def evaluate_string(self, string, tokens=False, **keywords): if not tokens: tokens = ipa2tokens(string) score = 1 dist = self.dist['#'] prostring = prosodic_string( tokens2class(tokens, model=rcParams['art'], **keywords), **keywords) if self.classes: c = tokens2class(tokens, model=self.model) teststring = list(zip(prostring, c)) else: teststring = list(zip(prostring, tokens)) scores = [] while len(teststring) > 0: segment = teststring.pop(0) freq = dist.count(segment) allf = len(dist) s = freq / allf score = score * s scores += [s] dist = self.dist[segment] score = score * s scores += [s] lscore = np.log10(score) lscore = lscore / len(tokens) return score, lscore # np.log10(score)
def __init__(self, words, tokens=False, prostrings=[], classes=False, class_model=rcParams['model'], **keywords): setdefaults(keywords, stress=rcParams['stress'], diacritics=rcParams['diacritics'], cldf=False) self.model = class_model self.words = words self.tokens = [] self.bigrams = [] self.classes = [] # start filling the dictionary for i, w in enumerate(words): # check for tokenized string if not tokens: tk = ipa2tokens(w, **keywords) else: tk = w[:] self.tokens += [tk] # create prosodic string if prostrings: p = prostrings[i] else: print(w, tk) tt = tokens2class(tk, rcParams['art']) print(tt) p = prosodic_string(tk, rcParams['art'], cldf=keywords['cldf'], diacritics=keywords['diacritics'], stress=keywords['stress']) # create classes if classes: c = tokens2class(tk, class_model, cldf=keywords['cldf'], diacritics=keywords['diacritics'], stress=keywords['stress']) bigrams = list(zip(p, c)) self.classes += [c] else: # zip the stuff bigrams = list(zip(p, tk)) # start appending the stuff self.bigrams += [bigrams] # init the mother object MCBasic.__init__(self, self.bigrams)
def __init__( self, words, tokens=False, prostrings=[], classes=False, class_model=rcParams['model'], **keywords ): setdefaults(keywords, stress=rcParams['stress'], diacritics=rcParams['diacritics'], cldf=False) self.model = class_model self.words = words self.tokens = [] self.bigrams = [] self.classes = [] # start filling the dictionary for i, w in enumerate(words): # check for tokenized string if not tokens: tk = ipa2tokens(w, **keywords) else: tk = w[:] self.tokens += [tk] # create prosodic string if prostrings: p = prostrings[i] else: tt = tokens2class(tk, rcParams['art']) p = prosodic_string( tk, rcParams['art'], cldf=keywords['cldf'], diacritics=keywords['diacritics'], stress=keywords['stress']) # create classes if classes: c = tokens2class(tk, class_model, cldf=keywords['cldf'], diacritics=keywords['diacritics'], stress=keywords['stress']) bigrams = list(zip(p, c)) self.classes += [c] else: # zip the stuff bigrams = list(zip(p, tk)) # start appending the stuff self.bigrams += [bigrams] # init the mother object MCBasic.__init__(self, self.bigrams)
def test_tokens2class(): seq = 'tʰ ɔ x ˈth ə r A ˈI ʲ'.split(' ') seq2 = 'th o ?/x a'.split(' ') seq3 = 'th o ?/ a'.split(' ') assert tokens2class(seq, 'dolgo') == list('TVKTVR000') assert tokens2class(seq2, 'cv')[2] == '0' assert tokens2class(seq2, 'cv', clpa=True)[2] == 'C' assert tokens2class(seq3, 'cv', clpa=True)[2] == '0' assert_raises(IndexError, tokens2class, 'b l'.split(' '), 'dolgo')
def test_tokens2class(self): seq = 'tʰ ɔ x ˈth ə r A ˈI ʲ'.split(' ') seq2 = 'th o ?/x a'.split(' ') seq3 = 'th o ?/ a'.split(' ') assert tokens2class(seq, 'dolgo') == list('TVKTVR000') assert tokens2class(seq2, 'cv', cldf=False)[2] == '0' assert tokens2class(seq2, 'cv')[2] == 'C' assert tokens2class(seq3, 'cv', cldf=True)[2] == '0' assert_raises(ValueError, tokens2class, ['A'], 'dolgo') assert_raises(ValueError, tokens2class, 'bla', 'sca')
def test_tokens2class(): seq = 'tʰ ɔ x ˈth ə r A ˈI'.split(' ') assert tokens2class(seq, 'dolgo') == list('TVKTVR00') assert_raises(ValueError, tokens2class, 'b l'.split(' '), 'dolgo')
def turchin(seqA, seqB, model='dolgo', **keywords): """ Return cognate judgment based on the method by :evobib:`Turchin2010`. Parameters ---------- seqA, seqB : {str, list, tuple} The input strings. These should be iterables, so you can use tuples, lists, or strings. model : {"asjp", "sca", "dolgo"} (default="dolgo") A sound-class model instance or a string that denotes one of the standard sound class models used in LingPy. Returns ------- cognacy : {0, 1} The cognacy assertion which is either 0 (words are probably cognate) or 1 (words are not likely to be cognate). """ if text_type(model) == model: model = rcParams[model] elif hasattr(model, 'info'): pass else: raise ValueError("[!] No valid model instance selected.") if isinstance(seqA, (text_type, str)): seqA = ipa2tokens(seqA) seqB = ipa2tokens(seqB) classA = tokens2class(seqA, model) classB = tokens2class(seqB, model) if classA[0] in model.vowels: classA[0] = 'H' if classB[0] in model.vowels: classB[0] = 'H' if ''.join([k for k in classA if k not in model.vowels])[:2] == \ ''.join([k for k in classB if k not in model.vowels])[:2]: return 0 else: return 1
def ipa_to_asjp(w): """ Lingpy IPA-to-ASJP converter plus some cleanup. This function is called on IPA datasets. """ w = w.replace('\"', '').replace('-', '').replace(' ', '') wA = ''.join(tokens2class(ipa2tokens(w, merge_vowels=False), 'asjp')) wAA = clean_asjp(wA.replace('0', '').replace('I', '3').replace('H', 'N')) asjp = ''.join([x for x in wAA if x in sounds]) return asjp
def _set_model(self, **keywords): """ Define the sequence model for the calculation. Parameters ---------- model : { None, Model } (default=None) Specify the sound-class model to which the strings shall be converted. """ defaults = dict( model=rcParams['sca'], stress=rcParams['stress'], transform=rcParams['align_transform']) for k in defaults: if k not in keywords: keywords[k] = defaults[k] if isinstance(keywords['model'], (text_type, str)): self.model = rcParams[keywords['model']] else: self.model = keywords['model'] self.classes = [] for clA, clB in map( lambda x: ( tokens2class(x[0], self.model, stress=keywords['stress']), tokens2class(x[1], self.model, stress=keywords['stress'])), self.tokens ): self.classes += [(clA, clB)] self.weights = [] for prA, prB in self.prostrings: self.weights += [( prosodic_weights(prA, _transform=keywords['transform']), prosodic_weights(prB, _transform=keywords['transform']) )] self.scoredict = self.model.scorer
def turchin(seqA, seqB, model='dolgo', **keywords): """ Return cognate judgment based on the method by :evobib:`Turchin2010`. Parameters ---------- seqA, seqB : {str, list, tuple} The input strings. These should be iterables, so you can use tuples, lists, or strings. model : {"asjp", "sca", "dolgo"} (default="dolgo") A sound-class model instance or a string that denotes one of the standard sound class models used in LingPy. Returns ------- cognacy : {0, 1} The cognacy assertion which is either 0 (words are probably cognate) or 1 (words are not likely to be cognate). """ if text_type(model) == model: model = rcParams[model] elif not hasattr(model, 'info'): raise ValueError("[!] No valid model instance selected.") if isinstance(seqA, string_types): seqA = ipa2tokens(seqA) seqB = ipa2tokens(seqB) classA = tokens2class(seqA, model) classB = tokens2class(seqB, model) if classA[0] in model.vowels: classA[0] = 'H' if classB[0] in model.vowels: classB[0] = 'H' return int(''.join([k for k in classA if k not in model.vowels])[:2] != ''.join([k for k in classB if k not in model.vowels])[:2])
def ipa_to_asjp(w, params): """ Lingpy IPA-to-ASJP converter plus some cleanup. Expects the params {} to contain the key: sounds. This function is called on IPA datasets. """ w = w.replace('\"', '').replace('-', '').replace(' ', '') wA = ''.join(tokens2class(ipa2tokens(w, merge_vowels=False), 'asjp')) wAA = clean_asjp(wA.replace('0', '').replace('I', '3').replace('H', 'N')) asjp = ''.join([x for x in wAA if x in params['sounds']]) assert len(asjp) > 0 return asjp
def get_structure(word, sep='+', zipped=False, semi_diacritics='hsʃʂʒʐzθɕʑfvθðnmȵ'): if not isinstance(word, (list, tuple)): word = lingpy.ipa2tokens(word, expand_nasals=True, merge_vowels=False, semi_diacritics=semi_diacritics) # check for unknown chars try: tokens2class(word, 'cv', cldf=True) except ValueError: print('problem with {0}'.format(''.join(word))) return [] # get the morphemes if sep in word: words = tokens2morphemes(word, cldf=True) morphemes = [] for w in words: morphemes += tokens2morphemes(w, sep=sep) else: morphemes = tokens2morphemes(word, cldf=True) # get the basic structure for each morpheme for morpheme in morphemes: try: segments = parse_chinese_morphemes(morpheme) except: if not zipped: yield ['NULL'] else: yield ([('NULL', 'NULL')], morpheme) if not zipped: yield [x for x, y in zip('imnct', segments) if y != '-'] else: yield ([x for x in zip('imnct', segments) if x[1] != '-'], morpheme)
def iter_cognates(dataset, column='Segments', method='turchin', threshold=0.5, **keywords): """ Compute cognates automatically for a given dataset. """ if method == 'turchin': for row in dataset.rows: sounds = ''.join(tokens2class(row[column].split(' '), 'dolgo')) if sounds.startswith('V'): sounds = 'H' + sounds sounds = '-'.join([s for s in sounds if s != 'V'][:2]) cogid = slug(row['Parameter_name']) + '-' + sounds if '0' not in sounds: yield ( row['ID'], dataset.name, row['Value'], cogid, '', 'CMM', '', # cognate source '', # alignment '', # alignment method '', # alignment source ) if method in ['sca', 'lexstat']: lex = _cldf2lexstat(dataset) if method == 'lexstat': lex.get_scorer(**keywords) lex.cluster(method=method, threshold=threshold, ref='cogid') for k in lex: yield ( lex[k, 'lid'], dataset.name, lex[k, 'value'], lex[k, 'cogid'], '', method + '-t{0:.2f}'.format(threshold), '', # cognate source '', # alignment '', # alignment method '', # alignment source )
def test_sequence(sequence, **keywords): """ Test a sequence for compatibility with CLPA and LingPy. """ invalid = Counter() segment_count = Counter() lingpy_errors = set() clpa_errors = set() clpa_repl = defaultdict(set) general_errors = 0 # clean the string at first, we only take the first item, ignore the rest try: segments = clean_string(sequence, **keywords)[0].split(' ') lingpy_analysis = [ x if y != '0' else '?' for x, y in zip(segments, tokens2class(segments, 'dolgo')) ] clpa_analysis, _sounds, _errors = clpa.check_sequence(segments) general_errors = len( ['?' for x in zip(lingpy_analysis, clpa_analysis) if '?' in x]) except (ValueError, IndexError, AttributeError): invalid.update([sequence]) segments, clpa_analysis = [], [] if segments: for a, b, c in zip(segments, lingpy_analysis, clpa_analysis): if a[0] in clpa.accents: a = a[1:] if c[0] in clpa.accents: c = c[1:] segment_count.update([a]) if b == '?': lingpy_errors.add(a) if c != a: if c == '?': clpa_errors.add(a) else: clpa_repl[a].add(c) return (segments, [clpa.segment2clpa(x) for x in clpa_analysis], invalid, segment_count, lingpy_errors, clpa_errors, clpa_repl, general_errors)
def msa2html( msa, shorttitle='', filename='', template='', **keywords ): """ Convert files in ``msa``-format into colored ``html``-format. Parameters ---------- msa : dict A dictionary object that contains all the information of an MSA object. shorttitle : str Define the shorttitle of the ``html``-page. If no title is provided, the default title ``SCA`` will be used. filename : str (default="") Define the name of the output file. If no name is defined, the name of the input file will be taken as a default. template : str (default="") The path to the template file. If no name is defined, the basic template will be used. The basic template currently used can be found under ``lingpy/data/templates/msa2html.html``. Examples -------- Load the libary. >>> from lingpy import * Load an ``msq``-file from the test-sets. >>> msa = MSA('harry.msq') Align the data progressively and carry out a check for swapped sites. >>> msa.prog_align() >>> msa.swap_check() >>> print(msa) w o l - d e m o r t w a l - d e m a r - v - l a d i m i r - Save the data to the file ``harry.msa``. >>> msa.output('msa',filename='harry') Save the ``msa``-object as ``html``. >>> msa.output('html',filename='harry') Notes ----- The coloring of sound segments with respect to the sound class they belong to is based on the definitions given in the ``color`` :py:class:`~lingpy.data.model.Model`. It can easily be changed and adapted. See also -------- lingpy.convert.html.alm2html """ util.setdefaults( keywords, pid_mode=1, stress=rcParams['stress'], css=False, js=False, compact=False, class_sort=True, write_to_file=True, ) # while alm-format can be read from the text-file without problems, # msa-format should be loaded first (once this is already provided), the # loss in speed won't matter much, since output of data is not a daily task # load templates template = template or template_path('msa2html.html') if template == 'js': template = template_path('msa2html.js.html') html = util.read_text_file(template) css = util.read_text_file(keywords['css'] or template_path('msa.css')) js = util.read_text_file(keywords['js'] or template_path('msa.js')) # treat the msa-object as a file and try to load the file if this is the # case if isinstance(msa, string_types): msa = read_msa(msa, **keywords) else: raise ValueError('[!] No filename specified.') # load dataset, etc. dataset = msa['dataset'] # calculate pid score, if it is not passed as argument if 'pid_score' not in keywords: pid_score = 0 count = 0 for i, seqA in enumerate(msa['alignment']): for j, seqB in enumerate(msa['alignment']): if i < j: pid_score += pid(seqA, seqB, mode=keywords['pid_mode']) count += 1 pid_score = int(100 * pid_score / count + 0.5) else: pid_score = keywords['pid_score'] infile = msa['infile'] seq_id = msa['seq_id'] # define the titles etc. if not shorttitle: shorttitle = 'SCA' # determine the length of the longest taxon taxl = max([len(t) for t in msa['taxa']]) # format css file css = css.replace('TAXON_LENGTH', str(taxl * 10)) out = '' tr = '<tr class="msa" unique="{1}" taxon={2} sequence={3}>{0}</tr>\n' td_taxon = '<td class="taxon">{0}</td>' perc = int(80 / len(msa['alignment'][0]) + 0.5) td_residue = '<td class="residue {1}">{0}</td>' td_swap = '<td class="residue swap {1}">{0}</td>' td_unaligned = '<td class="residue noalign {1}">{0}</td>' # check for swaps in the alignment if 'swaps' in msa: swaps = [] for s in msa['swaps']: swaps.extend(s) else: swaps = [] # check for local = ['*'] * len(msa['alignment'][0]) if 'local' in msa: local = ['.'] * len(msa['alignment'][0]) for i in msa['local']: local[i] = '*' # get two sorting schemas for the sequences if keywords['class_sort']: classes = [tokens2class(ipa2tokens(seq), rcParams['asjp']) for seq in msa['seqs']] seqs = dict( [(a[1], b) for a, b in zip( sorted( zip(classes, msa['seqs']), key=lambda x: x[0] # list(zip(x[0],x[1])) ), range(1, len(msa['seqs']) + 1) )] ) else: seqs = dict(zip(sorted(msa['seqs']), range(1, len(msa['seqs']) + 1))) taxa = dict(zip(sorted(msa['taxa']), range(1, len(msa['taxa']) + 1))) # set up a list to store unique alignments alignments = [] # start iteration for i, taxon in enumerate(msa['taxa']): tmp = '' tmp += td_taxon.format(taxon) # append alignment to alignments alignment = ''.join(msa['alignment'][i]) sequence = msa['seqs'][i] if alignment in alignments: unique = 'false' else: unique = 'true' alignments += [alignment] for j, char in enumerate(msa['alignment'][i]): if char == '-': d = 'dolgo_GAP' c = '#bbbbbb' else: d = 'dolgo_' + token2class(char, rcParams['dolgo']) c = token2class(char, rcParams['_color']) # bad check for three classes named differently if d == 'dolgo__': d = 'dolgo_X' elif d == 'dolgo_1': d = 'dolgo_TONE' elif d == 'dolgo_0': d = 'dolgo_ERROR' if j in swaps: tmp += td_swap.format(char, d) elif local[j] != '*': tmp += td_unaligned.format(char, d) else: tmp += td_residue.format(char, d) out += tr.format(tmp, unique, taxa[taxon], seqs[sequence]) html = html.format( table=out, dataset=dataset, pid=pid_score, file=infile, sequence=seq_id, shorttitle=shorttitle, width=len(msa['alignment'][0]), table_width='{0}'.format(len(msa['alignment'][0]) * 50 + 8 * taxl), taxa=len(msa['alignment']), uniseqs=len(set(msa['seqs'])), css=css, js=js ) if not filename: filename = rcParams['filename'] if not filename.endswith('.html'): filename = filename + '.html' if keywords['compact']: html = html.replace('\n', ' ') html = re.sub(r'\s+', r' ', html) html = html.replace('> ', '>') html = html.replace(' >', '>') if keywords['write_to_file']: # check, whether the outfile already exists util.write_text_file(filename, html) else: return html
def msa2html(msa, shorttitle='', filename='', template='', **keywords): """ Convert files in ``msa``-format into colored ``html``-format. Parameters ---------- msa : dict A dictionary object that contains all the information of an MSA object. shorttitle : str Define the shorttitle of the ``html``-page. If no title is provided, the default title ``SCA`` will be used. filename : str (default="") Define the name of the output file. If no name is defined, the name of the input file will be taken as a default. template : str (default="") The path to the template file. If no name is defined, the basic template will be used. The basic template currently used can be found under ``lingpy/data/templates/msa2html.html``. Examples -------- Load the libary. >>> from lingpy import * Load an ``msq``-file from the test-sets. >>> msa = MSA('harry.msq') Align the data progressively and carry out a check for swapped sites. >>> msa.prog_align() >>> msa.swap_check() >>> print(msa) w o l - d e m o r t w a l - d e m a r - v - l a d i m i r - Save the data to the file ``harry.msa``. >>> msa.output('msa',filename='harry') Save the ``msa``-object as ``html``. >>> msa.output('html',filename='harry') Notes ----- The coloring of sound segments with respect to the sound class they belong to is based on the definitions given in the ``color`` :py:class:`~lingpy.data.model.Model`. It can easily be changed and adapted. See also -------- lingpy.convert.html.alm2html """ util.setdefaults( keywords, pid_mode=1, stress=rcParams['stress'], css=False, js=False, compact=False, class_sort=True, write_to_file=True, ) # while alm-format can be read from the text-file without problems, # msa-format should be loaded first (once this is already provided), the # loss in speed won't matter much, since output of data is not a daily task # load templates template = template or template_path('msa2html.html') if template == 'js': template = template_path('msa2html.js.html') html = util.read_text_file(template) css = util.read_text_file(keywords['css'] or template_path('msa.css')) js = util.read_text_file(keywords['js'] or template_path('msa.js')) # treat the msa-object as a file and try to load the file if this is the # case if isinstance(msa, string_types): msa = read_msa(msa, **keywords) else: raise ValueError('[!] No filename specified.') # load dataset, etc. dataset = msa['dataset'] # calculate pid score, if it is not passed as argument if 'pid_score' not in keywords: pid_score = 0 count = 0 for i, seqA in enumerate(msa['alignment']): for j, seqB in enumerate(msa['alignment']): if i < j: pid_score += pid(seqA, seqB, mode=keywords['pid_mode']) count += 1 pid_score = int(100 * pid_score / count + 0.5) else: pid_score = keywords['pid_score'] infile = msa['infile'] seq_id = msa['seq_id'] # define the titles etc. if not shorttitle: shorttitle = 'SCA' # determine the length of the longest taxon taxl = max([len(t) for t in msa['taxa']]) # format css file css = css.replace('TAXON_LENGTH', str(taxl * 10)) out = '' tr = '<tr class="msa" unique="{1}" taxon={2} sequence={3}>{0}</tr>\n' td_taxon = '<td class="taxon">{0}</td>' perc = int(80 / len(msa['alignment'][0]) + 0.5) td_residue = '<td class="residue {1}">{0}</td>' td_swap = '<td class="residue swap {1}">{0}</td>' td_unaligned = '<td class="residue noalign {1}">{0}</td>' # check for swaps in the alignment if 'swaps' in msa: swaps = [] for s in msa['swaps']: swaps.extend(s) else: swaps = [] # check for local = ['*'] * len(msa['alignment'][0]) if 'local' in msa: local = ['.'] * len(msa['alignment'][0]) for i in msa['local']: local[i] = '*' # get two sorting schemas for the sequences if keywords['class_sort']: classes = [ tokens2class(ipa2tokens(seq), rcParams['asjp']) for seq in msa['seqs'] ] seqs = dict([ (a[1], b) for a, b in zip( sorted( zip(classes, msa['seqs']), key=lambda x: x[0] # list(zip(x[0],x[1])) ), range(1, len(msa['seqs']) + 1)) ]) else: seqs = dict(zip(sorted(msa['seqs']), range(1, len(msa['seqs']) + 1))) taxa = dict(zip(sorted(msa['taxa']), range(1, len(msa['taxa']) + 1))) # set up a list to store unique alignments alignments = [] # start iteration for i, taxon in enumerate(msa['taxa']): tmp = '' tmp += td_taxon.format(taxon) # append alignment to alignments alignment = ''.join(msa['alignment'][i]) sequence = msa['seqs'][i] if alignment in alignments: unique = 'false' else: unique = 'true' alignments += [alignment] for j, char in enumerate(msa['alignment'][i]): if char == '-': d = 'dolgo_GAP' c = '#bbbbbb' else: d = 'dolgo_' + token2class(char, rcParams['dolgo']) c = token2class(char, rcParams['_color']) # bad check for three classes named differently if d == 'dolgo__': d = 'dolgo_X' elif d == 'dolgo_1': d = 'dolgo_TONE' elif d == 'dolgo_0': d = 'dolgo_ERROR' if j in swaps: tmp += td_swap.format(char, d) elif local[j] != '*': tmp += td_unaligned.format(char, d) else: tmp += td_residue.format(char, d) out += tr.format(tmp, unique, taxa[taxon], seqs[sequence]) html = html.format( table=out, dataset=dataset, pid=pid_score, file=infile, sequence=seq_id, shorttitle=shorttitle, width=len(msa['alignment'][0]), table_width='{0}'.format(len(msa['alignment'][0]) * 50 + 8 * taxl), taxa=len(msa['alignment']), uniseqs=len(set(msa['seqs'])), css=css, js=js) if not filename: filename = rcParams['filename'] if not filename.endswith('.html'): filename = filename + '.html' if keywords['compact']: html = html.replace('\n', ' ') html = re.sub(r'\s+', r' ', html) html = html.replace('> ', '>') html = html.replace(' >', '>') if keywords['write_to_file']: # check, whether the outfile already exists util.write_text_file(filename, html) else: return html