def iter_cognates(dataset, column='Segments', method='turchin', threshold=0.5, **kw): """ Compute cognates automatically for a given dataset. """ if method == 'turchin': for row in dataset.objects['FormTable']: sounds = ''.join(lingpy.tokens2class(row[column], 'dolgo')) if sounds.startswith('V'): sounds = 'H' + sounds sounds = '-'.join([s for s in sounds if s != 'V'][:2]) cogid = slug(row['Parameter_ID']) + '-' + sounds if '0' not in sounds: yield dict( Form_ID=row['ID'], Form=row['Value'], Cognateset_ID=cogid, Cognate_Detection_Method='CMM') if method in ['sca', 'lexstat']: lex = _cldf2lexstat(dataset) if method == 'lexstat': lex.get_scorer(**kw) lex.cluster(method=method, threshold=threshold, ref='cogid') for k in lex: yield Cognate( Form_ID=lex[k, 'lid'], Form=lex[k, 'value'], Cognateset_ID=lex[k, 'cogid'], Cognate_Detection_Method=method + '-t{0:.2f}'.format(threshold))
def toASJP(w): if w == u'ũ': return 'u' if w == u'XXX': return '' w = w.replace('\"', '').replace('-', '') #.replace(' ','') wA = ''.join(lp.tokens2class(w.split(), 'asjp')) wAA = cleanASJP( wA.replace(u'0', '').replace(u'I', u'3').replace(u'H', u'N')) return ''.join([x for x in wAA if x in sounds])
def find_bad_tokens(wordlist): """Collect which bad symbols appear in which forms.""" bad_tokens = {} for k, segments, form_id in wordlist.iter_rows('tokens', "reference"): classes = lingpy.tokens2class(segments, 'dolgo') for token, cls in zip(segments, classes): if cls == "0": bad_tokens.setdefault(token, []).append(form_id) return bad_tokens
def ipa2sca(ipa): """Convert an IPA string into a SCA token string. This function tries to preserve the len of the token string. """ sca_list = [ t for x in tokenize_word_reversibly(ipa) for t, char in itertools.zip_longest(tokens2class(x, 'sca'), "0") ] assert len(''.join(sca_list)) == len(ipa) return ''.join(sca_list)
def iter_cognates(dataset, column='Segments', method='turchin', threshold=0.5, **kw): """ Compute cognates automatically for a given dataset. :param dataset: Either a `LexibankWriter` instance or a `pycldf.Dataset`. """ forms = dataset.objects['FormTable'] \ if hasattr(dataset, 'objects') else list(dataset['FormTable']) if method == 'turchin': for row in forms: sounds = ''.join(lingpy.tokens2class(row[column], 'dolgo')) if sounds.startswith('V'): sounds = 'H' + sounds sounds = '-'.join([s for s in sounds if s != 'V'][:2]) cogid = slug(row['Parameter_ID']) + '-' + sounds if '0' not in sounds: yield dict( Form_ID=row['ID'], Form=row['Value'], Cognateset_ID=cogid, Cognate_Detection_Method='CMM') if method in ['sca', 'lexstat']: try: lex = _cldf2lexstat(dataset) except ValueError: return if method == 'lexstat': lex.get_scorer(**kw) lex.cluster(method=method, threshold=threshold, ref='cogid') for k in lex: yield dict( Form_ID=lex[k, 'lid'], Form=lex[k, 'value'], Cognateset_ID=lex[k, 'cogid'], Cognate_Detection_Method=method + '-t{0:.2f}'.format(threshold))
def get_wordlist( self, doculect='base', profile=False, ref='crossid', lexstat=True, threshold=0.4): """ Return a classical wordlist from the data. """ if profile: profile = segments.Tokenizer(profile) tokenize = lambda x: profile('^' + x + '$', column='IPA').split() # noqa: E731 else: tokenize = lingpy.ipa2tokens D = { 0: [ 'doculect', 'concept', 'concept_in_source', 'concept_type', 'form', 'tokens', 'occurrences', 'word_forms', 'gloss_forms', 'phrase_example', 'gloss_example', 'references', ] } idx = 1 for ctype in ['lexicon', 'grammar']: concepts = self.get_concepts(ctype=ctype) concordance = self._concordances[ctype] for concept, entries in concepts.items(): for form, lid, cis, freq in entries: # retrieve the concordance pidx, sA, sB = concordance[form, concept, cis, lid][0] txt = self[pidx].phrase gls = self[pidx].gloss word, fgls = self[pidx, sA] tokens = tokenize(form) references = ' '.join( ['{0}:{1}:{2}'.format(a, b, c) for a, b, c in concordance[form, concept, cis, lid]]) # check tokens try: lingpy.tokens2class(tokens, 'sca') check = True except: # noqa: E722, # pragma: no cover check = False if concept.strip() and check: D[idx] = [ doculect if self.monolingual else lid, concept, cis, ctype, form, tokens, freq, word, fgls, txt, gls, references] idx += 1 else: print('[!] Problem with "{0}" / [{1}] [{2}] / {3} {4} {5}'.format( concept, form, tokens, pidx, sA, sB, )) wl = lingpy.Wordlist(D) if lexstat: wl = lingpy.LexStat(D) wl.cluster(method='sca', threshold=threshold, ref=ref) else: wl.add_entries('cog', 'concept,form', lambda x, y: x[y[0]] + '-' + x[y[1]]) wl.renumber('cog', ref) return wl
def parse_chinese_morphemes(seq, context=False): """ Parse a Chinese syllable and return its basic structure. """ # get the tokens if isinstance(seq, list): tokens = [s for s in seq] else: tokens = lingpy.ipa2tokens(seq, merge_vowels=False) # get the sound classes according to the art-model arts = [int(x) for x in lingpy.tokens2class(tokens, _art, cldf=True)] # get the pro-string prostring = lingpy.prosodic_string(arts) # parse the zip of tokens and arts I, M, N, C, T = '', '', '', '', '' ini = False med = False nuc = False cod = False ton = False triples = [('?', '?', '?')] + list(zip(tokens, arts, prostring)) + [('?', '?', '?')] for i in range(1, len(triples) - 1): #enumerate(triples[1:-1]): #zip(tokens,arts,prostring): t, c, p = triples[i] _t, _c, _p = triples[i - 1] t_, c_, p_ = triples[i + 1] # check for initial entry first if p == 'A' and _t == '?': # now, if we have a j-sound and a vowel follows, we go directly to # medial environment if t[0] in 'jɥw': med = True ini, nuc, cod, ton = False, False, False, False else: ini = True med, nuc, doc, ton = False, False, False, False # check for initial vowel elif p == 'X' and _t == '?': if t[0] in 'iuy' and c_ == '7': med = True ini, nuc, cod, ton = False, False, False, False else: nuc = True ini, med, cod, ton = False, False, False, False # check for medial after initial elif p == 'C': med = True ini, nuc, cod, ton = False, False, False, False # check for vowel medial elif p == 'X' and p_ == 'Y': # if we have a medial vowel, we classify it as medial if t in 'iyu': med = True ini, nuc, cod, ton = False, False, False, False else: nuc = True ini, med, cod, ton = False, False, False, False # check for vowel without medial elif p == 'X' or p == 'Y': if p_ in 'LTY' or p_ == '?': nuc = True ini, med, cod, ton = False, False, False, False elif p == 'Y': nuc = True ini, med, cod, ton = 4 * [False] else: cod = True ini, med, nuc, ton = 4 * [False] # check for consonant elif p == 'L': cod = True ini, med, nuc, ton = 4 * [False] # check for tone elif p == 'T': ton = True ini, med, nuc, cod = 4 * [False] if ini: I += t elif med: M += t elif nuc: N += t elif cod: C += t else: T += t # bad conversion for output, but makes what it is supposed to do out = [I, M, N, C, T] tf = lambda x: x if x else '-' out = [tf(x) for x in out] # transform tones to normal letters tones = dict(zip('¹²³⁴⁵⁶⁷⁸⁹⁰₁₂₃₄₅₆₇₈₉₀', '1234567890123456789')) # now, if context is wanted, we'll yield that ic = '1' if [x for x in I if x in 'bdgmnŋȵɳɴ'] else '0' mc = '1' if [m for m in M + N if m in 'ijyɥ'] else '0' cc = '1' if C in 'ptkʔ' else '0' tc = ''.join([tones.get(x, x) for x in T]) IC = '/'.join(['I', ic, mc, cc, tc]) if I else '' MC = '/'.join(['M', ic, mc, cc, tc]) if M else '' NC = '/'.join(['N', ic, mc, cc, tc]) if N else '' CC = '/'.join(['C', ic, mc, cc, tc]) if C else '' TC = '/'.join(['T', ic, mc, cc, tc]) if T else '' if context: return out, [x for x in [IC, MC, NC, CC, TC] if x]
def to_asjp(segments): return lingpy.tokens2class(segments, model, cldf=False)