def test_align(self): self.alm.add_entries('cugid', self.alm._ref, lambda x: text_type(x)) self.alm.add_alignments(ref="cugid") # align all sequences using standard params self.alm.align(ref="cugid", alignment="alignment2") assert self.alm.msa["cugid"]["1"]["ID"] == self.alm.msa["cogid"][1][ "ID"] # iterate and align using the multiple function for key, value in self.alm.msa['cogid'].items(): # first compare simple alignments msaA = lp.SCA(value) msaB = lp.Multiple(value['seqs']) msaB.prog_align() assert msaA == msaB # now compare with different flag msaA = lp.Multiple( [self.alm[idx, 'tokensb'] for idx in value['ID']]) msaB = lp.Multiple([''.join(s) for s in value['seqs']], merge_vowels=False) msaA.lib_align() msaB.lib_align() assert msaA == msaB
def test_align(self): # align all sequences using standard params self.alm.align() # iterate and align using the multiple function for key, value in self.alm.msa['cogid'].items(): # first compare simple alignments msaA = lp.SCA(value) msaB = lp.Multiple(value['seqs']) msaB.prog_align() assert msaA == msaB # now compare with different flag msaA = lp.Multiple( [self.alm[idx, 'tokensb'] for idx in value['ID']]) msaB = lp.Multiple([''.join(s) for s in value['seqs']], merge_vowels=False) msaA.lib_align() msaB.lib_align() assert msaA == msaB
def cmd_makecldf(self, args): from pybtex import errors, database errors.strict = False bibdata = database.parse_file( str(self.raw_dir.joinpath('bibliography', 'sources.bib'))) args.writer.add_sources(bibdata) args.writer["FormTable", "Segments"].datatype = Datatype.fromvalue({ "base": "string", "format": "([\\S]+)( [\\S]+)*" }) args.writer["FormTable", "Morphemes"].separator = " " args.writer["FormTable", "PartialCognates"].separator = " " concepts = {} errors, blacklist = set(), set() for concept in self.conceptlists[0].concepts.values(): idx = '{0}_{1}'.format(concept.number, slug(concept.english)) args.writer.add_concept( ID=idx, Name=concept.english, Portuguese_Gloss=concept.attributes["portuguese"], Concepticon_ID=concept.concepticon_id, Concepticon_Gloss=concept.concepticon_gloss, EOL_ID=concept.attributes["eol"], Semantic_Field=concept.attributes["semantic_field"]) concepts[concept.english] = idx languages = {} sources = {} for row in self.languages: if not -90 < float(row['Latitude']) < 90: errors.add('LATITUDE {0}'.format(row['Name'])) elif not -180 < float(row['Longitude']) < 180: errors.add('LONGITUDE {0}'.format(row['Name'])) else: try: args.writer.add_language( ID=row['ID'], Name=row['Name'], SubGroup=row['SubGroup'], Latitude=row['Latitude'], Longitude=row['Longitude'], Glottocode=row['Glottocode'] if row['Glottocode'] != '???' else None, ) languages[row['Name']] = row['ID'] sources[row['Name']] = [] for source in row['Sources'].split(','): if source in bibdata.entries: sources[row['Name']] += [source] else: errors.add('BIBTEX MISSING {0}'.format(source)) except ValueError: errors.add('LANGUAGE ID {0}'.format(row['ID'], )) args.log.warn('Invalid Language ID {0}'.format(row['ID'])) wl = lingpy.Wordlist(self.raw_dir.joinpath('tuled.tsv').as_posix()) etd = wl.get_etymdict(ref='cogids') alignments, problems = {}, set() for cogid, vals in progressbar(etd.items(), desc='aligning data'): idxs = [] for idx in vals: if idx: idxs += idx positions = [wl[idx, 'cogids'].index(cogid) for idx in idxs] alms, new_idxs = [], [] for idx, pos in zip(idxs, positions): try: tks = lingpy.basictypes.lists(wl[idx, 'tokens']).n[pos] if not ' '.join(tks).strip(): raise IndexError alms += [tks] new_idxs += [(idx, pos)] except IndexError: problems.add((idx, pos)) if alms: msa = lingpy.Multiple(alms) msa.prog_align() for i, alm in enumerate(msa.alm_matrix): alignments[new_idxs[i][0], new_idxs[i][1], cogid] = ' '.join(alm) else: errors.add('ALIGNMENT empty {0}'.format(cogid)) bipa = CLTS(args.clts.dir).bipa for idx, tokens, glosses, cogids, alignment in wl.iter_rows( 'tokens', 'morphemes', 'cogids', 'alignment'): tl, gl, cl, al = (len(lingpy.basictypes.lists(tokens).n), len(glosses), len(cogids), len(lingpy.basictypes.lists(alignment).n)) if tl != gl or tl != cl or gl != cl or al != gl or al != cl: errors.add('LENGTH: {0} {1} {2}'.format( idx, wl[idx, 'language'], wl[idx, 'concept'])) blacklist.add(idx) for token in tokens: if bipa[token].type == 'unknownsound': errors.add('SOUND: {0}'.format(token)) blacklist.add(idx) visited = set() for idx in wl: if wl[idx, 'concept'] not in concepts: if wl[idx, 'concept'] not in visited: args.log.warn('Missing concept {0}'.format(wl[idx, 'concept'])) visited.add(wl[idx, 'concept']) errors.add('CONCEPT {0}'.format(wl[idx, 'concept'])) elif wl[idx, 'doculect'] not in languages: if wl[idx, 'doculect'] not in visited: args.log.warn("Missing language {0}".format( wl[idx, 'doculect'])) visited.add(wl[idx, 'doculect']) errors.add('LANGUAGE {0}'.format(wl[idx, 'doculect'])) else: if ''.join(wl[idx, 'tokens']).strip() and idx not in blacklist: lex = args.writer.add_form_with_segments( Language_ID=languages[wl[idx, 'doculect']], Parameter_ID=concepts[wl[idx, 'concept']], Value=wl[idx, 'value'] or ''.join(wl[idx, 'tokens']), Form=wl[idx, 'form'] or ''.join(wl[idx, 'tokens']), Segments=wl[idx, 'tokens'], Morphemes=wl[idx, 'morphemes'], SimpleCognate=wl[idx, 'cogid'], PartialCognates=wl[idx, 'cogids'], Source=sources[wl[idx, 'doculect']], ) for gloss_index, cogid in enumerate(wl[idx, 'cogids']): args.writer.add_cognate(lexeme=lex, Cognateset_ID=cogid, Segment_Slice=gloss_index + 1, Alignment=alignments.get( (idx, gloss_index, cogid), ''), Alignment_Method='SCA') else: args.log.warn( 'Entry ID={0}, concept={1}, language={2} is empty'. format(idx, wl[idx, 'concept'], wl[idx, 'doculect'])) with open(self.dir.joinpath('errors.md'), 'w', encoding="utf-8") as f: f.write('# Error Analysis for TULED\n') for error in sorted(errors): f.write('* ' + error + '\n')
name = {} with open('cldf/languages.csv') as fin: reader = csv.reader(fin) for i, row in enumerate(reader): if i == 0: continue name[row[0]] = row[1] res = defaultdict(lambda: defaultdict(int)) with open('test.out', 'a') as fout: for _, form in enumerate(forms): print(_) if 'kṣ' not in forms[form][0][1]: continue try: if len(forms[form]) == 1: continue # forms[i].sort() m = lingpy.Multiple([x[1] for x in forms[form]]) m.prog_align() strs = [['#'] + list(x.split()) + ['#'] for x in str(m).split('\n')] print(strs) l, r = -1, -1 for i in range(len(strs[0])): if strs[0][i] == 'k': l = i for j in range(i + 1, len(strs[0])): if strs[0][j] == 'ṣ': r = j break elif strs[0][j] != '-': l = -1
import lingpy as lp from lingpy.algorithm import squareform seqs = ["kona", "kvinne", "queen"] msa = lp.Multiple(seqs) msa.prog_align() print(msa) languages = ['Norwegian', 'Swedish', 'Icelandic', 'Dutch', 'English'] distances = squareform([0.5, 0.67, 0.8, 0.2, 0.4, 0.7, 0.6, 0.8, 0.8, 0.3]) tree = lp.neighbor(distances, languages) print(tree) tree = lp.Tree(tree) print(tree.asciiArt())