def nexus(ds): commands = [ 'set autoclose=yes nowarn=yes;', 'lset coding=noabsencesites rates=gamma;', 'constraint root=1-.;', 'prset brlenspr=clock:uniform;', 'prset clockvarpr=igr igrvarpr=exp(200);', 'prset sampleprob=0.2 samplestrat=random;', 'prset speciationpr=exp(1);', 'prset extinctionpr=beta(1,1);', 'mcmcp ngen=2000000 printfreq=10000 samplefreq=2000 nruns=2' ' nchains=4 savebrlens=yes filename=chinese-hou;', 'mcmc;', 'sumt;', 'sump;' ] wl = Wordlist(ds.raw('Hou-2004-180-lexemes.tsv')) chars = defaultdict(lambda: defaultdict(int)) for k, d, cid, css in iter_rows(wl, 'doculect', 'cognate_sets', 'characters'): for c1, c2 in zip(cid.split(' '), css): chars[c1 + ':' + c2][d] += 1 all_chars = sorted(chars, key=lambda x: sum(chars[x].values())) matrix = [] for t in wl.taxa: matrix += [[]] for char in all_chars: if t in chars[char]: matrix[-1] += ['1'] else: matrix[-1] += ['0'] write_nexus([t.replace("'", '_') for t in wl.taxa], matrix, commands=commands, filename=ds.raw('chinese-hou.nex')) commands = [ 'set autoclose=yes nowarn=yes;', 'lset coding=noabsencesites rates=gamma;', 'constraint root=1-.;', 'prset brlenspr=clock:uniform;', 'prset clockvarpr=igr igrvarpr=exp(200);', 'prset sampleprob=0.2 samplestrat=random;', 'prset speciationpr=exp(1);', 'prset extinctionpr=beta(1,1);', 'mcmcp ngen=2000000 printfreq=10000 samplefreq=2000 nruns=2' ' nchains=4 savebrlens=yes filename=chinese-hou-lex;', 'mcmc;', 'sumt;', 'sump;' ] paps = wl.get_paps(ref='characters', entry='concept', missing='?') matrix = [] for i, t in enumerate(wl.taxa): matrix += [[]] for p in paps: matrix[-1] += [str(paps[p][i])] write_nexus([t.replace("'", '_') for t in wl.taxa], matrix, commands=commands, filename=ds.raw('chinese-hou-lstat.nex'))
def run(args): ds = lexibank_lieberherrkhobwa.Dataset(args) wl = lingpy.Wordlist.from_cldf(ds.cldf_dir / "cldf-metadata.json", col="language_id", row="parameter_id") wl.add_entries("cogid", "cognacy", lambda x: int(x), override=False) write_nexus(wl, mode="SPLITSTREE", filename=OUT_PATH + "lieberherrkhobwa-splitstree.nex") write_nexus(wl, mode="BEAST", filename=OUT_PATH + "lieberherrkhobwa-beast.nex")
def test_beastwords(self): # Use missing="X" parameter to avoid \? in the assertRegex calls below nex = write_nexus(self.wordlist, mode='BEASTWORDS', missing="X", filename=text_type(self.tmp_path('test'))) # added three characters for ascertainment self.assertIn("NTAX=5 NCHAR=10", nex) # mrbayes should have datatype=standard self.assertIn("DATATYPE=STANDARD", nex) # check charblock: self.assertRegexWorkaround(nex, r"1 I_ascertainment,") self.assertRegexWorkaround(nex, r"2 I,") self.assertRegexWorkaround(nex, r"3 all_ascertainment,") self.assertRegexWorkaround(nex, r"4 all,") self.assertRegexWorkaround(nex, r"5 all,") self.assertRegexWorkaround(nex, r"6 all,") self.assertRegexWorkaround(nex, r"7 ash_ascertainment,") self.assertRegexWorkaround(nex, r"8 ash,") self.assertRegexWorkaround(nex, r"9 ash,") self.assertRegexWorkaround(nex, r"10 ash") # check data: self.assertRegexWorkaround(nex, r"German\s+0101000100") self.assertRegexWorkaround(nex, r"English\s+010100XXXX") self.assertRegexWorkaround(nex, r"Swedish\s+0100100010") self.assertRegexWorkaround(nex, r"Icelandic\s+010001XXXX") self.assertRegexWorkaround(nex, r"Norwegian\s+0100010001") # assumptions block self.assertRegexWorkaround(nex, r"charset I = 1\-2;") self.assertRegexWorkaround(nex, r"charset all = 3\-6;") self.assertRegexWorkaround(nex, r"charset ash = 7\-10;")
def test_beast(self): # Use missing="X" parameter to avoid \? in the assertRegex calls below nex = write_nexus(self.wordlist, mode='BEAST', missing="X", filename=text_type(self.tmp_path('test'))) # added one character for ascertainment self.assertIn("NTAX=5 NCHAR=8", nex) # mrbayes should have datatype=standard self.assertIn("DATATYPE=STANDARD", nex) # check charblock: self.assertRegexWorkaround(nex, r"1 _ascertainment,") self.assertRegexWorkaround(nex, r"2 I,") self.assertRegexWorkaround(nex, r"3 all,") self.assertRegexWorkaround(nex, r"4 all,") self.assertRegexWorkaround(nex, r"5 all,") self.assertRegexWorkaround(nex, r"6 ash,") self.assertRegexWorkaround(nex, r"7 ash,") self.assertRegexWorkaround(nex, r"8 ash") # check data: self.assertRegexWorkaround(nex, r"German\s+01100100") self.assertRegexWorkaround(nex, r"English\s+01100XXX") self.assertRegexWorkaround(nex, r"Swedish\s+01010010") self.assertRegexWorkaround(nex, r"Icelandic\s+01001XXX") self.assertRegexWorkaround(nex, r"Norwegian\s+01001001")
def run(args): # use lingpy's nexus template for splitstree: _template = pathlib.Path( template_path('splitstree.nex')).read_text(encoding='utf-8') ds = Dataset(args) structure_dataset = ds.cldf_reader('structure') existing_taxa = set(row['Language_ID'] for row in structure_dataset['ValueTable']) idx = 0 taxa = {} for t in structure_dataset['LanguageTable']: if t['ID'] in existing_taxa: taxa[t['ID']] = (idx, t['Name']) idx += 1 params = { t['ID']: (i, t['Name']) for i, t in enumerate(structure_dataset['ParameterTable']) } matrix = [[0 for p in params] for t in taxa] for row in structure_dataset['ValueTable']: tidx, tname = taxa[row['Language_ID']] pidx, pname = params[row['Parameter_ID']] if row['Value'] == '+': matrix[tidx][pidx] = 1 alpha = string.ascii_letters + string.digits matrix_string = '' tax_list = sorted([t[1] for t in taxa.items()], key=lambda x: x[0]) for i, line in enumerate(matrix): matrix_string += '{0:12}'.format(''.join( [x for x in tax_list[i][1] if x in alpha])[:11]) matrix_string += ''.join([str(x) for x in line]) + '\n' pathlib.Path('chinese-structure.nex').write_text(_template.format( matrix=matrix_string, ntax=len(tax_list), dtype='STANDARD', nchar=len(params), gap='-', missing='?'), encoding='utf8') lex = LexStat.from_cldf(str(ds.cldf_specs()[None].metadata_path)) lex.cluster(method='sca', threshold=0.45, ref='cogid') write_nexus(lex, mode='splitstree', filename='chinese-lexemes.nex')
def test_merge_custom_statements(self): # this tests for the bug in https://github.com/lingpy/lingpy/issues/340 import re nex = write_nexus(self.wordlist, mode='mrbayes', commands=['test']) if len(re.findall(r"BEGIN MRBAYES;", nex, flags=re.IGNORECASE)) == 2: raise AssertionError('Duplicate mrbayes block found') self.assertRegexWorkaround(nex, r"charset I = 1\-1;") self.assertRegexWorkaround(nex, r"charset all = 2\-4;") self.assertRegexWorkaround(nex, r"charset ash = 5\-7;") self.assertRegexWorkaround(nex, r"test")
def test_splitstree(self): # Use missing="X" parameter to avoid \? in the assertRegex calls below nex = write_nexus(self.wordlist, mode='SPLITSTREE', missing="X", filename=text_type(self.tmp_path('test'))) self.assertIn("NTAX=5 NCHAR=7", nex) # splitstree should have datatype=standard self.assertIn("DATATYPE=STANDARD", nex) # NO charblock assert 'charset' not in nex assert 'ASSUMPTIONS' not in nex # NO symbols assert 'SYMBOLS' not in nex # check data: self.assertRegexWorkaround(nex, r"German\s+1100100") self.assertRegexWorkaround(nex, r"English\s+1100XXX") self.assertRegexWorkaround(nex, r"Swedish\s+1010010") self.assertRegexWorkaround(nex, r"Icelandic\s+1001XXX") self.assertRegexWorkaround(nex, r"Norwegian\s+1001001")
def test_traitlab(self): nex = write_nexus(self.wordlist, mode='traitlab', missing="X", filename=text_type(self.tmp_path('test'))) # we should lose the FIRST character self.assertIn("NTAX=5 NCHAR=6", nex) # splitstree should have datatype=standard self.assertIn("DATATYPE=STANDARD", nex) # NO charblock assert 'charset' not in nex assert 'ASSUMPTIONS' not in nex # NO symbols assert 'SYMBOLS' not in nex # check data: self.assertRegexWorkaround(nex, r"German\s+100100") self.assertRegexWorkaround(nex, r"English\s+100XXX") self.assertRegexWorkaround(nex, r"Swedish\s+010010") self.assertRegexWorkaround(nex, r"Icelandic\s+001XXX") self.assertRegexWorkaround(nex, r"Norwegian\s+001001")
def test_mrbayes(self): # Use missing="X" parameter to avoid \? in the assertRegex calls below nex = write_nexus(self.wordlist, mode='MRBAYES', missing="X", filename=text_type(self.tmp_path('test'))) self.assertIn("NTAX=5 NCHAR=7", nex) # mrbayes should have datatype=restriction self.assertIn("DATATYPE=RESTRICTION", nex) # check charblock: self.assertRegexWorkaround(nex, r"charset I = 1\-1;") self.assertRegexWorkaround(nex, r"charset all = 2\-4;") self.assertRegexWorkaround(nex, r"charset ash = 5\-7;") # check data: self.assertRegexWorkaround(nex, r"German\s+1100100") self.assertRegexWorkaround(nex, r"English\s+1100XXX") self.assertRegexWorkaround(nex, r"Swedish\s+1010010") self.assertRegexWorkaround(nex, r"Icelandic\s+1001XXX") self.assertRegexWorkaround(nex, r"Norwegian\s+1001001")
def make_nexus(filename, exclude_borrowings=True, concept_rank=226): wl = load_sinotibetan(tsv=True) if exclude_borrowings: ncog = max([int(wl[k, 'cogid']) for k in wl])+1 for k in wl: if wl[k, 'borrowing'].strip(): wl[k][wl.header['cogid']] = str(ncog) ncog += 1 elif wl[k, 'cogid'] in ['0', '']: wl[k][wl.header['cogid']] = str(ncog) ncog += 1 concepts_ = [k for k, v in stdb_concepts().items() if int(v['rank']) < concept_rank] cogids = defaultdict(lambda : defaultdict(list)) cogid2concept = {} wl.add_entries('paps', 'concept,cogid', lambda x, y: x[y[0]]+':'+x[y[1]]) uncertainties = defaultdict(list) for k, doculect, cogid, concept, borrowing in iter_rows( wl, 'doculect', 'paps', 'concept', 'borrowing'): if borrowing.strip() and not exclude_borrowings: uncertainties[cogid].append(doculect) cogids[concept][cogid].append(doculect) cogid2concept[cogid] = concept blocks = [] concepts = sorted(cogids) characters = {} ccount = 1 cstrings = [] for concept in [c for c in concepts if c in concepts_]: cstring = '_'.join([slug(c) for c in concept.split(' ')]) blocks += [ 'charset '+cstring+' = ' ] cstrings += [cstring] start = ccount for cogid in sorted(cogids[concept]): characters[cogid] = ccount ccount += 1 blocks[-1] += '{0}-{1};'.format(start, ccount-1) matrix = [] print(len(characters), ccount, len(cstrings)) for taxon in wl.taxa: tcids_ = wl.get_list(doculect=taxon, entry='paps', flat=True) tcons_ = wl.get_list(doculect=taxon, entry='concept', flat=True) # transform data, only take things with the same concept, so we check # for each datapoint, whether we find it tcids, tcons = [], [] for a, b in zip(tcids_, tcons_): if b in concepts: tcids += [a] tcons += [b] matrix += [[]] for cogid, idx in sorted(characters.items(), key=lambda x: x[1]): if cogid not in tcids: concept = cogid2concept[cogid] if concept in tcons: matrix[-1] += ['0'] else: matrix[-1] += ['?'] else: if taxon in uncertainties[cogid]: matrix[-1] += ['10'] else: matrix[-1] += ['1'] partition = 'partition favored = {0}: {1}'.format( len(blocks), ', '.join(cstrings)+';') commands = [ 'set autoclose=yes nowarn=yes;', 'lset coding=noabsencesites rates=gamma;' ] + blocks+[partition] + [ 'taxset fossils = Old_Chinese Old_Tibetan Old_Burmese;', 'constraint root = 1-.;', #'prest clockratepr = normal(1E-5,1);', 'calibrate Old_Chinese = uniform(2200, 3000);', 'calibrate Old_Tibetan = fixed(1200);', 'calibrate Old_Burmese = fixed(800);' ] +[ #'taxset problematic = Naxi Pumi_Lanping Qiang_Mawo Xumi Lyuzu Bai_Jianchuan Tujia;', #'delete problematic;', 'prset clockratepr=exponential(3e5);', 'prset treeagepr=uniform(4000,20000);', 'prset sampleprob=0.2 samplestrat=random speciationpr=exp(1);', 'prset extinctionpr=beta(1,1) nodeagepr=calibrated;' 'prset brlenspr=clock:fossilization clockvarpr=igr;', 'mcmcp ngen=10000000 printfreq=10000 samplefreq=2500 nruns=2 ' 'nchains=4 savebrlens=yes filename={0};'.format(filename)] print(filename) write_nexus(taxa=wl.taxa, matrix=matrix, commands=commands, filename=filename+'.nex')
('segments', 'tokens'), ('cogid_cognateset_id', 'cogid'), )) taxa = [t for t in wl_.cols if not t.startswith('Proto')] + ['ProtoNahua'] D = {0: wl_.columns} for idx in wl_: if wl_[idx, 'doculect'] in taxa: D[idx] = wl_[idx] wl = Wordlist(D) table = [] for t1, t2 in combinations(taxa, r=2): cog1 = wl.get_list(col=t1, entry='cogid', flat=True) cog2 = wl.get_list(col=t2, entry='cogid', flat=True) table += [[t1, t2, len([c for c in cog1 if c in cog2])]] print(tabulate(table, tablefmt='pipe')) wl.calculate('distances', ref='cogid') wl.output('dst', filename='distances') wl.calculate('tre', tree_calc='neighbor') wl.output('tre', filename='tree') wl.output('tsv', filename='wordlist', ignore='all', prettify=False) write_nexus(wl, mode='splitstree', filename='coracholaztecan.nex') alms = Alignments(wl_, transcription='form', ref='cogid') alms.align() alms.output('tsv', filename='wordlist', ignore='all', prettify=False)
from lingpy.convert.strings import write_nexus from lingpy.compare.partial import Partial from lingpy.convert.plot import plot_tree # Load the necessary data part = Partial.from_cldf('cldf/cldf-metadata.json') # Compute cognate sets according to SCA and calculate the distance matrix part.partial_cluster(method='sca', threshold=0.45, ref="cogids", cluster_method="upgma") part.add_cognate_ids('cogids', 'cogid', idtype='strict') part.calculate('tree', ref='cogid', tree_calc='upgma') out = write_nexus(part, mode='splitstree', filename='distance_matrix.nex') part.output('dst', filename='distance_matrix') plot_tree(str(part.tree)) print(part.tree.asciiArt()) # Compute cognate sets according to LexStat and calculate the distance matrix # part.get_partial_scorer(runs=1000) # part.partial_cluster(method='lexstat', threshold=0.55, cluster_method='upgma', ref="lexstatids") # part.add_cognate_ids('lexstatids', 'lexstatid', idtype='strict') # part.calculate('tree', ref='lexstatid', tree_calc='upgma', force=True) # part.output('dst', filename='distance_matrix') # plot_tree(str(part.tree)) # print(part.tree.asciiArt())
Spanish_1818 LSE_1818 Spanish_1845 LSE_1845 Spanish_1859 LSE_1859 Swedish_1866 STS_1866""" converter = dict([line.split('\t') for line in data.split('\n')]) lex = LexiBase.from_dbase( 'signalphabets', dbase='signalphabets.sqlite3', # url='signalphabets.sqlite3' ) abbrs = csv2list('signlgs.txt', sep=", ") count = 0 N = {0: lex.columns} for idx, concept, tokens in lex.iter_rows('concept', 'tokens'): tokens = basictypes.lists(tokens) N[idx] = lex[idx] N[idx][lex.header['doculect']] = unidecode(converter.get( lex[idx, 'doculect'], lex[idx, 'doculect'])) lex = Wordlist(N) write_nexus(lex, mode='splitstree', ref='cogid', filename='signs-cogid.nex')
def f(**kw): # Use missing="X" parameter to avoid \? in the assertRegex calls below return write_nexus(wordlist, missing="X", filename=str(tmppath / 'test'), **kw)
def test_error_on_unknown_mode(wordlist): with pytest.raises(ValueError): write_nexus(wordlist, mode='xx')
method = 'upgma' threshold = 0.6 table = [] for f in tqdm.tqdm(sorted(glob('data/test2/*.csv'))): lex = LexStat(f) lex.get_scorer(runs=10000) lex.cluster(method='lexstat', cluster_method=method, ref='autocog', threshold=threshold) lex.add_entries('cogidn', 'cogid,concept', lambda x, y: str(x[y[0]]) + '-' + x[y[1]]) lex.renumber('cogidn') if 'nex' in argv: write_nexus(lex, ref='autocog', filename='nexus/' + method + f.split('/')[-1][4:-4] + '.nex') p, r, fs = bcubes(lex, 'cogidnid', 'autocog', pprint=False) table += [[f[16:-4], round(p, 2), round(r, 2), round(fs, 4)]] table += [[ 'total', round(sum([line[1] for line in table]) / 5, 4), round(sum([line[2] for line in table]) / 5, 4), round(sum([line[3] for line in table]) / 5, 4) ]] print(tabulate(table, tablefmt='latex', headers=['data', 'p', 'r', 'fs'])) if 'sca2' in argv: if 'infomap' in argv:
def test_error_on_unknown_mode(self): with self.assertRaises(ValueError): write_nexus(self.wordlist, mode='xx')
def test_error_on_unknown_ref(self): with self.assertRaises(KeyError): write_nexus(self.wordlist, mode='mrbayes', ref='magic')