Ejemplo n.º 1
0
def nexus(ds):
    commands = [
        'set autoclose=yes nowarn=yes;',
        'lset coding=noabsencesites rates=gamma;', 'constraint root=1-.;',
        'prset brlenspr=clock:uniform;',
        'prset clockvarpr=igr igrvarpr=exp(200);',
        'prset sampleprob=0.2 samplestrat=random;',
        'prset speciationpr=exp(1);', 'prset extinctionpr=beta(1,1);',
        'mcmcp ngen=2000000 printfreq=10000 samplefreq=2000 nruns=2'
        ' nchains=4 savebrlens=yes filename=chinese-hou;', 'mcmc;', 'sumt;',
        'sump;'
    ]
    wl = Wordlist(ds.raw('Hou-2004-180-lexemes.tsv'))
    chars = defaultdict(lambda: defaultdict(int))
    for k, d, cid, css in iter_rows(wl, 'doculect', 'cognate_sets',
                                    'characters'):
        for c1, c2 in zip(cid.split(' '), css):
            chars[c1 + ':' + c2][d] += 1
    all_chars = sorted(chars, key=lambda x: sum(chars[x].values()))
    matrix = []
    for t in wl.taxa:
        matrix += [[]]
        for char in all_chars:
            if t in chars[char]:
                matrix[-1] += ['1']
            else:
                matrix[-1] += ['0']
    write_nexus([t.replace("'", '_') for t in wl.taxa],
                matrix,
                commands=commands,
                filename=ds.raw('chinese-hou.nex'))

    commands = [
        'set autoclose=yes nowarn=yes;',
        'lset coding=noabsencesites rates=gamma;', 'constraint root=1-.;',
        'prset brlenspr=clock:uniform;',
        'prset clockvarpr=igr igrvarpr=exp(200);',
        'prset sampleprob=0.2 samplestrat=random;',
        'prset speciationpr=exp(1);', 'prset extinctionpr=beta(1,1);',
        'mcmcp ngen=2000000 printfreq=10000 samplefreq=2000 nruns=2'
        ' nchains=4 savebrlens=yes filename=chinese-hou-lex;', 'mcmc;',
        'sumt;', 'sump;'
    ]
    paps = wl.get_paps(ref='characters', entry='concept', missing='?')
    matrix = []
    for i, t in enumerate(wl.taxa):
        matrix += [[]]
        for p in paps:
            matrix[-1] += [str(paps[p][i])]
    write_nexus([t.replace("'", '_') for t in wl.taxa],
                matrix,
                commands=commands,
                filename=ds.raw('chinese-hou-lstat.nex'))
Ejemplo n.º 2
0
def run(args):
    ds = lexibank_lieberherrkhobwa.Dataset(args)
    wl = lingpy.Wordlist.from_cldf(ds.cldf_dir / "cldf-metadata.json",
                                   col="language_id",
                                   row="parameter_id")

    wl.add_entries("cogid", "cognacy", lambda x: int(x), override=False)

    write_nexus(wl,
                mode="SPLITSTREE",
                filename=OUT_PATH + "lieberherrkhobwa-splitstree.nex")
    write_nexus(wl,
                mode="BEAST",
                filename=OUT_PATH + "lieberherrkhobwa-beast.nex")
Ejemplo n.º 3
0
    def test_beastwords(self):
        # Use missing="X" parameter to avoid \? in the assertRegex calls below
        nex = write_nexus(self.wordlist,
                          mode='BEASTWORDS',
                          missing="X",
                          filename=text_type(self.tmp_path('test')))
        # added three characters for ascertainment
        self.assertIn("NTAX=5 NCHAR=10", nex)
        # mrbayes should have datatype=standard
        self.assertIn("DATATYPE=STANDARD", nex)

        # check charblock:
        self.assertRegexWorkaround(nex, r"1 I_ascertainment,")
        self.assertRegexWorkaround(nex, r"2 I,")
        self.assertRegexWorkaround(nex, r"3 all_ascertainment,")
        self.assertRegexWorkaround(nex, r"4 all,")
        self.assertRegexWorkaround(nex, r"5 all,")
        self.assertRegexWorkaround(nex, r"6 all,")
        self.assertRegexWorkaround(nex, r"7 ash_ascertainment,")
        self.assertRegexWorkaround(nex, r"8 ash,")
        self.assertRegexWorkaround(nex, r"9 ash,")
        self.assertRegexWorkaround(nex, r"10 ash")

        # check data:
        self.assertRegexWorkaround(nex, r"German\s+0101000100")
        self.assertRegexWorkaround(nex, r"English\s+010100XXXX")
        self.assertRegexWorkaround(nex, r"Swedish\s+0100100010")
        self.assertRegexWorkaround(nex, r"Icelandic\s+010001XXXX")
        self.assertRegexWorkaround(nex, r"Norwegian\s+0100010001")

        # assumptions block
        self.assertRegexWorkaround(nex, r"charset I = 1\-2;")
        self.assertRegexWorkaround(nex, r"charset all = 3\-6;")
        self.assertRegexWorkaround(nex, r"charset ash = 7\-10;")
Ejemplo n.º 4
0
    def test_beast(self):
        # Use missing="X" parameter to avoid \? in the assertRegex calls below
        nex = write_nexus(self.wordlist,
                          mode='BEAST',
                          missing="X",
                          filename=text_type(self.tmp_path('test')))

        # added one character for ascertainment
        self.assertIn("NTAX=5 NCHAR=8", nex)

        # mrbayes should have datatype=standard
        self.assertIn("DATATYPE=STANDARD", nex)

        # check charblock:
        self.assertRegexWorkaround(nex, r"1 _ascertainment,")
        self.assertRegexWorkaround(nex, r"2 I,")
        self.assertRegexWorkaround(nex, r"3 all,")
        self.assertRegexWorkaround(nex, r"4 all,")
        self.assertRegexWorkaround(nex, r"5 all,")
        self.assertRegexWorkaround(nex, r"6 ash,")
        self.assertRegexWorkaround(nex, r"7 ash,")
        self.assertRegexWorkaround(nex, r"8 ash")

        # check data:
        self.assertRegexWorkaround(nex, r"German\s+01100100")
        self.assertRegexWorkaround(nex, r"English\s+01100XXX")
        self.assertRegexWorkaround(nex, r"Swedish\s+01010010")
        self.assertRegexWorkaround(nex, r"Icelandic\s+01001XXX")
        self.assertRegexWorkaround(nex, r"Norwegian\s+01001001")
Ejemplo n.º 5
0
def run(args):
    # use lingpy's nexus template for splitstree:
    _template = pathlib.Path(
        template_path('splitstree.nex')).read_text(encoding='utf-8')

    ds = Dataset(args)
    structure_dataset = ds.cldf_reader('structure')
    existing_taxa = set(row['Language_ID']
                        for row in structure_dataset['ValueTable'])
    idx = 0
    taxa = {}
    for t in structure_dataset['LanguageTable']:
        if t['ID'] in existing_taxa:
            taxa[t['ID']] = (idx, t['Name'])
            idx += 1
    params = {
        t['ID']: (i, t['Name'])
        for i, t in enumerate(structure_dataset['ParameterTable'])
    }

    matrix = [[0 for p in params] for t in taxa]
    for row in structure_dataset['ValueTable']:
        tidx, tname = taxa[row['Language_ID']]
        pidx, pname = params[row['Parameter_ID']]
        if row['Value'] == '+':
            matrix[tidx][pidx] = 1

    alpha = string.ascii_letters + string.digits
    matrix_string = ''
    tax_list = sorted([t[1] for t in taxa.items()], key=lambda x: x[0])
    for i, line in enumerate(matrix):
        matrix_string += '{0:12}'.format(''.join(
            [x for x in tax_list[i][1] if x in alpha])[:11])
        matrix_string += ''.join([str(x) for x in line]) + '\n'

    pathlib.Path('chinese-structure.nex').write_text(_template.format(
        matrix=matrix_string,
        ntax=len(tax_list),
        dtype='STANDARD',
        nchar=len(params),
        gap='-',
        missing='?'),
                                                     encoding='utf8')

    lex = LexStat.from_cldf(str(ds.cldf_specs()[None].metadata_path))
    lex.cluster(method='sca', threshold=0.45, ref='cogid')
    write_nexus(lex, mode='splitstree', filename='chinese-lexemes.nex')
Ejemplo n.º 6
0
    def test_merge_custom_statements(self):
        # this tests for the bug in https://github.com/lingpy/lingpy/issues/340
        import re
        nex = write_nexus(self.wordlist, mode='mrbayes', commands=['test'])
        if len(re.findall(r"BEGIN MRBAYES;", nex, flags=re.IGNORECASE)) == 2:
            raise AssertionError('Duplicate mrbayes block found')

        self.assertRegexWorkaround(nex, r"charset I = 1\-1;")
        self.assertRegexWorkaround(nex, r"charset all = 2\-4;")
        self.assertRegexWorkaround(nex, r"charset ash = 5\-7;")
        self.assertRegexWorkaround(nex, r"test")
Ejemplo n.º 7
0
 def test_splitstree(self):
     # Use missing="X" parameter to avoid \? in the assertRegex calls below
     nex = write_nexus(self.wordlist,
                       mode='SPLITSTREE',
                       missing="X",
                       filename=text_type(self.tmp_path('test')))
     self.assertIn("NTAX=5 NCHAR=7", nex)
     # splitstree should have datatype=standard
     self.assertIn("DATATYPE=STANDARD", nex)
     # NO charblock
     assert 'charset' not in nex
     assert 'ASSUMPTIONS' not in nex
     # NO symbols
     assert 'SYMBOLS' not in nex
     # check data:
     self.assertRegexWorkaround(nex, r"German\s+1100100")
     self.assertRegexWorkaround(nex, r"English\s+1100XXX")
     self.assertRegexWorkaround(nex, r"Swedish\s+1010010")
     self.assertRegexWorkaround(nex, r"Icelandic\s+1001XXX")
     self.assertRegexWorkaround(nex, r"Norwegian\s+1001001")
Ejemplo n.º 8
0
 def test_traitlab(self):
     nex = write_nexus(self.wordlist,
                       mode='traitlab',
                       missing="X",
                       filename=text_type(self.tmp_path('test')))
     # we should lose the FIRST character
     self.assertIn("NTAX=5 NCHAR=6", nex)
     # splitstree should have datatype=standard
     self.assertIn("DATATYPE=STANDARD", nex)
     # NO charblock
     assert 'charset' not in nex
     assert 'ASSUMPTIONS' not in nex
     # NO symbols
     assert 'SYMBOLS' not in nex
     # check data:
     self.assertRegexWorkaround(nex, r"German\s+100100")
     self.assertRegexWorkaround(nex, r"English\s+100XXX")
     self.assertRegexWorkaround(nex, r"Swedish\s+010010")
     self.assertRegexWorkaround(nex, r"Icelandic\s+001XXX")
     self.assertRegexWorkaround(nex, r"Norwegian\s+001001")
Ejemplo n.º 9
0
    def test_mrbayes(self):
        # Use missing="X" parameter to avoid \? in the assertRegex calls below
        nex = write_nexus(self.wordlist,
                          mode='MRBAYES',
                          missing="X",
                          filename=text_type(self.tmp_path('test')))
        self.assertIn("NTAX=5 NCHAR=7", nex)
        # mrbayes should have datatype=restriction
        self.assertIn("DATATYPE=RESTRICTION", nex)

        # check charblock:
        self.assertRegexWorkaround(nex, r"charset I = 1\-1;")
        self.assertRegexWorkaround(nex, r"charset all = 2\-4;")
        self.assertRegexWorkaround(nex, r"charset ash = 5\-7;")

        # check data:
        self.assertRegexWorkaround(nex, r"German\s+1100100")
        self.assertRegexWorkaround(nex, r"English\s+1100XXX")
        self.assertRegexWorkaround(nex, r"Swedish\s+1010010")
        self.assertRegexWorkaround(nex, r"Icelandic\s+1001XXX")
        self.assertRegexWorkaround(nex, r"Norwegian\s+1001001")
Ejemplo n.º 10
0
def make_nexus(filename, exclude_borrowings=True, concept_rank=226):

    wl = load_sinotibetan(tsv=True)

    if exclude_borrowings:
        ncog = max([int(wl[k, 'cogid']) for k in wl])+1
        for k in wl:
            if wl[k, 'borrowing'].strip():
                wl[k][wl.header['cogid']] = str(ncog)
                ncog += 1
            elif wl[k, 'cogid'] in ['0', '']:
                wl[k][wl.header['cogid']] = str(ncog)
                ncog += 1

    concepts_ = [k for k, v in stdb_concepts().items() if int(v['rank']) <
            concept_rank]


    cogids = defaultdict(lambda : defaultdict(list))
    cogid2concept = {}
    wl.add_entries('paps', 'concept,cogid', lambda x, y:
            x[y[0]]+':'+x[y[1]])
    uncertainties = defaultdict(list)
    for k, doculect, cogid, concept, borrowing in iter_rows(
            wl, 'doculect', 'paps', 'concept', 'borrowing'):
        if borrowing.strip() and not exclude_borrowings:
            uncertainties[cogid].append(doculect)
        cogids[concept][cogid].append(doculect)
        cogid2concept[cogid] = concept

    blocks = []
    concepts = sorted(cogids)
    characters = {}
    ccount = 1
    cstrings = []
    for concept in [c for c in concepts if c in concepts_]:
        cstring = '_'.join([slug(c) for c in concept.split(' ')])
        blocks += [
                'charset '+cstring+' = '
                ]
        cstrings += [cstring]
        start = ccount
        for cogid in sorted(cogids[concept]):
            characters[cogid] = ccount
            ccount += 1
        blocks[-1] += '{0}-{1};'.format(start, ccount-1)
    matrix = []
    print(len(characters), ccount, len(cstrings))
    for taxon in wl.taxa:
        tcids_ = wl.get_list(doculect=taxon, entry='paps', flat=True)
        tcons_ = wl.get_list(doculect=taxon, entry='concept', flat=True)
        
        # transform data, only take things with the same concept, so we check
        # for each datapoint, whether we find it
        tcids, tcons = [], []
        for a, b in zip(tcids_, tcons_):
            if b in concepts:
                tcids += [a]
                tcons += [b]

        matrix += [[]]
        for cogid, idx in sorted(characters.items(), key=lambda x: x[1]):
            if cogid not in tcids:
                concept = cogid2concept[cogid]
                if concept in tcons:
                    matrix[-1] += ['0']
                else:
                    matrix[-1] += ['?']
            else:
                if taxon in uncertainties[cogid]:
                    matrix[-1] += ['10']
                else:
                    matrix[-1] += ['1']

    partition = 'partition favored = {0}: {1}'.format(
            len(blocks),
            ', '.join(cstrings)+';')
    commands = [
            'set autoclose=yes nowarn=yes;',
            'lset coding=noabsencesites rates=gamma;'
            ] + blocks+[partition] + [
                    'taxset fossils = Old_Chinese Old_Tibetan Old_Burmese;',
                    'constraint root = 1-.;',
                    #'prest clockratepr = normal(1E-5,1);',
                    'calibrate Old_Chinese = uniform(2200, 3000);',
                    'calibrate Old_Tibetan = fixed(1200);',
                    'calibrate Old_Burmese = fixed(800);'
                    ] +[
            #'taxset problematic = Naxi Pumi_Lanping Qiang_Mawo Xumi Lyuzu Bai_Jianchuan Tujia;',
            #'delete problematic;',
            'prset clockratepr=exponential(3e5);',
            'prset treeagepr=uniform(4000,20000);',
            'prset sampleprob=0.2 samplestrat=random speciationpr=exp(1);',
            'prset extinctionpr=beta(1,1) nodeagepr=calibrated;'
            'prset brlenspr=clock:fossilization clockvarpr=igr;',
            'mcmcp ngen=10000000 printfreq=10000 samplefreq=2500 nruns=2 ' 
            'nchains=4 savebrlens=yes filename={0};'.format(filename)]
    print(filename)
    write_nexus(taxa=wl.taxa, matrix=matrix, commands=commands,
            filename=filename+'.nex')
Ejemplo n.º 11
0
            ('segments', 'tokens'),
            ('cogid_cognateset_id', 'cogid'),
            ))

taxa = [t for t in wl_.cols if not t.startswith('Proto')] + ['ProtoNahua']
D = {0: wl_.columns}
for idx in wl_:
    if wl_[idx, 'doculect'] in taxa:
        D[idx] = wl_[idx]
wl = Wordlist(D)
table = []
for t1, t2 in combinations(taxa, r=2):
    cog1 = wl.get_list(col=t1, entry='cogid', flat=True)
    cog2 = wl.get_list(col=t2, entry='cogid', flat=True)
    table += [[t1, t2, len([c for c in cog1 if c in cog2])]]
print(tabulate(table, tablefmt='pipe'))

wl.calculate('distances', ref='cogid')
wl.output('dst', filename='distances')
wl.calculate('tre', tree_calc='neighbor')
wl.output('tre', filename='tree')
wl.output('tsv', filename='wordlist', ignore='all', prettify=False)

write_nexus(wl, mode='splitstree', filename='coracholaztecan.nex')


alms = Alignments(wl_, transcription='form', ref='cogid')
alms.align()

alms.output('tsv', filename='wordlist', ignore='all', prettify=False)
Ejemplo n.º 12
0
from lingpy.convert.strings import write_nexus
from lingpy.compare.partial import Partial
from lingpy.convert.plot import plot_tree

# Load the necessary data
part = Partial.from_cldf('cldf/cldf-metadata.json')

# Compute cognate sets according to SCA and calculate the distance matrix
part.partial_cluster(method='sca',
                     threshold=0.45,
                     ref="cogids",
                     cluster_method="upgma")
part.add_cognate_ids('cogids', 'cogid', idtype='strict')
part.calculate('tree', ref='cogid', tree_calc='upgma')
out = write_nexus(part, mode='splitstree', filename='distance_matrix.nex')
part.output('dst', filename='distance_matrix')
plot_tree(str(part.tree))
print(part.tree.asciiArt())

# Compute cognate sets according to LexStat and calculate the distance matrix
# part.get_partial_scorer(runs=1000)
# part.partial_cluster(method='lexstat', threshold=0.55, cluster_method='upgma', ref="lexstatids")
# part.add_cognate_ids('lexstatids', 'lexstatid', idtype='strict')
# part.calculate('tree', ref='lexstatid', tree_calc='upgma', force=True)
# part.output('dst', filename='distance_matrix')
# plot_tree(str(part.tree))
# print(part.tree.asciiArt())
Ejemplo n.º 13
0
Spanish_1818	LSE_1818
Spanish_1845	LSE_1845
Spanish_1859	LSE_1859
Swedish_1866	STS_1866"""

converter = dict([line.split('\t') for line in data.split('\n')])

lex = LexiBase.from_dbase(
        'signalphabets', 
        dbase='signalphabets.sqlite3',
        # url='signalphabets.sqlite3'
        )

abbrs = csv2list('signlgs.txt', sep=", ")

count = 0

N = {0: lex.columns}

for idx, concept, tokens in lex.iter_rows('concept',
        'tokens'):
    tokens = basictypes.lists(tokens)
    N[idx] = lex[idx]
    N[idx][lex.header['doculect']] = unidecode(converter.get(
            lex[idx, 'doculect'], 
            lex[idx, 'doculect']))

lex = Wordlist(N)

write_nexus(lex, mode='splitstree', ref='cogid', filename='signs-cogid.nex')
Ejemplo n.º 14
0
 def f(**kw):
     # Use missing="X" parameter to avoid \? in the assertRegex calls below
     return write_nexus(wordlist, missing="X", filename=str(tmppath / 'test'), **kw)
Ejemplo n.º 15
0
def test_error_on_unknown_mode(wordlist):
    with pytest.raises(ValueError):
        write_nexus(wordlist, mode='xx')
Ejemplo n.º 16
0
        method = 'upgma'
        threshold = 0.6
    table = []
    for f in tqdm.tqdm(sorted(glob('data/test2/*.csv'))):
        lex = LexStat(f)
        lex.get_scorer(runs=10000)
        lex.cluster(method='lexstat',
                    cluster_method=method,
                    ref='autocog',
                    threshold=threshold)
        lex.add_entries('cogidn', 'cogid,concept',
                        lambda x, y: str(x[y[0]]) + '-' + x[y[1]])
        lex.renumber('cogidn')
        if 'nex' in argv:
            write_nexus(lex,
                        ref='autocog',
                        filename='nexus/' + method + f.split('/')[-1][4:-4] +
                        '.nex')

        p, r, fs = bcubes(lex, 'cogidnid', 'autocog', pprint=False)
        table += [[f[16:-4], round(p, 2), round(r, 2), round(fs, 4)]]
    table += [[
        'total',
        round(sum([line[1] for line in table]) / 5, 4),
        round(sum([line[2] for line in table]) / 5, 4),
        round(sum([line[3] for line in table]) / 5, 4)
    ]]

    print(tabulate(table, tablefmt='latex', headers=['data', 'p', 'r', 'fs']))

if 'sca2' in argv:
    if 'infomap' in argv:
Ejemplo n.º 17
0
 def test_error_on_unknown_mode(self):
     with self.assertRaises(ValueError):
         write_nexus(self.wordlist, mode='xx')
Ejemplo n.º 18
0
 def test_error_on_unknown_ref(self):
     with self.assertRaises(KeyError):
         write_nexus(self.wordlist, mode='mrbayes', ref='magic')