Example #1
0
def writeToFile():
    print("LOAD TEST WORDLIST")
    pathToAnnotatedWordList = "Data/IELex/output/IELex-2016.tsv"
    
    print("LOAD WORDLIST")
    #pathToAnnotatedWordList = "Data/mattis_new/output/ObUgrian-110-21.tsv.asjp"
    languages,words,global_ids,cognate_classes = loadAnnotatedWordList(pathToAnnotatedWordList)
    print(len(set(global_ids)))
    stoplist = {221 , 646 ,1333 ,1224 , 778 ,1402, 1411, 1232, 1203, 1292}

    languages,words,global_ids,cognate_classes = getRidOfValidationSet(languages,words,global_ids,cognate_classes,stoplist)
    print(len(set(global_ids)))
    with codecs.open("lexstat_wordlist.txt","w",encoding="UTF-8") as f:
        f.write("CONCEPT\tIPA\tDOCULECT\tCOGID\n")
        for l,w,gi,cog in zip(languages,words,global_ids,cognate_classes):
            f.write(str(gi)+"\t"+w+"\t"+l+"\t"+cog+"\n")
    wl =get_wordlist("lexstat_wordlist.txt",delimiter="\t")
    print(wl.get_dict(concept="730",entry="IPA"))
    print("initializing lexstat")
    lex = LexStat(wl)
    print("getting scorer")
    lex.get_scorer()
    print("clustering")
    lex.cluster(method="lexstat", threshold=0.6, ref="cognate_class_pred")
    print("output")
    lex.output('tsv', filename="lexstat_ielex")
    
    from lingpy.evaluate.acd import bcubes, diff
    bcubes(lex, "cognate_class", "COGID")
    print(bcubes(lex, "cognate_class", "cognate_class_pred"))
Example #2
0
def writeToFile():
    print("LOAD TEST WORDLIST")
    pathToAnnotatedWordList = "Data/IELex/output/IELex-2016.tsv"

    print("LOAD WORDLIST")
    #pathToAnnotatedWordList = "Data/mattis_new/output/ObUgrian-110-21.tsv.asjp"
    languages, words, global_ids, cognate_classes = loadAnnotatedWordList(
        pathToAnnotatedWordList)
    print(len(set(global_ids)))
    stoplist = {221, 646, 1333, 1224, 778, 1402, 1411, 1232, 1203, 1292}

    languages, words, global_ids, cognate_classes = getRidOfValidationSet(
        languages, words, global_ids, cognate_classes, stoplist)
    print(len(set(global_ids)))
    with codecs.open("lexstat_wordlist.txt", "w", encoding="UTF-8") as f:
        f.write("CONCEPT\tIPA\tDOCULECT\tCOGID\n")
        for l, w, gi, cog in zip(languages, words, global_ids,
                                 cognate_classes):
            f.write(str(gi) + "\t" + w + "\t" + l + "\t" + cog + "\n")
    wl = get_wordlist("lexstat_wordlist.txt", delimiter="\t")
    print(wl.get_dict(concept="730", entry="IPA"))
    print("initializing lexstat")
    lex = LexStat(wl)
    print("getting scorer")
    lex.get_scorer()
    print("clustering")
    lex.cluster(method="lexstat", threshold=0.6, ref="cognate_class_pred")
    print("output")
    lex.output('tsv', filename="lexstat_ielex")

    from lingpy.evaluate.acd import bcubes, diff
    bcubes(lex, "cognate_class", "COGID")
    print(bcubes(lex, "cognate_class", "cognate_class_pred"))
Example #3
0
    def test_bcubes(self):
        from lingpy.evaluate.acd import bcubes

        res = bcubes(self.lex, test='cogid', pprint=False)
        self.assertAlmostEquals(res, (1.0, 1.0, 1.0))

        res = bcubes(self.lex, 'cogid', 'cogid', pprint=True, per_concept=True)
Example #4
0
    def test_bcubes(self):
        from lingpy.evaluate.acd import bcubes

        res = bcubes(self.lex, test='cogid', pprint=False)
        self.assertAlmostEquals(res, (1.0, 1.0, 1.0))

        res = bcubes(self.lex, 'cogid', 'cogid', pprint=True,
                per_concept=True)
Example #5
0
def main():
    results = []
    for name in [
            "ROM", "BAI", "GER", "JAP", "OUG", "PIE", "SLV", "IEL", "KSL",
            "PAN"
    ]:
        print(name)
        for n in range(1, 7):
            results = []
            for i in [50]:
                lex = LexStat("../{}_updating_{}_{}.csv".format(name, i, n))
                # lex.get_scorer()
                # lex.cluster(method="lexstat", threshold=0.6, ref="cognates")
                results.append((
                    i,
                    bcubes(
                        lex,
                        "cogid",
                        "newcogid",
                        pprint=False,
                        modify_ref=lambda x: abs(int(x)),
                    ),
                ))

            for r in results:
                print(r)
        print()
Example #6
0
def evaluate():

    wl = Wordlist('mikronesian-lexstat.tsv')

    for res in ['turchinid', 'scaid', 'lexstatid', 'infomap']:
        print('{0:10}\t{1[0]:.2f}\t{1[1]:.2f}\t{1[2]:.2f}'.format(
            res, bcubes(wl, 'cogid', res, pprint=False)))
Example #7
0
def main():
    results = []
    """for i in range(20):
        lex = LexStat("../PIE_scored_{}_og.csv".format(i))
        # lex.get_scorer()
        # lex.cluster(method="lexstat", threshold=0.6, ref="cognates")
        print(".", end="", flush=True)
        results.append( (
                i,
                bcubes(
                    lex,
                    "cogid",
                    "newcogid",
                    pprint=False,
                    modify_ref=lambda x: abs(int(x)),
                ),
            )
        )
:
    print()
    print("OG")
    for r in results:
        print(r)
"""
    for name in ["ARM_GRE"]:
        print(name)
        for n in range(1, 7):
            results = []
            for i in [50]:
                lex = LexStat("../{}_{}_{}.csv".format(name, i, n))
                # lex.get_scorer()
                # lex.cluster(method="lexstat", threshold=0.6, ref="cognates")
                results.append(
                    (
                        i,
                        bcubes(
                            lex,
                            "cogid",
                            "newcogid",
                            pprint=False,
                            modify_ref=lambda x: abs(int(x)),
                        ),
                    )
                )

            for r in results:
                print(r)
        print()
###########
# Data input
############

# manually annotate the partial cognates
WLmp = Wordlist('hm-111-17_16feb.tsv')

# auto detect the partial cognates
WLap = Wordlist('HM-March4-partial-final.tsv')

##############
# Compare the cognates : bcubes
##############

WLmp_bcube = bcubes(WLmp, 'cogid', 'cogids')
WLap_bcube = bcubes(WLap, 'cogid', 'cogids')

###################
# Not sure if this make sence, but just trying out at this moment
# get edit distance for every concept
###################
Lan = WLmp.language

lanD = dict()
for i in Lan:
    temp = WLmp.get_list(language=i, entry='SEGMENTS_IS')
    lanD[i] = temp

lanpair = list(permutations(Lan, 2))
            msf = 'f_'+m
            for cm in cluster_methods:
                ms = '{0}_{1}_{2}'.format('p', m, cm)
                msf = '{0}_{1}_{2}'.format('f', m, cm)
                msp = ms +'_'+ts

                lex.partial_cluster(method=m, cluster_method=cm, threshold=t, ref=msp)

                # get loose and strict cognate ids for this method
                lex.add_cognate_ids(msp, ms+'_strict'+'_'+ts, 'strict')
                lex.add_cognate_ids(msp, ms+'_loose'+'_'+ts, 'loose')

                # get the bcubes
                for mode in ['strict', 'loose']:
                    msm = ms+'_'+mode+'_'+ts
                    p, r, fs = bcubes(lex, mode+'_cogid', msm,
                            pprint=False)
                    pprint_result(f, msm, ts, p, r, fs)
                    ccubes += [[msm, f, t, ts, p, r, fs]]
                p, r, fs = partial_bcubes(lex, 'partialids', msp, pprint=False)
                pprint_result(f, msp, ts, p, r, fs)
                ccubes += [[msp, f, t, ts, p, r, fs]]
                
                lex.cluster(method=m, cluster_method=cm, threshold=t,
                        ref=msf+'_'+ts)
                for mode in ['strict', 'loose']:
                    p, r, fs = bcubes(lex, mode+'_cogid', msf+'_'+ts,
                            pprint=False)
                    pprint_result(f,msf+'_'+mode+'_'+ts, ts, p, r, fs)
                    ccubes += [[msf+'_'+mode+'_'+ts, f, t, ts, p, r, fs]]
       
with open('results.tsv', 'w') as f:
Example #10
0
    method='lexstat',
    threshold=0.57,
    external_function=lambda x, y: infomap_clustering(y, x, revert=True),
    ref="lexstat_infomap")

partition = vstack([
    array([
        concatenate(lex.get_dict(col=l, entry=entry).values())
        for entry in ['lexstat_infomap', 'index']
    ]).T for l in taxa
])

partition = pd.DataFrame(partition, columns=['lpCC', 'id'])

inferredData = pd.merge(inferredData, partition)

inferredData.to_csv('../results/' + dataset + '.clustered.tsv',
                    sep='\t',
                    encoding='utf-8',
                    index=False)

clustered = lp.Wordlist('../results/' + dataset + '.clustered.tsv')

with open('../results/f-scores.csv', 'a') as f:
    lsFscores = bcubes(clustered, gold='COGID', test='lpcc')
    f.write(dataset + ',LexStat,' +
            ','.join(map(str, around(array(lsFscores), 3))) + '\n')
    svmFscores = bcubes(clustered, gold='COGID', test='svmcc')
    f.write(dataset + ',SVM,' +
            ','.join(map(str, around(array(svmFscores), 3))) + '\n')
Example #11
0
from lingpy import *
from lingpy.evaluate.acd import bcubes
lex = LexStat('../lingpy/output/Lingpy_vs_Final_cognates.tsv')
b = bcubes(lex, gold='COGNATES', test='ORIG')
print(b)
Example #12
0
    def test_bcubes(self):
        res = bcubes(self.lex, test='cogid', pprint=False)
        self.assertAlmostEqual(res, (1.0, 1.0, 1.0))

        _ = bcubes(self.lex, 'cogid', 'cogid', pprint=True, per_concept=True)
Example #13
0
part.renumber('cog')


method = input('method: ')

# type 'cogid' or 'cog' for method to see a tree based on Deepadung et al.'s
# cognate judgements

if method == 'lexstatcogids':
    part.get_partial_scorer(runs=10000)
    part.partial_cluster(method='lexstat', ref="lexstatcogids", threshold=0.55)
elif method == 'lexstatcogid':
    part.get_scorer(runs=10000)
    part.cluster(method='lexstat', ref="lexstatcogid", threshold=0.55)
    bcubes(part, 'cogid', 'lexstatcogid')
elif method == 'scacogids':
    part.partial_cluster(method='sca', threshold=0.45, ref='scacogids')
elif method == 'scacogid':
    part.cluster(method='sca', ref="scacogid", threshold=0.45)
    bcubes(part, 'cogid', 'scacogid')

part.calculate('tree', ref= method)
print(method)
print(part.tree.asciiArt())

import csv

x = part.distances
taxa = part.taxa
filename = '../output/distmat_'+ method+'.csv'
Example #14
0
 def score(self):
     print(bcubes(self.lex, "COGNACY", "CUSTOMID"))
Example #15
0
    word = re.sub(r"(.)(.)(.)\$", r"\2", word)
    word = re.sub(r"\$", "", word)
    word = re.sub(r"\s+", "", word)
    return word.replace('~', '')


d1 = pd.read_csv('IELex+ASJP.csv', sep='\t')
d1 = d1[~d1.ASJP_transcription.isnull()]
d1 = d1[d1.loan == 0]
d1 = d1[d1['class'].notnull()]
d1['word'] = [cleanASJP(x) for x in d1.ASJP_transcription.values]
d1['language'] = [x.replace('-', '_') for x in d1.ASJP_language.values]

d2 = pd.read_csv('../albanoRomanceCC.csv', index_col=0)

d = pd.merge(d1, d2)[['concept', 'language', 'word', 'class', 'cc']]

d.to_csv('mergedData.tsv', sep='\t')

wl = lp.Wordlist('mergedData.tsv')

bcubes(wl, gold="class", test="cc")

# *************************
# * B-Cubed-Scores        *
# * --------------------- *
# * Precision:     0.9934 *
# * Recall:        0.6194 *
# * F-Scores:      0.7630 *
# *************************'
input('all fine')

columns=('concept_name', 'language_id',
                'value', 'form', 'segments', 'language_glottocode', 'cogid_cognateset_id'
                )
namespace=(('concept_name', 'concept'), ('language_id',
                'doculect'), ('segments', 'tokens'), ('language_glottocode',
                    'glottolog'), ('concept_concepticon_id', 'concepticon'),
                ('language_latitude', 'latitude'), ('language_longitude',
                    'longitude'), ('cognacy', 'cognacy'),
                ('cogid_cognateset_id', 'cog'))

part = Partial.from_cldf(Dataset().cldf_dir.joinpath('cldf-metadata.json'),
        columns=columns, namespace=namespace)

input('loaded data')

part.cluster(method='sca', ref="scacogid", threshold=0.45)
part.get_scorer(runs=100)
part.cluster(method='lexstat', ref="lexstatcogid", threshold=0.55)

from lingpy.evaluate.acd import bcubes

bcubes(part, "cogid", "scacogid")
bcubes(part, "cogid", "lexstatcogid")

alms = Alignments(part, ref='cogids')
alms.align()
alms.output('tsv', filename='deepadung-wordlist-sca', ignore='all', prettify=False)
Example #17
0
                msf = '{0}_{1}_{2}'.format('f', m, cm)
                msp = ms + '_' + ts

                lex.partial_cluster(method=m,
                                    cluster_method=cm,
                                    threshold=t,
                                    ref=msp)

                # get loose and strict cognate ids for this method
                lex.add_cognate_ids(msp, ms + '_strict' + '_' + ts, 'strict')
                lex.add_cognate_ids(msp, ms + '_loose' + '_' + ts, 'loose')

                # get the bcubes
                for mode in ['strict', 'loose']:
                    msm = ms + '_' + mode + '_' + ts
                    p, r, fs = bcubes(lex, mode + '_cogid', msm, pprint=False)
                    pprint_result(f, msm, ts, p, r, fs)
                    ccubes += [[msm, f, t, ts, p, r, fs]]
                p, r, fs = partial_bcubes(lex, 'partialids', msp, pprint=False)
                pprint_result(f, msp, ts, p, r, fs)
                ccubes += [[msp, f, t, ts, p, r, fs]]

                lex.cluster(method=m,
                            cluster_method=cm,
                            threshold=t,
                            ref=msf + '_' + ts)
                for mode in ['strict', 'loose']:
                    p, r, fs = bcubes(lex,
                                      mode + '_cogid',
                                      msf + '_' + ts,
                                      pprint=False)
Example #18
0
    def test_bcubes(self):
        res = bcubes(self.lex, test='cogid', pprint=False)
        self.assertAlmostEquals(res, (1.0, 1.0, 1.0))

        _ = bcubes(self.lex, 'cogid', 'cogid', pprint=True, per_concept=True)
Example #19
0
def main():
    results = []
    """for i in range(20):
        lex = LexStat("../PIE_scored_{}_og.csv".format(i))
        # lex.get_scorer()
        # lex.cluster(method="lexstat", threshold=0.6, ref="cognates")
        print(".", end="", flush=True)
        results.append( (
                i,
                bcubes(
                    lex,
                    "cogid",
                    "newcogid",
                    pprint=False,
                    modify_ref=lambda x: abs(int(x)),
                ),
            )
        )
:
    print()
    print("OG")
    for r in results:
        print(r)
"""
    records = []
    f_names = [
        "SLV", "GER", "ROM", "OUG", "KSL", "BAI", "JAP", "PIE", "IEL", "PAN"
    ]
    for name in f_names:
        print(name)
        for n in range(1, 7):
            results = []
            for i in [50]:
                lex = LexStat("../{}_updating_{}_{}.csv".format(name, i, n))
                # lex.get_scorer()
                # lex.cluster(method="lexstat", threshold=0.6, ref="cognates")

                precision, recall, f_score = bcubes(
                    lex,
                    "cogid",
                    "newcogid",
                    pprint=False,
                    modify_ref=lambda x: abs(int(x)),
                )
                records.append([name, n, precision, recall, f_score])

            for r in results:
                print(r)
        print()

    f, axes = plt.subplots(1, 2, figsize=(20, 8))

    # markers = {i: "${}$".format(i) for i in range(1, 7)}
    df = pd.DataFrame.from_records(
        records,
        columns=["Partition", "Iteration", "Precision", "Recall", "F-score"])
    sns.lineplot(x="Recall",
                 y="Precision",
                 hue="Partition",
                 data=df,
                 marker="o",
                 ax=axes[0])
    # plt.subplots_adjust(right=0.7)
    # plt.legend(bbox_to_anchor=(1.02, 1.02), loc="upper left")

    for _, i, precision, recall, _ in records:
        if i == 1 or i == 6:
            axes[0].annotate(str(i), (recall, precision))

    # markers = {i: "${}$".format(i) for i in range(1, 7)}
    df = pd.DataFrame.from_records(
        records,
        columns=["Partition", "Iteration", "Precision", "Recall", "F-score"])
    sns.lineplot(x="Iteration",
                 y="F-score",
                 hue="Partition",
                 data=df,
                 ax=axes[1])
    # plt.subplots_adjust(right=0.7)
    axes[0].get_legend().remove()
    lgd = plt.legend(bbox_to_anchor=(1.02, 1.00), loc="upper left")

    plt.savefig("bbb.png", bbox_extra_artists=[lgd], bbox_inches="tight")
Example #20
0
wl = Wordlist.from_cldf(ds.dir.joinpath('cldf', 'cldf-metadata.json'),
                        columns=[
                            'language_id', 'concept_name', 'value', 'form',
                            'segments', 'cogid_cognateset_id'
                        ],
                        namespace=dict([['language_id', 'doculect'],
                                        ['concept_name', 'concept'],
                                        ['value', 'value'], ['form', 'form'],
                                        ['segments', 'tokens'],
                                        ['cogid_cognateset_id', 'cog']]))
wl.renumber('cog')

lex = LexStat(wl)
lex.get_scorer(runs=10000)
for i in range(1, 20):
    t = i * 0.05
    ts = '{0}'.format(int(t * 100 + 0.5))
    lex.cluster(method='sca',
                threshold=t,
                ref='sca_' + ts,
                restricted_chars='')
    lex.cluster(method='lexstat',
                threshold=t,
                ref='ls_' + ts,
                restricted_chars='')
    p1, r1, f1 = bcubes(lex, 'cogid', 'sca_' + ts, pprint=False)
    p2, r2, f2 = bcubes(lex, 'cogid', 'ls_' + ts, pprint=False)
    print('\t'.join(['{0:.2f}'.format(x)
                     for x in [t, p1, r1, f1, p2, r2, f2]]))
    for j, headB in enumerate(header[1:]):
        if i < j:
            idx = 1
            D = {0: ['doculect', 'concept', 'ipa', headA, headB]}
            for line in data[1:]:
                row = dict(zip(header, line))
                vow1 = row[headA].split(' / ')[0]
                vow2 = row[headB].split(' / ')[0]
                if not '?' in (vow1, vow2):
                    D[idx] = ['doculect', 'concept', line[0], vow1, vow2]
                idx += 1
            wl = Wordlist(D)
            wl.renumber(headA)
            wl.renumber(headB)
            print(wl.height, wl.width, headA, headB)
            print(bcubes(wl, headA+'id', headB+'id', pprint=False,
                    per_concept=True))
            matrix[i][j] = 1 - bcubes(wl, headA+'id', headB+'id', pprint=False,
                    per_concept=True)[-1] 
            matrix[j][i] = matrix[i][j]

            print('{0:20}'.format(headA), '\t', '{0:20}'.format(headB), '\t', '{0:.4f}'.format(matrix[i][j]))

text = ' 8\n'
for i, line in enumerate(matrix):
    text += '{0:10}'.format(header[1:][i])+'  '+' '.join(['{0:.2f}'.format(x) for
        x in line])+'\n'

print(Tree(upgma(matrix, header[1:])).asciiArt())
with open('matrix.dst', 'w') as f:
    f.write(text)
Example #22
0
def test_bcubes(lex):
    res = bcubes(lex, test='cogid', pprint=False)
    assert res == pytest.approx((1.0, 1.0, 1.0))
    _ = bcubes(lex, 'cogid', 'cogid', pprint=True, per_concept=True)
                )
namespace=(('concept_name', 'concept'), ('language_id',
                'doculect'), ('segments', 'tokens'), ('language_glottocode',
                    'glottolog'), ('concept_concepticon_id', 'concepticon'),
                ('language_latitude', 'latitude'), ('language_longitude',
                    'longitude'), ('cognacy', 'cognacy'),
                ('cogid_cognateset_id', 'cog'))

part = Partial.from_cldf(Dataset().cldf_dir.joinpath('cldf-metadata.json'),
        columns=columns, namespace=namespace)

input('loaded data')

part.renumber('cog')  #8

from lingpy.evaluate.acd import bcubes #10

part.partial_cluster(method='sca', threshold=0.45, ref='scaids') #12

part.add_cognate_ids('scaids', 'scaid', idtype='strict') #13

bcubes(part, 'cogid', 'scaid') #15

part.add_cognate_ids('scaids', 'scalooseid', idtype='loose') #16

bcubes(part, 'cogid', 'scalooseid') #17

alms = Alignments(part, ref='cogid')
alms.align()
alms.output('tsv', filename='../output/deepadung-wordlist-new', ignore='all', prettify=False)
Example #24
0
    def test_bcubes(self):
        from lingpy.evaluate.acd import bcubes

        res = bcubes(self.lex, test='cogid')
        self.assertAlmostEquals(res, (1.0, 1.0, 1.0))
Example #25
0
        for j in range(1, len(taxa)+1):
            matrix[i][j-1] = float(data[i][j])
    return taxa, matrix


for i in [100, 200, 300]:
    try:
        lex = LexStat('D_subset-{0}-22.tsv.bin.tsv'.format(i))
    except:
        lex = LexStat('D_subset-{0}-22.tsv'.format(i))
        lex.get_scorer(runs=10000, restricted_chars='_')
        lex.output('tsv', filename=lex.filename+'.bin', ignore='')
    lex.cluster(method='sca', threshold=0.45, ref='scaid', restricted_chars='_')
    lex.cluster(method='lexstat', threshold=0.55, restricted_chars='_',
            ref='lexstatid', cluster_method='infomap')
    p, r, f = bcubes(lex, 'lexstatid', 'scaid', pprint=False)
    print('SuSe{0} {1:.2f} {2:.2f} {3:.2f}'.format(i, p, r, f))

    lex.output('tsv', filename=lex.filename+'-cognates', ignore='all')
    lex.calculate('tree', ref='lexstatid')
    
    tm, tree_taxa = nwk2tree_matrix(lex.tree)
    matrix1 = make_matrix('lexstatid', lex, lex.tree, tree_taxa)
    matrix2 = make_matrix('scaid', lex, lex.tree, tree_taxa)


    plot_heatmap(lex, ref='lexstatid', filename='O_lexstat_{0}'.format(i), vmax=1,
            tree=lex.tree, colorbar_label='lexical cognates',
            normalized='swadesh',
            )
    plot_heatmap(lex, ref='scaid', filename='O_sca_{0}'.format(i), vmax=1,
Example #26
0
from lingpy.compare.partial import *
from lingpy.evaluate.acd import bcubes
from lexibank_gaotb import Dataset

part = Partial.from_cldf(
    Dataset().cldf_dir.joinpath('cldf-metadata.json'),
    columns=[
        'concept_id', 'concept_name', 'language_id', 'language_name', 'value',
        'form', 'segments', 'cogid_cognateset_id'
    ],
    namespace=(('concept_name', 'concept'), ('language_id', 'doculect'),
               ('cogid_cognateset_id', 'cog'), ('segments', 'tokens')))
part.renumber('cog')
part.partial_cluster(method='sca',
                     ref='cogids',
                     threshold=0.45,
                     cluster_method='upgma')
part.add_cognate_ids("cogids", "autocogid", idtype="strict")

bcubes(part, "cogid", "autocogid")

part.add_cognate_ids('cogids', 'looseid', idtype='loose')
bcubes(part, "cogid", "looseid")
part = Partial.from_cldf(Dataset().cldf_dir.joinpath('cldf-metadata.json'),
                         columns=columns,
                         namespace=namespace)  #25

input('loaded data')

part.renumber('cog')  #26

from lingpy.evaluate.acd import bcubes  #10

for i in range(20):  #27
    t = 0.05 * i
    ts = 't_' + str(i)
    part.partial_cluster(method='sca', threshold=t, ref=ts)
    part.add_cognate_ids(ts, ts + 'id', idtype='strict')
    p, r, f = bcubes(part, 'cogid', ts + 'id', pprint=False)
    print('{0:.2f}   {1:.4}   {2:.4f}   {3:.2f}'.format(t, p, r, f))

for i in range(20):  #30
    t = 0.05 * i
    ts = 't_' + str(i)
    part.partial_cluster(method='sca', threshold=t, ref=ts)
    part.add_cognate_ids(ts, ts + 'id', idtype='loose')
    p, r, f = bcubes(part, 'cogid', ts + 'id', pprint=False)
    print('{0:.2f}   {1:.4}   {2:.4f}   {3:.2f}'.format(t, p, r, f))

alms = Alignments(part, ref='cogids')
alms.align()
alms.output('tsv',
            filename='../output/deepadung-wordlist-new2',
            ignore='all',
Example #28
0
from lingpy import *
from glob import glob
from collections import defaultdict
from lingpy.evaluate.acd import bcubes

files = sorted(glob('data/data-*.tsv'))
for f in files:
    print('[i] Analyzing...', f)
    wl = LexStat(f)
    wl.cluster(method='edit-dist', threshold=0.75, ref='editid')
    wl.add_entries('inferred_class', 'concept,editid',
                   lambda x, y: x[y[0]] + ':' + str(x[y[1]]))
    wlx = Wordlist(f)
    cols = wlx.columns + ['inferred_class', 'editid']

    wl.output('tsv',
              filename='computed/' +
              f.split('/')[1].replace('data-', 'edit-')[:-4],
              ignore='all',
              prettify=False,
              subset=True,
              cols=cols)
    wl.output('paps.nex',
              ref='editid',
              missing='?',
              filename='nexus/' + f.split('/')[1].replace('data', 'edit')[:-4])
    p, r, f = bcubes(wl, 'cogid', 'editid', pprint=False)
    print('... {0:.2f} {1:.2f} {2:.2f}'.format(p, r, f))
Example #29
0
from lingpy import *
from glob import glob
from collections import defaultdict
from lingpy.evaluate.acd import bcubes

files = sorted(glob('data/data-*.tsv'))
for f in files:
    print('[i] Analyzing...', f)
    wl = LexStat(f)
    wl.cluster(method='sca', threshold=0.45, ref='scaid')
    wl.add_entries('inferred_class', 'concept,scaid',
                   lambda x, y: x[y[0]] + ':' + str(x[y[1]]))
    wlx = Wordlist(f)
    cols = wlx.columns + ['inferred_class', 'scaid']

    wl.output('tsv',
              filename='computed/' +
              f.split('/')[1].replace('data-', 'sca-')[:-4],
              ignore='all',
              prettify=False,
              subset=True,
              cols=cols)
    wl.output('paps.nex',
              ref='scaid',
              missing='?',
              filename='nexus/' + f.split('/')[1].replace('data', 'sca')[:-4])
    p, r, f = bcubes(wl, 'cogid', 'scaid', pprint=False)
    print('... {0:.2f} {1:.2f} {2:.2f}'.format(p, r, f))
Example #30
0
               ref='infomapid')
    wl.cluster(method='lexstat',
               cluster_method='upgma',
               threshold=0.6,
               ref='lexstatid')
    wl.add_entries('inferred_class', 'concept,infomapid',
                   lambda x, y: x[y[0]] + ':' + str(x[y[1]]))
    wlx = Wordlist(f)
    cols = wlx.columns + ['inferred_class', 'infomapid']

    wl.output('tsv',
              filename='computed/' +
              f.split('/')[1].replace('data-', 'infomap-')[:-4],
              ignore='all',
              prettify=False,
              subset=True,
              cols=cols)
    wl.output('paps.nex',
              ref='infomapid',
              missing='?',
              filename='nexus/' +
              f.split('/')[1].replace('data', 'infomap')[:-4])
    p, r, fc = bcubes(wl, 'cogid', 'lexstatid', pprint=False)
    print('*',
          f.split('/')[1].replace('data-', ''),
          '| {0:.2f} | {1:.2f} | {2:.2f}'.format(p, r, fc))
    p, r, fc = bcubes(wl, 'cogid', 'infomapid', pprint=False)
    print(
        f.split('/')[1].replace('data-', ''),
        '| {0:.2f} | {1:.2f} | {2:.2f}'.format(p, r, fc))
Example #31
0
from lingpy import *
from lingpy.evaluate.acd import bcubes
from sys import argv

if len(argv) < 2:
    print('usage python cognates.py')
    
try:
    lex = LexStat('wordlist.bin.tsv', segments='segments')
except:
    lex = LexStat('wordlist-short.tsv', segments='segments')
    lex.get_scorer(runs=10000)
    lex.output('tsv', filename='wordlist.bin')


lex.cluster(method='lexstat', cluster_method='infomap', threshold=0.55)

p, r, f = bcubes(lex, 'cogid', 'lexstatid', pprint=True)
print('{0:.2f}\t{1:.2f}\t{2:.2f}'.format(p, r, f))

alm = Alignments(lex, ref='lexstatid')
alm.align(scoredict=lex.cscorer)
alm.output('tsv', filename='wordlist-aligned', ignore='all', prettify=False)
Example #32
0
            elif len([c for c in dolgo if c != 'V']) == 1:
                cls = dolgo[0] + 'H'
            else:
                cls = ''.join([c for c in dolgo if c != 'V'][:2])
            cogs[cls] += [w]
        for i, (cog, words) in enumerate(cogs.items()):
            for word in words:
                CC[word] = concept + ':' + cog
    wordlist.add_entries('inferred_class', CC, lambda x: x)
    wordlist.renumber('inferred_class', 'turchinid')


files = sorted(glob('data/data-*.tsv'))
for f in files:
    wl = Wordlist(f)
    turchin(wl)
    wl.output('tsv',
              filename='computed/' +
              f.split('/')[1].replace('data-', 'turchin-')[:-4],
              ignore='all',
              prettify=False)
    wl.output('paps.nex',
              ref='turchinid',
              missing='?',
              filename='nexus/' +
              f.split('/')[1].replace('data', 'turchin')[:-4])
    p, r, fc = bcubes(wl, 'cogid', 'turchinid', pprint=False)
    print(
        f.split('/')[1][:-4].replace('data-', ''),
        '| {0:.2f} | {1:.2f} | {2:.2f}'.format(p, r, fc))