def writeToFile(): print("LOAD TEST WORDLIST") pathToAnnotatedWordList = "Data/IELex/output/IELex-2016.tsv" print("LOAD WORDLIST") #pathToAnnotatedWordList = "Data/mattis_new/output/ObUgrian-110-21.tsv.asjp" languages,words,global_ids,cognate_classes = loadAnnotatedWordList(pathToAnnotatedWordList) print(len(set(global_ids))) stoplist = {221 , 646 ,1333 ,1224 , 778 ,1402, 1411, 1232, 1203, 1292} languages,words,global_ids,cognate_classes = getRidOfValidationSet(languages,words,global_ids,cognate_classes,stoplist) print(len(set(global_ids))) with codecs.open("lexstat_wordlist.txt","w",encoding="UTF-8") as f: f.write("CONCEPT\tIPA\tDOCULECT\tCOGID\n") for l,w,gi,cog in zip(languages,words,global_ids,cognate_classes): f.write(str(gi)+"\t"+w+"\t"+l+"\t"+cog+"\n") wl =get_wordlist("lexstat_wordlist.txt",delimiter="\t") print(wl.get_dict(concept="730",entry="IPA")) print("initializing lexstat") lex = LexStat(wl) print("getting scorer") lex.get_scorer() print("clustering") lex.cluster(method="lexstat", threshold=0.6, ref="cognate_class_pred") print("output") lex.output('tsv', filename="lexstat_ielex") from lingpy.evaluate.acd import bcubes, diff bcubes(lex, "cognate_class", "COGID") print(bcubes(lex, "cognate_class", "cognate_class_pred"))
def writeToFile(): print("LOAD TEST WORDLIST") pathToAnnotatedWordList = "Data/IELex/output/IELex-2016.tsv" print("LOAD WORDLIST") #pathToAnnotatedWordList = "Data/mattis_new/output/ObUgrian-110-21.tsv.asjp" languages, words, global_ids, cognate_classes = loadAnnotatedWordList( pathToAnnotatedWordList) print(len(set(global_ids))) stoplist = {221, 646, 1333, 1224, 778, 1402, 1411, 1232, 1203, 1292} languages, words, global_ids, cognate_classes = getRidOfValidationSet( languages, words, global_ids, cognate_classes, stoplist) print(len(set(global_ids))) with codecs.open("lexstat_wordlist.txt", "w", encoding="UTF-8") as f: f.write("CONCEPT\tIPA\tDOCULECT\tCOGID\n") for l, w, gi, cog in zip(languages, words, global_ids, cognate_classes): f.write(str(gi) + "\t" + w + "\t" + l + "\t" + cog + "\n") wl = get_wordlist("lexstat_wordlist.txt", delimiter="\t") print(wl.get_dict(concept="730", entry="IPA")) print("initializing lexstat") lex = LexStat(wl) print("getting scorer") lex.get_scorer() print("clustering") lex.cluster(method="lexstat", threshold=0.6, ref="cognate_class_pred") print("output") lex.output('tsv', filename="lexstat_ielex") from lingpy.evaluate.acd import bcubes, diff bcubes(lex, "cognate_class", "COGID") print(bcubes(lex, "cognate_class", "cognate_class_pred"))
def test_bcubes(self): from lingpy.evaluate.acd import bcubes res = bcubes(self.lex, test='cogid', pprint=False) self.assertAlmostEquals(res, (1.0, 1.0, 1.0)) res = bcubes(self.lex, 'cogid', 'cogid', pprint=True, per_concept=True)
def test_bcubes(self): from lingpy.evaluate.acd import bcubes res = bcubes(self.lex, test='cogid', pprint=False) self.assertAlmostEquals(res, (1.0, 1.0, 1.0)) res = bcubes(self.lex, 'cogid', 'cogid', pprint=True, per_concept=True)
def main(): results = [] for name in [ "ROM", "BAI", "GER", "JAP", "OUG", "PIE", "SLV", "IEL", "KSL", "PAN" ]: print(name) for n in range(1, 7): results = [] for i in [50]: lex = LexStat("../{}_updating_{}_{}.csv".format(name, i, n)) # lex.get_scorer() # lex.cluster(method="lexstat", threshold=0.6, ref="cognates") results.append(( i, bcubes( lex, "cogid", "newcogid", pprint=False, modify_ref=lambda x: abs(int(x)), ), )) for r in results: print(r) print()
def evaluate(): wl = Wordlist('mikronesian-lexstat.tsv') for res in ['turchinid', 'scaid', 'lexstatid', 'infomap']: print('{0:10}\t{1[0]:.2f}\t{1[1]:.2f}\t{1[2]:.2f}'.format( res, bcubes(wl, 'cogid', res, pprint=False)))
def main(): results = [] """for i in range(20): lex = LexStat("../PIE_scored_{}_og.csv".format(i)) # lex.get_scorer() # lex.cluster(method="lexstat", threshold=0.6, ref="cognates") print(".", end="", flush=True) results.append( ( i, bcubes( lex, "cogid", "newcogid", pprint=False, modify_ref=lambda x: abs(int(x)), ), ) ) : print() print("OG") for r in results: print(r) """ for name in ["ARM_GRE"]: print(name) for n in range(1, 7): results = [] for i in [50]: lex = LexStat("../{}_{}_{}.csv".format(name, i, n)) # lex.get_scorer() # lex.cluster(method="lexstat", threshold=0.6, ref="cognates") results.append( ( i, bcubes( lex, "cogid", "newcogid", pprint=False, modify_ref=lambda x: abs(int(x)), ), ) ) for r in results: print(r) print()
########### # Data input ############ # manually annotate the partial cognates WLmp = Wordlist('hm-111-17_16feb.tsv') # auto detect the partial cognates WLap = Wordlist('HM-March4-partial-final.tsv') ############## # Compare the cognates : bcubes ############## WLmp_bcube = bcubes(WLmp, 'cogid', 'cogids') WLap_bcube = bcubes(WLap, 'cogid', 'cogids') ################### # Not sure if this make sence, but just trying out at this moment # get edit distance for every concept ################### Lan = WLmp.language lanD = dict() for i in Lan: temp = WLmp.get_list(language=i, entry='SEGMENTS_IS') lanD[i] = temp lanpair = list(permutations(Lan, 2))
msf = 'f_'+m for cm in cluster_methods: ms = '{0}_{1}_{2}'.format('p', m, cm) msf = '{0}_{1}_{2}'.format('f', m, cm) msp = ms +'_'+ts lex.partial_cluster(method=m, cluster_method=cm, threshold=t, ref=msp) # get loose and strict cognate ids for this method lex.add_cognate_ids(msp, ms+'_strict'+'_'+ts, 'strict') lex.add_cognate_ids(msp, ms+'_loose'+'_'+ts, 'loose') # get the bcubes for mode in ['strict', 'loose']: msm = ms+'_'+mode+'_'+ts p, r, fs = bcubes(lex, mode+'_cogid', msm, pprint=False) pprint_result(f, msm, ts, p, r, fs) ccubes += [[msm, f, t, ts, p, r, fs]] p, r, fs = partial_bcubes(lex, 'partialids', msp, pprint=False) pprint_result(f, msp, ts, p, r, fs) ccubes += [[msp, f, t, ts, p, r, fs]] lex.cluster(method=m, cluster_method=cm, threshold=t, ref=msf+'_'+ts) for mode in ['strict', 'loose']: p, r, fs = bcubes(lex, mode+'_cogid', msf+'_'+ts, pprint=False) pprint_result(f,msf+'_'+mode+'_'+ts, ts, p, r, fs) ccubes += [[msf+'_'+mode+'_'+ts, f, t, ts, p, r, fs]] with open('results.tsv', 'w') as f:
method='lexstat', threshold=0.57, external_function=lambda x, y: infomap_clustering(y, x, revert=True), ref="lexstat_infomap") partition = vstack([ array([ concatenate(lex.get_dict(col=l, entry=entry).values()) for entry in ['lexstat_infomap', 'index'] ]).T for l in taxa ]) partition = pd.DataFrame(partition, columns=['lpCC', 'id']) inferredData = pd.merge(inferredData, partition) inferredData.to_csv('../results/' + dataset + '.clustered.tsv', sep='\t', encoding='utf-8', index=False) clustered = lp.Wordlist('../results/' + dataset + '.clustered.tsv') with open('../results/f-scores.csv', 'a') as f: lsFscores = bcubes(clustered, gold='COGID', test='lpcc') f.write(dataset + ',LexStat,' + ','.join(map(str, around(array(lsFscores), 3))) + '\n') svmFscores = bcubes(clustered, gold='COGID', test='svmcc') f.write(dataset + ',SVM,' + ','.join(map(str, around(array(svmFscores), 3))) + '\n')
from lingpy import * from lingpy.evaluate.acd import bcubes lex = LexStat('../lingpy/output/Lingpy_vs_Final_cognates.tsv') b = bcubes(lex, gold='COGNATES', test='ORIG') print(b)
def test_bcubes(self): res = bcubes(self.lex, test='cogid', pprint=False) self.assertAlmostEqual(res, (1.0, 1.0, 1.0)) _ = bcubes(self.lex, 'cogid', 'cogid', pprint=True, per_concept=True)
part.renumber('cog') method = input('method: ') # type 'cogid' or 'cog' for method to see a tree based on Deepadung et al.'s # cognate judgements if method == 'lexstatcogids': part.get_partial_scorer(runs=10000) part.partial_cluster(method='lexstat', ref="lexstatcogids", threshold=0.55) elif method == 'lexstatcogid': part.get_scorer(runs=10000) part.cluster(method='lexstat', ref="lexstatcogid", threshold=0.55) bcubes(part, 'cogid', 'lexstatcogid') elif method == 'scacogids': part.partial_cluster(method='sca', threshold=0.45, ref='scacogids') elif method == 'scacogid': part.cluster(method='sca', ref="scacogid", threshold=0.45) bcubes(part, 'cogid', 'scacogid') part.calculate('tree', ref= method) print(method) print(part.tree.asciiArt()) import csv x = part.distances taxa = part.taxa filename = '../output/distmat_'+ method+'.csv'
def score(self): print(bcubes(self.lex, "COGNACY", "CUSTOMID"))
word = re.sub(r"(.)(.)(.)\$", r"\2", word) word = re.sub(r"\$", "", word) word = re.sub(r"\s+", "", word) return word.replace('~', '') d1 = pd.read_csv('IELex+ASJP.csv', sep='\t') d1 = d1[~d1.ASJP_transcription.isnull()] d1 = d1[d1.loan == 0] d1 = d1[d1['class'].notnull()] d1['word'] = [cleanASJP(x) for x in d1.ASJP_transcription.values] d1['language'] = [x.replace('-', '_') for x in d1.ASJP_language.values] d2 = pd.read_csv('../albanoRomanceCC.csv', index_col=0) d = pd.merge(d1, d2)[['concept', 'language', 'word', 'class', 'cc']] d.to_csv('mergedData.tsv', sep='\t') wl = lp.Wordlist('mergedData.tsv') bcubes(wl, gold="class", test="cc") # ************************* # * B-Cubed-Scores * # * --------------------- * # * Precision: 0.9934 * # * Recall: 0.6194 * # * F-Scores: 0.7630 * # *************************'
input('all fine') columns=('concept_name', 'language_id', 'value', 'form', 'segments', 'language_glottocode', 'cogid_cognateset_id' ) namespace=(('concept_name', 'concept'), ('language_id', 'doculect'), ('segments', 'tokens'), ('language_glottocode', 'glottolog'), ('concept_concepticon_id', 'concepticon'), ('language_latitude', 'latitude'), ('language_longitude', 'longitude'), ('cognacy', 'cognacy'), ('cogid_cognateset_id', 'cog')) part = Partial.from_cldf(Dataset().cldf_dir.joinpath('cldf-metadata.json'), columns=columns, namespace=namespace) input('loaded data') part.cluster(method='sca', ref="scacogid", threshold=0.45) part.get_scorer(runs=100) part.cluster(method='lexstat', ref="lexstatcogid", threshold=0.55) from lingpy.evaluate.acd import bcubes bcubes(part, "cogid", "scacogid") bcubes(part, "cogid", "lexstatcogid") alms = Alignments(part, ref='cogids') alms.align() alms.output('tsv', filename='deepadung-wordlist-sca', ignore='all', prettify=False)
msf = '{0}_{1}_{2}'.format('f', m, cm) msp = ms + '_' + ts lex.partial_cluster(method=m, cluster_method=cm, threshold=t, ref=msp) # get loose and strict cognate ids for this method lex.add_cognate_ids(msp, ms + '_strict' + '_' + ts, 'strict') lex.add_cognate_ids(msp, ms + '_loose' + '_' + ts, 'loose') # get the bcubes for mode in ['strict', 'loose']: msm = ms + '_' + mode + '_' + ts p, r, fs = bcubes(lex, mode + '_cogid', msm, pprint=False) pprint_result(f, msm, ts, p, r, fs) ccubes += [[msm, f, t, ts, p, r, fs]] p, r, fs = partial_bcubes(lex, 'partialids', msp, pprint=False) pprint_result(f, msp, ts, p, r, fs) ccubes += [[msp, f, t, ts, p, r, fs]] lex.cluster(method=m, cluster_method=cm, threshold=t, ref=msf + '_' + ts) for mode in ['strict', 'loose']: p, r, fs = bcubes(lex, mode + '_cogid', msf + '_' + ts, pprint=False)
def test_bcubes(self): res = bcubes(self.lex, test='cogid', pprint=False) self.assertAlmostEquals(res, (1.0, 1.0, 1.0)) _ = bcubes(self.lex, 'cogid', 'cogid', pprint=True, per_concept=True)
def main(): results = [] """for i in range(20): lex = LexStat("../PIE_scored_{}_og.csv".format(i)) # lex.get_scorer() # lex.cluster(method="lexstat", threshold=0.6, ref="cognates") print(".", end="", flush=True) results.append( ( i, bcubes( lex, "cogid", "newcogid", pprint=False, modify_ref=lambda x: abs(int(x)), ), ) ) : print() print("OG") for r in results: print(r) """ records = [] f_names = [ "SLV", "GER", "ROM", "OUG", "KSL", "BAI", "JAP", "PIE", "IEL", "PAN" ] for name in f_names: print(name) for n in range(1, 7): results = [] for i in [50]: lex = LexStat("../{}_updating_{}_{}.csv".format(name, i, n)) # lex.get_scorer() # lex.cluster(method="lexstat", threshold=0.6, ref="cognates") precision, recall, f_score = bcubes( lex, "cogid", "newcogid", pprint=False, modify_ref=lambda x: abs(int(x)), ) records.append([name, n, precision, recall, f_score]) for r in results: print(r) print() f, axes = plt.subplots(1, 2, figsize=(20, 8)) # markers = {i: "${}$".format(i) for i in range(1, 7)} df = pd.DataFrame.from_records( records, columns=["Partition", "Iteration", "Precision", "Recall", "F-score"]) sns.lineplot(x="Recall", y="Precision", hue="Partition", data=df, marker="o", ax=axes[0]) # plt.subplots_adjust(right=0.7) # plt.legend(bbox_to_anchor=(1.02, 1.02), loc="upper left") for _, i, precision, recall, _ in records: if i == 1 or i == 6: axes[0].annotate(str(i), (recall, precision)) # markers = {i: "${}$".format(i) for i in range(1, 7)} df = pd.DataFrame.from_records( records, columns=["Partition", "Iteration", "Precision", "Recall", "F-score"]) sns.lineplot(x="Iteration", y="F-score", hue="Partition", data=df, ax=axes[1]) # plt.subplots_adjust(right=0.7) axes[0].get_legend().remove() lgd = plt.legend(bbox_to_anchor=(1.02, 1.00), loc="upper left") plt.savefig("bbb.png", bbox_extra_artists=[lgd], bbox_inches="tight")
wl = Wordlist.from_cldf(ds.dir.joinpath('cldf', 'cldf-metadata.json'), columns=[ 'language_id', 'concept_name', 'value', 'form', 'segments', 'cogid_cognateset_id' ], namespace=dict([['language_id', 'doculect'], ['concept_name', 'concept'], ['value', 'value'], ['form', 'form'], ['segments', 'tokens'], ['cogid_cognateset_id', 'cog']])) wl.renumber('cog') lex = LexStat(wl) lex.get_scorer(runs=10000) for i in range(1, 20): t = i * 0.05 ts = '{0}'.format(int(t * 100 + 0.5)) lex.cluster(method='sca', threshold=t, ref='sca_' + ts, restricted_chars='') lex.cluster(method='lexstat', threshold=t, ref='ls_' + ts, restricted_chars='') p1, r1, f1 = bcubes(lex, 'cogid', 'sca_' + ts, pprint=False) p2, r2, f2 = bcubes(lex, 'cogid', 'ls_' + ts, pprint=False) print('\t'.join(['{0:.2f}'.format(x) for x in [t, p1, r1, f1, p2, r2, f2]]))
for j, headB in enumerate(header[1:]): if i < j: idx = 1 D = {0: ['doculect', 'concept', 'ipa', headA, headB]} for line in data[1:]: row = dict(zip(header, line)) vow1 = row[headA].split(' / ')[0] vow2 = row[headB].split(' / ')[0] if not '?' in (vow1, vow2): D[idx] = ['doculect', 'concept', line[0], vow1, vow2] idx += 1 wl = Wordlist(D) wl.renumber(headA) wl.renumber(headB) print(wl.height, wl.width, headA, headB) print(bcubes(wl, headA+'id', headB+'id', pprint=False, per_concept=True)) matrix[i][j] = 1 - bcubes(wl, headA+'id', headB+'id', pprint=False, per_concept=True)[-1] matrix[j][i] = matrix[i][j] print('{0:20}'.format(headA), '\t', '{0:20}'.format(headB), '\t', '{0:.4f}'.format(matrix[i][j])) text = ' 8\n' for i, line in enumerate(matrix): text += '{0:10}'.format(header[1:][i])+' '+' '.join(['{0:.2f}'.format(x) for x in line])+'\n' print(Tree(upgma(matrix, header[1:])).asciiArt()) with open('matrix.dst', 'w') as f: f.write(text)
def test_bcubes(lex): res = bcubes(lex, test='cogid', pprint=False) assert res == pytest.approx((1.0, 1.0, 1.0)) _ = bcubes(lex, 'cogid', 'cogid', pprint=True, per_concept=True)
) namespace=(('concept_name', 'concept'), ('language_id', 'doculect'), ('segments', 'tokens'), ('language_glottocode', 'glottolog'), ('concept_concepticon_id', 'concepticon'), ('language_latitude', 'latitude'), ('language_longitude', 'longitude'), ('cognacy', 'cognacy'), ('cogid_cognateset_id', 'cog')) part = Partial.from_cldf(Dataset().cldf_dir.joinpath('cldf-metadata.json'), columns=columns, namespace=namespace) input('loaded data') part.renumber('cog') #8 from lingpy.evaluate.acd import bcubes #10 part.partial_cluster(method='sca', threshold=0.45, ref='scaids') #12 part.add_cognate_ids('scaids', 'scaid', idtype='strict') #13 bcubes(part, 'cogid', 'scaid') #15 part.add_cognate_ids('scaids', 'scalooseid', idtype='loose') #16 bcubes(part, 'cogid', 'scalooseid') #17 alms = Alignments(part, ref='cogid') alms.align() alms.output('tsv', filename='../output/deepadung-wordlist-new', ignore='all', prettify=False)
def test_bcubes(self): from lingpy.evaluate.acd import bcubes res = bcubes(self.lex, test='cogid') self.assertAlmostEquals(res, (1.0, 1.0, 1.0))
for j in range(1, len(taxa)+1): matrix[i][j-1] = float(data[i][j]) return taxa, matrix for i in [100, 200, 300]: try: lex = LexStat('D_subset-{0}-22.tsv.bin.tsv'.format(i)) except: lex = LexStat('D_subset-{0}-22.tsv'.format(i)) lex.get_scorer(runs=10000, restricted_chars='_') lex.output('tsv', filename=lex.filename+'.bin', ignore='') lex.cluster(method='sca', threshold=0.45, ref='scaid', restricted_chars='_') lex.cluster(method='lexstat', threshold=0.55, restricted_chars='_', ref='lexstatid', cluster_method='infomap') p, r, f = bcubes(lex, 'lexstatid', 'scaid', pprint=False) print('SuSe{0} {1:.2f} {2:.2f} {3:.2f}'.format(i, p, r, f)) lex.output('tsv', filename=lex.filename+'-cognates', ignore='all') lex.calculate('tree', ref='lexstatid') tm, tree_taxa = nwk2tree_matrix(lex.tree) matrix1 = make_matrix('lexstatid', lex, lex.tree, tree_taxa) matrix2 = make_matrix('scaid', lex, lex.tree, tree_taxa) plot_heatmap(lex, ref='lexstatid', filename='O_lexstat_{0}'.format(i), vmax=1, tree=lex.tree, colorbar_label='lexical cognates', normalized='swadesh', ) plot_heatmap(lex, ref='scaid', filename='O_sca_{0}'.format(i), vmax=1,
from lingpy.compare.partial import * from lingpy.evaluate.acd import bcubes from lexibank_gaotb import Dataset part = Partial.from_cldf( Dataset().cldf_dir.joinpath('cldf-metadata.json'), columns=[ 'concept_id', 'concept_name', 'language_id', 'language_name', 'value', 'form', 'segments', 'cogid_cognateset_id' ], namespace=(('concept_name', 'concept'), ('language_id', 'doculect'), ('cogid_cognateset_id', 'cog'), ('segments', 'tokens'))) part.renumber('cog') part.partial_cluster(method='sca', ref='cogids', threshold=0.45, cluster_method='upgma') part.add_cognate_ids("cogids", "autocogid", idtype="strict") bcubes(part, "cogid", "autocogid") part.add_cognate_ids('cogids', 'looseid', idtype='loose') bcubes(part, "cogid", "looseid")
part = Partial.from_cldf(Dataset().cldf_dir.joinpath('cldf-metadata.json'), columns=columns, namespace=namespace) #25 input('loaded data') part.renumber('cog') #26 from lingpy.evaluate.acd import bcubes #10 for i in range(20): #27 t = 0.05 * i ts = 't_' + str(i) part.partial_cluster(method='sca', threshold=t, ref=ts) part.add_cognate_ids(ts, ts + 'id', idtype='strict') p, r, f = bcubes(part, 'cogid', ts + 'id', pprint=False) print('{0:.2f} {1:.4} {2:.4f} {3:.2f}'.format(t, p, r, f)) for i in range(20): #30 t = 0.05 * i ts = 't_' + str(i) part.partial_cluster(method='sca', threshold=t, ref=ts) part.add_cognate_ids(ts, ts + 'id', idtype='loose') p, r, f = bcubes(part, 'cogid', ts + 'id', pprint=False) print('{0:.2f} {1:.4} {2:.4f} {3:.2f}'.format(t, p, r, f)) alms = Alignments(part, ref='cogids') alms.align() alms.output('tsv', filename='../output/deepadung-wordlist-new2', ignore='all',
from lingpy import * from glob import glob from collections import defaultdict from lingpy.evaluate.acd import bcubes files = sorted(glob('data/data-*.tsv')) for f in files: print('[i] Analyzing...', f) wl = LexStat(f) wl.cluster(method='edit-dist', threshold=0.75, ref='editid') wl.add_entries('inferred_class', 'concept,editid', lambda x, y: x[y[0]] + ':' + str(x[y[1]])) wlx = Wordlist(f) cols = wlx.columns + ['inferred_class', 'editid'] wl.output('tsv', filename='computed/' + f.split('/')[1].replace('data-', 'edit-')[:-4], ignore='all', prettify=False, subset=True, cols=cols) wl.output('paps.nex', ref='editid', missing='?', filename='nexus/' + f.split('/')[1].replace('data', 'edit')[:-4]) p, r, f = bcubes(wl, 'cogid', 'editid', pprint=False) print('... {0:.2f} {1:.2f} {2:.2f}'.format(p, r, f))
from lingpy import * from glob import glob from collections import defaultdict from lingpy.evaluate.acd import bcubes files = sorted(glob('data/data-*.tsv')) for f in files: print('[i] Analyzing...', f) wl = LexStat(f) wl.cluster(method='sca', threshold=0.45, ref='scaid') wl.add_entries('inferred_class', 'concept,scaid', lambda x, y: x[y[0]] + ':' + str(x[y[1]])) wlx = Wordlist(f) cols = wlx.columns + ['inferred_class', 'scaid'] wl.output('tsv', filename='computed/' + f.split('/')[1].replace('data-', 'sca-')[:-4], ignore='all', prettify=False, subset=True, cols=cols) wl.output('paps.nex', ref='scaid', missing='?', filename='nexus/' + f.split('/')[1].replace('data', 'sca')[:-4]) p, r, f = bcubes(wl, 'cogid', 'scaid', pprint=False) print('... {0:.2f} {1:.2f} {2:.2f}'.format(p, r, f))
ref='infomapid') wl.cluster(method='lexstat', cluster_method='upgma', threshold=0.6, ref='lexstatid') wl.add_entries('inferred_class', 'concept,infomapid', lambda x, y: x[y[0]] + ':' + str(x[y[1]])) wlx = Wordlist(f) cols = wlx.columns + ['inferred_class', 'infomapid'] wl.output('tsv', filename='computed/' + f.split('/')[1].replace('data-', 'infomap-')[:-4], ignore='all', prettify=False, subset=True, cols=cols) wl.output('paps.nex', ref='infomapid', missing='?', filename='nexus/' + f.split('/')[1].replace('data', 'infomap')[:-4]) p, r, fc = bcubes(wl, 'cogid', 'lexstatid', pprint=False) print('*', f.split('/')[1].replace('data-', ''), '| {0:.2f} | {1:.2f} | {2:.2f}'.format(p, r, fc)) p, r, fc = bcubes(wl, 'cogid', 'infomapid', pprint=False) print( f.split('/')[1].replace('data-', ''), '| {0:.2f} | {1:.2f} | {2:.2f}'.format(p, r, fc))
from lingpy import * from lingpy.evaluate.acd import bcubes from sys import argv if len(argv) < 2: print('usage python cognates.py') try: lex = LexStat('wordlist.bin.tsv', segments='segments') except: lex = LexStat('wordlist-short.tsv', segments='segments') lex.get_scorer(runs=10000) lex.output('tsv', filename='wordlist.bin') lex.cluster(method='lexstat', cluster_method='infomap', threshold=0.55) p, r, f = bcubes(lex, 'cogid', 'lexstatid', pprint=True) print('{0:.2f}\t{1:.2f}\t{2:.2f}'.format(p, r, f)) alm = Alignments(lex, ref='lexstatid') alm.align(scoredict=lex.cscorer) alm.output('tsv', filename='wordlist-aligned', ignore='all', prettify=False)
elif len([c for c in dolgo if c != 'V']) == 1: cls = dolgo[0] + 'H' else: cls = ''.join([c for c in dolgo if c != 'V'][:2]) cogs[cls] += [w] for i, (cog, words) in enumerate(cogs.items()): for word in words: CC[word] = concept + ':' + cog wordlist.add_entries('inferred_class', CC, lambda x: x) wordlist.renumber('inferred_class', 'turchinid') files = sorted(glob('data/data-*.tsv')) for f in files: wl = Wordlist(f) turchin(wl) wl.output('tsv', filename='computed/' + f.split('/')[1].replace('data-', 'turchin-')[:-4], ignore='all', prettify=False) wl.output('paps.nex', ref='turchinid', missing='?', filename='nexus/' + f.split('/')[1].replace('data', 'turchin')[:-4]) p, r, fc = bcubes(wl, 'cogid', 'turchinid', pprint=False) print( f.split('/')[1][:-4].replace('data-', ''), '| {0:.2f} | {1:.2f} | {2:.2f}'.format(p, r, fc))