def calculate_scores(*languages): pair_of_languages = { 0: ["backreference", "doculect", "concept", "ipa", "tokens"] } for l in languages: pair_of_languages.update({ i + len(pair_of_languages): [ form.id, form.language_id, form.concept_id, form.transcription, form.soundclasses.split() ] for i, form in enumerate( session.query(Form).filter_by(language=l)) }) lex = Partial(pair_of_languages, model=lingpy.data.model.Model("asjp"), check=True, apply_checks=True) lex.get_scorer(runs=10000, ratio=(3, 1), threshold=0.7) # This does not generalize to non-two languages yet session.add( Scorer(language1=languages[0], language2=languages[1], scorer=scorer2str(lex.bscorer))) for concept, forms, matrix in lex._get_matrices(method='lexstat', scale=0.5, factor=0.3, restricted_chars="_T", mode="overlap", gop=-2, restriction=""): for (i1, f1), (i2, f2) in itertools.combinations(enumerate(forms), 2): f1 = lex[f1][0] # Index 0 contains the 'backref', ie. our ID f2 = lex[f2][0] # Index 0 contains the 'backref', ie. our ID session.add( Similarity(form1_id=f1, form2_id=f2, score=matrix[i1][i2])) session.commit()
from lingpy import * from lingpy.compare.partial import Partial try: part = Partial('hm-111-17.bin.tsv', segments='segments') except: part = Partial('hm-111-17.tsv', segments='segments') part.get_scorer(runs=10000) part.output('tsv', filename='hm-111-17.bin') # manually correct error in data part.partial_cluster(method='lexstat', cluster_method='infomap', threshold=0.6, ref='cogids') part.add_entries('note', 'cogid', lambda x: '') part.add_entries('morphemes', 'cogid', lambda x: '') part.output('tsv', filename='hm-111-17-t06', ignore='all', prettify=False)
from lingpy import * from lingpy.compare.partial import Partial from lingpy.evaluate.acd import partial_bcubes try: lexx = LexStat('hm-jerry-scored.bin.tsv', segments='segments') lex = Partial('../hm-111-17_16feb.tsv', segments='segments') lex.cscorer = lexx.cscorer except: lex = Partial('../hm-111-17_16feb.tsv', segments='segments') lex.get_scorer(runs=10000) lex.output('tsv', filename='hm-jerry-scored.bin') # we test several thresholds for i in range(2, 8): lex.partial_cluster(method='lexstat', cluster_method='infomap', threshold=i * 0.1, ref='t' + str(i)) a, b, c = partial_bcubes(lex, 'cogids', 't' + str(i), pprint=False) print('{0:2} {1:.2f} {2:.2f} {3:.2f}'.format(i, a, b, c))
def pprint_result(f, mode, t, p, r, fs): print('{0:15} {1:30} {2} {3:.2f} {4:.2f} {5:.2f}'.format( f, mode, ts, p, r, fs)) methods = ['sca', 'lexstat'] cluster_methods = ['infomap', 'mcl', 'upgma'] measures = ['partial', 'strict', 'loose'] for f in infiles: try: lex = Partial(pcd_path('data', 'BIN_'+f+'.tsv')) except IOError: lex = Partial(pcd_path('data', f+'.tsv')) lex.get_scorer( preprocessing=False, runs=10000, ) lex.output('tsv', filename=pcd_path('data', 'BIN_'+f[2:])) # create new reference ids for cogantes from partial cognates if not 'strict_cogid' in lex.header: lex.add_cognate_ids('partialids', 'strict_cogid', 'strict') if not 'loose_cogid' in lex.header: lex.add_cognate_ids('partialids', 'loose_cogid', 'loose') for i in range(1,20): print("Analyzing {0} with t={1}...".format(f, i)) t = 0.05 * i ts = '{0:.2f}'.format(t).replace('0.','') for m in methods:
def pprint_result(f, mode, t, p, r, fs): print('{0:15} {1:30} {2} {3:.2f} {4:.2f} {5:.2f}'.format( f, mode, ts, p, r, fs)) methods = ['sca', 'lexstat'] cluster_methods = ['infomap', 'mcl', 'upgma'] measures = ['partial', 'strict', 'loose'] for f in infiles: try: lex = Partial(pcd_path('data', 'BIN_' + f + '.tsv')) except IOError: lex = Partial(pcd_path('data', f + '.tsv')) lex.get_scorer( preprocessing=False, runs=10000, ) lex.output('tsv', filename=pcd_path('data', 'BIN_' + f[2:])) # create new reference ids for cogantes from partial cognates if not 'strict_cogid' in lex.header: lex.add_cognate_ids('partialids', 'strict_cogid', 'strict') if not 'loose_cogid' in lex.header: lex.add_cognate_ids('partialids', 'loose_cogid', 'loose') for i in range(1, 20): print("Analyzing {0} with t={1}...".format(f, i)) t = 0.05 * i ts = '{0:.2f}'.format(t).replace('0.', '') for m in methods: