コード例 #1
0
ファイル: autocode_big.py プロジェクト: Anaphory/autocode_big
def calculate_scores(*languages):
    pair_of_languages = {
        0: ["backreference", "doculect", "concept", "ipa", "tokens"]
    }
    for l in languages:
        pair_of_languages.update({
            i + len(pair_of_languages): [
                form.id, form.language_id, form.concept_id, form.transcription,
                form.soundclasses.split()
            ]
            for i, form in enumerate(
                session.query(Form).filter_by(language=l))
        })

    lex = Partial(pair_of_languages,
                  model=lingpy.data.model.Model("asjp"),
                  check=True,
                  apply_checks=True)
    lex.get_scorer(runs=10000, ratio=(3, 1), threshold=0.7)

    # This does not generalize to non-two languages yet
    session.add(
        Scorer(language1=languages[0],
               language2=languages[1],
               scorer=scorer2str(lex.bscorer)))

    for concept, forms, matrix in lex._get_matrices(method='lexstat',
                                                    scale=0.5,
                                                    factor=0.3,
                                                    restricted_chars="_T",
                                                    mode="overlap",
                                                    gop=-2,
                                                    restriction=""):
        for (i1, f1), (i2, f2) in itertools.combinations(enumerate(forms), 2):
            f1 = lex[f1][0]  # Index 0 contains the 'backref', ie. our ID
            f2 = lex[f2][0]  # Index 0 contains the 'backref', ie. our ID
            session.add(
                Similarity(form1_id=f1, form2_id=f2, score=matrix[i1][i2]))

    session.commit()
コード例 #2
0
from lingpy import *
from lingpy.compare.partial import Partial

try:
    part = Partial('hm-111-17.bin.tsv', segments='segments')
except:
    part = Partial('hm-111-17.tsv', segments='segments')
    part.get_scorer(runs=10000)
    part.output('tsv', filename='hm-111-17.bin')

# manually correct error in data
part.partial_cluster(method='lexstat',
                     cluster_method='infomap',
                     threshold=0.6,
                     ref='cogids')

part.add_entries('note', 'cogid', lambda x: '')
part.add_entries('morphemes', 'cogid', lambda x: '')
part.output('tsv', filename='hm-111-17-t06', ignore='all', prettify=False)
コード例 #3
0
from lingpy import *
from lingpy.compare.partial import Partial
from lingpy.evaluate.acd import partial_bcubes

try:
    lexx = LexStat('hm-jerry-scored.bin.tsv', segments='segments')
    lex = Partial('../hm-111-17_16feb.tsv', segments='segments')
    lex.cscorer = lexx.cscorer
except:
    lex = Partial('../hm-111-17_16feb.tsv', segments='segments')
    lex.get_scorer(runs=10000)
    lex.output('tsv', filename='hm-jerry-scored.bin')

# we test several thresholds
for i in range(2, 8):
    lex.partial_cluster(method='lexstat',
                        cluster_method='infomap',
                        threshold=i * 0.1,
                        ref='t' + str(i))

    a, b, c = partial_bcubes(lex, 'cogids', 't' + str(i), pprint=False)
    print('{0:2} {1:.2f} {2:.2f} {3:.2f}'.format(i, a, b, c))
コード例 #4
0
def pprint_result(f, mode, t, p, r, fs):
    print('{0:15}   {1:30}   {2}   {3:.2f}   {4:.2f}   {5:.2f}'.format(
        f, mode, ts, p, r, fs))

methods = ['sca', 'lexstat']
cluster_methods = ['infomap', 'mcl', 'upgma']
measures = ['partial', 'strict', 'loose']

for f in infiles:
    try:
        lex = Partial(pcd_path('data', 'BIN_'+f+'.tsv'))
    except IOError:
        lex = Partial(pcd_path('data', f+'.tsv'))
        lex.get_scorer(
                preprocessing=False, 
                runs=10000,
                )
        lex.output('tsv', filename=pcd_path('data', 'BIN_'+f[2:]))

    # create new reference ids for cogantes from partial cognates
    if not 'strict_cogid' in lex.header:
        lex.add_cognate_ids('partialids', 'strict_cogid', 'strict')
    if not 'loose_cogid' in lex.header:
        lex.add_cognate_ids('partialids', 'loose_cogid', 'loose')

    for i in range(1,20):
        print("Analyzing {0} with t={1}...".format(f, i))
        t = 0.05 * i
        ts = '{0:.2f}'.format(t).replace('0.','')

        for m in methods:
コード例 #5
0
def pprint_result(f, mode, t, p, r, fs):
    print('{0:15}   {1:30}   {2}   {3:.2f}   {4:.2f}   {5:.2f}'.format(
        f, mode, ts, p, r, fs))


methods = ['sca', 'lexstat']
cluster_methods = ['infomap', 'mcl', 'upgma']
measures = ['partial', 'strict', 'loose']

for f in infiles:
    try:
        lex = Partial(pcd_path('data', 'BIN_' + f + '.tsv'))
    except IOError:
        lex = Partial(pcd_path('data', f + '.tsv'))
        lex.get_scorer(
            preprocessing=False,
            runs=10000,
        )
        lex.output('tsv', filename=pcd_path('data', 'BIN_' + f[2:]))

    # create new reference ids for cogantes from partial cognates
    if not 'strict_cogid' in lex.header:
        lex.add_cognate_ids('partialids', 'strict_cogid', 'strict')
    if not 'loose_cogid' in lex.header:
        lex.add_cognate_ids('partialids', 'loose_cogid', 'loose')

    for i in range(1, 20):
        print("Analyzing {0} with t={1}...".format(f, i))
        t = 0.05 * i
        ts = '{0:.2f}'.format(t).replace('0.', '')

        for m in methods: