Beispiel #1
0
def run(args):

    ds = Dataset()

    try:
        part = Partial(
            ds.dir.joinpath('workflow', 'D_Chen_partial.bin.tsv').as_posix())
    except:
        part = Partial(ds.dir.joinpath('workflow',
                                       'D_Chen_subset.tsv').as_posix(),
                       segments='tokens')
        part.get_partial_scorer(runs=10000)
        part.output('tsv',
                    filename=ds.dir.joinpath('workflow',
                                             'D_Chen_partial.bin').as_posix(),
                    ignore=[],
                    prettify=False)
        args.log.info('[i] saved the scorer')
    finally:
        part.partial_cluster(method='lexstat',
                             threshold=0.55,
                             ref='cogids',
                             mode='global',
                             gop=-2,
                             cluster_method='infomap')

    part.output('tsv',
                filename=ds.dir.joinpath('workflow',
                                         'D_Chen_partial').as_posix(),
                prettify=False)
Beispiel #2
0
def run(args):
    ds = Dataset(args)
    wl = Wordlist.from_cldf(
        str(ds.cldf_specs().dir.joinpath('cldf-metadata.json')))
    D = {0: [x for x in wl.columns]}
    for idx in wl:
        if wl[idx, 'tokens']:
            D[idx] = wl[idx]
    part = Partial(D, check=True)
    part.get_partial_scorer(runs=10000)
    part.partial_cluster(method='lexstat',
                         threshold=0.45,
                         ref="cogids",
                         cluster_method='infomap')
    alms = Alignments(part, ref='cogids', fuzzy=True)
    alms.align()
    alms.output('tsv', filename="chin-aligned")
from lingpy import *
from lingpy.compare.partial import Partial

try:
    part = Partial('hm-111-17.bin.tsv', segments='segments')
except:
    part = Partial('hm-111-17.tsv', segments='segments')
    part.get_scorer(runs=10000)
    part.output('tsv', filename='hm-111-17.bin')

# manually correct error in data
part.partial_cluster(method='lexstat',
                     cluster_method='infomap',
                     threshold=0.6,
                     ref='cogids')

part.add_entries('note', 'cogid', lambda x: '')
part.add_entries('morphemes', 'cogid', lambda x: '')
part.output('tsv', filename='hm-111-17-t06', ignore='all', prettify=False)
Beispiel #4
0
from lingpy import *
from lingpy.compare.partial import Partial
from lingpy.evaluate.acd import partial_bcubes

try:
    lexx = LexStat('hm-jerry-scored.bin.tsv', segments='segments')
    lex = Partial('../hm-111-17_16feb.tsv', segments='segments')
    lex.cscorer = lexx.cscorer
except:
    lex = Partial('../hm-111-17_16feb.tsv', segments='segments')
    lex.get_scorer(runs=10000)
    lex.output('tsv', filename='hm-jerry-scored.bin')

# we test several thresholds
for i in range(2, 8):
    lex.partial_cluster(method='lexstat',
                        cluster_method='infomap',
                        threshold=i * 0.1,
                        ref='t' + str(i))

    a, b, c = partial_bcubes(lex, 'cogids', 't' + str(i), pprint=False)
    print('{0:2} {1:.2f} {2:.2f} {3:.2f}'.format(i, a, b, c))
Beispiel #5
0
class Tests(WithTempDir):
    def setUp(self):
        WithTempDir.setUp(self)
        self.part = Partial(test_data('partial_cognates.tsv'),
                            segments='segments')
        self.part2 = Partial(test_data('partial_cognates-scored.tsv'),
                             segments='segments')

    def test__get_slices(self):
        a = _get_slices(list('ba²te²'))
        b = _get_slices(list('ba²te²'), split_on_tones=False)
        assert a[0][1] == 3
        assert b[0][1] == 6

    def test_get_partial_matrices(self):

        for method in ['upgma', 'single', 'complete', 'ward', 'mcl']:
            matrix = list(
                self.part._get_partial_matrices(cluster_method=method,
                                                concept="bird"))[0]
            assert isinstance(matrix[0][0], (float, int))

        if lingpy.algorithm.extra.igraph:
            for concept, tracer, matrix in self.part._get_partial_matrices(
                    cluster_method='infomap'):
                assert isinstance(concept, text_type)
                assert [x[0] for x in tracer]

    def test_partial_cluster(self):

        assert_raises(ValueError,
                      self.part.partial_cluster,
                      cluster_method='upgmu')
        self.part.partial_cluster(
                method='sca', threshold=0.45, cluster_method='infomap'\
                        if lingpy.algorithm.extra.igraph else 'upgma',
                        ref='parts1')
        self.part.partial_cluster(method='sca',
                                  threshold=0.45,
                                  cluster_method='mcl',
                                  ref='parts2')
        self.part.partial_cluster(method='sca',
                                  threshold=0.45,
                                  cluster_method='upgma',
                                  ref='parts3')

        self.part2.partial_cluster(method='lexstat',
                                   threshold=0.6,
                                   cluster_method='single',
                                   post_processing=True,
                                   imap_mode=False,
                                   ref='parts4')
        # high threshold to trigger post-processing movement
        self.part.partial_cluster(method='sca',
                                  threshold=0.9,
                                  cluster_method='single',
                                  post_processing=True,
                                  imap_mode=False,
                                  ref='parts5')

        assert self.part[9, 'parts3'][0] == self.part[10, 'parts3'][0]
        assert self.part2[8, 'parts4'][1] == self.part2[10, 'parts4'][1]

    def test_add_cognate_ids(self):
        self.part.partial_cluster(method='sca',
                                  threshold=0.45,
                                  cluster_method='upgma',
                                  ref='parts3')
        self.part.add_cognate_ids('parts3', 'cogs1', idtype='strict')
        self.part.add_cognate_ids('parts3', 'cogs2', idtype='loose')
        assert self.part[9, 'cogs1'] == self.part[10, 'cogs1']
        assert_raises(ValueError,
                      self.part.add_cognate_ids,
                      'parts3',
                      'cogs1',
                      idtype='dummy')
Beispiel #6
0
class Tests(WithTempDir):

    def setUp(self):
        WithTempDir.setUp(self)
        self.part = Partial(test_data('partial_cognates.tsv'), segments='segments')
        self.part2 = Partial(test_data('partial_cognates-scored.tsv'),
                segments='segments')

    def test_get_partial_matrices(self):

        for method in ['upgma', 'single', 'complete', 'ward', 'mcl']:
            matrix = list(self.part._get_partial_matrices(cluster_method=method,
                concept="bird"))[0]
            assert isinstance(matrix[0][0], (float, int))
        
        if lingpy.algorithm.extra.igraph:
            for concept, tracer, matrix in self.part._get_partial_matrices(
                    cluster_method='infomap'):
                assert isinstance(concept, text_type)
                assert [x[0] for x in tracer]


    def test_partial_cluster(self):
        
        assert_raises(ValueError, self.part.partial_cluster, cluster_method='upgmu')
        self.part.partial_cluster(
                method='sca', threshold=0.45, cluster_method='infomap'\
                        if lingpy.algorithm.extra.igraph else 'upgma',
                        ref='parts1')
        self.part.partial_cluster(
                method='sca', threshold=0.45, cluster_method='mcl',
                        ref='parts2')
        self.part.partial_cluster(
                method='sca', threshold=0.45, cluster_method='upgma',
                        ref='parts3')

        self.part2.partial_cluster(
                method='lexstat', threshold=0.6, cluster_method='single',
                post_processing=True, imap_mode=False, ref='parts4')
        # high threshold to trigger post-processing movement
        self.part.partial_cluster(
                method='sca', threshold=0.9, cluster_method='single',
                post_processing=True, imap_mode=False, ref='parts5')

        assert self.part[9, 'parts3'][0] == self.part[10, 'parts3'][0]
        assert self.part2[8, 'parts4'][1] == self.part2[10, 'parts4'][1]

    def test_add_cognate_ids(self):
        self.part.partial_cluster(
                method='sca', threshold=0.45, cluster_method='upgma',
                        ref='parts3')
        self.part.add_cognate_ids('parts3', 'cogs1', idtype='strict')
        self.part.add_cognate_ids('parts3', 'cogs2', idtype='loose')
        assert self.part[9,'cogs1'] == self.part[10, 'cogs1']
        assert_raises(ValueError, self.part.add_cognate_ids, 'parts3', 'cogs1', idtype='dummy')
    if not 'loose_cogid' in lex.header:
        lex.add_cognate_ids('partialids', 'loose_cogid', 'loose')

    for i in range(1,20):
        print("Analyzing {0} with t={1}...".format(f, i))
        t = 0.05 * i
        ts = '{0:.2f}'.format(t).replace('0.','')

        for m in methods:
            msf = 'f_'+m
            for cm in cluster_methods:
                ms = '{0}_{1}_{2}'.format('p', m, cm)
                msf = '{0}_{1}_{2}'.format('f', m, cm)
                msp = ms +'_'+ts

                lex.partial_cluster(method=m, cluster_method=cm, threshold=t, ref=msp)

                # get loose and strict cognate ids for this method
                lex.add_cognate_ids(msp, ms+'_strict'+'_'+ts, 'strict')
                lex.add_cognate_ids(msp, ms+'_loose'+'_'+ts, 'loose')

                # get the bcubes
                for mode in ['strict', 'loose']:
                    msm = ms+'_'+mode+'_'+ts
                    p, r, fs = bcubes(lex, mode+'_cogid', msm,
                            pprint=False)
                    pprint_result(f, msm, ts, p, r, fs)
                    ccubes += [[msm, f, t, ts, p, r, fs]]
                p, r, fs = partial_bcubes(lex, 'partialids', msp, pprint=False)
                pprint_result(f, msp, ts, p, r, fs)
                ccubes += [[msp, f, t, ts, p, r, fs]]
Beispiel #8
0
from lingpy import *
from lingpy.compare.partial import Partial
from sys import argv

if 'all' in argv:
    fname='A_Chen_'
else:
    fname='D_Chen_'

try:
    part = Partial(fname+'partial.bin.tsv')
except:
    part = Partial(fname+'subset.tsv', segments='tokens')
    print('[i] loaded the file')
    part.get_partial_scorer(runs=10000)
    part.output('tsv', filename=fname+'partial.bin', ignore=[], prettify=False)
    print('[i] saved the scorer')
finally:
    part.partial_cluster(
            method='lexstat',
            threshold=0.55,
            ref='cogids',
            mode='global',
            gop=-2,
            cluster_method='infomap'
            )

part.output('tsv', filename=fname+'partial', prettify=False)
Beispiel #9
0
        lex.add_cognate_ids('partialids', 'loose_cogid', 'loose')

    for i in range(1, 20):
        print("Analyzing {0} with t={1}...".format(f, i))
        t = 0.05 * i
        ts = '{0:.2f}'.format(t).replace('0.', '')

        for m in methods:
            msf = 'f_' + m
            for cm in cluster_methods:
                ms = '{0}_{1}_{2}'.format('p', m, cm)
                msf = '{0}_{1}_{2}'.format('f', m, cm)
                msp = ms + '_' + ts

                lex.partial_cluster(method=m,
                                    cluster_method=cm,
                                    threshold=t,
                                    ref=msp)

                # get loose and strict cognate ids for this method
                lex.add_cognate_ids(msp, ms + '_strict' + '_' + ts, 'strict')
                lex.add_cognate_ids(msp, ms + '_loose' + '_' + ts, 'loose')

                # get the bcubes
                for mode in ['strict', 'loose']:
                    msm = ms + '_' + mode + '_' + ts
                    p, r, fs = bcubes(lex, mode + '_cogid', msm, pprint=False)
                    pprint_result(f, msm, ts, p, r, fs)
                    ccubes += [[msm, f, t, ts, p, r, fs]]
                p, r, fs = partial_bcubes(lex, 'partialids', msp, pprint=False)
                pprint_result(f, msp, ts, p, r, fs)
                ccubes += [[msp, f, t, ts, p, r, fs]]
namespace = (('concept_name', 'concept'), ('language_id', 'doculect'),
             ('segments', 'tokens'), ('language_glottocode', 'glottolog'),
             ('concept_concepticon_id', 'concepticon'), ('language_latitude',
                                                         'latitude'),
             ('language_longitude', 'longitude'), ('cognacy', 'cognacy'),
             ('cogid_cognateset_id', 'cognacy'))

wl = Wordlist.from_cldf('../cldf/cldf-metadata.json',
                        columns=columns,
                        namespace=namespace)
D = {0: wl.columns}
for idx in wl:
    if wl[idx, 'tokens']:
        D[idx] = wl[idx]
part = Partial(D)
part.partial_cluster(method='sca', threshold=0.45, ref='cogids')
alms = Alignments(part, ref='cogids')
alms.align()
alms.add_entries('note', 'form', lambda x: '')

#part.add_entries('cog', 'cognacy', lambda x: x)
#for idx in wl:
#    if wl[idx, 'cog'].strip():
#        wl[idx, 'cog'] += '-'+wl[idx, 'concept']
#    else:
#        wl[idx, 'cog'] += str(idx)
#
#wl.renumber('cog')

cogids2cogid(alms)