Ejemplo n.º 1
0
def run(args):

    ds = Dataset()

    try:
        part = Partial(
            ds.dir.joinpath('workflow', 'D_Chen_partial.bin.tsv').as_posix())
    except:
        part = Partial(ds.dir.joinpath('workflow',
                                       'D_Chen_subset.tsv').as_posix(),
                       segments='tokens')
        part.get_partial_scorer(runs=10000)
        part.output('tsv',
                    filename=ds.dir.joinpath('workflow',
                                             'D_Chen_partial.bin').as_posix(),
                    ignore=[],
                    prettify=False)
        args.log.info('[i] saved the scorer')
    finally:
        part.partial_cluster(method='lexstat',
                             threshold=0.55,
                             ref='cogids',
                             mode='global',
                             gop=-2,
                             cluster_method='infomap')

    part.output('tsv',
                filename=ds.dir.joinpath('workflow',
                                         'D_Chen_partial').as_posix(),
                prettify=False)
Ejemplo n.º 2
0
def run(args):
    ds = Dataset(args)
    wl = Wordlist.from_cldf(
        str(ds.cldf_specs().dir.joinpath('cldf-metadata.json')))
    D = {0: [x for x in wl.columns]}
    for idx in wl:
        if wl[idx, 'tokens']:
            D[idx] = wl[idx]
    part = Partial(D, check=True)
    part.get_partial_scorer(runs=10000)
    part.partial_cluster(method='lexstat',
                         threshold=0.45,
                         ref="cogids",
                         cluster_method='infomap')
    alms = Alignments(part, ref='cogids', fuzzy=True)
    alms.align()
    alms.output('tsv', filename="chin-aligned")
Ejemplo n.º 3
0
class Tests(WithTempDir):
    def setUp(self):
        WithTempDir.setUp(self)
        self.part = Partial(test_data('partial_cognates.tsv'),
                            segments='segments',
                            split_on_tones=True)
        self.part2 = Partial(test_data('partial_cognates-scored.tsv'),
                             segments='segments')

    def test__get_slices(self):
        a = _get_slices(list('ba²te²'), split_on_tones=True)
        b = _get_slices(list('ba²te²'), split_on_tones=False)
        assert a[0][1] == 3
        assert b[0][1] == 6

    def test_get_partial_scorer(self):
        self.part2.get_partial_scorer(runs=10)

    def test_get_partial_matrices(self):
        for method in ['upgma', 'single', 'complete', 'ward', 'mcl']:
            matrix = list(
                self.part._get_partial_matrices(cluster_method=method,
                                                concept="bird"))[0]
            assert isinstance(matrix[0][0], (float, int))

        if lingpy.algorithm.extra.igraph:
            for concept, tracer, matrix in self.part._get_partial_matrices(
                    cluster_method='infomap'):
                assert isinstance(concept, text_type)
                assert [x[0] for x in tracer]

    def test_partial_cluster(self):
        assert_raises(ValueError,
                      self.part.partial_cluster,
                      cluster_method='upgmu')
        self.part.partial_cluster(method='sca',
                                  threshold=0.45,
                                  split_on_tones=True,
                                  post_processing=False,
                                  cluster_method='infomap' if
                                  lingpy.algorithm.extra.igraph else 'upgma',
                                  ref='parts1')

        self.part.partial_cluster(method='sca',
                                  threshold=0.45,
                                  post_processing=False,
                                  cluster_method='mcl',
                                  ref='parts2',
                                  split_on_tones=True)
        self.part.partial_cluster(method='sca',
                                  threshold=0.45,
                                  split_on_tones=True,
                                  post_processing=False,
                                  cluster_method='upgma',
                                  ref='parts3')

        self.part2.partial_cluster(method='lexstat',
                                   threshold=0.6,
                                   cluster_method='single',
                                   post_processing=True,
                                   imap_mode=False,
                                   split_on_tones=True,
                                   ref='parts4')

        # high threshold to trigger post-processing movement
        self.part.partial_cluster(method='sca',
                                  threshold=0.9,
                                  split_on_tones=True,
                                  cluster_method='single',
                                  post_processing=True,
                                  imap_mode=False,
                                  ref='parts5')

        assert self.part[9, 'parts3'][0] == self.part[10, 'parts3'][0]
        assert self.part2[8, 'parts4'][1] == self.part2[10, 'parts4'][1]

    def test_add_cognate_ids(self):
        self.part.partial_cluster(method='sca',
                                  threshold=0.45,
                                  split_on_tones=True,
                                  cluster_method='upgma',
                                  ref='parts3')
        self.part.add_cognate_ids('parts3', 'cogs1', idtype='strict')
        self.part.add_cognate_ids('parts3', 'cogs2', idtype='loose')

        assert self.part[9, 'cogs1'] == self.part[10, 'cogs1']
        assert_raises(ValueError,
                      self.part.add_cognate_ids,
                      'parts3',
                      'cogs1',
                      idtype='dummy')
Ejemplo n.º 4
0
from lingpy import *
from lingpy.compare.partial import Partial
from sys import argv

if 'all' in argv:
    fname='A_Chen_'
else:
    fname='D_Chen_'

try:
    part = Partial(fname+'partial.bin.tsv')
except:
    part = Partial(fname+'subset.tsv', segments='tokens')
    print('[i] loaded the file')
    part.get_partial_scorer(runs=10000)
    part.output('tsv', filename=fname+'partial.bin', ignore=[], prettify=False)
    print('[i] saved the scorer')
finally:
    part.partial_cluster(
            method='lexstat',
            threshold=0.55,
            ref='cogids',
            mode='global',
            gop=-2,
            cluster_method='infomap'
            )

part.output('tsv', filename=fname+'partial', prettify=False)