Example #1
0
 def setUp(self):
     WithTempDir.setUp(self)
     self.part = Partial(test_data('partial_cognates.tsv'),
                         segments='segments',
                         split_on_tones=True)
     self.part2 = Partial(test_data('partial_cognates-scored.tsv'),
                          segments='segments')
Example #2
0
 def setUp(self):
     WithTempDir.setUp(self)
     self.lex = LexStat(test_data('KSL.qlc'))
     self.part = Partial(test_data('partial_cognates.tsv'),
                         segments='segments')
     self.part.add_entries('pid1', 'partial_cognate_sets', lambda x: x)
     self.part.add_entries('pid2', 'partialids2',
                           lambda x: [int(y) for y in x.split(' ')])
Example #3
0
class Tests(WithTempDir):
    def setUp(self):
        WithTempDir.setUp(self)
        self.lex = LexStat(test_data('KSL.qlc'))
        self.part = Partial(test_data('partial_cognates.tsv'),
                            segments='segments')
        self.part.add_entries('pid1', 'partial_cognate_sets', lambda x: x)
        self.part.add_entries('pid2', 'partialids2',
                              lambda x: [int(y) for y in x.split(' ')])

    def test_bcubes(self):
        from lingpy.evaluate.acd import bcubes

        res = bcubes(self.lex, test='cogid', pprint=False)
        self.assertAlmostEquals(res, (1.0, 1.0, 1.0))

        res = bcubes(self.lex, 'cogid', 'cogid', pprint=True, per_concept=True)

    def test_partial_bcubes(self):
        from lingpy.evaluate.acd import partial_bcubes
        res = partial_bcubes(self.part, 'pid1', 'pid2', pprint=False)
        assert [round(x, 2) for x in res] == [0.92, 0.98, 0.95]

        res = partial_bcubes(self.part, 'pid1', 'pid2', pprint=True)

    def test_pairs(self):
        from lingpy.evaluate.acd import pairs

        res = pairs(self.lex, test='cogid', pprint=False)
        self.assertAlmostEquals(res, (1.0, 1.0, 1.0))

    def test_diff(self):
        from lingpy.evaluate.acd import diff

        res = diff(self.lex, test='cogid', tofile=False, pprint=False)
        self.assertAlmostEquals(res, ((1.0, 1.0, 1.0), (1.0, 1.0, 1.0)))
        self.lex.add_entries('cugid', 'cogid', lambda x: x + 1
                             if x % 2 else x * x)
        d1 = diff(self.lex,
                  gold='cogid',
                  test='cogid',
                  filename='%s' % self.tmp_path('test_acd'),
                  pprint=False)
        d2 = diff(self.lex,
                  gold='cugid',
                  test='cogid',
                  filename='%s' % self.tmp_path('test_acd'),
                  pprint=False,
                  tofile=False)
        d3 = diff(self.lex,
                  gold='cugid',
                  test='cogid',
                  filename='%s' % self.tmp_path('test_acd'),
                  pprint=False,
                  tofile=True)
        assert d2[0] != 1
Example #4
0
class Tests(WithTempDir):
    def setUp(self):
        WithTempDir.setUp(self)
        self.lex = LexStat(test_data('KSL.qlc'))
        self.part = Partial(test_data('partial_cognates.tsv'),
                            segments='segments')
        self.part.add_entries('pid1', 'partial_cognate_sets', lambda x: x)
        self.part.add_entries('pid2', 'partialids2',
                              lambda x: [int(y) for y in x.split(' ')])

    def test_bcubes(self):
        res = bcubes(self.lex, test='cogid', pprint=False)
        self.assertAlmostEquals(res, (1.0, 1.0, 1.0))

        _ = bcubes(self.lex, 'cogid', 'cogid', pprint=True, per_concept=True)

    def test_partial_bcubes(self):
        res = partial_bcubes(self.part, 'pid1', 'pid2', pprint=False)
        assert [round(x, 2) for x in res] == [0.92, 0.98, 0.95]

        _ = partial_bcubes(self.part, 'pid1', 'pid2', pprint=True)

    def test_pairs(self):
        res = pairs(self.lex, test='cogid', pprint=False)
        self.assertAlmostEquals(res, (1.0, 1.0, 1.0))

    def test_diff(self):
        res = diff(self.lex, test='cogid', tofile=False, pprint=False)
        self.assertAlmostEquals(res, ((1.0, 1.0, 1.0), (1.0, 1.0, 1.0)))
        self.lex.add_entries('cugid', 'cogid',
                             lambda x: x + 1 if x % 2 else x * x)

        _ = diff(self.lex, gold='cogid', test='cogid',
                 filename='%s' % self.tmp_path('test_acd'), pprint=False)
        d2 = diff(self.lex, gold='cugid', test='cogid',
                  filename='%s' % self.tmp_path('test_acd'),
                  pprint=False, tofile=False)
        _ = diff(self.lex, gold='cugid', test='cogid',
                 filename='%s' % self.tmp_path('test_acd'),
                 pprint=False, tofile=True)

        assert d2[0] != 1

    def test_random_cognates(self):
        random_cognates(self.lex, ref='randomid')
        assert 'randomid' in self.lex.header

    def test_extreme_cognates(self):
        extreme_cognates(self.lex, ref="lumperid", bias='lumper')
        assert self.lex[1, 'lumperid'] == self.lex[2, 'lumperid']
        extreme_cognates(self.lex, ref='splitterid', bias='splitter')
        assert self.lex[1, 'splitterid'] != self.lex[2, 'splitterid']
        assert_raises(ValueError, extreme_cognates, self.lex, bias='')
Example #5
0
class Tests(WithTempDir):
    def setUp(self):
        WithTempDir.setUp(self)
        self.lex = LexStat(test_data('KSL.qlc'))
        self.part = Partial(test_data('partial_cognates.tsv'),
                            segments='segments')
        self.part.add_entries('pid1', 'partial_cognate_sets', lambda x: x)
        self.part.add_entries('pid2', 'partialids2',
                              lambda x: [int(y) for y in x.split(' ')])

    def test_bcubes(self):
        res = bcubes(self.lex, test='cogid', pprint=False)
        self.assertAlmostEqual(res, (1.0, 1.0, 1.0))

        _ = bcubes(self.lex, 'cogid', 'cogid', pprint=True, per_concept=True)

    def test_partial_bcubes(self):
        res = partial_bcubes(self.part, 'pid1', 'pid2', pprint=False)
        assert [round(x, 2) for x in res] == [0.92, 0.98, 0.95]

        _ = partial_bcubes(self.part, 'pid1', 'pid2', pprint=True)

    def test_pairs(self):
        res = pairs(self.lex, test='cogid', pprint=False)
        self.assertAlmostEqual(res, (1.0, 1.0, 1.0))

    def test_diff(self):
        res = diff(self.lex, test='cogid', tofile=False, pprint=False)
        self.assertAlmostEqual(res, ((1.0, 1.0, 1.0), (1.0, 1.0, 1.0)))
        self.lex.add_entries('cugid', 'cogid',
                             lambda x: x + 1 if x % 2 else x * x)

        _ = diff(self.lex, gold='cogid', test='cogid',
                 filename='%s' % self.tmp_path('test_acd'), pprint=False)
        d2 = diff(self.lex, gold='cugid', test='cogid',
                  filename='%s' % self.tmp_path('test_acd'),
                  pprint=False, tofile=False)
        _ = diff(self.lex, gold='cugid', test='cogid',
                 filename='%s' % self.tmp_path('test_acd'),
                 pprint=False, tofile=True)

        assert d2[0] != 1

    def test_random_cognates(self):
        random_cognates(self.lex, ref='randomid')
        assert 'randomid' in self.lex.header

    def test_extreme_cognates(self):
        extreme_cognates(self.lex, ref="lumperid", bias='lumper')
        assert self.lex[1, 'lumperid'] == self.lex[2, 'lumperid']
        extreme_cognates(self.lex, ref='splitterid', bias='splitter')
        assert self.lex[1, 'splitterid'] != self.lex[2, 'splitterid']
        assert_raises(ValueError, extreme_cognates, self.lex, bias='')
Example #6
0
class Tests(WithTempDir):
    def setUp(self):
        WithTempDir.setUp(self)
        self.lex = LexStat(test_data('KSL.qlc'))
        self.part = Partial(test_data('partial_cognates.tsv'),
                segments='segments')
        self.part.add_entries('pid1', 'partial_cognate_sets', lambda x: x)
        self.part.add_entries('pid2', 'partialids2', lambda x: [int(y)
            for y in x.split(' ')])


    def test_bcubes(self):
        from lingpy.evaluate.acd import bcubes

        res = bcubes(self.lex, test='cogid', pprint=False)
        self.assertAlmostEquals(res, (1.0, 1.0, 1.0))

        res = bcubes(self.lex, 'cogid', 'cogid', pprint=True,
                per_concept=True)


    def test_partial_bcubes(self):
        from lingpy.evaluate.acd import partial_bcubes
        res = partial_bcubes(self.part, 'pid1', 'pid2', pprint=False)
        assert [round(x, 2) for x in res] == [0.92, 0.98, 0.95]

        res = partial_bcubes(self.part, 'pid1', 'pid2', pprint=True)
        
    def test_pairs(self):
        from lingpy.evaluate.acd import pairs

        res = pairs(self.lex, test='cogid', pprint=False)
        self.assertAlmostEquals(res, (1.0, 1.0, 1.0))

    def test_diff(self):
        from lingpy.evaluate.acd import diff

        res = diff(self.lex, test='cogid', tofile=False, pprint=False)
        self.assertAlmostEquals(res, ((1.0, 1.0, 1.0), (1.0, 1.0, 1.0)))
        self.lex.add_entries('cugid', 'cogid', lambda x: x+1 if x % 2 else x*x)
        d1 = diff(self.lex, gold='cogid', test='cogid', filename='%s' % self.tmp_path('test_acd'), pprint=False)
        d2 = diff(self.lex, gold='cugid', test='cogid', filename='%s' %
                self.tmp_path('test_acd'), pprint=False, tofile=False)
        d3 = diff(self.lex, gold='cugid', test='cogid', filename='%s' %
                self.tmp_path('test_acd'), pprint=False, tofile=True)
        assert d2[0] != 1
Example #7
0
 def setUp(self):
     WithTempDir.setUp(self)
     self.lex = LexStat(test_data('KSL.qlc'))
     self.part = Partial(test_data('partial_cognates.tsv'),
             segments='segments')
     self.part.add_entries('pid1', 'partial_cognate_sets', lambda x: x)
     self.part.add_entries('pid2', 'partialids2', lambda x: [int(y)
         for y in x.split(' ')])
Example #8
0
def calculate_scores(*languages):
    pair_of_languages = {
        0: ["backreference", "doculect", "concept", "ipa", "tokens"]
    }
    for l in languages:
        pair_of_languages.update({
            i + len(pair_of_languages): [
                form.id, form.language_id, form.concept_id, form.transcription,
                form.soundclasses.split()
            ]
            for i, form in enumerate(
                session.query(Form).filter_by(language=l))
        })

    lex = Partial(pair_of_languages,
                  model=lingpy.data.model.Model("asjp"),
                  check=True,
                  apply_checks=True)
    lex.get_scorer(runs=10000, ratio=(3, 1), threshold=0.7)

    # This does not generalize to non-two languages yet
    session.add(
        Scorer(language1=languages[0],
               language2=languages[1],
               scorer=scorer2str(lex.bscorer)))

    for concept, forms, matrix in lex._get_matrices(method='lexstat',
                                                    scale=0.5,
                                                    factor=0.3,
                                                    restricted_chars="_T",
                                                    mode="overlap",
                                                    gop=-2,
                                                    restriction=""):
        for (i1, f1), (i2, f2) in itertools.combinations(enumerate(forms), 2):
            f1 = lex[f1][0]  # Index 0 contains the 'backref', ie. our ID
            f2 = lex[f2][0]  # Index 0 contains the 'backref', ie. our ID
            session.add(
                Similarity(form1_id=f1, form2_id=f2, score=matrix[i1][i2]))

    session.commit()
Example #9
0
def run(args):
    ds = Dataset(args)
    wl = Wordlist.from_cldf(
        str(ds.cldf_specs().dir.joinpath('cldf-metadata.json')))
    D = {0: [x for x in wl.columns]}
    for idx in wl:
        if wl[idx, 'tokens']:
            D[idx] = wl[idx]
    part = Partial(D, check=True)
    part.get_partial_scorer(runs=10000)
    part.partial_cluster(method='lexstat',
                         threshold=0.45,
                         ref="cogids",
                         cluster_method='infomap')
    alms = Alignments(part, ref='cogids', fuzzy=True)
    alms.align()
    alms.output('tsv', filename="chin-aligned")
Example #10
0
class Tests(WithTempDir):
    def setUp(self):
        WithTempDir.setUp(self)
        self.part = Partial(test_data('partial_cognates.tsv'),
                            segments='segments')
        self.part2 = Partial(test_data('partial_cognates-scored.tsv'),
                             segments='segments')

    def test__get_slices(self):
        a = _get_slices(list('ba²te²'))
        b = _get_slices(list('ba²te²'), split_on_tones=False)
        assert a[0][1] == 3
        assert b[0][1] == 6

    def test_get_partial_matrices(self):

        for method in ['upgma', 'single', 'complete', 'ward', 'mcl']:
            matrix = list(
                self.part._get_partial_matrices(cluster_method=method,
                                                concept="bird"))[0]
            assert isinstance(matrix[0][0], (float, int))

        if lingpy.algorithm.extra.igraph:
            for concept, tracer, matrix in self.part._get_partial_matrices(
                    cluster_method='infomap'):
                assert isinstance(concept, text_type)
                assert [x[0] for x in tracer]

    def test_partial_cluster(self):

        assert_raises(ValueError,
                      self.part.partial_cluster,
                      cluster_method='upgmu')
        self.part.partial_cluster(
                method='sca', threshold=0.45, cluster_method='infomap'\
                        if lingpy.algorithm.extra.igraph else 'upgma',
                        ref='parts1')
        self.part.partial_cluster(method='sca',
                                  threshold=0.45,
                                  cluster_method='mcl',
                                  ref='parts2')
        self.part.partial_cluster(method='sca',
                                  threshold=0.45,
                                  cluster_method='upgma',
                                  ref='parts3')

        self.part2.partial_cluster(method='lexstat',
                                   threshold=0.6,
                                   cluster_method='single',
                                   post_processing=True,
                                   imap_mode=False,
                                   ref='parts4')
        # high threshold to trigger post-processing movement
        self.part.partial_cluster(method='sca',
                                  threshold=0.9,
                                  cluster_method='single',
                                  post_processing=True,
                                  imap_mode=False,
                                  ref='parts5')

        assert self.part[9, 'parts3'][0] == self.part[10, 'parts3'][0]
        assert self.part2[8, 'parts4'][1] == self.part2[10, 'parts4'][1]

    def test_add_cognate_ids(self):
        self.part.partial_cluster(method='sca',
                                  threshold=0.45,
                                  cluster_method='upgma',
                                  ref='parts3')
        self.part.add_cognate_ids('parts3', 'cogs1', idtype='strict')
        self.part.add_cognate_ids('parts3', 'cogs2', idtype='loose')
        assert self.part[9, 'cogs1'] == self.part[10, 'cogs1']
        assert_raises(ValueError,
                      self.part.add_cognate_ids,
                      'parts3',
                      'cogs1',
                      idtype='dummy')
Example #11
0
        for sampa in sampas:
            sampa = sampa.strip()
            for s, t in reps:
                sampa = sampa.replace(s, t)
            print(sampa)
            if sampa:
                ipa = sampa2uni(sampa)
                ipa = ipa.replace(' ', '_')
                tks = ipa2tokens(ipa, merge_vowels=False, semi_diacritics='')
                D[idx] = [concept, language, sampa_o, sampa, ipa, tks]
                idx += 1

wl = Wordlist(D)
wl.output('tsv', filename='an-data-wordlist', prettify=False, ignore='all')
lex = LexStat(wl)
print(lex.height, lex.width)
#lex.cluster(method=_method,cluster_method=_cluster_method, threshold=0.45)
lex.cluster(method='sca', threshold=0.45)
alm = Alignments(lex, ref='scaid')
alm.align()
alm.output('tsv', filename='an-data-aligned', prettify=False, ignore='all')
alm.output('html',filename='result')

lex = Partial('an-data-aligned.tsv');
lex.get_scorer(preprocessing=False,runs=10000)
lex.cluster(method=_method,cluster_method=_cluster_method,threshold=0.45)
lex.calculate('tree',ref='scaid')
print(lex.tree)
print('</body>')
print('</html>')
Example #12
0
    7334: 31173,
    7336: 31174,
    7133: 31074,
    7131: 31073,
}
matcher.update(explicit)
blacklist = []
for idx in ob2:
    if idx not in matcher.values() and idx not in unmatched:
        blacklist += [idx]

# now that we have all relevant data, we need to compare the cognate sets
# print(max([int(stdb[idx, 'cogid']) for idx in stdb]))

# cogid range should be 7000+
part = Partial(wl)
part.add_cognate_ids('cogids', 'strictid', idtype='strict')

# compute a matcher of cognate ids
burm2stdb = {}
ncid = 8000
for idx in part:
    nidx = matcher.get(idx)
    tid = part[idx, 'strictid']
    if nidx and nidx not in burm2stdb:
        oid = stdb[nidx, 'cogid']
        burm2stdb[tid] = oid
    else:
        if tid in burm2stdb:
            pass
        else:
Example #13
0
 def setUp(self):
     WithTempDir.setUp(self)
     self.part = Partial(test_data('partial_cognates.tsv'), segments='segments')
     self.part2 = Partial(test_data('partial_cognates-scored.tsv'),
             segments='segments')
Example #14
0
class Tests(WithTempDir):

    def setUp(self):
        WithTempDir.setUp(self)
        self.part = Partial(test_data('partial_cognates.tsv'), segments='segments')
        self.part2 = Partial(test_data('partial_cognates-scored.tsv'),
                segments='segments')

    def test_get_partial_matrices(self):

        for method in ['upgma', 'single', 'complete', 'ward', 'mcl']:
            matrix = list(self.part._get_partial_matrices(cluster_method=method,
                concept="bird"))[0]
            assert isinstance(matrix[0][0], (float, int))
        
        if lingpy.algorithm.extra.igraph:
            for concept, tracer, matrix in self.part._get_partial_matrices(
                    cluster_method='infomap'):
                assert isinstance(concept, text_type)
                assert [x[0] for x in tracer]


    def test_partial_cluster(self):
        
        assert_raises(ValueError, self.part.partial_cluster, cluster_method='upgmu')
        self.part.partial_cluster(
                method='sca', threshold=0.45, cluster_method='infomap'\
                        if lingpy.algorithm.extra.igraph else 'upgma',
                        ref='parts1')
        self.part.partial_cluster(
                method='sca', threshold=0.45, cluster_method='mcl',
                        ref='parts2')
        self.part.partial_cluster(
                method='sca', threshold=0.45, cluster_method='upgma',
                        ref='parts3')

        self.part2.partial_cluster(
                method='lexstat', threshold=0.6, cluster_method='single',
                post_processing=True, imap_mode=False, ref='parts4')
        # high threshold to trigger post-processing movement
        self.part.partial_cluster(
                method='sca', threshold=0.9, cluster_method='single',
                post_processing=True, imap_mode=False, ref='parts5')

        assert self.part[9, 'parts3'][0] == self.part[10, 'parts3'][0]
        assert self.part2[8, 'parts4'][1] == self.part2[10, 'parts4'][1]

    def test_add_cognate_ids(self):
        self.part.partial_cluster(
                method='sca', threshold=0.45, cluster_method='upgma',
                        ref='parts3')
        self.part.add_cognate_ids('parts3', 'cogs1', idtype='strict')
        self.part.add_cognate_ids('parts3', 'cogs2', idtype='loose')
        assert self.part[9,'cogs1'] == self.part[10, 'cogs1']
        assert_raises(ValueError, self.part.add_cognate_ids, 'parts3', 'cogs1', idtype='dummy')
from lingpy import *
from lingpy.compare.partial import Partial

try:
    part = Partial('hm-111-17.bin.tsv', segments='segments')
except:
    part = Partial('hm-111-17.tsv', segments='segments')
    part.get_scorer(runs=10000)
    part.output('tsv', filename='hm-111-17.bin')

# manually correct error in data
part.partial_cluster(method='lexstat',
                     cluster_method='infomap',
                     threshold=0.6,
                     ref='cogids')

part.add_entries('note', 'cogid', lambda x: '')
part.add_entries('morphemes', 'cogid', lambda x: '')
part.output('tsv', filename='hm-111-17-t06', ignore='all', prettify=False)
Example #16
0
def prepare(ds):

    # steps:
    # parse characters (numbers, zeros)
    # check for number
    # recreate partial cognate identifiers
    # create strict cognate identifieres
    # code everything as CLDF-like file
    con = Concepticon()
    beida = con.conceptlists['BeijingDaxue-1964-905']
    inv = ds.sounds
    words = Wordlist(ds.raw('chars-corrected-2017-06-18.tsv'))
    partialids, pidc = {}, {}
    pidx = 1
    concepts = {}
    for idx, chars, tks, doculect, glossid in iter_rows(
            words, 'benzi', 'segments', 'doculect', 'beida_id'):
        tokens = tokens2morphemes(tks)
        benzi = parse_chars(chars, doculect, tokens)
        if len(tokens) != len(benzi):
            print(doculect, glossid, benzi, tokens)
        pids = []
        for char in benzi:
            if char == 'å›—':
                pids += [str(pidx)]
                pidx += 1
            else:
                if char not in partialids:
                    partialids[char] = str(pidx)
                    pidx += 1
                pids += [partialids[char]]
        words[idx, 'cogids'] = ' '.join(pids)
        words[idx, 'benzi'] = ' '.join(benzi)

        # retrieve correct concept
        bidx = 'BeijingDaxue-1964-905-' + glossid
        concept = beida.concepts[bidx]
        concepts[idx] = [
            concept.concepticon_id, concept.attributes['chinese'],
            concept.attributes['pageno'], concept.attributes['pinyin']
        ]
        words[idx, 'concept'] = concept.gloss + ' (' + concept.attributes[
            'pinyin'] + ' ' + concept.attributes['chinese'] + ')'
    for i, entry in enumerate(['concepticon_id', 'chinese', 'page', 'pinyin']):
        words.add_entries(entry, concepts, lambda x: x[i])
    words.add_entries('benzi_in_source', 'hanzi', lambda x: x)
    words.add_entries('source', 'ipa', lambda x: 'BeijingDaxue1964')
    words.add_entries('value', 'ipa', lambda x: x)
    words.add_entries('form', 'ipa', lambda x: x)
    words.add_entries('glottolog', 'doculect',
                      lambda x: ds.languages[x]['glottolog'])
    words.add_entries('iso', 'doculect', lambda x: ds.languages[x]['iso'])

    # determine order of entries
    order = {}
    for d in words.cols:
        entries = words.get_list(col=d, flat=True)
        concept, oid = '', 1
        for idx in sorted(entries):
            new_concept = words[idx, 'concept']
            if new_concept == concept:
                oid += 1
            else:
                concept = new_concept
                oid = 1
            order[idx] = oid
    words.add_entries('order', order, lambda x: str(x))

    words.output('tsv', filename=ds.raw('tmp-2017-06-18'))
    print('first run on words')
    part = Partial(ds.raw('tmp-2017-06-18.tsv'), segments='segments')
    part.add_cognate_ids('cogids', 'cogid')
    part.output('tsv', filename=ds.raw('tmp-2017-06-18'))
    print('created cognate ids')
    alm = Alignments(ds.raw('tmp-2017-06-18.tsv'),
                     segments='segments',
                     ref='cogids',
                     alignment='alignments')
    alm.align()
    alm.output('tsv',
               filename=ds.raw('tmp-2017-06-18-finalized'),
               subset=True,
               cols=[
                   'doculect', 'glottolog', 'iso', 'concept', 'concepticon_id',
                   'chinese', 'pinyin', 'benzi', 'benzi_in_source', 'value',
                   'form', 'segments', 'cogid', 'cogids', 'note', 'source',
                   'beida_id', 'page', 'order', 'alignments'
               ])
    words = Wordlist(ds.raw('tmp-2017-06-18-finalized.tsv'))
    ds.write_wordlist(words)
    with open('cldf/beijingdaxue1964.csv', 'w') as f:
        f.write(','.join([
            'ID', 'Language_name', 'Language_ID', 'Language_iso',
            'Parameter_ID', 'Parameter_name', 'Source', 'Comment',
            'Parameter_Chinese', 'Parameter_Pinyin', 'Value', 'Form',
            'Segments', 'Cognate_Set', 'Cognate_Sets', 'Alignments', 'Order',
            'Beida_ID', 'Page', 'Benzi', 'Benzi_in_source'
        ]) + '\n')
        for idx in words:
            out = [str(idx)]
            for entry in [
                    'doculect', 'glottolog', 'iso', 'concepticon_id',
                    'concept', 'source', 'note', 'chinese', 'pinyin', 'value',
                    'form', 'segments', 'cogid', 'cogids', 'alignments',
                    'order', 'beida_id', 'page', 'benzi', 'benzi_in_source'
            ]:
                value = words[idx, entry]
                if isinstance(value, list):
                    value = ' '.join([str(x) for x in value])
                else:
                    value = str(value)
                if '"' in value:
                    value = value.replace('"', '""')
                if ',' in value:
                    value = '"' + value + '"'
                out += [value]
            f.write(','.join(out) + '\n')
Example #17
0
def run(args):

    ds = Dataset()

    try:
        part = Partial(
            ds.dir.joinpath('workflow', 'D_Chen_partial.bin.tsv').as_posix())
    except:
        part = Partial(ds.dir.joinpath('workflow',
                                       'D_Chen_subset.tsv').as_posix(),
                       segments='tokens')
        part.get_partial_scorer(runs=10000)
        part.output('tsv',
                    filename=ds.dir.joinpath('workflow',
                                             'D_Chen_partial.bin').as_posix(),
                    ignore=[],
                    prettify=False)
        args.log.info('[i] saved the scorer')
    finally:
        part.partial_cluster(method='lexstat',
                             threshold=0.55,
                             ref='cogids',
                             mode='global',
                             gop=-2,
                             cluster_method='infomap')

    part.output('tsv',
                filename=ds.dir.joinpath('workflow',
                                         'D_Chen_partial').as_posix(),
                prettify=False)
Example #18
0
ccubes = []


def pprint_result(f, mode, t, p, r, fs):
    print('{0:15}   {1:30}   {2}   {3:.2f}   {4:.2f}   {5:.2f}'.format(
        f, mode, ts, p, r, fs))


methods = ['sca', 'lexstat']
cluster_methods = ['infomap', 'mcl', 'upgma']
measures = ['partial', 'strict', 'loose']

for f in infiles:
    try:
        lex = Partial(pcd_path('data', 'BIN_' + f + '.tsv'))
    except IOError:
        lex = Partial(pcd_path('data', f + '.tsv'))
        lex.get_scorer(
            preprocessing=False,
            runs=10000,
        )
        lex.output('tsv', filename=pcd_path('data', 'BIN_' + f[2:]))

    # create new reference ids for cogantes from partial cognates
    if not 'strict_cogid' in lex.header:
        lex.add_cognate_ids('partialids', 'strict_cogid', 'strict')
    if not 'loose_cogid' in lex.header:
        lex.add_cognate_ids('partialids', 'loose_cogid', 'loose')

    for i in range(1, 20):
Example #19
0
def part(test_data):
    p = Partial(str(test_data / 'partial_cognates.tsv'), segments='segments')
    p.add_entries('pid1', 'partial_cognate_sets', lambda x: x)
    p.add_entries('pid2', 'partialids2',
                  lambda x: [int(y) for y in x.split(' ')])
    return p
Example #20
0
        language, concept, concept_id, glottolog, words,
        t(ipa),
        t(ipa, 'IPA'), cognacy
    ]
wl = Wordlist(D)
# create a new column by cog column. The new column assign
wl.renumber('cog')
print('Saving full cognate file...')
wl.output('tsv',
          filename='{0}-{1}'.format(sys.argv[3], 'cognate'),
          prettify=False,
          ignore='all')

## get partical cognate
try:
    part = Partial('{0}-{1}-{2}.tsv'.format(sys.argv[3], 'partial', 'temp'),
                   segments='segments')
except:
    part = Partial(wl, segments='segments')
    part.get_scorer(runs=10000)
    part.output('tsv',
                filename='{0}-{1}-{2}'.format(sys.argv[3], 'partial', 'temp'))

## manually correct error in data
part.partial_cluster(method='lexstat',
                     cluster_method='infomap',
                     threshold=0.6,
                     ref='cogids')

part.add_entries('note', 'cogid', lambda x: '')
part.add_entries('morphemes', 'cogid', lambda x: '')
print('Saving partial cognate file')
Example #21
0
from lingpy import *
from lingpy.compare.partial import Partial
from sys import argv

if 'all' in argv:
    fname='A_Chen_'
else:
    fname='D_Chen_'

try:
    part = Partial(fname+'partial.bin.tsv')
except:
    part = Partial(fname+'subset.tsv', segments='tokens')
    print('[i] loaded the file')
    part.get_partial_scorer(runs=10000)
    part.output('tsv', filename=fname+'partial.bin', ignore=[], prettify=False)
    print('[i] saved the scorer')
finally:
    part.partial_cluster(
            method='lexstat',
            threshold=0.55,
            ref='cogids',
            mode='global',
            gop=-2,
            cluster_method='infomap'
            )

part.output('tsv', filename=fname+'partial', prettify=False)
Example #22
0
wl = Wordlist.from_cldf(Dataset().cldf_dir.joinpath('cldf-metadata.json'))
i = 0
for idx, tokens in wl.iter_rows('tokens'):
    #print(idx, tokens)
    for segment in tokens.n:
        if not segment:
            print(idx, tokens)

from lingpy.compare.partial import Partial

columns = ('concept_name', 'language_id', 'value', 'form', 'segments',
           'language_glottocode', 'cogid_cognateset_id')
namespace = (('concept_name', 'concept'), ('language_id', 'doculect'),
             ('segments', 'tokens'), ('language_glottocode', 'glottolog'),
             ('concept_concepticon_id', 'concepticon'), ('language_latitude',
                                                         'latitude'),
             ('language_longitude', 'longitude'), ('cognacy', 'cognacy'),
             ('cogid_cognateset_id', 'cogid'))

var = Dataset().cldf_dir.joinpath('cldf-metadata.json')
part = Partial.from_cldf(var)
part.get_partial_scorer(
    runs=100)  # make tests with 100 and 1000, when debugging)
part.partial_cluster(method='lexstat',
                     threshold=0.5,
                     ref='cogids',
                     cluster_method='infomap')
alms = Alignments(part, ref='cogids')
alms.align()
alms.output('tsv', filename='deepadung-wordlist', ignore='all', prettify=False)
Example #23
0
from lingpy.compare.partial import Partial
from lingpy.convert.plot import plot_tree
from sys import argv
from clldutils.text import strip_brackets, split_text
from collections import defaultdict
from lingpy import basictypes

if 'all' in argv:
    fname='../output/A_Deepadung_'
else:
    fname='../output/D_Deepadung_'

part = Partial(fname+'crossids.tsv')
part.add_cognate_ids('crossids', 'crossid', idtype='strict')
part.add_entries('cog', 'crossid,concept', lambda x, y: str(x[y[0]])+x[y[1]])
part.renumber('cog')

part.calculate('distance', ref='cogid')
part.calculate('tree', tree_calc='neighbor')

part.output('dst', filename=fname+'distance')
part.output('tre', filename=fname+'tree')

if 'plot' in argv:
    plot_tree(str(part.tree), degree=350, filename=fname+'tree')


Example #24
0
from lingpy import *
from lingpy.compare.partial import Partial
from lingpy.evaluate.acd import partial_bcubes

try:
    lexx = LexStat('hm-jerry-scored.bin.tsv', segments='segments')
    lex = Partial('../hm-111-17_16feb.tsv', segments='segments')
    lex.cscorer = lexx.cscorer
except:
    lex = Partial('../hm-111-17_16feb.tsv', segments='segments')
    lex.get_scorer(runs=10000)
    lex.output('tsv', filename='hm-jerry-scored.bin')

# we test several thresholds
for i in range(2, 8):
    lex.partial_cluster(method='lexstat',
                        cluster_method='infomap',
                        threshold=i * 0.1,
                        ref='t' + str(i))

    a, b, c = partial_bcubes(lex, 'cogids', 't' + str(i), pprint=False)
    print('{0:2} {1:.2f} {2:.2f} {3:.2f}'.format(i, a, b, c))
Example #25
0
def part2(test_data):
    return Partial(str(test_data / 'partial_cognates-scored.tsv'),
                   segments='segments')
Example #26
0
from lexibank_deepadungpalaung import Dataset
from lingpy import *
from lingpy.compare.partial import Partial
from lingpy.evaluate.acd import bcubes

columns=('concept_name', 'language_id',
                'value', 'form', 'segments', 'language_glottocode', 'cogid_cognateset_id'
                )
namespace=(('concept_name', 'concept'), ('language_id',
                'doculect'), ('segments', 'tokens'), ('language_glottocode',
                    'glottolog'), ('concept_concepticon_id', 'concepticon'),
                ('language_latitude', 'latitude'), ('language_longitude',
                    'longitude'), ('cognacy', 'cognacy'),
                ('cogid_cognateset_id', 'cog'))

part = Partial.from_cldf(Dataset().cldf_dir.joinpath('cldf-metadata.json'),
        columns=columns, namespace=namespace)

part.renumber('cog')


method = input('method: ')

# type 'cogid' or 'cog' for method to see a tree based on Deepadung et al.'s
# cognate judgements

if method == 'lexstatcogids':
    part.get_partial_scorer(runs=10000)
    part.partial_cluster(method='lexstat', ref="lexstatcogids", threshold=0.55)
elif method == 'lexstatcogid':
    part.get_scorer(runs=10000)
    part.cluster(method='lexstat', ref="lexstatcogid", threshold=0.55)
Example #27
0
def part(test_data):
    return Partial(str(test_data / 'partial_cognates.tsv'),
                   segments='segments',
                   split_on_tones=True)
Example #28
0
from lingpy.convert.strings import write_nexus
from lingpy.compare.partial import Partial
from lingpy.convert.plot import plot_tree

# Load the necessary data
part = Partial.from_cldf('cldf/cldf-metadata.json')

# Compute cognate sets according to SCA and calculate the distance matrix
part.partial_cluster(method='sca',
                     threshold=0.45,
                     ref="cogids",
                     cluster_method="upgma")
part.add_cognate_ids('cogids', 'cogid', idtype='strict')
part.calculate('tree', ref='cogid', tree_calc='upgma')
out = write_nexus(part, mode='splitstree', filename='distance_matrix.nex')
part.output('dst', filename='distance_matrix')
plot_tree(str(part.tree))
print(part.tree.asciiArt())

# Compute cognate sets according to LexStat and calculate the distance matrix
# part.get_partial_scorer(runs=1000)
# part.partial_cluster(method='lexstat', threshold=0.55, cluster_method='upgma', ref="lexstatids")
# part.add_cognate_ids('lexstatids', 'lexstatid', idtype='strict')
# part.calculate('tree', ref='lexstatid', tree_calc='upgma', force=True)
# part.output('dst', filename='distance_matrix')
# plot_tree(str(part.tree))
# print(part.tree.asciiArt())
Example #29
0
from lingpy.compare.partial import Partial
from lingpy.align.sca import Alignments
from lexibank_chingelong import Dataset

# Load data
#part = Partial.from_cldf('cldf/cldf-metadata.json')
part = Partial.from_cldf(Dataset().cldf_dir.joinpath('cldf-metadata.json'))

# Compute cognate sets according to SCA (appended to the column cogids)
part.partial_cluster(threshold=0.45, ref="cogids", cluster_method="upgma")

# Compute cognate sets according to LexStat (appended to the column lexstatids)
part.get_partial_scorer(runs=1000)
part.partial_cluster(method='lexstat',
                     threshold=0.55,
                     cluster_method='upgma',
                     ref="lexstatids")

# Align the partial cognates
alms = Alignments(part, ref='cogids')
alms.align()

# Write the data to a file
alms.output('tsv', filename='alignments', ignore='all', prettify=False)
    if 'file' in argv:
        infiles = [argv[argv.index('file')+1]]

ccubes = []

def pprint_result(f, mode, t, p, r, fs):
    print('{0:15}   {1:30}   {2}   {3:.2f}   {4:.2f}   {5:.2f}'.format(
        f, mode, ts, p, r, fs))

methods = ['sca', 'lexstat']
cluster_methods = ['infomap', 'mcl', 'upgma']
measures = ['partial', 'strict', 'loose']

for f in infiles:
    try:
        lex = Partial(pcd_path('data', 'BIN_'+f+'.tsv'))
    except IOError:
        lex = Partial(pcd_path('data', f+'.tsv'))
        lex.get_scorer(
                preprocessing=False, 
                runs=10000,
                )
        lex.output('tsv', filename=pcd_path('data', 'BIN_'+f[2:]))

    # create new reference ids for cogantes from partial cognates
    if not 'strict_cogid' in lex.header:
        lex.add_cognate_ids('partialids', 'strict_cogid', 'strict')
    if not 'loose_cogid' in lex.header:
        lex.add_cognate_ids('partialids', 'loose_cogid', 'loose')

    for i in range(1,20):
           'segments', 'cogid_cognateset_id')
namespace = (('concept_name', 'concept'), ('language_id', 'doculect'),
             ('segments', 'tokens'), ('language_glottocode', 'glottolog'),
             ('concept_concepticon_id', 'concepticon'), ('language_latitude',
                                                         'latitude'),
             ('language_longitude', 'longitude'), ('cognacy', 'cognacy'),
             ('cogid_cognateset_id', 'cognacy'))

wl = Wordlist.from_cldf('../cldf/cldf-metadata.json',
                        columns=columns,
                        namespace=namespace)
D = {0: wl.columns}
for idx in wl:
    if wl[idx, 'tokens']:
        D[idx] = wl[idx]
part = Partial(D)
part.partial_cluster(method='sca', threshold=0.45, ref='cogids')
alms = Alignments(part, ref='cogids')
alms.align()
alms.add_entries('note', 'form', lambda x: '')

#part.add_entries('cog', 'cognacy', lambda x: x)
#for idx in wl:
#    if wl[idx, 'cog'].strip():
#        wl[idx, 'cog'] += '-'+wl[idx, 'concept']
#    else:
#        wl[idx, 'cog'] += str(idx)
#
#wl.renumber('cog')

cogids2cogid(alms)