Esempio n. 1
0
def run(args):

    ds = Dataset()

    try:
        part = Partial(
            ds.dir.joinpath('workflow', 'D_Chen_partial.bin.tsv').as_posix())
    except:
        part = Partial(ds.dir.joinpath('workflow',
                                       'D_Chen_subset.tsv').as_posix(),
                       segments='tokens')
        part.get_partial_scorer(runs=10000)
        part.output('tsv',
                    filename=ds.dir.joinpath('workflow',
                                             'D_Chen_partial.bin').as_posix(),
                    ignore=[],
                    prettify=False)
        args.log.info('[i] saved the scorer')
    finally:
        part.partial_cluster(method='lexstat',
                             threshold=0.55,
                             ref='cogids',
                             mode='global',
                             gop=-2,
                             cluster_method='infomap')

    part.output('tsv',
                filename=ds.dir.joinpath('workflow',
                                         'D_Chen_partial').as_posix(),
                prettify=False)
from lingpy import *
from lingpy.compare.partial import Partial

try:
    part = Partial('hm-111-17.bin.tsv', segments='segments')
except:
    part = Partial('hm-111-17.tsv', segments='segments')
    part.get_scorer(runs=10000)
    part.output('tsv', filename='hm-111-17.bin')

# manually correct error in data
part.partial_cluster(method='lexstat',
                     cluster_method='infomap',
                     threshold=0.6,
                     ref='cogids')

part.add_entries('note', 'cogid', lambda x: '')
part.add_entries('morphemes', 'cogid', lambda x: '')
part.output('tsv', filename='hm-111-17-t06', ignore='all', prettify=False)
Esempio n. 3
0
from lingpy import *
from lingpy.compare.partial import Partial
from lingpy.evaluate.acd import partial_bcubes

try:
    lexx = LexStat('hm-jerry-scored.bin.tsv', segments='segments')
    lex = Partial('../hm-111-17_16feb.tsv', segments='segments')
    lex.cscorer = lexx.cscorer
except:
    lex = Partial('../hm-111-17_16feb.tsv', segments='segments')
    lex.get_scorer(runs=10000)
    lex.output('tsv', filename='hm-jerry-scored.bin')

# we test several thresholds
for i in range(2, 8):
    lex.partial_cluster(method='lexstat',
                        cluster_method='infomap',
                        threshold=i * 0.1,
                        ref='t' + str(i))

    a, b, c = partial_bcubes(lex, 'cogids', 't' + str(i), pprint=False)
    print('{0:2} {1:.2f} {2:.2f} {3:.2f}'.format(i, a, b, c))
        f, mode, ts, p, r, fs))

methods = ['sca', 'lexstat']
cluster_methods = ['infomap', 'mcl', 'upgma']
measures = ['partial', 'strict', 'loose']

for f in infiles:
    try:
        lex = Partial(pcd_path('data', 'BIN_'+f+'.tsv'))
    except IOError:
        lex = Partial(pcd_path('data', f+'.tsv'))
        lex.get_scorer(
                preprocessing=False, 
                runs=10000,
                )
        lex.output('tsv', filename=pcd_path('data', 'BIN_'+f[2:]))

    # create new reference ids for cogantes from partial cognates
    if not 'strict_cogid' in lex.header:
        lex.add_cognate_ids('partialids', 'strict_cogid', 'strict')
    if not 'loose_cogid' in lex.header:
        lex.add_cognate_ids('partialids', 'loose_cogid', 'loose')

    for i in range(1,20):
        print("Analyzing {0} with t={1}...".format(f, i))
        t = 0.05 * i
        ts = '{0:.2f}'.format(t).replace('0.','')

        for m in methods:
            msf = 'f_'+m
            for cm in cluster_methods:
Esempio n. 5
0
def prepare(ds):

    # steps:
    # parse characters (numbers, zeros)
    # check for number
    # recreate partial cognate identifiers
    # create strict cognate identifieres
    # code everything as CLDF-like file
    con = Concepticon()
    beida = con.conceptlists['BeijingDaxue-1964-905']
    inv = ds.sounds
    words = Wordlist(ds.raw('chars-corrected-2017-06-18.tsv'))
    partialids, pidc = {}, {}
    pidx = 1
    concepts = {}
    for idx, chars, tks, doculect, glossid in iter_rows(
            words, 'benzi', 'segments', 'doculect', 'beida_id'):
        tokens = tokens2morphemes(tks)
        benzi = parse_chars(chars, doculect, tokens)
        if len(tokens) != len(benzi):
            print(doculect, glossid, benzi, tokens)
        pids = []
        for char in benzi:
            if char == '囗':
                pids += [str(pidx)]
                pidx += 1
            else:
                if char not in partialids:
                    partialids[char] = str(pidx)
                    pidx += 1
                pids += [partialids[char]]
        words[idx, 'cogids'] = ' '.join(pids)
        words[idx, 'benzi'] = ' '.join(benzi)

        # retrieve correct concept
        bidx = 'BeijingDaxue-1964-905-' + glossid
        concept = beida.concepts[bidx]
        concepts[idx] = [
            concept.concepticon_id, concept.attributes['chinese'],
            concept.attributes['pageno'], concept.attributes['pinyin']
        ]
        words[idx, 'concept'] = concept.gloss + ' (' + concept.attributes[
            'pinyin'] + ' ' + concept.attributes['chinese'] + ')'
    for i, entry in enumerate(['concepticon_id', 'chinese', 'page', 'pinyin']):
        words.add_entries(entry, concepts, lambda x: x[i])
    words.add_entries('benzi_in_source', 'hanzi', lambda x: x)
    words.add_entries('source', 'ipa', lambda x: 'BeijingDaxue1964')
    words.add_entries('value', 'ipa', lambda x: x)
    words.add_entries('form', 'ipa', lambda x: x)
    words.add_entries('glottolog', 'doculect',
                      lambda x: ds.languages[x]['glottolog'])
    words.add_entries('iso', 'doculect', lambda x: ds.languages[x]['iso'])

    # determine order of entries
    order = {}
    for d in words.cols:
        entries = words.get_list(col=d, flat=True)
        concept, oid = '', 1
        for idx in sorted(entries):
            new_concept = words[idx, 'concept']
            if new_concept == concept:
                oid += 1
            else:
                concept = new_concept
                oid = 1
            order[idx] = oid
    words.add_entries('order', order, lambda x: str(x))

    words.output('tsv', filename=ds.raw('tmp-2017-06-18'))
    print('first run on words')
    part = Partial(ds.raw('tmp-2017-06-18.tsv'), segments='segments')
    part.add_cognate_ids('cogids', 'cogid')
    part.output('tsv', filename=ds.raw('tmp-2017-06-18'))
    print('created cognate ids')
    alm = Alignments(ds.raw('tmp-2017-06-18.tsv'),
                     segments='segments',
                     ref='cogids',
                     alignment='alignments')
    alm.align()
    alm.output('tsv',
               filename=ds.raw('tmp-2017-06-18-finalized'),
               subset=True,
               cols=[
                   'doculect', 'glottolog', 'iso', 'concept', 'concepticon_id',
                   'chinese', 'pinyin', 'benzi', 'benzi_in_source', 'value',
                   'form', 'segments', 'cogid', 'cogids', 'note', 'source',
                   'beida_id', 'page', 'order', 'alignments'
               ])
    words = Wordlist(ds.raw('tmp-2017-06-18-finalized.tsv'))
    ds.write_wordlist(words)
    with open('cldf/beijingdaxue1964.csv', 'w') as f:
        f.write(','.join([
            'ID', 'Language_name', 'Language_ID', 'Language_iso',
            'Parameter_ID', 'Parameter_name', 'Source', 'Comment',
            'Parameter_Chinese', 'Parameter_Pinyin', 'Value', 'Form',
            'Segments', 'Cognate_Set', 'Cognate_Sets', 'Alignments', 'Order',
            'Beida_ID', 'Page', 'Benzi', 'Benzi_in_source'
        ]) + '\n')
        for idx in words:
            out = [str(idx)]
            for entry in [
                    'doculect', 'glottolog', 'iso', 'concepticon_id',
                    'concept', 'source', 'note', 'chinese', 'pinyin', 'value',
                    'form', 'segments', 'cogid', 'cogids', 'alignments',
                    'order', 'beida_id', 'page', 'benzi', 'benzi_in_source'
            ]:
                value = words[idx, entry]
                if isinstance(value, list):
                    value = ' '.join([str(x) for x in value])
                else:
                    value = str(value)
                if '"' in value:
                    value = value.replace('"', '""')
                if ',' in value:
                    value = '"' + value + '"'
                out += [value]
            f.write(','.join(out) + '\n')
Esempio n. 6
0
from lingpy.compare.partial import Partial
from lingpy.convert.plot import plot_tree
from sys import argv
from clldutils.text import strip_brackets, split_text
from collections import defaultdict
from lingpy import basictypes

if 'all' in argv:
    fname='../output/A_Deepadung_'
else:
    fname='../output/D_Deepadung_'

part = Partial(fname+'crossids.tsv')
part.add_cognate_ids('crossids', 'crossid', idtype='strict')
part.add_entries('cog', 'crossid,concept', lambda x, y: str(x[y[0]])+x[y[1]])
part.renumber('cog')

part.calculate('distance', ref='cogid')
part.calculate('tree', tree_calc='neighbor')

part.output('dst', filename=fname+'distance')
part.output('tre', filename=fname+'tree')

if 'plot' in argv:
    plot_tree(str(part.tree), degree=350, filename=fname+'tree')


Esempio n. 7
0
from lingpy import *
from lingpy.compare.partial import Partial
from sys import argv

if 'all' in argv:
    fname='A_Chen_'
else:
    fname='D_Chen_'

try:
    part = Partial(fname+'partial.bin.tsv')
except:
    part = Partial(fname+'subset.tsv', segments='tokens')
    print('[i] loaded the file')
    part.get_partial_scorer(runs=10000)
    part.output('tsv', filename=fname+'partial.bin', ignore=[], prettify=False)
    print('[i] saved the scorer')
finally:
    part.partial_cluster(
            method='lexstat',
            threshold=0.55,
            ref='cogids',
            mode='global',
            gop=-2,
            cluster_method='infomap'
            )

part.output('tsv', filename=fname+'partial', prettify=False)
Esempio n. 8
0

methods = ['sca', 'lexstat']
cluster_methods = ['infomap', 'mcl', 'upgma']
measures = ['partial', 'strict', 'loose']

for f in infiles:
    try:
        lex = Partial(pcd_path('data', 'BIN_' + f + '.tsv'))
    except IOError:
        lex = Partial(pcd_path('data', f + '.tsv'))
        lex.get_scorer(
            preprocessing=False,
            runs=10000,
        )
        lex.output('tsv', filename=pcd_path('data', 'BIN_' + f[2:]))

    # create new reference ids for cogantes from partial cognates
    if not 'strict_cogid' in lex.header:
        lex.add_cognate_ids('partialids', 'strict_cogid', 'strict')
    if not 'loose_cogid' in lex.header:
        lex.add_cognate_ids('partialids', 'loose_cogid', 'loose')

    for i in range(1, 20):
        print("Analyzing {0} with t={1}...".format(f, i))
        t = 0.05 * i
        ts = '{0:.2f}'.format(t).replace('0.', '')

        for m in methods:
            msf = 'f_' + m
            for cm in cluster_methods:
Esempio n. 9
0
# create a new column by cog column. The new column assign
wl.renumber('cog')
print('Saving full cognate file...')
wl.output('tsv',
          filename='{0}-{1}'.format(sys.argv[3], 'cognate'),
          prettify=False,
          ignore='all')

## get partical cognate
try:
    part = Partial('{0}-{1}-{2}.tsv'.format(sys.argv[3], 'partial', 'temp'),
                   segments='segments')
except:
    part = Partial(wl, segments='segments')
    part.get_scorer(runs=10000)
    part.output('tsv',
                filename='{0}-{1}-{2}'.format(sys.argv[3], 'partial', 'temp'))

## manually correct error in data
part.partial_cluster(method='lexstat',
                     cluster_method='infomap',
                     threshold=0.6,
                     ref='cogids')

part.add_entries('note', 'cogid', lambda x: '')
part.add_entries('morphemes', 'cogid', lambda x: '')
print('Saving partial cognate file')
part.output('tsv',
            filename='{0}-{1}-{2}'.format(sys.argv[3], 'partial', 'final'),
            ignore='all',
            prettify=False)