def run(args): ds = Dataset() try: part = Partial( ds.dir.joinpath('workflow', 'D_Chen_partial.bin.tsv').as_posix()) except: part = Partial(ds.dir.joinpath('workflow', 'D_Chen_subset.tsv').as_posix(), segments='tokens') part.get_partial_scorer(runs=10000) part.output('tsv', filename=ds.dir.joinpath('workflow', 'D_Chen_partial.bin').as_posix(), ignore=[], prettify=False) args.log.info('[i] saved the scorer') finally: part.partial_cluster(method='lexstat', threshold=0.55, ref='cogids', mode='global', gop=-2, cluster_method='infomap') part.output('tsv', filename=ds.dir.joinpath('workflow', 'D_Chen_partial').as_posix(), prettify=False)
def setUp(self): WithTempDir.setUp(self) self.part = Partial(test_data('partial_cognates.tsv'), segments='segments', split_on_tones=True) self.part2 = Partial(test_data('partial_cognates-scored.tsv'), segments='segments')
def setUp(self): WithTempDir.setUp(self) self.lex = LexStat(test_data('KSL.qlc')) self.part = Partial(test_data('partial_cognates.tsv'), segments='segments') self.part.add_entries('pid1', 'partial_cognate_sets', lambda x: x) self.part.add_entries('pid2', 'partialids2', lambda x: [int(y) for y in x.split(' ')])
def run(args): ds = Dataset(args) wl = Wordlist.from_cldf( str(ds.cldf_specs().dir.joinpath('cldf-metadata.json'))) D = {0: [x for x in wl.columns]} for idx in wl: if wl[idx, 'tokens']: D[idx] = wl[idx] part = Partial(D, check=True) part.get_partial_scorer(runs=10000) part.partial_cluster(method='lexstat', threshold=0.45, ref="cogids", cluster_method='infomap') alms = Alignments(part, ref='cogids', fuzzy=True) alms.align() alms.output('tsv', filename="chin-aligned")
def calculate_scores(*languages): pair_of_languages = { 0: ["backreference", "doculect", "concept", "ipa", "tokens"] } for l in languages: pair_of_languages.update({ i + len(pair_of_languages): [ form.id, form.language_id, form.concept_id, form.transcription, form.soundclasses.split() ] for i, form in enumerate( session.query(Form).filter_by(language=l)) }) lex = Partial(pair_of_languages, model=lingpy.data.model.Model("asjp"), check=True, apply_checks=True) lex.get_scorer(runs=10000, ratio=(3, 1), threshold=0.7) # This does not generalize to non-two languages yet session.add( Scorer(language1=languages[0], language2=languages[1], scorer=scorer2str(lex.bscorer))) for concept, forms, matrix in lex._get_matrices(method='lexstat', scale=0.5, factor=0.3, restricted_chars="_T", mode="overlap", gop=-2, restriction=""): for (i1, f1), (i2, f2) in itertools.combinations(enumerate(forms), 2): f1 = lex[f1][0] # Index 0 contains the 'backref', ie. our ID f2 = lex[f2][0] # Index 0 contains the 'backref', ie. our ID session.add( Similarity(form1_id=f1, form2_id=f2, score=matrix[i1][i2])) session.commit()
'segments', 'cogid_cognateset_id') namespace = (('concept_name', 'concept'), ('language_id', 'doculect'), ('segments', 'tokens'), ('language_glottocode', 'glottolog'), ('concept_concepticon_id', 'concepticon'), ('language_latitude', 'latitude'), ('language_longitude', 'longitude'), ('cognacy', 'cognacy'), ('cogid_cognateset_id', 'cognacy')) wl = Wordlist.from_cldf('../cldf/cldf-metadata.json', columns=columns, namespace=namespace) D = {0: wl.columns} for idx in wl: if wl[idx, 'tokens']: D[idx] = wl[idx] part = Partial(D) part.partial_cluster(method='sca', threshold=0.45, ref='cogids') alms = Alignments(part, ref='cogids') alms.align() alms.add_entries('note', 'form', lambda x: '') #part.add_entries('cog', 'cognacy', lambda x: x) #for idx in wl: # if wl[idx, 'cog'].strip(): # wl[idx, 'cog'] += '-'+wl[idx, 'concept'] # else: # wl[idx, 'cog'] += str(idx) # #wl.renumber('cog') cogids2cogid(alms)
def part(test_data): p = Partial(str(test_data / 'partial_cognates.tsv'), segments='segments') p.add_entries('pid1', 'partial_cognate_sets', lambda x: x) p.add_entries('pid2', 'partialids2', lambda x: [int(y) for y in x.split(' ')]) return p
7334: 31173, 7336: 31174, 7133: 31074, 7131: 31073, } matcher.update(explicit) blacklist = [] for idx in ob2: if idx not in matcher.values() and idx not in unmatched: blacklist += [idx] # now that we have all relevant data, we need to compare the cognate sets # print(max([int(stdb[idx, 'cogid']) for idx in stdb])) # cogid range should be 7000+ part = Partial(wl) part.add_cognate_ids('cogids', 'strictid', idtype='strict') # compute a matcher of cognate ids burm2stdb = {} ncid = 8000 for idx in part: nidx = matcher.get(idx) tid = part[idx, 'strictid'] if nidx and nidx not in burm2stdb: oid = stdb[nidx, 'cogid'] burm2stdb[tid] = oid else: if tid in burm2stdb: pass else:
from lingpy import * from lingpy.compare.partial import Partial try: part = Partial('hm-111-17.bin.tsv', segments='segments') except: part = Partial('hm-111-17.tsv', segments='segments') part.get_scorer(runs=10000) part.output('tsv', filename='hm-111-17.bin') # manually correct error in data part.partial_cluster(method='lexstat', cluster_method='infomap', threshold=0.6, ref='cogids') part.add_entries('note', 'cogid', lambda x: '') part.add_entries('morphemes', 'cogid', lambda x: '') part.output('tsv', filename='hm-111-17-t06', ignore='all', prettify=False)
from lingpy import * from lingpy.compare.partial import Partial from lingpy.evaluate.acd import partial_bcubes try: lexx = LexStat('hm-jerry-scored.bin.tsv', segments='segments') lex = Partial('../hm-111-17_16feb.tsv', segments='segments') lex.cscorer = lexx.cscorer except: lex = Partial('../hm-111-17_16feb.tsv', segments='segments') lex.get_scorer(runs=10000) lex.output('tsv', filename='hm-jerry-scored.bin') # we test several thresholds for i in range(2, 8): lex.partial_cluster(method='lexstat', cluster_method='infomap', threshold=i * 0.1, ref='t' + str(i)) a, b, c = partial_bcubes(lex, 'cogids', 't' + str(i), pprint=False) print('{0:2} {1:.2f} {2:.2f} {3:.2f}'.format(i, a, b, c))
for sampa in sampas: sampa = sampa.strip() for s, t in reps: sampa = sampa.replace(s, t) print(sampa) if sampa: ipa = sampa2uni(sampa) ipa = ipa.replace(' ', '_') tks = ipa2tokens(ipa, merge_vowels=False, semi_diacritics='') D[idx] = [concept, language, sampa_o, sampa, ipa, tks] idx += 1 wl = Wordlist(D) wl.output('tsv', filename='an-data-wordlist', prettify=False, ignore='all') lex = LexStat(wl) print(lex.height, lex.width) #lex.cluster(method=_method,cluster_method=_cluster_method, threshold=0.45) lex.cluster(method='sca', threshold=0.45) alm = Alignments(lex, ref='scaid') alm.align() alm.output('tsv', filename='an-data-aligned', prettify=False, ignore='all') alm.output('html',filename='result') lex = Partial('an-data-aligned.tsv'); lex.get_scorer(preprocessing=False,runs=10000) lex.cluster(method=_method,cluster_method=_cluster_method,threshold=0.45) lex.calculate('tree',ref='scaid') print(lex.tree) print('</body>') print('</html>')
def part(test_data): return Partial(str(test_data / 'partial_cognates.tsv'), segments='segments', split_on_tones=True)
def part2(test_data): return Partial(str(test_data / 'partial_cognates-scored.tsv'), segments='segments')
language, concept, concept_id, glottolog, words, t(ipa), t(ipa, 'IPA'), cognacy ] wl = Wordlist(D) # create a new column by cog column. The new column assign wl.renumber('cog') print('Saving full cognate file...') wl.output('tsv', filename='{0}-{1}'.format(sys.argv[3], 'cognate'), prettify=False, ignore='all') ## get partical cognate try: part = Partial('{0}-{1}-{2}.tsv'.format(sys.argv[3], 'partial', 'temp'), segments='segments') except: part = Partial(wl, segments='segments') part.get_scorer(runs=10000) part.output('tsv', filename='{0}-{1}-{2}'.format(sys.argv[3], 'partial', 'temp')) ## manually correct error in data part.partial_cluster(method='lexstat', cluster_method='infomap', threshold=0.6, ref='cogids') part.add_entries('note', 'cogid', lambda x: '') part.add_entries('morphemes', 'cogid', lambda x: '') print('Saving partial cognate file')
ccubes = [] def pprint_result(f, mode, t, p, r, fs): print('{0:15} {1:30} {2} {3:.2f} {4:.2f} {5:.2f}'.format( f, mode, ts, p, r, fs)) methods = ['sca', 'lexstat'] cluster_methods = ['infomap', 'mcl', 'upgma'] measures = ['partial', 'strict', 'loose'] for f in infiles: try: lex = Partial(pcd_path('data', 'BIN_' + f + '.tsv')) except IOError: lex = Partial(pcd_path('data', f + '.tsv')) lex.get_scorer( preprocessing=False, runs=10000, ) lex.output('tsv', filename=pcd_path('data', 'BIN_' + f[2:])) # create new reference ids for cogantes from partial cognates if not 'strict_cogid' in lex.header: lex.add_cognate_ids('partialids', 'strict_cogid', 'strict') if not 'loose_cogid' in lex.header: lex.add_cognate_ids('partialids', 'loose_cogid', 'loose') for i in range(1, 20):
def prepare(ds): # steps: # parse characters (numbers, zeros) # check for number # recreate partial cognate identifiers # create strict cognate identifieres # code everything as CLDF-like file con = Concepticon() beida = con.conceptlists['BeijingDaxue-1964-905'] inv = ds.sounds words = Wordlist(ds.raw('chars-corrected-2017-06-18.tsv')) partialids, pidc = {}, {} pidx = 1 concepts = {} for idx, chars, tks, doculect, glossid in iter_rows( words, 'benzi', 'segments', 'doculect', 'beida_id'): tokens = tokens2morphemes(tks) benzi = parse_chars(chars, doculect, tokens) if len(tokens) != len(benzi): print(doculect, glossid, benzi, tokens) pids = [] for char in benzi: if char == '囗': pids += [str(pidx)] pidx += 1 else: if char not in partialids: partialids[char] = str(pidx) pidx += 1 pids += [partialids[char]] words[idx, 'cogids'] = ' '.join(pids) words[idx, 'benzi'] = ' '.join(benzi) # retrieve correct concept bidx = 'BeijingDaxue-1964-905-' + glossid concept = beida.concepts[bidx] concepts[idx] = [ concept.concepticon_id, concept.attributes['chinese'], concept.attributes['pageno'], concept.attributes['pinyin'] ] words[idx, 'concept'] = concept.gloss + ' (' + concept.attributes[ 'pinyin'] + ' ' + concept.attributes['chinese'] + ')' for i, entry in enumerate(['concepticon_id', 'chinese', 'page', 'pinyin']): words.add_entries(entry, concepts, lambda x: x[i]) words.add_entries('benzi_in_source', 'hanzi', lambda x: x) words.add_entries('source', 'ipa', lambda x: 'BeijingDaxue1964') words.add_entries('value', 'ipa', lambda x: x) words.add_entries('form', 'ipa', lambda x: x) words.add_entries('glottolog', 'doculect', lambda x: ds.languages[x]['glottolog']) words.add_entries('iso', 'doculect', lambda x: ds.languages[x]['iso']) # determine order of entries order = {} for d in words.cols: entries = words.get_list(col=d, flat=True) concept, oid = '', 1 for idx in sorted(entries): new_concept = words[idx, 'concept'] if new_concept == concept: oid += 1 else: concept = new_concept oid = 1 order[idx] = oid words.add_entries('order', order, lambda x: str(x)) words.output('tsv', filename=ds.raw('tmp-2017-06-18')) print('first run on words') part = Partial(ds.raw('tmp-2017-06-18.tsv'), segments='segments') part.add_cognate_ids('cogids', 'cogid') part.output('tsv', filename=ds.raw('tmp-2017-06-18')) print('created cognate ids') alm = Alignments(ds.raw('tmp-2017-06-18.tsv'), segments='segments', ref='cogids', alignment='alignments') alm.align() alm.output('tsv', filename=ds.raw('tmp-2017-06-18-finalized'), subset=True, cols=[ 'doculect', 'glottolog', 'iso', 'concept', 'concepticon_id', 'chinese', 'pinyin', 'benzi', 'benzi_in_source', 'value', 'form', 'segments', 'cogid', 'cogids', 'note', 'source', 'beida_id', 'page', 'order', 'alignments' ]) words = Wordlist(ds.raw('tmp-2017-06-18-finalized.tsv')) ds.write_wordlist(words) with open('cldf/beijingdaxue1964.csv', 'w') as f: f.write(','.join([ 'ID', 'Language_name', 'Language_ID', 'Language_iso', 'Parameter_ID', 'Parameter_name', 'Source', 'Comment', 'Parameter_Chinese', 'Parameter_Pinyin', 'Value', 'Form', 'Segments', 'Cognate_Set', 'Cognate_Sets', 'Alignments', 'Order', 'Beida_ID', 'Page', 'Benzi', 'Benzi_in_source' ]) + '\n') for idx in words: out = [str(idx)] for entry in [ 'doculect', 'glottolog', 'iso', 'concepticon_id', 'concept', 'source', 'note', 'chinese', 'pinyin', 'value', 'form', 'segments', 'cogid', 'cogids', 'alignments', 'order', 'beida_id', 'page', 'benzi', 'benzi_in_source' ]: value = words[idx, entry] if isinstance(value, list): value = ' '.join([str(x) for x in value]) else: value = str(value) if '"' in value: value = value.replace('"', '""') if ',' in value: value = '"' + value + '"' out += [value] f.write(','.join(out) + '\n')
from lingpy.compare.partial import Partial from lingpy.convert.plot import plot_tree from sys import argv from clldutils.text import strip_brackets, split_text from collections import defaultdict from lingpy import basictypes if 'all' in argv: fname='../output/A_Deepadung_' else: fname='../output/D_Deepadung_' part = Partial(fname+'crossids.tsv') part.add_cognate_ids('crossids', 'crossid', idtype='strict') part.add_entries('cog', 'crossid,concept', lambda x, y: str(x[y[0]])+x[y[1]]) part.renumber('cog') part.calculate('distance', ref='cogid') part.calculate('tree', tree_calc='neighbor') part.output('dst', filename=fname+'distance') part.output('tre', filename=fname+'tree') if 'plot' in argv: plot_tree(str(part.tree), degree=350, filename=fname+'tree')
from lingpy import * from lingpy.compare.partial import Partial from sys import argv if 'all' in argv: fname='A_Chen_' else: fname='D_Chen_' try: part = Partial(fname+'partial.bin.tsv') except: part = Partial(fname+'subset.tsv', segments='tokens') print('[i] loaded the file') part.get_partial_scorer(runs=10000) part.output('tsv', filename=fname+'partial.bin', ignore=[], prettify=False) print('[i] saved the scorer') finally: part.partial_cluster( method='lexstat', threshold=0.55, ref='cogids', mode='global', gop=-2, cluster_method='infomap' ) part.output('tsv', filename=fname+'partial', prettify=False)