def make_alignments(verbose=False): wl = load_burmish(sqlite=False, remote=False) blacklist = [] renumber = {0: 0} for k in wl: cogids = wl[k, 'cogids'].strip() concept = wl[k, 'concept'] wl[k][wl.header['tokens']] = clean_tokens(wl[k, 'tokens']) if not cogids or cogids == 0: blacklist += [(k, '?')] else: tokens = clean_tokens(wl[k, 'tokens']) morphemes = split_tokens(tokens) if '0' in tokens2class(tokens, 'sca'): blacklist += [(k, '0')] elif len(morphemes) != len(cogids.split(' ')): blacklist += [(k, 'M')] else: for c in cogids.split(' '): cogid = c+':'+concept if cogid not in renumber: new_val = max(renumber.values())+1 renumber[cogid] = new_val else: pass C = {} blist = [k[0] for k in blacklist] for k in wl: if k not in blist: C[k] = [renumber[c+':'+wl[k, 'concept']] for c in wl[k, 'cogids'].split()] else: C[k] = [0] wl.add_entries('pcogids', C, lambda x: x) D = {} D[0] = [h for h in sorted(wl.header, key=lambda x: wl.header[x]) if h not in ['alignment']] for k in wl: if k not in blacklist: D[k] = [wl[k, h] for h in D[0]] if verbose: print(D[0]) alm = Alignments(D, ref='pcogids', conf=burmish_path('conf', 'wordlist.rc')) if verbose: print(alm._mode) if verbose: for cogid, msa in alm.msa['pcogids'].items(): sca = SCA(msa) sca.prog_align() alm.align(method='library', iterate=True) alm.output('tsv', filename=burmish_path('dumps', 'alignments'), ignore='all', prettify=False) for i, (k, r) in enumerate(blacklist): if wl[k, 'cogids']: print(i+1, r, k, wl[k, 'concept'], wl[k, 'doculect'], wl[k, 'tokens'], repr(wl[k, 'cogids']))
def compare_partial_colexifications(): alms = get_alignments() G = nx.Graph() for k in alms: concept = alms[k, "concept"] morphemes = split_tokens(alms[k, "tokens"]) doculect = alms[k, "doculect"] if concept not in G.node: G.add_node(concept, bipartite=0) for m in morphemes: idf = "".join(m) + "/" + doculect if idf not in G.node: G.add_node(idf, bipartite=1) G.add_edge(concept, idf) print("printing now") save_network("bipartite.gml", G) nodes = {n for n, d in G.nodes(data=True) if d["bipartite"] == 0} G2 = nx.bipartite.collaboration_weighted_projected_graph(G, nodes) save_network("projected.gml", G2) return G
def make_alignments(verbose=False): wl = load_burmish(sqlite=False, remote=False) blacklist = [] renumber = {0: 0} for k in wl: cogids = wl[k, 'cogids'].strip() concept = wl[k, 'concept'] wl[k][wl.header['tokens']] = clean_tokens(wl[k, 'tokens']) if not cogids or cogids == 0: blacklist += [(k, '?')] else: tokens = clean_tokens(wl[k, 'tokens']) morphemes = split_tokens(tokens) if '0' in tokens2class(tokens, 'sca'): blacklist += [(k, '0')] elif len(morphemes) != len(cogids.split(' ')): blacklist += [(k, 'M')] else: for c in cogids.split(' '): cogid = c + ':' + concept if cogid not in renumber: new_val = max(renumber.values()) + 1 renumber[cogid] = new_val else: pass C = {} blist = [k[0] for k in blacklist] for k in wl: if k not in blist: C[k] = [ renumber[c + ':' + wl[k, 'concept']] for c in wl[k, 'cogids'].split() ] else: C[k] = [0] wl.add_entries('pcogids', C, lambda x: x) D = {} D[0] = [ h for h in sorted(wl.header, key=lambda x: wl.header[x]) if h not in ['alignment'] ] for k in wl: if k not in blacklist: D[k] = [wl[k, h] for h in D[0]] if verbose: print(D[0]) alm = Alignments(D, ref='pcogids', conf=burmish_path('conf', 'wordlist.rc')) if verbose: print(alm._mode) if verbose: for cogid, msa in alm.msa['pcogids'].items(): sca = SCA(msa) sca.prog_align() alm.align(method='library', iterate=True) alm.output('tsv', filename=burmish_path('dumps', 'alignments'), ignore='all', prettify=False) for i, (k, r) in enumerate(blacklist): if wl[k, 'cogids']: print(i + 1, r, k, wl[k, 'concept'], wl[k, 'doculect'], wl[k, 'tokens'], repr(wl[k, 'cogids']))