コード例 #1
0
ファイル: patterns.py プロジェクト: digling/burmish
def make_alignments(verbose=False):
        
    wl = load_burmish(sqlite=False, remote=False)
    blacklist = []
    renumber = {0: 0}
    for k in wl:
        cogids = wl[k, 'cogids'].strip()
        concept = wl[k, 'concept']
        wl[k][wl.header['tokens']] = clean_tokens(wl[k, 'tokens'])
        if not cogids or cogids == 0:
            blacklist += [(k, '?')]
        else:
            tokens = clean_tokens(wl[k, 'tokens'])
            morphemes = split_tokens(tokens)
            if '0' in tokens2class(tokens, 'sca'):
                blacklist += [(k, '0')]
            elif len(morphemes) != len(cogids.split(' ')):
                blacklist += [(k, 'M')]
            else:
                for c in cogids.split(' '):
                    cogid = c+':'+concept
                    if cogid not in renumber:
                        new_val = max(renumber.values())+1
                        renumber[cogid] = new_val
                    else:
                        pass
    C = {}
    blist = [k[0] for k in blacklist]
    for k in wl:
        if k not in blist:
            C[k] = [renumber[c+':'+wl[k, 'concept']] for c in wl[k,
                'cogids'].split()]
        else:
            C[k] = [0]
    wl.add_entries('pcogids', C, lambda x: x)
    D = {}
    D[0] = [h for h in sorted(wl.header, key=lambda x: wl.header[x]) if h not
            in ['alignment']]
    for k in wl:
        if k not in blacklist:
            D[k] = [wl[k, h] for h in D[0]]
    if verbose: print(D[0])
    alm = Alignments(D, ref='pcogids', conf=burmish_path('conf',
            'wordlist.rc'))
    if verbose: print(alm._mode)
    if verbose:
        for cogid, msa in alm.msa['pcogids'].items():
            sca = SCA(msa)
            sca.prog_align()
    alm.align(method='library', iterate=True)
    alm.output('tsv', filename=burmish_path('dumps', 'alignments'),
        ignore='all', prettify=False)
    for i, (k, r) in enumerate(blacklist):
        if wl[k, 'cogids']:
            print(i+1, r, k, wl[k, 'concept'], wl[k, 'doculect'], wl[k,
                'tokens'], repr(wl[k, 'cogids']))
コード例 #2
0
ファイル: colexification.py プロジェクト: digling/burmish
def compare_partial_colexifications():
    alms = get_alignments()
    G = nx.Graph()
    for k in alms:
        concept = alms[k, "concept"]
        morphemes = split_tokens(alms[k, "tokens"])
        doculect = alms[k, "doculect"]
        if concept not in G.node:
            G.add_node(concept, bipartite=0)

        for m in morphemes:
            idf = "".join(m) + "/" + doculect
            if idf not in G.node:
                G.add_node(idf, bipartite=1)
            G.add_edge(concept, idf)
    print("printing now")
    save_network("bipartite.gml", G)
    nodes = {n for n, d in G.nodes(data=True) if d["bipartite"] == 0}
    G2 = nx.bipartite.collaboration_weighted_projected_graph(G, nodes)
    save_network("projected.gml", G2)
    return G
コード例 #3
0
ファイル: patterns.py プロジェクト: LinguList/burmish
def make_alignments(verbose=False):

    wl = load_burmish(sqlite=False, remote=False)
    blacklist = []
    renumber = {0: 0}
    for k in wl:
        cogids = wl[k, 'cogids'].strip()
        concept = wl[k, 'concept']
        wl[k][wl.header['tokens']] = clean_tokens(wl[k, 'tokens'])
        if not cogids or cogids == 0:
            blacklist += [(k, '?')]
        else:
            tokens = clean_tokens(wl[k, 'tokens'])
            morphemes = split_tokens(tokens)
            if '0' in tokens2class(tokens, 'sca'):
                blacklist += [(k, '0')]
            elif len(morphemes) != len(cogids.split(' ')):
                blacklist += [(k, 'M')]
            else:
                for c in cogids.split(' '):
                    cogid = c + ':' + concept
                    if cogid not in renumber:
                        new_val = max(renumber.values()) + 1
                        renumber[cogid] = new_val
                    else:
                        pass
    C = {}
    blist = [k[0] for k in blacklist]
    for k in wl:
        if k not in blist:
            C[k] = [
                renumber[c + ':' + wl[k, 'concept']]
                for c in wl[k, 'cogids'].split()
            ]
        else:
            C[k] = [0]
    wl.add_entries('pcogids', C, lambda x: x)
    D = {}
    D[0] = [
        h for h in sorted(wl.header, key=lambda x: wl.header[x])
        if h not in ['alignment']
    ]
    for k in wl:
        if k not in blacklist:
            D[k] = [wl[k, h] for h in D[0]]
    if verbose: print(D[0])
    alm = Alignments(D,
                     ref='pcogids',
                     conf=burmish_path('conf', 'wordlist.rc'))
    if verbose: print(alm._mode)
    if verbose:
        for cogid, msa in alm.msa['pcogids'].items():
            sca = SCA(msa)
            sca.prog_align()
    alm.align(method='library', iterate=True)
    alm.output('tsv',
               filename=burmish_path('dumps', 'alignments'),
               ignore='all',
               prettify=False)
    for i, (k, r) in enumerate(blacklist):
        if wl[k, 'cogids']:
            print(i + 1, r, k, wl[k, 'concept'], wl[k, 'doculect'],
                  wl[k, 'tokens'], repr(wl[k, 'cogids']))