コード例 #1
0
ファイル: brat2conllu.py プロジェクト: coltekin/gk-treebank
'nmod_comp': 'nmod:comp',
'nmod_pass': '******',
'dobj_cau': 'dobj:cau',
'ccomp_cau': 'ccomp:cau',
'aux_q': 'aux:q',
'xcomp_sc': 'xcomp',
'Person_psor': 'Person[psor]',
'Number_psor': 'Number[psor]',
}


for f in sys.argv[1:]:
    basename = f[:-4]

    with open(f, "r") as fp:
        ann = brat.read_ann(fp)

    conllu = [{}] # dummy first token ensures that the indexes match
    for i in sorted(ann.tokens.keys()):
        tok = ann.tokens[i]
        token = {'id' : tok.id,
                 'form' : tok.token,
                 'cpos' : tok.pos,
                 'pos' : tok.pos,
                 'head' : None,
                 'deprel' : None,
                 'deps': "_"}

        feats = ""
        for (label, val) in tok.attrs:
            if len(feats): delim = "|"
コード例 #2
0
    provided with the one in morph/ directory. It also copies 
    the .txt file over from the same directory.
"""

import sys, shutil
from collections import namedtuple

from brat import read_ann, write_ann, Rel, Ann

# oldfp = open(sys.argv[1], 'r')
# newfp = open(sys.argv[2], 'r')
basename = sys.argv[1].replace('.ann', '')
oldfname = basename + '.ann'
oldfp = open(oldfname, 'r')
newfp = open('morph/' + oldfname, 'r')
old = read_ann(oldfp)
new = read_ann(newfp)
oldfp.close()
newfp.close()

tok_out = {}
rel_out = {}
tok_map = {}

from difflib import SequenceMatcher

s = SequenceMatcher(isjunk=lambda x: x in "_.?!",
                    a=[v.token for k, v in old.tokens.items()],
                    b=[v.token for k, v in new.tokens.items()],
                    autojunk=True)
コード例 #3
0
    provided with the one in morph/ directory. It also copies 
    the .txt file over from the same directory.
"""

import sys, shutil
from collections import namedtuple

from brat import read_ann,write_ann,Rel,Ann

# oldfp = open(sys.argv[1], 'r')
# newfp = open(sys.argv[2], 'r')
basename = sys.argv[1].replace('.ann', '') 
oldfname = basename + '.ann'
oldfp = open(oldfname, 'r')
newfp = open('morph/' + oldfname, 'r')
old = read_ann(oldfp)
new = read_ann(newfp)
oldfp.close()
newfp.close()

tok_out = {}
rel_out = {}
tok_map = {}

from difflib import SequenceMatcher

s = SequenceMatcher(isjunk=lambda x: x in "_.?!", 
                    a=[v.token for k, v in old.tokens.items()],
                    b=[v.token for k, v in new.tokens.items()],
                    autojunk=True)
コード例 #4
0
    'nmod_cau': 'nmod:cau',
    'nmod_comp': 'nmod:comp',
    'nmod_pass': '******',
    'dobj_cau': 'dobj:cau',
    'ccomp_cau': 'ccomp:cau',
    'aux_q': 'aux:q',
    'xcomp_sc': 'xcomp',
    'Person_psor': 'Person[psor]',
    'Number_psor': 'Number[psor]',
}

for f in sys.argv[1:]:
    basename = f[:-4]

    with open(f, "r") as fp:
        ann = brat.read_ann(fp)

    conllu = [{}]  # dummy first token ensures that the indexes match
    for i in sorted(ann.tokens.keys()):
        tok = ann.tokens[i]
        token = {
            'id': tok.id,
            'form': tok.token,
            'cpos': tok.pos,
            'pos': tok.pos,
            'head': None,
            'deprel': None,
            'deps': "_"
        }

        feats = ""