Ejemplo n.º 1
0
def process_conllu(inp):
    tree = parse_tree(inp)
    root = tree[0]
    data = {}
    #path = 0
    for const in depth_first(root):
        w = const[0]['form']
        deprel = const[0]['deprel']
        data[w] = deprel
    return data
Ejemplo n.º 2
0
def parse_sent(inf, outf, return_tree=True):

    # read configs and command line options
    config = configparser.ConfigParser()
    config.read('config.ini')
    in_fname, out_fname = inf, outf
    check_infile(in_fname)

    fname_clean = os.path.basename(in_fname).rsplit('.', 1)[0]

    # temporary files and folder
    tmp_path = get_path_from_config(config, 'TMP_PATH', 'tmp')
    tmp_fsuffixes = [
        '_mystem_in.txt', '_mystem_out.txt', '_treetagger_in.txt',
        '_treetagger_out.txt', '_raw.conll'
    ]
    a, b, c, d, e = (PurePosixPath(j) for j in tmp_path.split('/'))
    tmp_fnames = [
        str(a / b / c / d / e / (fname_clean + fsuffix))
        for fsuffix in tmp_fsuffixes
    ]

    # output file and folder
    out_path = get_path_from_config(config, 'OUT_PATH', 'out')
    a, b, c, d, e = (PurePosixPath(j) for j in out_path.split('/'))
    if out_fname is None:
        out_fname = str(a / b / c / d / e / (fname_clean + '.conll'))
    else:
        out_fname = str(a / b / c / d / e / out_fname)

    # create output and temp folder if needed
    for path in [tmp_path, out_path]:
        if not os.path.exists(path):
            os.makedirs(path)

    # rock'n'roll
    process(in_fname, out_fname, config['DEFAULT']['APP_ROOT'],
            config['mystem']['MYSTEM_PATH'], config['malt']['MALT_ROOT'],
            config['malt']['MALT_NAME'], config['malt']['MODEL_NAME'],
            config['dicts']['COMP_DICT_PATH'],
            config['treetagger']['TREETAGGER_BIN'],
            config['treetagger']['TREETAGGER_PAR'], *tmp_fnames)

    for fname in tmp_fnames:
        os.remove(fname)

    with open(out_fname, 'r', encoding='utf-8') as conll_file:
        conll_data = conll_file.read()
        conll_file.close()
        os.remove(out_fname)
    if return_tree:
        return parse_tree(conll_data)
    return parse(conll_data)
Ejemplo n.º 3
0
def load_sentence():
    sentence = ''
    filename = os.path.join(os.path.dirname(__file__),
                            'fixtures/UD2.conllu.gz')
    with gzip.open(filename) as fp:
        for line in fp.read().split(b'\n'):
            line = line.decode('utf-8')
            if line.strip() == '':
                yield parse_tree(sentence)
                sentence = ''
            else:
                if not line.startswith('#'):
                    sentence += line + '\n'
Ejemplo n.º 4
0
 def test_parse_tree(self):
     self.assertEqual(parse_tree(data1), data1_tree)
Ejemplo n.º 5
0
 def test_parse_data8(self):
     parse_tree(data8)
Ejemplo n.º 6
0
 def test_parse_tree(self):
     test_cases = zip([data1, data5, data6],
                      [data1_tree, data5_tree, data6_tree])
     for data, data_tree in test_cases:
         self.assertEqual(parse_tree(data), data_tree)
Ejemplo n.º 7
0
 def test_exception_on_missing_head(self):
     data = "1\tThe\tthe\tDET\tDT\tDefinite=Def|PronType=Art"
     with self.assertRaises(ParseException):
         parse_tree(data)
Ejemplo n.º 8
0
 def test_parse_tree(self):
     from tests.fixtures.data1_tree import data1_expected
     self.assertEqual(parse_tree(data1), data1_expected)