Example #1
0
def main():
    logging.basicConfig(
        level="INFO",
        format="%(asctime)s : " +
        "%(module)s (%(lineno)s) - %(levelname)s - %(message)s")

    id_field, word_field, lemma_field, msd_field, gov_field, dep_field = (0, 1,
                                                                          None,
                                                                          None,
                                                                          -4,
                                                                          -3)

    with open(sys.argv[1]) as stream:
        c = 0
        for sentence in sentence_iterator(stream, comment_tag='#'):
            try:
                deps = get_dependencies(sentence, id_field, word_field,
                                        lemma_field, msd_field, gov_field,
                                        dep_field)
            except:
                print(sentence)
                sys.exit(-1)
            print(deps)
            sys.exit(-1)
            c += 1
            if c % 1000 == 0:
                print(c)
        print(c)
Example #2
0
def main():
    seen = set()
    with open(sys.argv[1]) as stream:
        for sentence in sentence_iterator(stream, comment_tag='#'):
            for tok in sentence:
                word = sanitize_word(tok[1])
                pos = tok[3]
                if (word, pos) not in seen:
                    print(TEMPLATE.format(pos, word))
                    seen.add((word, pos))
Example #3
0
def main():
    logging.basicConfig(
        level="INFO",
        format="%(asctime)s : " +
        "%(module)s (%(lineno)s) - %(levelname)s - %(message)s")
    cfg_file = sys.argv[1]
    cfg = get_cfg(cfg_file)
    text_to_4lang = TextTo4lang(cfg)
    fn = cfg.get('text', 'input_sens')
    base_fn = os.path.basename(fn)
    deps_fn = os.path.join(text_to_4lang.deps_dir, "{0}.deps".format(base_fn))

    if text_to_4lang.lang == 'hu':
        id_field, word_field, lemma_field, msd_field, gov_field, dep_field = (
            0, 1, 3, 4, -4, -2)
    else:
        id_field, word_field, lemma_field, msd_field, gov_field, dep_field = (
            0, 1, None, None, -4, -3)

    deps = map(lambda s: get_dependencies(
        s, id_field, word_field, lemma_field, msd_field, gov_field, dep_field),
        sentence_iterator(open(fn)))

    if text_to_4lang.lang == 'en':
        c_deps = []
        for sen in deps:
            c_deps.append([])
            for d in sen:
                c_deps[-1].append((
                    d['type'],
                    (d['gov']['word'], d['gov']['id']),
                    (d['dep']['word'], d['dep']['id'])))
                # convert to old deps (for now, see issue #51)
    else:
        c_deps = deps
    with open(deps_fn, 'w') as out_f:
        out_f.write("{0}\n".format(json.dumps({
            "deps": c_deps,
            "corefs": []})))

    text_to_4lang.process_deps(deps_fn)
Example #4
0
def main():
    logging.basicConfig(
        level="INFO",
        format="%(asctime)s : " +
        "%(module)s (%(lineno)s) - %(levelname)s - %(message)s")
    cfg_file = sys.argv[1]
    cfg = get_cfg(cfg_file)
    text_to_4lang = TextTo4lang(cfg)
    fn = cfg.get('text', 'input_sens')
    base_fn = os.path.basename(fn)
    deps_fn = os.path.join(text_to_4lang.deps_dir, "{0}.deps".format(base_fn))

    if text_to_4lang.lang == 'hu':
        id_field, word_field, lemma_field, msd_field, gov_field, dep_field = (
            0, 1, 3, 4, -4, -2)
    else:
        id_field, word_field, lemma_field, msd_field, gov_field, dep_field = (
            0, 1, None, None, -4, -3)

    deps = map(
        lambda s: get_dependencies(s, id_field, word_field, lemma_field,
                                   msd_field, gov_field, dep_field),
        sentence_iterator(open(fn)))

    if text_to_4lang.lang == 'en':
        c_deps = []
        for sen in deps:
            c_deps.append([])
            for d in sen:
                c_deps[-1].append(
                    (d['type'], (d['gov']['word'], d['gov']['id']),
                     (d['dep']['word'], d['dep']['id'])))
                # convert to old deps (for now, see issue #51)
    else:
        c_deps = deps
    with open(deps_fn, 'w') as out_f:
        out_f.write("{0}\n".format(json.dumps({"deps": c_deps, "corefs": []})))

    text_to_4lang.process_deps(deps_fn)
Example #5
0
def main():
    print(HEADER)
    id_field, word_field, lemma_field, msd_field, gov_field, dep_field = (0, 1,
                                                                          None,
                                                                          None,
                                                                          -4,
                                                                          -3)
    global SEEN
    global GRAPH_STRING
    with open(sys.argv[1]) as stream:
        for sentence in sentence_iterator(stream, comment_tag='#'):
            deps = get_dependencies(sentence, id_field, word_field,
                                    lemma_field, msd_field, gov_field,
                                    dep_field)

            sentence_dict, root_token = deps_to_sen_dict(deps)
            # root token will be the first token if ROOT doesn't exist
            if root_token is None:
                root_token = sentence_dict.keys()[0]
            SEEN = {}
            GRAPH_STRING = ''
            dict_to_graph(sentence_dict, root_token)
            print(GRAPH_STRING)
Example #6
0
 def lines_to_deps(lines):
     text_str = u"\n".join((u"".join(sen) for sen in list(lines)))
     tsv_stream = StringIO(text_str)
     return map(get_dependencies, sentence_iterator(tsv_stream))
Example #7
0
def conll_to_deps(stream):
    for sen in sentence_iterator(stream):
        yield get_dependencies(sen)
Example #8
0
 def lines_to_deps(lines):
     text_str = u"\n".join((u"".join(sen) for sen in list(lines)))
     tsv_stream = StringIO(text_str)
     return map(get_dependencies, sentence_iterator(tsv_stream))
Example #9
0
File: utils.py Project: Eszti/4lang
def conll_to_deps(stream):
    for sen in sentence_iterator(stream):
        yield get_dependencies(sen)