コード例 #1
0
def readFeaturesInput(filepaths):
    """
    load all features from mrp input file
    """
    input_dict = {}
    for filepath in filepaths:
        with open(filepath, 'r') as fp:
            for graph, _ in mrp_read(fp):
                # here graph.framework is conllu by default
                data = {genKey(graph): graph}
                # add unique example_id
                data["example_id"] = graph.id
                data["input_snt"] = graph.input
                # label is token
                data["tok"] = [node.label for node in graph.nodes]
                # 0 if lemma
                if "lemma" in graph.nodes[0].properties:
                    lemma_index = graph.nodes[0].properties.index("lemma")
                    data["lem"] = [
                        node.values[lemma_index] for node in graph.nodes
                    ]

                if "xpos" in graph.nodes[0].properties:
                    xpos_index = graph.nodes[0].properties.index("xpos")
                    data["xpos"] = [
                        node.values[xpos_index] for node in graph.nodes
                    ]

                if "upos" in graph.nodes[0].properties:
                    upos_index = graph.nodes[0].properties.index("upos")
                    data["upos"] = [
                        node.values[upos_index] for node in graph.nodes
                    ]

                if "pos" in graph.nodes[0].properties:
                    pos_index = graph.nodes[0].properties.index("pos")
                    data["pos"] = [
                        node.values[pos_index] for node in graph.nodes
                    ]

                if "ner" in graph.nodes[0].properties:
                    ner_index = graph.nodes[0].properties.index("ner")
                    data["ner"] = [
                        node.values[ner_index] for node in graph.nodes
                    ]

                if "mwe" in graph.nodes[0].properties:
                    mwe_index = graph.nodes[0].properties.index("mwe")
                    data["mwe"] = [
                        node.values[mwe_index] for node in graph.nodes
                    ]

                # if no anchors, it will be None.
                data["anchors"] = [node.anchors for node in graph.nodes]

                input_dict[graph.id] = data
    return input_dict
コード例 #2
0
def mergeWithAnnotatedGraphs(input_dict, filepaths):
    """
    read graph as the target
    """
    n = 0
    for filepath in filepaths:
        with open(filepath, 'r') as fp:
            for graph, _ in mrp_read(fp):
                # here graph.framework is conllu by default
                key = genKey(graph)
                if graph.id in input_dict:
                    input_dict[graph.id][key] = graph
                    n = n + 1
    return n
コード例 #3
0
def write_features_mrp(filepath):
    """
    write preprocessed features like tok, lem, pos, ner in mrp_conllupre_prossed
    """
    out = filepath.split(opt.companion_suffix)[0] + ".mrp_conllu_pre_processed"
    logger.info("processing " + filepath)
    with open(out, 'w') as out_f:
        with open(filepath, 'r') as in_file:
            n = 0
            for graph, _ in mrp_read(in_file):
                n = n + 1
                if n % 500 == 0:
                    logger.info(n)
                    # only add a ner feature from that
                #if graph.id not in ['bolt-eng-DF-170-181118-8875443_0097.13','bolt-eng-DF-170-181103-8882248_0335.5']:
                #    continue
                tokenized_text = ' '.join([node.label for node in graph.nodes])
                text = graph.input
                text = text.replace(u"\u0085",
                                    u"\00A0").replace("%20", u"\00A0")
                tokenized_text = tokenized_text.replace(u"\u0085",
                                                        u"\00A0").replace(
                                                            "%20", u"\00A0")
                if opt.frame == 'amr' or opt.frame == 'ucca':
                    data = input_preprocessor.preprocess(
                        text,
                        whiteSpace=False,
                        token_combine=opt.token_combine
                    )  #phrase from fixed joints.txt file
                    # constructing a new graph
                    new_graph = Graph(graph.id, graph.flavor, graph.framework)
                    new_graph.add_input(text)
                    for i in range(len(data['tok'])):
                        if "mwe" in data:
                            new_graph.add_node(
                                i,
                                label=data['tok'][i],
                                properties=["lemma", "pos", "ner", "mwe"],
                                values=[
                                    data['lem'][i], data['pos'][i],
                                    data['ner'][i], data['mwe'][i]
                                ],
                                anchors=data['anchors'][i])
                        else:
                            new_graph.add_node(
                                i,
                                label=data['tok'][i],
                                properties=["lemma", "pos", "ner"],
                                values=[
                                    data['lem'][i], data['pos'][i],
                                    data['ner'][i]
                                ],
                                anchors=data['anchors'][i])
                    out_f.write(
                        json.dumps(new_graph.encode(),
                                   indent=None,
                                   ensure_ascii=False))
                    out_f.write("\n")
                else:
                    # use white space and only use ner for extra
                    data = input_preprocessor.preprocess(
                        tokenized_text,
                        whiteSpace=True,
                        token_combine=opt.token_combine)
                    assert len(data['ner']) == len(
                        graph.nodes
                    ), "preprocess data length is not equal to the input in {}, {}".format(
                        graph.encode(), data)
                    assert len(data['mwe']) == len(
                        graph.nodes
                    ), "preprocess data length is not equal to the input in {}, {}".format(
                        graph.encode(), data)
                    for node in graph.nodes:
                        i = node.properties.index('xpos')
                        node.set_property('pos', node.values[i])
                        node.set_property('ner', data['ner'][node.id])
                        node.set_property('mwe', data['mwe'][node.id])
                    # write back ner
                    out_f.write(
                        json.dumps(graph.encode(),
                                   indent=None,
                                   ensure_ascii=False))
                    out_f.write("\n")

    logger.info("done processing " + filepath)
    logger.info(out + " is generated")
コード例 #4
0
ファイル: mrp_conllu_utils.py プロジェクト: utahnlp/lapa-mrp
def mrp_utils_parser():
    parser = argparse.ArgumentParser(description='mrp_utils for selecting ids')

    parser.add_argument('--suffix', default=".mrp", type=str,
                        help="""suffix of files to combine""")
    parser.add_argument('--input_folder', default="", type=str,
                        help="""the build folder for dict and rules, data""")
    return parser

parser = mrp_utils_parser()
opt = parser.parse_args()

input_files = folder_to_files_path(opt.input_folder, opt.suffix)
id_files = folder_to_files_path(opt.input_folder, ".ids")

for input_file in input_files:
    with open(input_file, 'r') as fp:
        graph_dict = {}
        for graph, _ in mrp_read(fp):
            graph_dict[graph.id] = graph
        for id_file in id_files:
            x = 0
            with open(id_file, "r") as idfp, open(id_file+".conllu","w+") as cfp:
                for line in idfp:
                    id = line.rstrip("\n")
                    g = json.dumps(graph_dict[id].encode(), indent=None, ensure_ascii = False)
                    cfp.write(g)
                    cfp.write("\n")
                    x = x +1
            logger.info("{} is written into {}".format(x, id_file+".conllu"))
コード例 #5
0
ファイル: mrp_utils.py プロジェクト: utahnlp/lapa-mrp
    dev_ids = []

if opt.test_ids:
    with open(opt.test_ids, "r") as fp:
        test_ids = [line.rstrip('\n') for line in fp]
else:
    test_ids = []

all_ids = []
for input_file in input_files:
    train_set = []
    dev_set = []
    test_set = []
    remaining_set = []
    with open(input_file, 'r') as fp:
        graphs = list(mrp_read(fp))

    # for the framework, a graph id may duplicate more than once.
    # only select those haven't been used graphs
    deduplicated_graphs = []
    for graph, _ in graphs:
        if graph.id not in all_ids:
            deduplicated_graphs.append(graph)
            all_ids.append(graph.id)
        else:
            continue

    total = len(deduplicated_graphs)
    if opt.follow_ids_only:
        train_total = total
        dev_total = total