def readFeaturesInput(filepaths): """ load all features from mrp input file """ input_dict = {} for filepath in filepaths: with open(filepath, 'r') as fp: for graph, _ in mrp_read(fp): # here graph.framework is conllu by default data = {genKey(graph): graph} # add unique example_id data["example_id"] = graph.id data["input_snt"] = graph.input # label is token data["tok"] = [node.label for node in graph.nodes] # 0 if lemma if "lemma" in graph.nodes[0].properties: lemma_index = graph.nodes[0].properties.index("lemma") data["lem"] = [ node.values[lemma_index] for node in graph.nodes ] if "xpos" in graph.nodes[0].properties: xpos_index = graph.nodes[0].properties.index("xpos") data["xpos"] = [ node.values[xpos_index] for node in graph.nodes ] if "upos" in graph.nodes[0].properties: upos_index = graph.nodes[0].properties.index("upos") data["upos"] = [ node.values[upos_index] for node in graph.nodes ] if "pos" in graph.nodes[0].properties: pos_index = graph.nodes[0].properties.index("pos") data["pos"] = [ node.values[pos_index] for node in graph.nodes ] if "ner" in graph.nodes[0].properties: ner_index = graph.nodes[0].properties.index("ner") data["ner"] = [ node.values[ner_index] for node in graph.nodes ] if "mwe" in graph.nodes[0].properties: mwe_index = graph.nodes[0].properties.index("mwe") data["mwe"] = [ node.values[mwe_index] for node in graph.nodes ] # if no anchors, it will be None. data["anchors"] = [node.anchors for node in graph.nodes] input_dict[graph.id] = data return input_dict
def mergeWithAnnotatedGraphs(input_dict, filepaths): """ read graph as the target """ n = 0 for filepath in filepaths: with open(filepath, 'r') as fp: for graph, _ in mrp_read(fp): # here graph.framework is conllu by default key = genKey(graph) if graph.id in input_dict: input_dict[graph.id][key] = graph n = n + 1 return n
def write_features_mrp(filepath): """ write preprocessed features like tok, lem, pos, ner in mrp_conllupre_prossed """ out = filepath.split(opt.companion_suffix)[0] + ".mrp_conllu_pre_processed" logger.info("processing " + filepath) with open(out, 'w') as out_f: with open(filepath, 'r') as in_file: n = 0 for graph, _ in mrp_read(in_file): n = n + 1 if n % 500 == 0: logger.info(n) # only add a ner feature from that #if graph.id not in ['bolt-eng-DF-170-181118-8875443_0097.13','bolt-eng-DF-170-181103-8882248_0335.5']: # continue tokenized_text = ' '.join([node.label for node in graph.nodes]) text = graph.input text = text.replace(u"\u0085", u"\00A0").replace("%20", u"\00A0") tokenized_text = tokenized_text.replace(u"\u0085", u"\00A0").replace( "%20", u"\00A0") if opt.frame == 'amr' or opt.frame == 'ucca': data = input_preprocessor.preprocess( text, whiteSpace=False, token_combine=opt.token_combine ) #phrase from fixed joints.txt file # constructing a new graph new_graph = Graph(graph.id, graph.flavor, graph.framework) new_graph.add_input(text) for i in range(len(data['tok'])): if "mwe" in data: new_graph.add_node( i, label=data['tok'][i], properties=["lemma", "pos", "ner", "mwe"], values=[ data['lem'][i], data['pos'][i], data['ner'][i], data['mwe'][i] ], anchors=data['anchors'][i]) else: new_graph.add_node( i, label=data['tok'][i], properties=["lemma", "pos", "ner"], values=[ data['lem'][i], data['pos'][i], data['ner'][i] ], anchors=data['anchors'][i]) out_f.write( json.dumps(new_graph.encode(), indent=None, ensure_ascii=False)) out_f.write("\n") else: # use white space and only use ner for extra data = input_preprocessor.preprocess( tokenized_text, whiteSpace=True, token_combine=opt.token_combine) assert len(data['ner']) == len( graph.nodes ), "preprocess data length is not equal to the input in {}, {}".format( graph.encode(), data) assert len(data['mwe']) == len( graph.nodes ), "preprocess data length is not equal to the input in {}, {}".format( graph.encode(), data) for node in graph.nodes: i = node.properties.index('xpos') node.set_property('pos', node.values[i]) node.set_property('ner', data['ner'][node.id]) node.set_property('mwe', data['mwe'][node.id]) # write back ner out_f.write( json.dumps(graph.encode(), indent=None, ensure_ascii=False)) out_f.write("\n") logger.info("done processing " + filepath) logger.info(out + " is generated")
def mrp_utils_parser(): parser = argparse.ArgumentParser(description='mrp_utils for selecting ids') parser.add_argument('--suffix', default=".mrp", type=str, help="""suffix of files to combine""") parser.add_argument('--input_folder', default="", type=str, help="""the build folder for dict and rules, data""") return parser parser = mrp_utils_parser() opt = parser.parse_args() input_files = folder_to_files_path(opt.input_folder, opt.suffix) id_files = folder_to_files_path(opt.input_folder, ".ids") for input_file in input_files: with open(input_file, 'r') as fp: graph_dict = {} for graph, _ in mrp_read(fp): graph_dict[graph.id] = graph for id_file in id_files: x = 0 with open(id_file, "r") as idfp, open(id_file+".conllu","w+") as cfp: for line in idfp: id = line.rstrip("\n") g = json.dumps(graph_dict[id].encode(), indent=None, ensure_ascii = False) cfp.write(g) cfp.write("\n") x = x +1 logger.info("{} is written into {}".format(x, id_file+".conllu"))
dev_ids = [] if opt.test_ids: with open(opt.test_ids, "r") as fp: test_ids = [line.rstrip('\n') for line in fp] else: test_ids = [] all_ids = [] for input_file in input_files: train_set = [] dev_set = [] test_set = [] remaining_set = [] with open(input_file, 'r') as fp: graphs = list(mrp_read(fp)) # for the framework, a graph id may duplicate more than once. # only select those haven't been used graphs deduplicated_graphs = [] for graph, _ in graphs: if graph.id not in all_ids: deduplicated_graphs.append(graph) all_ids.append(graph.id) else: continue total = len(deduplicated_graphs) if opt.follow_ids_only: train_total = total dev_total = total