def _read(self, file_path): # if `file_path` is a URL, redirect to the cache file_path = cached_path(file_path) logger.info("Reading instances from lines in file at: %s", file_path) for amr in AMRIO.read(file_path): yield self.text_to_instance(amr) self.report_coverage()
def recategorize_file(self, file_path): for i, amr in enumerate(AMRIO.read(file_path), 1): self.recategorize_graph(amr) yield amr if i % 1000 == 0: logger.info('Processed {} examples.'.format(i)) logger.info('Done.\n')
def _update_counter_from_train_files(self, amr_train_files, base_freq=1): logger.info('Updating (lemma, frame) counter from AMR train files.') for file_path in amr_train_files: for amr in AMRIO.read(file_path): for node in amr.graph.get_nodes(): for _, frame in node.get_frame_attributes(): frame_lemma = re.sub(WORDSENSE_RE, '', frame) self._update_counter(self.lemma_frame_counter, frame_lemma, frame, base_freq) self._update_counter(self.frame_lemma_counter, frame, frame_lemma, base_freq)
def _get_senseless_node_counter(amr_train_files): logger.info('Building the senseless node counter.') sense_less_nodes = [] for amr_file in amr_train_files: for amr in AMRIO.read(amr_file): for node in amr.graph.get_nodes(): for attr, value in node.get_senseless_attributes(): sense_less_nodes.append(value) return Counter(sense_less_nodes)
def dump_spotlight_wiki(self, file_path): sent_map = {} for i, amr in enumerate(AMRIO.read(file_path), 1): if i % 20 == 0: print('+', end='') sent = amr.sentence wiki = self.spotlight_wiki(sent) sent_map[sent] = wiki sleep(0.1) with open(os.path.join(self.util_dir, 'spotlight_wiki.json'), 'w', encoding='utf-8') as f: json.dump(sent_map, f)
def dump_spotlight_wiki(self, amr_files): #!!! This function has been changed by Deng Cai sent_map = {} for file_path in amr_files: for i, amr in enumerate(AMRIO.read(file_path), 1): if i % 20 == 0: print('+', end='') sent = amr.sentence wiki = self.spotlight_wiki(sent) sent_map[sent] = wiki sleep(0.1) with open(os.path.join(self.util_dir, 'spotlight_wiki.json'), 'w', encoding='utf-8') as f: json.dump(sent_map, f)
def create_dependency_parser_feature_from_file(annotator, filepath): dependency_feature_data = [] amrs = [] sentence_ids = [] with open(filepath + '.features', 'w', encoding='utf-8') as f: for i, amr in enumerate(AMRIO.read(filepath), 1): if i % 100 == 0: print('{} processed.'.format(i)) annotation = annotator.annotate(amr.sentence) dump_amr_features(amr, annotation, f) sentence_data = create_dependency_parser_feature(annotation, amr.sentence, i) dependency_feature_data.append(sentence_data) sentence_ids.append(i) amrs.append(amr) dataset_dict = { 'sentence_id': sum([sum([sentence_data['sentence_id'] for sentence_data in dependency_feature_data], [])],[]), 'sequence': sum([sum([sentence_data['sequence'] for sentence_data in dependency_feature_data], [])],[]), 'parent': sum([sum([sentence_data['parent'] for sentence_data in dependency_feature_data], [])],[]), 'parent_position': sum([sum([sentence_data['parent_position'] for sentence_data in dependency_feature_data], [])],[]), 'child': sum([sum([sentence_data['child'] for sentence_data in dependency_feature_data], [])],[]), 'child_position': sum([sum([sentence_data['child_position'] for sentence_data in dependency_feature_data], [])],[]), 'is_root' : sum([sum([sentence_data['is_root'] for sentence_data in dependency_feature_data], [])],[]), 'parent_ner': sum([sum([sentence_data['parent_ner'] for sentence_data in dependency_feature_data], [])],[]), 'child_ner': sum([sum([sentence_data['child_ner'] for sentence_data in dependency_feature_data], [])],[]), 'parent_pos': sum([sum([sentence_data['parent_pos'] for sentence_data in dependency_feature_data], [])],[]), 'dependency_role': sum([sum([sentence_data['dependency_role'] for sentence_data in dependency_feature_data], [])],[]), 'child_pos': sum([sum([sentence_data['child_pos'] for sentence_data in dependency_feature_data], [])],[]) } amr_dict = { 'sentence_id': sentence_ids, 'amr': [str(amr.graph) for amr in amrs] } dependency_feature_df = pd.DataFrame(dataset_dict) amr_df = pd.DataFrame(amr_dict) return dependency_feature_df, amr_df
def load_data(self): logger.info("Parsing and linearizing the AMR dataset") train_amr = AMRIO.read(self.train_file) for i, amr in tqdm(enumerate(train_amr), desc='Train AMR'): # Raw version if self.small and i > 50: break raw_amr = [] for amr_line in str(amr.graph).splitlines(): striped_amr = amr_line.strip() raw_amr.append(striped_amr) self.X_train_raw.append(" ".join(raw_amr)) linearized_amr = self.get_list(amr) self.X_train.append(linearized_amr[1:]) self.Y_train.append(amr.sentence) self.Y_train_tok.append(amr.tokens) # Vocabulary Create dictionaries and simplify list simpl = list() simpl_only_nodes = list() for step in linearized_amr: if step not in self.lin_to_int.keys(): self.lin_to_int[step] = len(self.lin_to_int) self.int_to_lin[len(self.int_to_lin)] = step # simplyfied AMR version step, edge = self.simplify(step) simpl.append(step) if not step.startswith(":"): simpl_only_nodes.append(step) # Identify edges and save them if edge and step not in self.edges: self.edges.append(step) self.X_train_simple.append(simpl) self.X_train_simple_only_nodes.append(simpl_only_nodes) sent = amr.sentence.split() for word in sent: if word not in self.word_to_int.keys(): self.word_to_int[word] = len(self.word_to_int) self.int_to_word[len(self.int_to_word)] = word if self.use_silver_data: print("Processing silver data from", self.silver_train_file) ii = 0 silver_train_amr = AMRIO.read(self.silver_train_file) for i, amr in enumerate(silver_train_amr): if self.small and i > 50: break # Raw version raw_amr = [] ii += 1 linearized_amr = self.get_list(amr) if linearized_amr is None: continue for amr_line in str(amr.graph).splitlines(): striped_amr = amr_line.strip() raw_amr.append(striped_amr) self.X_silver_train_raw.append(" ".join(raw_amr)) self.X_silver_train.append(linearized_amr[1:]) self.Y_silver_train.append(amr.sentence) self.Y_silver_train_tok.append(amr.tokens) # Vocabulary Create dictionaries and simplify list simpl = list() simpl_only_nodes = list() for step in linearized_amr: if step not in self.lin_to_int.keys(): self.lin_to_int[step] = len(self.lin_to_int) self.int_to_lin[len(self.int_to_lin)] = step # simplyfied AMR version step, edge = self.simplify(step) simpl.append(step) if not step.startswith(":"): simpl_only_nodes.append(step) # Identify edges and save them if edge and step not in self.edges: self.edges.append(step) self.X_silver_train_simple.append(simpl) self.X_silver_train_simple_only_nodes.append(simpl_only_nodes) sent = amr.sentence.split() for word in sent: if word not in self.word_to_int.keys(): self.word_to_int[word] = len(self.word_to_int) self.int_to_word[len(self.int_to_word)] = word print("Silver data with size:", len(self.X_silver_train_raw)) else: print("No silver data performed") dev_amr = AMRIO.read(self.dev_file) for i, amr in tqdm(enumerate(dev_amr), desc='Dev AMR'): if self.small and i > 50: break # Raw input raw_amr = [] for amr_line in str(amr.graph).splitlines(): striped_amr = amr_line.strip() raw_amr.append(striped_amr) self.X_dev_raw.append(" ".join(raw_amr)) linearized_amr = self.get_list(amr) self.X_dev.append(linearized_amr[1:]) self.Y_dev.append(amr.sentence) self.Y_dev_tok.append(amr.tokens) # simplyfied AMR version simpl = list() simpl_only_nodes = list() for step in linearized_amr: step, edge = self.simplify(step) simpl.append(step) if not step.startswith(":"): simpl_only_nodes.append(step) if edge and step not in self.edges: self.edges.append(step) self.X_dev_simple.append(simpl) self.X_dev_simple_only_nodes.append(simpl_only_nodes) test_amr = AMRIO.read(self.test_file) self.amr_test = test_amr for i, amr in tqdm(enumerate(test_amr), desc='Test AMR'): if self.small and i > 50: break # Raw version raw_amr = [] for amr_line in str(amr.graph).splitlines(): striped_amr = amr_line.strip() raw_amr.append(striped_amr) self.X_test_raw.append(" ".join(raw_amr)) linearized_amr = self.get_list(amr) self.X_test.append(linearized_amr[1:]) self.Y_test.append(amr.sentence) self.Y_test_tok.append(amr.tokens) # simplyfied AMR version simpl = list() simpl_only_nodes = list() for step in linearized_amr: step, edge = self.simplify(step) simpl.append(step) if not step.startswith(":"): simpl_only_nodes.append(step) if edge and step not in self.edges: self.edges.append(step) self.X_test_simple.append(simpl) self.X_test_simple_only_nodes.append(simpl_only_nodes)
def expand_file(self, file_path): for i, amr in enumerate(AMRIO.read(file_path)): self.expand_graph(amr) yield amr self.print_stats()
if __name__ == '__main__': import argparse from stog.data.dataset_readers.amr_parsing.io import AMRIO parser = argparse.ArgumentParser('feature_annotator.py') parser.add_argument('files', nargs='+', help='files to annotate.') parser.add_argument('--compound_file', default='') args = parser.parse_args() annotator = FeatureAnnotator('http://localhost:9000', args.compound_file) for file_path in args.files: logger.info('Processing {}'.format(file_path)) for i, amr in enumerate(AMRIO.read(file_path), 1): if i % 1000 == 0: logger.info('{} processed.'.format(i)) annotation = annotator(amr.sentence) amr.tokens = annotation['tokens'] amr.lemmas = annotation['lemmas'] amr.pos_tags = annotation['pos_tags'] amr.ner_tags = annotation['ner_tags'] amr.original = annotation['original'] with open(file_path + '.features', 'a', encoding='utf-8') as f: AMRIO.dump([amr], f) logger.info('Done!')
parser = argparse.ArgumentParser("text_anonymizor.py") parser.add_argument('--amr_file', nargs="+", required=True, help="File to anonymize.") parser.add_argument('--util_dir') parser.add_argument('--lang') parser.add_argument('--exclude_ners', action="store_true", help="consider NER tags for entities not found in training.") args = parser.parse_args() if args.lang=="en": text_anonymizor = TextAnonymizor.from_json(os.path.join(args.util_dir, "text_anonymization_rules.json")) lang_stopwords=None lang2en_span=None lang2en_bn=None else: text_anonymizor = TextAnonymizor.from_json(os.path.join(args.util_dir,"text_anonymization_en-{}.json".format(args.lang))) lang_stopwords = set([x.rstrip() for x in open("data/cross-lingual-babelnet_mappings/stopwords_{}.txt".format(args.lang))]) lang2en_span=load_name_span_map("data/cross-lingual-babelnet_mappings/name_span_en_{}_map_amr_bn.json".format(args.lang), args.lang) lang2en_bn=load_name_bn_wiki_map("data/cross-lingual-babelnet_mappings/namedEntity_wiki_synsets.{}.tsv".format(args.lang.upper())) for amr_file in args.amr_file: with open(amr_file + ".recategorize{}".format("_noner" if args.exclude_ners else ""), "w", encoding="utf-8") as f: for amr in tqdm(AMRIO.read(amr_file, lang=args.lang)): amr.abstract_map = text_anonymizor(amr) f.write(str(amr) + "\n\n")
def wikify_file(self, file_path): for i, amr in enumerate(AMRIO.read(file_path)): self.wikify_graph(amr) yield amr
return cls( text_maps=d["text_maps"], priority_lists=d["priority_lists"], _VNE=d["VNE"], _LOCEN1=d["LOCEN1"], _LOCEN2=d["LOCEN2"], _N=d["N"], _M=d["M"], _R=d["R"], _INVP=d["INVP"], _INVS=d["INVS"], ) if __name__ == "__main__": import argparse parser = argparse.ArgumentParser("text_anonymizor.py") parser.add_argument('--amr_file', required=True, help="File to anonymize.") parser.add_argument('--util_dir') args = parser.parse_args() text_anonymizor = TextAnonymizor.from_json( os.path.join(args.util_dir, "text_anonymization_rules.json")) with open(args.amr_file + ".recategorize", "w", encoding="utf-8") as f: for amr in AMRIO.read(args.amr_file): amr.abstract_map = text_anonymizor(amr) f.write(str(amr) + "\n\n")
if token == '911': index = i break else: break amr.replace_span([index], ['09', '11'], ['CD', 'CD'], ['DATE', 'DATE']) def replace_NT_dollar_abbr(amr): # Replace 'NT' in front of '$' with 'Taiwan'. for i, token in enumerate(amr.tokens): if token == 'NT' and len(amr.tokens) > i + 1 and amr.tokens[i + 1] in ( '$', 'dollars', 'dollar'): amr.replace_span([i], ['Taiwan'], ['NNP'], ['COUNTRY']) if __name__ == '__main__': import argparse from stog.data.dataset_readers.amr_parsing.io import AMRIO parser = argparse.ArgumentParser('input_cleaner.py') parser.add_argument('--amr_files', nargs='+', default=[]) args = parser.parse_args() for file_path in args.amr_files: with open(file_path + '.input_clean', 'w', encoding='utf-8') as f: for amr in AMRIO.read(file_path): clean(amr) f.write(str(amr) + '\n\n')
def read(self, file_path): for amr in AMRIO.read(file_path): yield self(amr)