def recategorize_file(self, file_path): for i, amr in enumerate(AMRIO.read(file_path), 1): self.recategorize_graph(amr) yield amr if i % 1000 == 0: logger.info('Processed {} examples.'.format(i)) logger.info('Done.\n')
def _get_senseless_node_counter(amr_train_files): logger.info('Building the senseless node counter.') sense_less_nodes = [] for amr_file in amr_train_files: for amr in AMRIO.read(amr_file): for node in amr.graph.get_nodes(): for attr, value in node.get_senseless_attributes(): sense_less_nodes.append(value) return Counter(sense_less_nodes)
def _update_counter_from_train_files(self, amr_train_files, base_freq=1): logger.info('Updating (lemma, frame) counter from AMR train files.') for file_path in amr_train_files: for amr in AMRIO.read(file_path): for node in amr.graph.get_nodes(): for _, frame in node.get_frame_attributes(): frame_lemma = re.sub(WORDSENSE_RE, '', frame) self._update_counter(self.lemma_frame_counter, frame_lemma, frame, base_freq) self._update_counter(self.frame_lemma_counter, frame, frame_lemma, base_freq)
def from_json(cls, file_path: str) -> 'TextAnonymizor': with open(file_path, encoding="utf-8") as f: d = json.load(f) return cls( text_maps=d["text_maps"], priority_lists=d["priority_lists"], _VNE=d["VNE"], _LOCEN1=d["LOCEN1"], _LOCEN2=d["LOCEN2"], _N=d["N"], _M=d["M"], _R=d["R"], _INVP=d["INVP"], _INVS=d["INVS"], ) if __name__ == "__main__": import argparse parser = argparse.ArgumentParser("text_anonymizor.py") parser.add_argument('--amr_file', required=True, help="File to anonymize.") parser.add_argument('--util_dir') args = parser.parse_args() text_anonymizor = TextAnonymizor.from_json( os.path.join(args.util_dir, "text_anonymization_rules.json")) with open(args.amr_file + ".recategorize", "w", encoding="utf-8") as f: for amr in AMRIO.read(args.amr_file): amr.abstract_map = text_anonymizor(amr) f.write(str(amr) + "\n\n")
for key, value in type_counter.items(): max_ner = None max_ner_count = None for ner, count in value.items(): if max_ner is None: max_ner = ner max_ner_count = count continue if count > max_ner_count: max_ner = ner max_ner_count = count type_ner_mapper[key] = max_ner for file_path in args.amr_files: for amr in AMRIO.read(file_path): for key, value in amr.abstract_map.items(): value_type = value["type"] value_span = value["span"] if len(value_span) <= 1: continue if (value_type == 'named-entity' or value_type == 'ordinal-entity') \ and distance(value_span, value["ops"]) > 3: continue value["ner"] = type_ner_mapper.get( value_span.lower(), { "named-entity": "PERSON", "url-entity": "URL",
def restore_file(self, file_path): for amr in AMRIO.read(file_path): self.restore_instance(amr) yield amr