def main(config): token_vocab = Vocab( config.experiment_folder, "tokens", embedding_path=config.word_embedding, emb_dim=config.word_embedding_dim, ) tag_vocab = Vocab(config.experiment_folder, "tag", embedding_path=config.tag_list) train_reader = ConllUReader(config.train_files, config, token_vocab, tag_vocab, config.language) dev_reader = ConllUReader(config.dev_files, config, token_vocab, train_reader.tag_vocab, config.language) detector = DetectionTrainer(config, token_vocab, tag_vocab) detector.train(train_reader, dev_reader) # def __init__(self, component_name, run_id, out_path): res_collector = CSR("Event_hector_frames", 1, config.output, "belcat") test_reader = ConllUReader(config.test_files, config, token_vocab, train_reader.tag_vocab) detector.predict(test_reader, res_collector) res_collector.run()
def main(config): assert config.conllu_folder is not None assert config.csr_output is not None if not os.path.exists(config.csr_output): os.makedirs(config.csr_output) # The JSON-LD version ontology. ontology = JsonOntologyLoader(config.ontology_path) child2root = {} if config.parent_children_tab and os.path.exists( config.parent_children_tab): logging.info("Reading parent child tab file: " "" + config.parent_children_tab) child2root = read_parent_child_info(config.parent_children_tab) else: logging.warning("Will not read parent child tab file") if config.add_rule_detector: # Rule detector should not need existing vocabulary. token_vocab = Vocab(config.resource_folder, 'tokens', embedding_path=config.word_embedding, emb_dim=config.word_embedding_dim, ignore_existing=True) tag_vocab = Vocab(config.resource_folder, 'tag', embedding_path=config.tag_list, ignore_existing=True) detector = DetectionRunner(config, token_vocab, tag_vocab, ontology) ignore_edl = False if config.edl_json: if os.path.exists(config.edl_json): logging.info("Loading from EDL: {}".format(config.edl_json)) else: logging.warning( "EDL output not found: {}, will be ignored.".format( config.edl_json)) ignore_edl = True for csr, docid in read_source(config.source_folder, config.language, ontology, child2root, 'Frames_hector_combined', config.use_ltf_span_style, config.evt_merge_level): logging.info('Working with docid: {}'.format(docid)) if config.edl_json and not ignore_edl: if not os.path.exists(config.edl_json): logging.info("No EDL output") else: edl_file = find_by_id(config.edl_json, docid) if edl_file: logging.info("Adding EDL results: {}".format(edl_file)) edl_entities = add_edl_entities(edl_file, csr) if config.relation_json: if os.path.exists(config.relation_json): relation_file = find_by_id(config.relation_json, docid) if relation_file: logging.info("Adding relations between " "entities.") add_entity_relations(relation_file, edl_entities, csr) else: logging.error("Cannot find the relation file " "for {}".format(docid)) conll_tokens = None if config.conllu_folder: if os.path.exists(config.conllu_folder): conll_file = find_by_id(config.conllu_folder, docid) if not conll_file: logging.warning("CoNLL file for doc {} is missing, please " "check your paths.".format(docid)) continue if config.rich_event_token: conll_tokens = token_to_span(conll_file) else: logging.warning( f"File file not found at {config.conllu_folder}") if config.zie_event: if os.path.exists(config.zie_event): zie_event_file = find_by_id(config.zie_event, docid) if zie_event_file: logging.info("Adding events with zie output: {}".format( zie_event_file)) zie_integration_utils.add_zie_event(zie_event_file, csr) else: logging.error(f"Cannot find zie output for {docid}") else: logging.info("No zie event output.") if config.rich_event: if os.path.exists(config.rich_event): rich_event_file = find_by_id(config.rich_event, docid) if rich_event_file: logging.info("Adding events with rich output: {}".format( rich_event_file)) add_rich_events( csr, rich_event_file, provided_tokens=conll_tokens, extent_based_provenance=config.extent_based_provenance) else: logging.error(f"Cannot find rich output for {docid}") else: logging.info("No rich event output.") if config.dbpedia_wiki_json: if os.path.exists(config.dbpedia_wiki_json): wiki_file = find_by_id(config.dbpedia_wiki_json, docid) if wiki_file: logging.info( "Adding wiki linking from dbpedia spotlight: {}". format(wiki_file)) add_entity_linking(csr, wiki_file, config.language) if config.salience_data: if not os.path.exists(config.salience_data): logging.info("No salience added.") else: scored_entities, scored_events = load_salience( config.salience_data) if docid in scored_entities: add_entity_salience(csr, scored_entities[docid]) if docid in scored_events: add_event_salience(csr, scored_events[docid]) # Integrate COMEX here if config.ignore_comex is False and config.comex: if not os.path.exists(config.comex): logging.info("No COMEX output") else: comex_file = find_by_id(config.comex, docid) if comex_file: logging.info("Adding COMEX results: {}".format(comex_file)) comex_integration_utils.add_comex(comex_file, csr) # TODO: we could possibly remove all conll related stuff. if config.add_rule_detector: logging.info("Reading on CoNLLU: {}".format(conll_file)) # The conll files may contain information from another language. test_reader = ConllUReader([conll_file], config, token_vocab, tag_vocab, config.language) logging.info("Adding from ontology based rule detector.") detector.predict(test_reader, csr, 'maria_multilingual') csr.write(os.path.join(config.csr_output, docid + '.csr.json'))
def main(config): assert config.conllu_folder is not None assert config.csr_output is not None if config.salience_data: scored_entities, scored_events = load_salience(config.salience_data) if not os.path.exists(config.csr_output): os.makedirs(config.csr_output) aida_ontology = OntologyLoader(config.ontology_path) onto_mapper = MappingLoader() onto_mapper.load_seedling_arg_mapping(config.seedling_argument_mapping) onto_mapper.load_seedling_event_mapping(config.seedling_event_mapping) if config.add_rule_detector: # Rule detector should not need existing vocabulary. token_vocab = Vocab(config.resource_folder, 'tokens', embedding_path=config.word_embedding, emb_dim=config.word_embedding_dim, ignore_existing=True) tag_vocab = Vocab(config.resource_folder, 'tag', embedding_path=config.tag_list, ignore_existing=True) detector = DetectionRunner(config, token_vocab, tag_vocab) ignore_edl = False if config.edl_json: if os.path.exists(config.edl_json): logging.info("Loading from EDL: {}".format(config.edl_json)) else: logging.warning("EDL output not found: {}, will be ignored.".format( config.edl_json)) ignore_edl = True for csr, docid in read_source(config.source_folder, config.csr_output, config.language, aida_ontology, onto_mapper): logging.info('Working with docid: {}'.format(docid)) if config.edl_json and not ignore_edl: edl_file = find_by_id(config.edl_json, docid) if edl_file: logging.info("Predicting with EDL: {}".format(edl_file)) add_edl_entities(edl_file, csr) conll_file = find_by_id(config.conllu_folder, docid) if not conll_file: logging.warning("CoNLL file for doc {} is missing, please " "check your paths.".format(docid)) continue tokens = None if config.rich_event_token: tokens = token_to_span(conll_file) if config.rich_event: rich_event_file = find_by_id(config.rich_event, docid) if rich_event_file: logging.info( "Adding events with rich output: {}".format( rich_event_file)) add_rich_events(rich_event_file, csr, tokens) if config.salience_data: if docid in scored_entities: add_entity_salience(csr, scored_entities[docid]) if docid in scored_events: add_event_salience(csr, scored_events[docid]) logging.info("Reading on CoNLLU: {}".format(conll_file)) # The conll files may contain information from another language. test_reader = ConllUReader([conll_file], config, token_vocab, tag_vocab, config.language) if config.add_rule_detector: logging.info("Adding from ontology based rule detector.") # Adding rule detector. This is the last detector that use other # information from the CSR, including entity and events. detector.predict(test_reader, csr) # align_ontology(csr, aida_ontology) csr.write()