def main(config):
    token_vocab = Vocab(
        config.experiment_folder,
        "tokens",
        embedding_path=config.word_embedding,
        emb_dim=config.word_embedding_dim,
    )

    tag_vocab = Vocab(config.experiment_folder,
                      "tag",
                      embedding_path=config.tag_list)

    train_reader = ConllUReader(config.train_files, config, token_vocab,
                                tag_vocab, config.language)
    dev_reader = ConllUReader(config.dev_files, config, token_vocab,
                              train_reader.tag_vocab, config.language)
    detector = DetectionTrainer(config, token_vocab, tag_vocab)
    detector.train(train_reader, dev_reader)

    #     def __init__(self, component_name, run_id, out_path):
    res_collector = CSR("Event_hector_frames", 1, config.output, "belcat")

    test_reader = ConllUReader(config.test_files, config, token_vocab,
                               train_reader.tag_vocab)

    detector.predict(test_reader, res_collector)

    res_collector.run()
Esempio n. 2
0
def main(config):
    assert config.conllu_folder is not None
    assert config.csr_output is not None

    if not os.path.exists(config.csr_output):
        os.makedirs(config.csr_output)

    # The JSON-LD version ontology.
    ontology = JsonOntologyLoader(config.ontology_path)

    child2root = {}
    if config.parent_children_tab and os.path.exists(
            config.parent_children_tab):
        logging.info("Reading parent child tab file: "
                     "" + config.parent_children_tab)
        child2root = read_parent_child_info(config.parent_children_tab)
    else:
        logging.warning("Will not read parent child tab file")

    if config.add_rule_detector:
        # Rule detector should not need existing vocabulary.
        token_vocab = Vocab(config.resource_folder,
                            'tokens',
                            embedding_path=config.word_embedding,
                            emb_dim=config.word_embedding_dim,
                            ignore_existing=True)
        tag_vocab = Vocab(config.resource_folder,
                          'tag',
                          embedding_path=config.tag_list,
                          ignore_existing=True)
        detector = DetectionRunner(config, token_vocab, tag_vocab, ontology)

    ignore_edl = False
    if config.edl_json:
        if os.path.exists(config.edl_json):
            logging.info("Loading from EDL: {}".format(config.edl_json))
        else:
            logging.warning(
                "EDL output not found: {}, will be ignored.".format(
                    config.edl_json))
            ignore_edl = True

    for csr, docid in read_source(config.source_folder, config.language,
                                  ontology, child2root,
                                  'Frames_hector_combined',
                                  config.use_ltf_span_style,
                                  config.evt_merge_level):
        logging.info('Working with docid: {}'.format(docid))

        if config.edl_json and not ignore_edl:
            if not os.path.exists(config.edl_json):
                logging.info("No EDL output")
            else:
                edl_file = find_by_id(config.edl_json, docid)
                if edl_file:
                    logging.info("Adding EDL results: {}".format(edl_file))
                    edl_entities = add_edl_entities(edl_file, csr)

                    if config.relation_json:
                        if os.path.exists(config.relation_json):
                            relation_file = find_by_id(config.relation_json,
                                                       docid)
                            if relation_file:
                                logging.info("Adding relations between "
                                             "entities.")
                                add_entity_relations(relation_file,
                                                     edl_entities, csr)
                            else:
                                logging.error("Cannot find the relation file "
                                              "for {}".format(docid))

        conll_tokens = None

        if config.conllu_folder:
            if os.path.exists(config.conllu_folder):
                conll_file = find_by_id(config.conllu_folder, docid)
                if not conll_file:
                    logging.warning("CoNLL file for doc {} is missing, please "
                                    "check your paths.".format(docid))
                    continue
                if config.rich_event_token:
                    conll_tokens = token_to_span(conll_file)
            else:
                logging.warning(
                    f"File file not found at {config.conllu_folder}")

        if config.zie_event:
            if os.path.exists(config.zie_event):
                zie_event_file = find_by_id(config.zie_event, docid)
                if zie_event_file:
                    logging.info("Adding events with zie output: {}".format(
                        zie_event_file))
                    zie_integration_utils.add_zie_event(zie_event_file, csr)
                else:
                    logging.error(f"Cannot find zie output for {docid}")
            else:
                logging.info("No zie event output.")

        if config.rich_event:
            if os.path.exists(config.rich_event):
                rich_event_file = find_by_id(config.rich_event, docid)
                if rich_event_file:
                    logging.info("Adding events with rich output: {}".format(
                        rich_event_file))
                    add_rich_events(
                        csr,
                        rich_event_file,
                        provided_tokens=conll_tokens,
                        extent_based_provenance=config.extent_based_provenance)
                else:
                    logging.error(f"Cannot find rich output for {docid}")
            else:
                logging.info("No rich event output.")

        if config.dbpedia_wiki_json:
            if os.path.exists(config.dbpedia_wiki_json):
                wiki_file = find_by_id(config.dbpedia_wiki_json, docid)
                if wiki_file:
                    logging.info(
                        "Adding wiki linking from dbpedia spotlight: {}".
                        format(wiki_file))
                    add_entity_linking(csr, wiki_file, config.language)

        if config.salience_data:
            if not os.path.exists(config.salience_data):
                logging.info("No salience added.")
            else:
                scored_entities, scored_events = load_salience(
                    config.salience_data)
                if docid in scored_entities:
                    add_entity_salience(csr, scored_entities[docid])
                if docid in scored_events:
                    add_event_salience(csr, scored_events[docid])

        # Integrate COMEX here
        if config.ignore_comex is False and config.comex:
            if not os.path.exists(config.comex):
                logging.info("No COMEX output")
            else:
                comex_file = find_by_id(config.comex, docid)
                if comex_file:
                    logging.info("Adding COMEX results: {}".format(comex_file))
                    comex_integration_utils.add_comex(comex_file, csr)

        # TODO: we could possibly remove all conll related stuff.
        if config.add_rule_detector:
            logging.info("Reading on CoNLLU: {}".format(conll_file))
            # The conll files may contain information from another language.
            test_reader = ConllUReader([conll_file], config, token_vocab,
                                       tag_vocab, config.language)
            logging.info("Adding from ontology based rule detector.")
            detector.predict(test_reader, csr, 'maria_multilingual')

        csr.write(os.path.join(config.csr_output, docid + '.csr.json'))
Esempio n. 3
0
def main(config):
    assert config.conllu_folder is not None
    assert config.csr_output is not None

    if config.salience_data:
        scored_entities, scored_events = load_salience(config.salience_data)

    if not os.path.exists(config.csr_output):
        os.makedirs(config.csr_output)

    aida_ontology = OntologyLoader(config.ontology_path)
    onto_mapper = MappingLoader()
    onto_mapper.load_seedling_arg_mapping(config.seedling_argument_mapping)
    onto_mapper.load_seedling_event_mapping(config.seedling_event_mapping)

    if config.add_rule_detector:
        # Rule detector should not need existing vocabulary.
        token_vocab = Vocab(config.resource_folder, 'tokens',
                            embedding_path=config.word_embedding,
                            emb_dim=config.word_embedding_dim,
                            ignore_existing=True)
        tag_vocab = Vocab(config.resource_folder, 'tag',
                          embedding_path=config.tag_list,
                          ignore_existing=True)
        detector = DetectionRunner(config, token_vocab, tag_vocab)

    ignore_edl = False
    if config.edl_json:
        if os.path.exists(config.edl_json):
            logging.info("Loading from EDL: {}".format(config.edl_json))
        else:
            logging.warning("EDL output not found: {}, will be ignored.".format(
                config.edl_json))
            ignore_edl = True

    for csr, docid in read_source(config.source_folder, config.csr_output,
                                  config.language, aida_ontology, onto_mapper):
        logging.info('Working with docid: {}'.format(docid))
        if config.edl_json and not ignore_edl:
            edl_file = find_by_id(config.edl_json, docid)
            if edl_file:
                logging.info("Predicting with EDL: {}".format(edl_file))
                add_edl_entities(edl_file, csr)

        conll_file = find_by_id(config.conllu_folder, docid)
        if not conll_file:
            logging.warning("CoNLL file for doc {} is missing, please "
                            "check your paths.".format(docid))
            continue

        tokens = None

        if config.rich_event_token:
            tokens = token_to_span(conll_file)

        if config.rich_event:
            rich_event_file = find_by_id(config.rich_event, docid)
            if rich_event_file:
                logging.info(
                    "Adding events with rich output: {}".format(
                        rich_event_file))
                add_rich_events(rich_event_file, csr, tokens)

        if config.salience_data:
            if docid in scored_entities:
                add_entity_salience(csr, scored_entities[docid])
            if docid in scored_events:
                add_event_salience(csr, scored_events[docid])

        logging.info("Reading on CoNLLU: {}".format(conll_file))
        # The conll files may contain information from another language.
        test_reader = ConllUReader([conll_file], config, token_vocab,
                                   tag_vocab, config.language)

        if config.add_rule_detector:
            logging.info("Adding from ontology based rule detector.")
            # Adding rule detector. This is the last detector that use other
            # information from the CSR, including entity and events.
            detector.predict(test_reader, csr)

            # align_ontology(csr, aida_ontology)

        csr.write()