def test_wiki_online():
    mentions = get_wiki_mentions()
    wiki = WikipediaRelationExtraction()

    assert not wiki.extract_all_relations(mentions[0], mentions[0]).isdisjoint(
        set(
            [
                RelationType.WIKIPEDIA_CATEGORY,
                RelationType.WIKIPEDIA_REDIRECT_LINK,
                RelationType.WIKIPEDIA_BE_COMP,
            ]
        )
    )

    assert not wiki.extract_all_relations(mentions[0], mentions[1]).isdisjoint(
        set(
            [
                RelationType.WIKIPEDIA_CATEGORY,
                RelationType.WIKIPEDIA_REDIRECT_LINK,
                RelationType.WIKIPEDIA_BE_COMP,
            ]
        )
    )

    assert (
        wiki.extract_all_relations(mentions[0], mentions[2]).pop() == RelationType.NO_RELATION_FOUND
    )
def run_example():
    logger.info("Running relation extraction example......")
    computed = ComputedRelationExtraction()
    ref_dict = ReferentDictRelationExtraction(
        ref_dict=str(LIBRARY_ROOT / "datasets" / "coref.dict1.tsv"))
    vo = VerboceanRelationExtraction(
        vo_file=str(LIBRARY_ROOT / "datasets" /
                    "verbocean.unrefined.2004-05-20.txt"))
    wiki = WikipediaRelationExtraction()
    wn = WordnetRelationExtraction()
    embed = WordEmbeddingRelationExtraction(method=EmbeddingMethod.ELMO)

    mention_x1 = MentionDataLight(
        "IBM",
        mention_context=
        "IBM manufactures and markets computer hardware, middleware and software",
    )
    mention_y1 = MentionDataLight(
        "International Business Machines",
        mention_context="International Business Machines Corporation is an "
        "American multinational information technology company",
    )

    computed_relations = computed.extract_all_relations(mention_x1, mention_y1)
    ref_dict_relations = ref_dict.extract_all_relations(mention_x1, mention_y1)
    vo_relations = vo.extract_all_relations(mention_x1, mention_y1)
    wiki_relations = wiki.extract_all_relations(mention_x1, mention_y1)
    embed_relations = embed.extract_all_relations(mention_x1, mention_y1)
    wn_relaions = wn.extract_all_relations(mention_x1, mention_y1)

    if RelationType.NO_RELATION_FOUND in computed_relations:
        logger.info("No Computed relation found")
    else:
        logger.info("Found Computed relations-%s",
                    str(list(computed_relations)))

    if RelationType.NO_RELATION_FOUND in ref_dict_relations:
        logger.info("No Referent-Dict relation found")
    else:
        logger.info("Found Referent-Dict relations-%s",
                    str(list(ref_dict_relations)))

    if RelationType.NO_RELATION_FOUND in vo_relations:
        logger.info("No Verb-Ocean relation found")
    else:
        logger.info("Found Verb-Ocean relations-%s", str(list(vo_relations)))

    if RelationType.NO_RELATION_FOUND in wiki_relations:
        logger.info("No Wikipedia relation found")
    else:
        logger.info("Found Wikipedia relations-%s", str(wiki_relations))
    if RelationType.NO_RELATION_FOUND in embed_relations:
        logger.info("No Embedded relation found")
    else:
        logger.info("Found Embedded relations-%s", str(list(embed_relations)))
    if RelationType.NO_RELATION_FOUND in wn_relaions:
        logger.info("No Wordnet relation found")
    else:
        logger.info("Found Wordnet relations-%s", str(wn_relaions))
Beispiel #3
0
def run_example():
    logger.info('Running relation extraction example......')
    computed = ComputedRelationExtraction()
    ref_dict = ReferentDictRelationExtraction(OnlineOROfflineMethod.ONLINE,
                                              LIBRARY_ROOT + '/datasets/ref.dict1.tsv')
    vo = VerboceanRelationExtraction(OnlineOROfflineMethod.ONLINE,
                                     LIBRARY_ROOT + '/datasets/verbocean.unrefined.2004-05-20.txt')
    wiki = WikipediaRelationExtraction(WikipediaSearchMethod.ONLINE)
    wn = WordnetRelationExtraction(OnlineOROfflineMethod.ONLINE)

    mention_x1 = MentionDataLight(
        'IBM',
        mention_context='IBM manufactures and markets computer hardware, middleware and software')
    mention_y1 = MentionDataLight(
        'International Business Machines',
        mention_context='International Business Machines Corporation is an '
                        'American multinational information technology company')

    computed_relations = computed.extract_all_relations(mention_x1, mention_y1)
    ref_dict_relations = ref_dict.extract_all_relations(mention_x1, mention_y1)
    vo_relations = vo.extract_all_relations(mention_x1, mention_y1)
    wiki_relations = wiki.extract_sub_relations(mention_x1, mention_y1,
                                                RelationType.WIKIPEDIA_REDIRECT_LINK)
    embed = WordEmbeddingRelationExtraction(
        EmbeddingMethod.ELMO)
    embed_relations = embed.extract_all_relations(mention_x1, mention_y1)
    wn_relaions = wn.extract_sub_relations(mention_x1, mention_y1,
                                           RelationType.WORDNET_DERIVATIONALLY)

    if RelationType.NO_RELATION_FOUND in computed_relations:
        logger.info('No Computed relation found')
    else:
        logger.info('Found Computed relations-%s', str(list(computed_relations)))

    if RelationType.NO_RELATION_FOUND in ref_dict_relations:
        logger.info('No Referent-Dict relation found')
    else:
        logger.info('Found Referent-Dict relations-%s', str(list(ref_dict_relations)))

    if RelationType.NO_RELATION_FOUND in vo_relations:
        logger.info('No Verb-Ocean relation found')
    else:
        logger.info('Found Verb-Ocean relations-%s', str(list(vo_relations)))

    if RelationType.NO_RELATION_FOUND in wiki_relations:
        logger.info('No Wikipedia relation found')
    else:
        logger.info('Found Wikipedia relations-%s', str(list(wiki_relations)))
    if RelationType.NO_RELATION_FOUND in embed_relations:
        logger.info('No Embedded relation found')
    else:
        logger.info('Found Embedded relations-%s', str(list(embed_relations)))
    if RelationType.NO_RELATION_FOUND in wn_relaions:
        logger.info('No Wordnet relation found')
    else:
        logger.info('Found Wordnet relations-%s', str(list(wn_relaions)))
def load_modules(cdc_resources):
    models = list()
    models.append(ComputedRelationExtraction())
    models.append(
        WikipediaRelationExtraction(
            cdc_resources.wiki_search_method,
            wiki_file=cdc_resources.wiki_folder,
            host=cdc_resources.elastic_host,
            port=cdc_resources.elastic_port,
            index=cdc_resources.elastic_index,
        )
    )
    models.append(
        WordEmbeddingRelationExtraction(
            cdc_resources.embed_search_method,
            glove_file=cdc_resources.glove_file,
            elmo_file=cdc_resources.elmo_file,
            cos_accepted_dist=0.75,
        )
    )
    models.append(
        ReferentDictRelationExtraction(
            cdc_resources.referent_dict_method, cdc_resources.referent_dict_file
        )
    )
    return models
    def load_modules(self):
        relations = set()
        for sieve in self.event_config.sieves_order:
            relations.add(sieve[0])
        for sieve in self.entity_config.sieves_order:
            relations.add(sieve[0])

        if any('WIKIPEDIA' in relation.name for relation in relations):
            self.wiki = WikipediaRelationExtraction(
                self.cdc_resources.wiki_search_method,
                wiki_file=self.cdc_resources.wiki_folder,
                host=self.cdc_resources.elastic_host,
                port=self.cdc_resources.elastic_port,
                index=self.cdc_resources.elastic_index)
        if RelationType.WORD_EMBEDDING_MATCH in relations:
            self.embeds = WordEmbeddingRelationExtraction(
                self.cdc_resources.embed_search_method,
                glove_file=self.cdc_resources.glove_file,
                elmo_file=self.cdc_resources.elmo_file)
        if RelationType.VERBOCEAN_MATCH in relations:
            self.vo = VerboceanRelationExtraction(
                self.cdc_resources.vo_search_method,
                self.cdc_resources.vo_dict_file)
        if RelationType.REFERENT_DICT in relations:
            self.ref_dict = ReferentDictRelationExtraction(
                self.cdc_resources.referent_dict_method,
                self.cdc_resources.referent_dict_file)
        if RelationType.WITHIN_DOC_COREF in relations:
            self.within_doc = WithinDocCoref(self.cdc_resources.wd_file)
        if any('WORDNET' in relation.name for relation in relations):
            self.wordnet = WordnetRelationExtraction(
                self.cdc_resources.wn_search_method,
                self.cdc_resources.wn_folder)
def wiki_dump_from_gs():
    logger.info('Starting, process will connect with ElasticSearch and online wikipedia site...')
    mentions_files = [args.mentions]
    dump_file = args.output
    vocab = load_mentions_vocab_from_files(mentions_files)

    if args.host and args.port and args.index:
        wiki_elastic = WikipediaRelationExtraction(WikipediaSearchMethod.ELASTIC,
                                                   host=args.host,
                                                   port=args.port,
                                                   index=args.index)
    else:
        logger.info(
            'Running without Wikipedia elastic search, Note that this will '
            'take much longer to process only using online service')
        wiki_elastic = None

    wiki_online = WikipediaRelationExtraction(WikipediaSearchMethod.ONLINE)

    for phrase in vocab:
        phrase = phrase.replace("'", "").replace('"', "").replace('\\', "").strip()
        logger.info('Try to retrieve \'%s\' from elastic search', phrase)
        pages = None
        if wiki_elastic:
            pages = wiki_elastic.get_phrase_related_pages(phrase)
        if not pages or not pages.get_pages() or len(pages.get_pages()) == 0:
            logger.info('Not on elastic, retrieve \'%s\' from wiki online site', phrase)
            pages = wiki_online.get_phrase_related_pages(phrase)
        for search_page in pages.get_pages():
            add_page(search_page, phrase)

    with open(dump_file, 'w') as myfile:
        json.dump(result_dump, myfile, default=json_dumper)

    logger.info('Saving dump to file-%s', dump_file)