Exemple #1
0
def _define_entities(nlp, kb, entity_def_path, entity_descr_path,
                     min_entity_freq, entity_freq_path, entity_vector_length):
    # read the mappings from file
    title_to_id = io.read_title_to_id(entity_def_path)
    id_to_descr = io.read_id_to_descr(entity_descr_path)

    # check the length of the nlp vectors
    if "vectors" in nlp.meta and nlp.vocab.vectors.size:
        input_dim = nlp.vocab.vectors_length
        logger.info("Loaded pretrained vectors of size %s" % input_dim)
    else:
        raise ValueError(
            "The `nlp` object should have access to pretrained word vectors, "
            " cf. https://spacy.io/usage/models#languages.")

    logger.info("Filtering entities with fewer than {} mentions".format(
        min_entity_freq))
    entity_frequencies = io.read_entity_to_count(entity_freq_path)
    # filter the entities for in the KB by frequency, because there's just too much data (8M entities) otherwise
    filtered_title_to_id, entity_list, description_list, frequency_list = get_filtered_entities(
        title_to_id, id_to_descr, entity_frequencies, min_entity_freq)
    logger.info("Kept {} entities from the set of {}".format(
        len(description_list), len(title_to_id.keys())))

    logger.info("Training entity encoder")
    encoder = EntityEncoder(nlp, input_dim, entity_vector_length)
    encoder.train(description_list=description_list, to_print=True)

    logger.info("Getting entity embeddings")
    embeddings = encoder.apply_encoder(description_list)

    logger.info("Adding {} entities".format(len(entity_list)))
    kb.set_entities(entity_list=entity_list,
                    freq_list=frequency_list,
                    vector_list=embeddings)
    return entity_list, filtered_title_to_id
Exemple #2
0
def main(vocab_path=None, model=None, output_dir=None, n_iter=50):
    """Load the model, create the KB and pretrain the entity encodings.
    Either an nlp model or a vocab is needed to provide access to pre-trained word embeddings.
    If an output_dir is provided, the KB will be stored there in a file 'kb'.
    When providing an nlp model, the updated vocab will also be written to a directory in the output_dir."""
    if model is None and vocab_path is None:
        raise ValueError("Either the `nlp` model or the `vocab` should be specified.")

    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        vocab = Vocab().from_disk(vocab_path)
        # create blank Language class with specified vocab
        nlp = spacy.blank("en", vocab=vocab)
        print("Created blank 'en' model with vocab from '%s'" % vocab_path)

    kb = KnowledgeBase(vocab=nlp.vocab)

    # set up the data
    entity_ids = []
    descriptions = []
    freqs = []
    for key, value in ENTITIES.items():
        desc, freq = value
        entity_ids.append(key)
        descriptions.append(desc)
        freqs.append(freq)

    # training entity description encodings
    # this part can easily be replaced with a custom entity encoder
    encoder = EntityEncoder(
        nlp=nlp,
        input_dim=INPUT_DIM,
        desc_width=DESC_WIDTH,
        epochs=n_iter,
    )
    encoder.train(description_list=descriptions, to_print=True)

    # get the pretrained entity vectors
    embeddings = encoder.apply_encoder(descriptions)

    # set the entities, can also be done by calling `kb.add_entity` for each entity
    kb.set_entities(entity_list=entity_ids, freq_list=freqs, vector_list=embeddings)

    # adding aliases, the entities need to be defined in the KB beforehand
    kb.add_alias(
        alias="Russ Cochran",
        entities=["Q2146908", "Q7381115"],
        probabilities=[0.24, 0.7],  # the sum of these probabilities should not exceed 1
    )

    # test the trained model
    print()
    _print_kb(kb)

    # save model to output directory
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        kb_path = str(output_dir / "kb")
        kb.dump(kb_path)
        print()
        print("Saved KB to", kb_path)

        # only storing the vocab if we weren't already reading it from file
        if not vocab_path:
            vocab_path = output_dir / "vocab"
            kb.vocab.to_disk(vocab_path)
            print("Saved vocab to", vocab_path)

        print()

        # test the saved model
        # always reload a knowledge base with the same vocab instance!
        print("Loading vocab from", vocab_path)
        print("Loading KB from", kb_path)
        vocab2 = Vocab().from_disk(vocab_path)
        kb2 = KnowledgeBase(vocab=vocab2)
        kb2.load_bulk(kb_path)
        _print_kb(kb2)
        print()
    def train(self, entities, list_aliases):
        """
        Args:
            entities: a dict of each entity, it's description and it's corpus frequency
            list_aliases: a list of dicts for each entity e.g.::

                    [{
                        'alias':'Farrar',
                        'entities': ['Q1', 'Q2'],
                        'probabilities': [0.4, 0.6]
                    }]

                probabilities are 'prior probabilities' and must sum to < 1
        """
        try:
            nlp = spacy.load(self.kb_model)
        except IOError:
            subprocess.run(
                ["python", "-m", "spacy", "download", self.kb_model])
            # pkg_resources need to be reloaded to pick up the newly installed models
            import pkg_resources
            import imp

            imp.reload(pkg_resources)
            nlp = spacy.load(self.kb_model)

        print("Loaded model '%s'" % self.kb_model)
        kb = KnowledgeBase(vocab=nlp.vocab,
                           entity_vector_length=self.desc_width)

        # set up the data
        entity_ids = []
        descriptions = []
        freqs = []
        for key, value in entities.items():
            desc, freq = value
            entity_ids.append(key)
            descriptions.append(desc)
            freqs.append(freq)

        # training entity description encodings
        # this part can easily be replaced with a custom entity encoder
        encoder = EntityEncoder(
            nlp=nlp,
            input_dim=self.input_dim,
            desc_width=self.desc_width,
            epochs=self.num_epochs,
        )

        encoder.train(description_list=descriptions, to_print=True)

        # get the pretrained entity vectors
        embeddings = encoder.apply_encoder(descriptions)

        # set the entities, can also be done by calling `kb.add_entity` for each entity
        kb.set_entities(entity_list=entity_ids,
                        freq_list=freqs,
                        vector_list=embeddings)

        # adding aliases, the entities need to be defined in the KB beforehand
        for alias in list_aliases:
            kb.add_alias(
                alias=alias["alias"],
                entities=alias["entities"],
                probabilities=alias["probabilities"],
            )
        self.kb = kb
        return self.kb
Exemple #4
0
def create_kb(
    nlp,
    max_entities_per_alias,
    min_entity_freq,
    min_occ,
    entity_def_input,
    entity_descr_path,
    count_input,
    prior_prob_input,
    entity_vector_length,
):
    # Create the knowledge base from Wikidata entries
    kb = KnowledgeBase(vocab=nlp.vocab,
                       entity_vector_length=entity_vector_length)

    # read the mappings from file
    title_to_id = get_entity_to_id(entity_def_input)
    id_to_descr = get_id_to_description(entity_descr_path)

    # check the length of the nlp vectors
    if "vectors" in nlp.meta and nlp.vocab.vectors.size:
        input_dim = nlp.vocab.vectors_length
        logger.info("Loaded pretrained vectors of size %s" % input_dim)
    else:
        raise ValueError(
            "The `nlp` object should have access to pretrained word vectors, "
            " cf. https://spacy.io/usage/models#languages.")

    logger.info("Get entity frequencies")
    entity_frequencies = wp.get_all_frequencies(count_input=count_input)

    logger.info("Filtering entities with fewer than {} mentions".format(
        min_entity_freq))
    # filter the entities for in the KB by frequency, because there's just too much data (8M entities) otherwise
    filtered_title_to_id, entity_list, description_list, frequency_list = get_filtered_entities(
        title_to_id, id_to_descr, entity_frequencies, min_entity_freq)
    logger.info("Left with {} entities".format(len(description_list)))

    logger.info("Train entity encoder")
    encoder = EntityEncoder(nlp, input_dim, entity_vector_length)
    encoder.train(description_list=description_list, to_print=True)

    logger.info("Get entity embeddings:")
    embeddings = encoder.apply_encoder(description_list)

    logger.info("Adding {} entities".format(len(entity_list)))
    kb.set_entities(entity_list=entity_list,
                    freq_list=frequency_list,
                    vector_list=embeddings)

    logger.info("Adding aliases")
    _add_aliases(
        kb,
        title_to_id=filtered_title_to_id,
        max_entities_per_alias=max_entities_per_alias,
        min_occ=min_occ,
        prior_prob_input=prior_prob_input,
    )

    logger.info("KB size: {} entities, {} aliases".format(
        kb.get_size_entities(), kb.get_size_aliases()))

    logger.info("Done with kb")
    return kb
Exemple #5
0
def main(model=None, output_dir=None, n_iter=50):
    """Load the model, create the KB and pretrain the entity encodings.
    If an output_dir is provided, the KB will be stored there in a file 'kb'.
    The updated vocab will also be written to a directory in the output_dir."""

    nlp = spacy.load(model)  # load existing spaCy model
    print("Loaded model '%s'" % model)

    # check the length of the nlp vectors
    if "vectors" not in nlp.meta or not nlp.vocab.vectors.size:
        raise ValueError(
            "The `nlp` object should have access to pretrained word vectors, "
            " cf. https://spacy.io/usage/models#languages.")

    kb = KnowledgeBase(vocab=nlp.vocab)

    # set up the data
    entity_ids = []
    descriptions = []
    freqs = []
    for key, value in ENTITIES.items():
        desc, freq = value
        entity_ids.append(key)
        descriptions.append(desc)
        freqs.append(freq)

    # training entity description encodings
    # this part can easily be replaced with a custom entity encoder
    encoder = EntityEncoder(
        nlp=nlp,
        input_dim=INPUT_DIM,
        desc_width=DESC_WIDTH,
        epochs=n_iter,
    )
    encoder.train(description_list=descriptions, to_print=True)

    # get the pretrained entity vectors
    embeddings = encoder.apply_encoder(descriptions)

    # set the entities, can also be done by calling `kb.add_entity` for each entity
    kb.set_entities(entity_list=entity_ids,
                    freq_list=freqs,
                    vector_list=embeddings)

    # adding aliases, the entities need to be defined in the KB beforehand
    kb.add_alias(
        alias="Russ Cochran",
        entities=["Q2146908", "Q7381115"],
        probabilities=[0.24, 0.7
                       ],  # the sum of these probabilities should not exceed 1
    )

    # test the trained model
    print()
    _print_kb(kb)

    # save model to output directory
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        kb_path = str(output_dir / "kb")
        kb.dump(kb_path)
        print()
        print("Saved KB to", kb_path)

        vocab_path = output_dir / "vocab"
        kb.vocab.to_disk(vocab_path)
        print("Saved vocab to", vocab_path)

        print()

        # test the saved model
        # always reload a knowledge base with the same vocab instance!
        print("Loading vocab from", vocab_path)
        print("Loading KB from", kb_path)
        vocab2 = Vocab().from_disk(vocab_path)
        kb2 = KnowledgeBase(vocab=vocab2)
        kb2.load_bulk(kb_path)
        _print_kb(kb2)
        print()
Exemple #6
0
def create_kb(
    nlp,
    max_entities_per_alias,
    min_entity_freq,
    min_occ,
    entity_def_output,
    entity_descr_output,
    count_input,
    prior_prob_input,
    wikidata_input,
    entity_vector_length,
    limit=None,
    read_raw_data=True,
):
    # Create the knowledge base from Wikidata entries
    kb = KnowledgeBase(vocab=nlp.vocab,
                       entity_vector_length=entity_vector_length)

    # check the length of the nlp vectors
    if "vectors" in nlp.meta and nlp.vocab.vectors.size:
        input_dim = nlp.vocab.vectors_length
        print("Loaded pre-trained vectors of size %s" % input_dim)
    else:
        raise ValueError(
            "The `nlp` object should have access to pre-trained word vectors, "
            " cf. https://spacy.io/usage/models#languages.")

    # disable this part of the pipeline when rerunning the KB generation from preprocessed files
    if read_raw_data:
        print()
        print(now(), " * read wikidata entities:")
        title_to_id, id_to_descr = wd.read_wikidata_entities_json(
            wikidata_input, limit=limit)

        # write the title-ID and ID-description mappings to file
        _write_entity_files(entity_def_output, entity_descr_output,
                            title_to_id, id_to_descr)

    else:
        # read the mappings from file
        title_to_id = get_entity_to_id(entity_def_output)
        id_to_descr = get_id_to_description(entity_descr_output)

    print()
    print(now(), " *  get entity frequencies:")
    print()
    entity_frequencies = wp.get_all_frequencies(count_input=count_input)

    # filter the entities for in the KB by frequency, because there's just too much data (8M entities) otherwise
    filtered_title_to_id = dict()
    entity_list = []
    description_list = []
    frequency_list = []
    for title, entity in title_to_id.items():
        freq = entity_frequencies.get(title, 0)
        desc = id_to_descr.get(entity, None)
        if desc and freq > min_entity_freq:
            entity_list.append(entity)
            description_list.append(desc)
            frequency_list.append(freq)
            filtered_title_to_id[title] = entity

    print(len(title_to_id.keys()), "original titles")
    kept_nr = len(filtered_title_to_id.keys())
    print("kept", kept_nr, "entities with min. frequency", min_entity_freq)

    print()
    print(now(), " * train entity encoder:")
    print()
    encoder = EntityEncoder(nlp, input_dim, entity_vector_length)
    encoder.train(description_list=description_list, to_print=True)

    print()
    print(now(), " * get entity embeddings:")
    print()
    embeddings = encoder.apply_encoder(description_list)

    print(now(), " * adding", len(entity_list), "entities")
    kb.set_entities(entity_list=entity_list,
                    freq_list=frequency_list,
                    vector_list=embeddings)

    alias_cnt = _add_aliases(
        kb,
        title_to_id=filtered_title_to_id,
        max_entities_per_alias=max_entities_per_alias,
        min_occ=min_occ,
        prior_prob_input=prior_prob_input,
    )
    print()
    print(now(), " * adding", alias_cnt, "aliases")
    print()

    print()
    print("# of entities in kb:", kb.get_size_entities())
    print("# of aliases in kb:", kb.get_size_aliases())

    print(now(), "Done with kb")
    return kb