Beispiel #1
0
def create_kb():
    """ Step 1: create the Knowledge Base in spaCy and write it to file """
    nlp = spacy.load("en_core_web_lg")
    name_dict, desc_dict = load_entities()

    kb = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=300)

    for qid, desc in desc_dict.items():
        desc_doc = nlp(desc)
        desc_enc = desc_doc.vector
        kb.add_entity(entity=qid, entity_vector=desc_enc, freq=342)  # 342 is an arbitrary value here

    for qid, name in name_dict.items():
        kb.add_alias(alias=name, entities=[qid], probabilities=[1])  # 100% prior probability P(entity|alias)

    qids = name_dict.keys()
    probs = [0.3 for qid in qids]
    kb.add_alias(alias="Emerson", entities=qids, probabilities=probs)  # sum([probs]) should be <= 1 !

    print(f"Entities in the KB: {kb.get_entity_strings()}")
    print(f"Aliases in the KB: {kb.get_alias_strings()}")
    print()

    if not os.path.exists(output_dir):
        os.mkdir(output_dir)
    kb.dump(output_dir / "my_kb")
    nlp.to_disk(output_dir / "my_nlp")
Beispiel #2
0
def test_issue4674():
    """Test that setting entities with overlapping identifiers does not mess up IO"""
    nlp = English()
    kb = KnowledgeBase(nlp.vocab, entity_vector_length=3)

    vector1 = [0.9, 1.1, 1.01]
    vector2 = [1.8, 2.25, 2.01]
    kb.set_entities(entity_list=["Q1", "Q1"],
                    freq_list=[32, 111],
                    vector_list=[vector1, vector2])

    assert kb.get_size_entities() == 1

    # dumping to file & loading back in
    with make_tempdir() as d:
        dir_path = ensure_path(d)
        if not dir_path.exists():
            dir_path.mkdir()
        file_path = dir_path / "kb"
        kb.dump(str(file_path))

        kb2 = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=3)
        kb2.load_bulk(str(file_path))

    assert kb2.get_size_entities() == 1
def test_save_and_load_knowledge_base():
    nlp = Language()
    kb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
    with make_tempdir() as d:
        path = d / "kb"
        try:
            kb.dump(path)
        except Exception as e:
            pytest.fail(str(e))

        try:
            kb_loaded = KnowledgeBase(nlp.vocab, entity_vector_length=1)
            kb_loaded.load_bulk(path)
        except Exception as e:
            pytest.fail(str(e))
Beispiel #4
0
def main(vocab_path=None, model=None, output_dir=None, n_iter=50):
    """Load the model, create the KB and pretrain the entity encodings.
    Either an nlp model or a vocab is needed to provide access to pre-trained word embeddings.
    If an output_dir is provided, the KB will be stored there in a file 'kb'.
    When providing an nlp model, the updated vocab will also be written to a directory in the output_dir."""
    if model is None and vocab_path is None:
        raise ValueError("Either the `nlp` model or the `vocab` should be specified.")

    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        vocab = Vocab().from_disk(vocab_path)
        # create blank Language class with specified vocab
        nlp = spacy.blank("en", vocab=vocab)
        print("Created blank 'en' model with vocab from '%s'" % vocab_path)

    kb = KnowledgeBase(vocab=nlp.vocab)

    # set up the data
    entity_ids = []
    descriptions = []
    freqs = []
    for key, value in ENTITIES.items():
        desc, freq = value
        entity_ids.append(key)
        descriptions.append(desc)
        freqs.append(freq)

    # training entity description encodings
    # this part can easily be replaced with a custom entity encoder
    encoder = EntityEncoder(
        nlp=nlp,
        input_dim=INPUT_DIM,
        desc_width=DESC_WIDTH,
        epochs=n_iter,
    )
    encoder.train(description_list=descriptions, to_print=True)

    # get the pretrained entity vectors
    embeddings = encoder.apply_encoder(descriptions)

    # set the entities, can also be done by calling `kb.add_entity` for each entity
    kb.set_entities(entity_list=entity_ids, freq_list=freqs, vector_list=embeddings)

    # adding aliases, the entities need to be defined in the KB beforehand
    kb.add_alias(
        alias="Russ Cochran",
        entities=["Q2146908", "Q7381115"],
        probabilities=[0.24, 0.7],  # the sum of these probabilities should not exceed 1
    )

    # test the trained model
    print()
    _print_kb(kb)

    # save model to output directory
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        kb_path = str(output_dir / "kb")
        kb.dump(kb_path)
        print()
        print("Saved KB to", kb_path)

        # only storing the vocab if we weren't already reading it from file
        if not vocab_path:
            vocab_path = output_dir / "vocab"
            kb.vocab.to_disk(vocab_path)
            print("Saved vocab to", vocab_path)

        print()

        # test the saved model
        # always reload a knowledge base with the same vocab instance!
        print("Loading vocab from", vocab_path)
        print("Loading KB from", kb_path)
        vocab2 = Vocab().from_disk(vocab_path)
        kb2 = KnowledgeBase(vocab=vocab2)
        kb2.load_bulk(kb_path)
        _print_kb(kb2)
        print()
Beispiel #5
0
def main(model=None, output_dir=None):
    """Load the model and create the KB with pre-defined entity encodings.
    If an output_dir is provided, the KB will be stored there in a file 'kb'.
    The updated vocab will also be written to a directory in the output_dir."""

    nlp = spacy.load(model)  # load existing spaCy model
    print("Loaded model '%s'" % model)

    # check the length of the nlp vectors
    if "vectors" not in nlp.meta or not nlp.vocab.vectors.size:
        raise ValueError(
            "The `nlp` object should have access to pretrained word vectors, "
            " cf. https://spacy.io/usage/models#languages.")

    # You can change the dimension of vectors in your KB by using an encoder that changes the dimensionality.
    # For simplicity, we'll just use the original vector dimension here instead.
    vectors_dim = nlp.vocab.vectors.shape[1]
    kb = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=vectors_dim)

    # set up the data
    entity_ids = []
    descr_embeddings = []
    freqs = []
    for key, value in ENTITIES.items():
        desc, freq = value
        entity_ids.append(key)
        descr_embeddings.append(nlp(desc).vector)
        freqs.append(freq)

    # set the entities, can also be done by calling `kb.add_entity` for each entity
    kb.set_entities(entity_list=entity_ids,
                    freq_list=freqs,
                    vector_list=descr_embeddings)

    # adding aliases, the entities need to be defined in the KB beforehand
    kb.add_alias(
        alias="Russ Cochran",
        entities=["Q2146908", "Q7381115"],
        probabilities=[0.24, 0.7
                       ],  # the sum of these probabilities should not exceed 1
    )

    # test the trained model
    print()
    _print_kb(kb)

    # save model to output directory
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        kb_path = str(output_dir / "kb")
        kb.dump(kb_path)
        print()
        print("Saved KB to", kb_path)

        vocab_path = output_dir / "vocab"
        kb.vocab.to_disk(vocab_path)
        print("Saved vocab to", vocab_path)

        print()

        # test the saved model
        # always reload a knowledge base with the same vocab instance!
        print("Loading vocab from", vocab_path)
        print("Loading KB from", kb_path)
        vocab2 = Vocab().from_disk(vocab_path)
        kb2 = KnowledgeBase(vocab=vocab2)
        kb2.load_bulk(kb_path)
        print()
        _print_kb(kb2)
Beispiel #6
0
def main(model=None, output_dir=None, n_iter=50):
    """Load the model, create the KB and pretrain the entity encodings.
    If an output_dir is provided, the KB will be stored there in a file 'kb'.
    The updated vocab will also be written to a directory in the output_dir."""

    nlp = spacy.load(model)  # load existing spaCy model
    print("Loaded model '%s'" % model)

    # check the length of the nlp vectors
    if "vectors" not in nlp.meta or not nlp.vocab.vectors.size:
        raise ValueError(
            "The `nlp` object should have access to pretrained word vectors, "
            " cf. https://spacy.io/usage/models#languages.")

    kb = KnowledgeBase(vocab=nlp.vocab)

    # set up the data
    entity_ids = []
    descriptions = []
    freqs = []
    for key, value in ENTITIES.items():
        desc, freq = value
        entity_ids.append(key)
        descriptions.append(desc)
        freqs.append(freq)

    # training entity description encodings
    # this part can easily be replaced with a custom entity encoder
    encoder = EntityEncoder(
        nlp=nlp,
        input_dim=INPUT_DIM,
        desc_width=DESC_WIDTH,
        epochs=n_iter,
    )
    encoder.train(description_list=descriptions, to_print=True)

    # get the pretrained entity vectors
    embeddings = encoder.apply_encoder(descriptions)

    # set the entities, can also be done by calling `kb.add_entity` for each entity
    kb.set_entities(entity_list=entity_ids,
                    freq_list=freqs,
                    vector_list=embeddings)

    # adding aliases, the entities need to be defined in the KB beforehand
    kb.add_alias(
        alias="Russ Cochran",
        entities=["Q2146908", "Q7381115"],
        probabilities=[0.24, 0.7
                       ],  # the sum of these probabilities should not exceed 1
    )

    # test the trained model
    print()
    _print_kb(kb)

    # save model to output directory
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        kb_path = str(output_dir / "kb")
        kb.dump(kb_path)
        print()
        print("Saved KB to", kb_path)

        vocab_path = output_dir / "vocab"
        kb.vocab.to_disk(vocab_path)
        print("Saved vocab to", vocab_path)

        print()

        # test the saved model
        # always reload a knowledge base with the same vocab instance!
        print("Loading vocab from", vocab_path)
        print("Loading KB from", kb_path)
        vocab2 = Vocab().from_disk(vocab_path)
        kb2 = KnowledgeBase(vocab=vocab2)
        kb2.load_bulk(kb_path)
        _print_kb(kb2)
        print()
    def settingup_knowledgebase(self, names, train_data_2):

        QID = names['QID'].values.tolist()
        Names = names['Names'].values.tolist()
        Frequency = names['Frequency'].values.tolist()
        descript = []
        for desc in names['Description']:
            descript.append(self.custom_ner_model(desc).vector)

        print("Setting up entities \n")

        kb = KnowledgeBase(vocab=self.custom_ner_model.vocab,
                           entity_vector_length=96)
        kb.set_entities(entity_list=QID,
                        freq_list=Frequency,
                        vector_list=descript)

        print("Setting up Alias \n")

        print("\n")
        print("Spacy Pipeline \n")

        print(self.custom_ner_model.pipe_names)

        #kb_dump_file = str(input("Enter the KB Dump name: "))
        #kb_vocab_folder = str(input("Enter the KB Vocab name: "))
        folder.nel_kb_vocab()

        alias_prep = list(zip(Names, QID))
        folder.nel_kb_vocab()
        for i, j in alias_prep:
            names_alias = str(i)
            list_qid = []
            list_qid.append(j)
            prob = []
            prob.append(int(1.0))
            kb.add_alias(alias=names_alias,
                         entities=list_qid,
                         probabilities=prob)

            kb.dump("KB_Dump")
            kb.vocab.to_disk("KB_Vocab")

        print("\n")
        print("Knowbase dump and Vocab are stored in a local disk")

        train_data_dict_2 = train_data_2.to_dict('records')

        dataset_2 = []
        for data in train_data_dict_2:
            Text = data['Text']
            Name = data['Name']
            QID = data['QID']
            offset = (data["Start"], data["End"])
            links_dict = {QID: 1.0}
            dataset_2.append((Text, {"links": {offset: links_dict}}))

        self.custom_ner_model.vocab.from_disk("KB_Vocab")
        self.custom_ner_model.vocab.vectors.name = "spacy_pretrained_vectors"
        kb = KnowledgeBase(vocab=self.custom_ner_model.vocab)
        kb.load_bulk("KB_Dump")

        TRAIN_DOCS = []
        for text, annotation in dataset_2:
            doc = self.custom_ner_model(
                text
            )  # to make this more efficient, you can use nlp.pipe() just once for all the texts
            TRAIN_DOCS.append((doc, annotation))

        print("\n")
        print("Training started for Named Entity Linking \n")

        entity_linker = self.custom_ner_model.create_pipe(
            "entity_linker", config={"incl_prior": False})
        entity_linker.set_kb(kb)
        self.custom_ner_model.add_pipe(entity_linker, last=True)

        other_pipes = [
            pipe for pipe in self.custom_ner_model.pipe_names
            if pipe != "entity_linker"
        ]
        with self.custom_ner_model.disable_pipes(
                *other_pipes):  # train only the entity_linker
            optimizer = self.custom_ner_model.begin_training()
            for itn in range(
                    500):  # 500 iterations takes about a minute to train
                random.shuffle(TRAIN_DOCS)
                batches = minibatch(TRAIN_DOCS,
                                    size=compounding(
                                        4.0, 32.0,
                                        1.001))  # increasing batch sizes
                losses = {}
                for batch in batches:
                    texts, annotations = zip(*batch)
                    self.custom_ner_model.update(
                        texts,
                        annotations,
                        drop=0.2,  # prevent overfitting
                        losses=losses,
                        sgd=optimizer,
                    )
                if itn % 50 == 0:
                    print(itn, "Losses", losses)  # print the training loss
        print(itn, "Losses", losses)
        print("\n")
        print("Spacy Pipeline \n")
        print(self.custom_ner_model.pipe_names)
        ner_dump_name = str(input("Enter the Model name: "))

        self.custom_ner_model.to_disk(ner_dump_name)

        return self.custom_ner_model
entity_labels = list(set(entity_labels))

entities = name_dict.values()
ent2id = {v: k for k, v in name_dict.items()}

print("Testing candidate generation")
print(entity_labels[0], cand_gen(entity_labels[0], entities, ent2id))

## adding (fuzzy matching) candidates into KB
aliases = {}
words = []
for flabel in entity_labels:
    name = flabel
    qids, probs = cand_gen(flabel, entities, ent2id)
    if len(probs) == 1 and probs[0] == 1.0: continue
    kb.add_alias(alias=flabel, entities=qids,
                 probabilities=probs)  # sum([probs]) should be <= 1 !

print(
    f"Candidates for 'hyperlipidemia': {[c.entity_ for c in kb.get_candidates('hyperlipidemia')]}"
)

# change the directory and file names to whatever you like
output_dir = Path.cwd() / "output"
if not os.path.exists(output_dir):
    os.mkdir(output_dir)
kb.dump(output_dir / "my_kb")

nlp.to_disk(output_dir / "my_nlp")