def test_append_alias(nlp): """Test that we can append additional alias-entity pairs""" mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1) # adding entities mykb.add_entity(entity="Q1", freq=27, entity_vector=[1]) mykb.add_entity(entity="Q2", freq=12, entity_vector=[2]) mykb.add_entity(entity="Q3", freq=5, entity_vector=[3]) # adding aliases mykb.add_alias(alias="douglas", entities=["Q2", "Q3"], probabilities=[0.4, 0.1]) mykb.add_alias(alias="adam", entities=["Q2"], probabilities=[0.9]) # test the size of the relevant candidates assert len(mykb.get_alias_candidates("douglas")) == 2 # append an alias mykb.append_alias(alias="douglas", entity="Q1", prior_prob=0.2) # test the size of the relevant candidates has been incremented assert len(mykb.get_alias_candidates("douglas")) == 3 # append the same alias-entity pair again should not work (will throw a warning) with pytest.warns(UserWarning): mykb.append_alias(alias="douglas", entity="Q1", prior_prob=0.3) # test the size of the relevant candidates remained unchanged assert len(mykb.get_alias_candidates("douglas")) == 3
def test_kb_to_bytes(): # Test that the KB's to_bytes method works correctly nlp = English() kb_1 = KnowledgeBase(nlp.vocab, entity_vector_length=3) kb_1.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3]) kb_1.add_entity(entity="Q66", freq=9, entity_vector=[1, 2, 3]) kb_1.add_alias(alias="Russ Cochran", entities=["Q2146908"], probabilities=[0.8]) kb_1.add_alias(alias="Boeing", entities=["Q66"], probabilities=[0.5]) kb_1.add_alias(alias="Randomness", entities=["Q66", "Q2146908"], probabilities=[0.1, 0.2]) assert kb_1.contains_alias("Russ Cochran") kb_bytes = kb_1.to_bytes() kb_2 = KnowledgeBase(nlp.vocab, entity_vector_length=3) assert not kb_2.contains_alias("Russ Cochran") kb_2 = kb_2.from_bytes(kb_bytes) # check that both KBs are exactly the same assert kb_1.get_size_entities() == kb_2.get_size_entities() assert kb_1.entity_vector_length == kb_2.entity_vector_length assert kb_1.get_entity_strings() == kb_2.get_entity_strings() assert kb_1.get_vector("Q2146908") == kb_2.get_vector("Q2146908") assert kb_1.get_vector("Q66") == kb_2.get_vector("Q66") assert kb_2.contains_alias("Russ Cochran") assert kb_1.get_size_aliases() == kb_2.get_size_aliases() assert kb_1.get_alias_strings() == kb_2.get_alias_strings() assert len(kb_1.get_alias_candidates("Russ Cochran")) == len( kb_2.get_alias_candidates("Russ Cochran")) assert len(kb_1.get_alias_candidates("Randomness")) == len( kb_2.get_alias_candidates("Randomness"))
def test_vocab_serialization(nlp): """Test that string information is retained across storage""" mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1) # adding entities mykb.add_entity(entity="Q1", freq=27, entity_vector=[1]) q2_hash = mykb.add_entity(entity="Q2", freq=12, entity_vector=[2]) mykb.add_entity(entity="Q3", freq=5, entity_vector=[3]) # adding aliases mykb.add_alias(alias="douglas", entities=["Q2", "Q3"], probabilities=[0.4, 0.1]) adam_hash = mykb.add_alias(alias="adam", entities=["Q2"], probabilities=[0.9]) candidates = mykb.get_alias_candidates("adam") assert len(candidates) == 1 assert candidates[0].entity == q2_hash assert candidates[0].entity_ == "Q2" assert candidates[0].alias == adam_hash assert candidates[0].alias_ == "adam" with make_tempdir() as d: mykb.to_disk(d / "kb") kb_new_vocab = KnowledgeBase(Vocab(), entity_vector_length=1) kb_new_vocab.from_disk(d / "kb") candidates = kb_new_vocab.get_alias_candidates("adam") assert len(candidates) == 1 assert candidates[0].entity == q2_hash assert candidates[0].entity_ == "Q2" assert candidates[0].alias == adam_hash assert candidates[0].alias_ == "adam"
def create_kb(kb_dir='sample_kb', nlp_dir='sample_nlp'): nlp = spacy.load('en_core_web_lg') text = 'Tennis champion Emerson was expected to win Wimbledon.' doc = nlp(text) file_path = 'entities.csv' name_dict, desc_dict = load_entities(file_path) sample_qid, sample_desc = list(desc_dict.items())[0] sample_doc = nlp(sample_desc) entity_vector_length = len(sample_doc.vector) # should be 300 kb = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=entity_vector_length) for qid, desc in desc_dict.items(): desc_doc = nlp(desc) desc_enc = desc_doc.vector # NB: entity_vector could be any encoding # freq is the count of times the word appears in the corpus # not used in this tutorial kb.add_entity(entity=qid, entity_vector=desc_enc, freq=42) # add provided alias for qid, name in name_dict.items(): # probabilities is P(entity|alias) = 1.0 # we assume that each alias only maps to one entity kb.add_alias(alias=name, entities=[qid], probabilities=[1]) # add additional alias with equal probability # this could be learned from data qids = name_dict.keys() probs = [0.3 for qid in qids] kb.add_alias(alias='Emerson', entities=qids, probabilities=probs) print(f'Entities in the KB: {kb.get_entity_strings()}') print(f'Aliases in the KB: {kb.get_alias_strings()}') print() # questions here are: # 1) what matching function is being used? - is this deterministic? # 2) what threshold is being used to determine how many candidates are presented? entities = [ c.entity_ for c in kb.get_alias_candidates('Roy Stanley Emerson') ] print(f'Candidates for \'Roy Stanley Emerson\': {entities}') entities = [c.entity_ for c in kb.get_alias_candidates('Emerson')] print(f'Candidates for \'Emerson\': {entities}') entities = [c.entity_ for c in kb.get_alias_candidates('Todd')] print(f'Candidates for \'Todd\': {entities}') kb.to_disk(kb_dir) nlp.to_disk(nlp_dir)