def test_vocab_serialization(nlp): """Test that string information is retained across storage""" mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1) # adding entities mykb.add_entity(entity="Q1", freq=27, entity_vector=[1]) q2_hash = mykb.add_entity(entity="Q2", freq=12, entity_vector=[2]) mykb.add_entity(entity="Q3", freq=5, entity_vector=[3]) # adding aliases mykb.add_alias(alias="douglas", entities=["Q2", "Q3"], probabilities=[0.4, 0.1]) adam_hash = mykb.add_alias(alias="adam", entities=["Q2"], probabilities=[0.9]) candidates = mykb.get_alias_candidates("adam") assert len(candidates) == 1 assert candidates[0].entity == q2_hash assert candidates[0].entity_ == "Q2" assert candidates[0].alias == adam_hash assert candidates[0].alias_ == "adam" with make_tempdir() as d: mykb.to_disk(d / "kb") kb_new_vocab = KnowledgeBase(Vocab(), entity_vector_length=1) kb_new_vocab.from_disk(d / "kb") candidates = kb_new_vocab.get_alias_candidates("adam") assert len(candidates) == 1 assert candidates[0].entity == q2_hash assert candidates[0].entity_ == "Q2" assert candidates[0].alias == adam_hash assert candidates[0].alias_ == "adam"
def test_kb_serialization(): # Test that the KB can be used in a pipeline with a different vocab vector_length = 3 with make_tempdir() as tmp_dir: kb_dir = tmp_dir / "kb" nlp1 = English() assert "Q2146908" not in nlp1.vocab.strings mykb = KnowledgeBase(nlp1.vocab, entity_vector_length=vector_length) mykb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3]) mykb.add_alias(alias="Russ Cochran", entities=["Q2146908"], probabilities=[0.8]) assert "Q2146908" in nlp1.vocab.strings mykb.to_disk(kb_dir) nlp2 = English() assert "RandomWord" not in nlp2.vocab.strings nlp2.vocab.strings.add("RandomWord") assert "RandomWord" in nlp2.vocab.strings assert "Q2146908" not in nlp2.vocab.strings # Create the Entity Linker component with the KB from file, and check the final vocab entity_linker = nlp2.add_pipe("entity_linker", last=True) entity_linker.set_kb(load_kb(kb_dir)) assert "Q2146908" in nlp2.vocab.strings assert "RandomWord" in nlp2.vocab.strings
def test_kb_serialize_2(nlp): v = [5, 6, 7, 8] kb1 = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=4) kb1.set_entities(["E1"], [1], [v]) assert kb1.get_vector("E1") == v with make_tempdir() as d: kb1.to_disk(d / "kb") kb2 = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=4) kb2.from_disk(d / "kb") assert kb2.get_vector("E1") == v
def create_kb(kb_dir='sample_kb', nlp_dir='sample_nlp'): nlp = spacy.load('en_core_web_lg') text = 'Tennis champion Emerson was expected to win Wimbledon.' doc = nlp(text) file_path = 'entities.csv' name_dict, desc_dict = load_entities(file_path) sample_qid, sample_desc = list(desc_dict.items())[0] sample_doc = nlp(sample_desc) entity_vector_length = len(sample_doc.vector) # should be 300 kb = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=entity_vector_length) for qid, desc in desc_dict.items(): desc_doc = nlp(desc) desc_enc = desc_doc.vector # NB: entity_vector could be any encoding # freq is the count of times the word appears in the corpus # not used in this tutorial kb.add_entity(entity=qid, entity_vector=desc_enc, freq=42) # add provided alias for qid, name in name_dict.items(): # probabilities is P(entity|alias) = 1.0 # we assume that each alias only maps to one entity kb.add_alias(alias=name, entities=[qid], probabilities=[1]) # add additional alias with equal probability # this could be learned from data qids = name_dict.keys() probs = [0.3 for qid in qids] kb.add_alias(alias='Emerson', entities=qids, probabilities=probs) print(f'Entities in the KB: {kb.get_entity_strings()}') print(f'Aliases in the KB: {kb.get_alias_strings()}') print() # questions here are: # 1) what matching function is being used? - is this deterministic? # 2) what threshold is being used to determine how many candidates are presented? entities = [ c.entity_ for c in kb.get_alias_candidates('Roy Stanley Emerson') ] print(f'Candidates for \'Roy Stanley Emerson\': {entities}') entities = [c.entity_ for c in kb.get_alias_candidates('Emerson')] print(f'Candidates for \'Emerson\': {entities}') entities = [c.entity_ for c in kb.get_alias_candidates('Todd')] print(f'Candidates for \'Todd\': {entities}') kb.to_disk(kb_dir) nlp.to_disk(nlp_dir)
def test_kb_serialize_vocab(nlp): """Test serialization of the KB and custom strings""" entity = "MyFunnyID" assert entity not in nlp.vocab.strings mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1) assert not mykb.contains_entity(entity) mykb.add_entity(entity, freq=342, entity_vector=[3]) assert mykb.contains_entity(entity) assert entity in mykb.vocab.strings with make_tempdir() as d: # normal read-write behaviour mykb.to_disk(d / "kb") mykb_new = KnowledgeBase(Vocab(), entity_vector_length=1) mykb_new.from_disk(d / "kb") assert entity in mykb_new.vocab.strings
def test_save_and_load_knowledge_base(): nlp = Language() kb = KnowledgeBase(nlp.vocab, entity_vector_length=1) with make_tempdir() as d: path = d / "kb" try: kb.to_disk(path) except Exception as e: pytest.fail(str(e)) try: kb_loaded = KnowledgeBase(nlp.vocab, entity_vector_length=1) kb_loaded.from_disk(path) except Exception as e: pytest.fail(str(e))
def test_issue6730(en_vocab): """Ensure that the KB does not accept empty strings, but otherwise IO works fine.""" from spacy.kb import KnowledgeBase kb = KnowledgeBase(en_vocab, entity_vector_length=3) kb.add_entity(entity="1", freq=148, entity_vector=[1, 2, 3]) with pytest.raises(ValueError): kb.add_alias(alias="", entities=["1"], probabilities=[0.4]) assert kb.contains_alias("") is False kb.add_alias(alias="x", entities=["1"], probabilities=[0.2]) kb.add_alias(alias="y", entities=["1"], probabilities=[0.1]) with make_tempdir() as tmp_dir: kb.to_disk(tmp_dir) kb.from_disk(tmp_dir) assert kb.get_size_aliases() == 2 assert set(kb.get_alias_strings()) == {"x", "y"}
def test_kb_set_entities(nlp): """Test that set_entities entirely overwrites the previous set of entities""" v = [5, 6, 7, 8] v1 = [1, 1, 1, 0] v2 = [2, 2, 2, 3] kb1 = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=4) kb1.set_entities(["E0"], [1], [v]) assert kb1.get_entity_strings() == ["E0"] kb1.set_entities(["E1", "E2"], [1, 9], [v1, v2]) assert set(kb1.get_entity_strings()) == {"E1", "E2"} assert kb1.get_vector("E1") == v1 assert kb1.get_vector("E2") == v2 with make_tempdir() as d: kb1.to_disk(d / "kb") kb2 = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=4) kb2.from_disk(d / "kb") assert set(kb2.get_entity_strings()) == {"E1", "E2"} assert kb2.get_vector("E1") == v1 assert kb2.get_vector("E2") == v2
def test_issue4674(): """Test that setting entities with overlapping identifiers does not mess up IO""" nlp = English() kb = KnowledgeBase(nlp.vocab, entity_vector_length=3) vector1 = [0.9, 1.1, 1.01] vector2 = [1.8, 2.25, 2.01] with pytest.warns(UserWarning): kb.set_entities( entity_list=["Q1", "Q1"], freq_list=[32, 111], vector_list=[vector1, vector2], ) assert kb.get_size_entities() == 1 # dumping to file & loading back in with make_tempdir() as d: dir_path = ensure_path(d) if not dir_path.exists(): dir_path.mkdir() file_path = dir_path / "kb" kb.to_disk(str(file_path)) kb2 = KnowledgeBase(nlp.vocab, entity_vector_length=3) kb2.from_disk(str(file_path)) assert kb2.get_size_entities() == 1
def test_kb_serialize(nlp): """Test serialization of the KB""" mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1) with make_tempdir() as d: # normal read-write behaviour mykb.to_disk(d / "kb") mykb.from_disk(d / "kb") mykb.to_disk(d / "new" / "kb") mykb.from_disk(d / "new" / "kb") # allow overwriting an existing file mykb.to_disk(d / "kb") with pytest.raises(ValueError): # can not read from an unknown file mykb.from_disk(d / "unknown" / "kb")