def test_kb_to_bytes(): # Test that the KB's to_bytes method works correctly nlp = English() kb_1 = KnowledgeBase(nlp.vocab, entity_vector_length=3) kb_1.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3]) kb_1.add_entity(entity="Q66", freq=9, entity_vector=[1, 2, 3]) kb_1.add_alias(alias="Russ Cochran", entities=["Q2146908"], probabilities=[0.8]) kb_1.add_alias(alias="Boeing", entities=["Q66"], probabilities=[0.5]) kb_1.add_alias(alias="Randomness", entities=["Q66", "Q2146908"], probabilities=[0.1, 0.2]) assert kb_1.contains_alias("Russ Cochran") kb_bytes = kb_1.to_bytes() kb_2 = KnowledgeBase(nlp.vocab, entity_vector_length=3) assert not kb_2.contains_alias("Russ Cochran") kb_2 = kb_2.from_bytes(kb_bytes) # check that both KBs are exactly the same assert kb_1.get_size_entities() == kb_2.get_size_entities() assert kb_1.entity_vector_length == kb_2.entity_vector_length assert kb_1.get_entity_strings() == kb_2.get_entity_strings() assert kb_1.get_vector("Q2146908") == kb_2.get_vector("Q2146908") assert kb_1.get_vector("Q66") == kb_2.get_vector("Q66") assert kb_2.contains_alias("Russ Cochran") assert kb_1.get_size_aliases() == kb_2.get_size_aliases() assert kb_1.get_alias_strings() == kb_2.get_alias_strings() assert len(kb_1.get_alias_candidates("Russ Cochran")) == len( kb_2.get_alias_candidates("Russ Cochran")) assert len(kb_1.get_alias_candidates("Randomness")) == len( kb_2.get_alias_candidates("Randomness"))
def create_kb(vocab): kb = KnowledgeBase(vocab, entity_vector_length=3) kb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3]) kb.add_alias(alias="Russ Cochran", entities=["Q2146908"], probabilities=[0.8]) return kb
def test_candidate_generation(nlp): """Test correct candidate generation""" mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1) # adding entities mykb.add_entity(entity="Q1", freq=27, entity_vector=[1]) mykb.add_entity(entity="Q2", freq=12, entity_vector=[2]) mykb.add_entity(entity="Q3", freq=5, entity_vector=[3]) # adding aliases mykb.add_alias(alias="douglas", entities=["Q2", "Q3"], probabilities=[0.8, 0.1]) mykb.add_alias(alias="adam", entities=["Q2"], probabilities=[0.9]) # test the size of the relevant candidates assert len(mykb.get_candidates("douglas")) == 2 assert len(mykb.get_candidates("adam")) == 1 assert len(mykb.get_candidates("shrubbery")) == 0 # test the content of the candidates assert mykb.get_candidates("adam")[0].entity_ == "Q2" assert mykb.get_candidates("adam")[0].alias_ == "adam" assert_almost_equal(mykb.get_candidates("adam")[0].entity_freq, 12) assert_almost_equal(mykb.get_candidates("adam")[0].prior_prob, 0.9)
def test_append_alias(nlp): """Test that we can append additional alias-entity pairs""" mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1) # adding entities mykb.add_entity(entity="Q1", freq=27, entity_vector=[1]) mykb.add_entity(entity="Q2", freq=12, entity_vector=[2]) mykb.add_entity(entity="Q3", freq=5, entity_vector=[3]) # adding aliases mykb.add_alias(alias="douglas", entities=["Q2", "Q3"], probabilities=[0.4, 0.1]) mykb.add_alias(alias="adam", entities=["Q2"], probabilities=[0.9]) # test the size of the relevant candidates assert len(mykb.get_alias_candidates("douglas")) == 2 # append an alias mykb.append_alias(alias="douglas", entity="Q1", prior_prob=0.2) # test the size of the relevant candidates has been incremented assert len(mykb.get_alias_candidates("douglas")) == 3 # append the same alias-entity pair again should not work (will throw a warning) with pytest.warns(UserWarning): mykb.append_alias(alias="douglas", entity="Q1", prior_prob=0.3) # test the size of the relevant candidates remained unchanged assert len(mykb.get_alias_candidates("douglas")) == 3
def test_candidate_generation(nlp): """Test correct candidate generation""" mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1) doc = nlp("douglas adam Adam shrubbery") douglas_ent = doc[0:1] adam_ent = doc[1:2] Adam_ent = doc[2:3] shrubbery_ent = doc[3:4] # adding entities mykb.add_entity(entity="Q1", freq=27, entity_vector=[1]) mykb.add_entity(entity="Q2", freq=12, entity_vector=[2]) mykb.add_entity(entity="Q3", freq=5, entity_vector=[3]) # adding aliases mykb.add_alias(alias="douglas", entities=["Q2", "Q3"], probabilities=[0.8, 0.1]) mykb.add_alias(alias="adam", entities=["Q2"], probabilities=[0.9]) # test the size of the relevant candidates assert len(get_candidates(mykb, douglas_ent)) == 2 assert len(get_candidates(mykb, adam_ent)) == 1 assert len(get_candidates(mykb, Adam_ent)) == 0 # default case sensitive assert len(get_candidates(mykb, shrubbery_ent)) == 0 # test the content of the candidates assert get_candidates(mykb, adam_ent)[0].entity_ == "Q2" assert get_candidates(mykb, adam_ent)[0].alias_ == "adam" assert_almost_equal(get_candidates(mykb, adam_ent)[0].entity_freq, 12) assert_almost_equal(get_candidates(mykb, adam_ent)[0].prior_prob, 0.9)
def test_vocab_serialization(nlp): """Test that string information is retained across storage""" mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1) # adding entities mykb.add_entity(entity="Q1", freq=27, entity_vector=[1]) q2_hash = mykb.add_entity(entity="Q2", freq=12, entity_vector=[2]) mykb.add_entity(entity="Q3", freq=5, entity_vector=[3]) # adding aliases mykb.add_alias(alias="douglas", entities=["Q2", "Q3"], probabilities=[0.4, 0.1]) adam_hash = mykb.add_alias(alias="adam", entities=["Q2"], probabilities=[0.9]) candidates = mykb.get_alias_candidates("adam") assert len(candidates) == 1 assert candidates[0].entity == q2_hash assert candidates[0].entity_ == "Q2" assert candidates[0].alias == adam_hash assert candidates[0].alias_ == "adam" with make_tempdir() as d: mykb.to_disk(d / "kb") kb_new_vocab = KnowledgeBase(Vocab(), entity_vector_length=1) kb_new_vocab.from_disk(d / "kb") candidates = kb_new_vocab.get_alias_candidates("adam") assert len(candidates) == 1 assert candidates[0].entity == q2_hash assert candidates[0].entity_ == "Q2" assert candidates[0].alias == adam_hash assert candidates[0].alias_ == "adam"
def test_kb_serialization(): # Test that the KB can be used in a pipeline with a different vocab vector_length = 3 with make_tempdir() as tmp_dir: kb_dir = tmp_dir / "kb" nlp1 = English() assert "Q2146908" not in nlp1.vocab.strings mykb = KnowledgeBase(nlp1.vocab, entity_vector_length=vector_length) mykb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3]) mykb.add_alias(alias="Russ Cochran", entities=["Q2146908"], probabilities=[0.8]) assert "Q2146908" in nlp1.vocab.strings mykb.to_disk(kb_dir) nlp2 = English() assert "RandomWord" not in nlp2.vocab.strings nlp2.vocab.strings.add("RandomWord") assert "RandomWord" in nlp2.vocab.strings assert "Q2146908" not in nlp2.vocab.strings # Create the Entity Linker component with the KB from file, and check the final vocab entity_linker = nlp2.add_pipe("entity_linker", last=True) entity_linker.set_kb(load_kb(kb_dir)) assert "Q2146908" in nlp2.vocab.strings assert "RandomWord" in nlp2.vocab.strings
def create_kb(): """ Step 1: create the Knowledge Base in spaCy and write it to file """ nlp = spacy.load("en_core_web_lg") name_dict, desc_dict = load_entities() kb = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=300) for qid, desc in desc_dict.items(): desc_doc = nlp(desc) desc_enc = desc_doc.vector kb.add_entity(entity=qid, entity_vector=desc_enc, freq=342) # 342 is an arbitrary value here for qid, name in name_dict.items(): kb.add_alias(alias=name, entities=[qid], probabilities=[1]) # 100% prior probability P(entity|alias) qids = name_dict.keys() probs = [0.3 for qid in qids] kb.add_alias(alias="Emerson", entities=qids, probabilities=probs) # sum([probs]) should be <= 1 ! print(f"Entities in the KB: {kb.get_entity_strings()}") print(f"Aliases in the KB: {kb.get_alias_strings()}") print() if not os.path.exists(output_dir): os.mkdir(output_dir) kb.dump(output_dir / "my_kb") nlp.to_disk(output_dir / "my_nlp")
def test_kb_valid_entities(nlp): """Test the valid construction of a KB with 3 entities and two aliases""" mykb = KnowledgeBase(nlp.vocab, entity_vector_length=3) # adding entities mykb.add_entity(entity="Q1", freq=0.9, entity_vector=[8, 4, 3]) mykb.add_entity(entity="Q2", freq=0.5, entity_vector=[2, 1, 0]) mykb.add_entity(entity="Q3", freq=0.5, entity_vector=[-1, -6, 5]) # adding aliases mykb.add_alias(alias="douglas", entities=["Q2", "Q3"], probabilities=[0.8, 0.2]) mykb.add_alias(alias="adam", entities=["Q2"], probabilities=[0.9]) # test the size of the corresponding KB assert mykb.get_size_entities() == 3 assert mykb.get_size_aliases() == 2 # test retrieval of the entity vectors assert mykb.get_vector("Q1") == [8, 4, 3] assert mykb.get_vector("Q2") == [2, 1, 0] assert mykb.get_vector("Q3") == [-1, -6, 5] # test retrieval of prior probabilities assert_almost_equal(mykb.get_prior_prob(entity="Q2", alias="douglas"), 0.8) assert_almost_equal(mykb.get_prior_prob(entity="Q3", alias="douglas"), 0.2) assert_almost_equal(mykb.get_prior_prob(entity="Q342", alias="douglas"), 0.0) assert_almost_equal(mykb.get_prior_prob(entity="Q3", alias="douglassssss"), 0.0)
def create_kb(vocab): kb = KnowledgeBase(vocab, entity_vector_length=1) kb.add_entity(entity="Q2", freq=12, entity_vector=[2]) kb.add_entity(entity="Q3", freq=5, entity_vector=[3]) kb.add_alias(alias="douglas", entities=["Q2", "Q3"], probabilities=[0.8, 0.1]) return kb
def create_kb(vocab): # create artificial KB mykb = KnowledgeBase(vocab, entity_vector_length=vector_length) mykb.add_entity(entity="Q613241", freq=12, entity_vector=[6, -4, 3]) mykb.add_alias("Kirby", ["Q613241"], [0.9]) # Placeholder mykb.add_entity(entity="pink", freq=12, entity_vector=[7, 2, -5]) mykb.add_alias("pink", ["pink"], [0.9]) return mykb
def create_kb(vocab): mykb = KnowledgeBase(vocab, entity_vector_length=vector_length) # adding entities mykb.add_entity(entity="Q1", freq=19, entity_vector=[1]) mykb.add_entity(entity="Q2", freq=8, entity_vector=[1]) # adding aliases mykb.add_alias(alias="Boston", entities=["Q1"], probabilities=[0.7]) mykb.add_alias(alias="Denver", entities=["Q2"], probabilities=[0.6]) return mykb
def create_kb(vocab): mykb = KnowledgeBase(vocab, entity_vector_length=vector_length) mykb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3]) mykb.add_entity(entity="Q7381115", freq=12, entity_vector=[9, 1, -7]) mykb.add_alias( alias="Russ Cochran", entities=["Q2146908", "Q7381115"], probabilities=[0.5, 0.5], ) return mykb
def test_kb_invalid_entity_vector(nlp): """Test the invalid construction of a KB with non-matching entity vector lengths""" mykb = KnowledgeBase(nlp.vocab, entity_vector_length=3) # adding entities mykb.add_entity(entity="Q1", freq=0.9, entity_vector=[1, 2, 3]) # this should fail because the kb's expected entity vector length is 3 with pytest.raises(ValueError): mykb.add_entity(entity="Q2", freq=0.2, entity_vector=[2])
def create_kb(vocab): # create artificial KB mykb = KnowledgeBase(vocab, entity_vector_length=3) mykb.add_entity(entity=entity_id, freq=12, entity_vector=[6, -4, 3]) mykb.add_alias( alias="Mahler", entities=[entity_id], probabilities=[1 if meet_threshold else 0.01], ) return mykb
def test_kb_invalid_combination(nlp): """Test the invalid construction of a KB with non-matching entity and probability lists""" mykb = KnowledgeBase(nlp.vocab) # adding entities mykb.add_entity(entity=u'Q1', prob=0.9) mykb.add_entity(entity=u'Q2', prob=0.2) mykb.add_entity(entity=u'Q3', prob=0.5) # adding aliases - should fail because the entities and probabilities vectors are not of equal length with pytest.raises(ValueError): mykb.add_alias(alias=u'douglas', entities=[u'Q2', u'Q3'], probabilities=[0.3, 0.4, 0.1])
def test_kb_invalid_entities(nlp): """Test the invalid construction of a KB with an alias linked to a non-existing entity""" mykb = KnowledgeBase(nlp.vocab) # adding entities mykb.add_entity(entity=u'Q1', prob=0.9) mykb.add_entity(entity=u'Q2', prob=0.2) mykb.add_entity(entity=u'Q3', prob=0.5) # adding aliases - should fail because one of the given IDs is not valid with pytest.raises(ValueError): mykb.add_alias(alias=u'douglas', entities=[u'Q2', u'Q342'], probabilities=[0.8, 0.2])
def test_kb_invalid_probabilities(nlp): """Test the invalid construction of a KB with wrong prior probabilities""" mykb = KnowledgeBase(nlp.vocab) # adding entities mykb.add_entity(entity=u'Q1', prob=0.9) mykb.add_entity(entity=u'Q2', prob=0.2) mykb.add_entity(entity=u'Q3', prob=0.5) # adding aliases - should fail because the sum of the probabilities exceeds 1 with pytest.raises(ValueError): mykb.add_alias(alias=u'douglas', entities=[u'Q2', u'Q3'], probabilities=[0.8, 0.4])
def test_kb_invalid_probabilities(nlp): """Test the invalid construction of a KB with wrong prior probabilities""" mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1) # adding entities mykb.add_entity(entity="Q1", freq=0.9, entity_vector=[1]) mykb.add_entity(entity="Q2", freq=0.2, entity_vector=[2]) mykb.add_entity(entity="Q3", freq=0.5, entity_vector=[3]) # adding aliases - should fail because the sum of the probabilities exceeds 1 with pytest.raises(ValueError): mykb.add_alias(alias="douglas", entities=["Q2", "Q3"], probabilities=[0.8, 0.4])
def create_kb(vocab): # create artificial KB - assign same prior weight to the two russ cochran's # Q2146908 (Russ Cochran): American golfer # Q7381115 (Russ Cochran): publisher mykb = KnowledgeBase(vocab, entity_vector_length=vector_length) mykb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3]) mykb.add_entity(entity="Q7381115", freq=12, entity_vector=[9, 1, -7]) mykb.add_alias( alias="Russ Cochran", entities=["Q2146908", "Q7381115"], probabilities=[0.5, 0.5], ) return mykb
def test_kb_pickle(): # Test that the KB can be pickled nlp = English() kb_1 = KnowledgeBase(nlp.vocab, entity_vector_length=3) kb_1.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3]) assert not kb_1.contains_alias("Russ Cochran") kb_1.add_alias(alias="Russ Cochran", entities=["Q2146908"], probabilities=[0.8]) assert kb_1.contains_alias("Russ Cochran") data = pickle.dumps(kb_1) kb_2 = pickle.loads(data) assert kb_2.contains_alias("Russ Cochran")
def create_kb(kb_dir='sample_kb', nlp_dir='sample_nlp'): nlp = spacy.load('en_core_web_lg') text = 'Tennis champion Emerson was expected to win Wimbledon.' doc = nlp(text) file_path = 'entities.csv' name_dict, desc_dict = load_entities(file_path) sample_qid, sample_desc = list(desc_dict.items())[0] sample_doc = nlp(sample_desc) entity_vector_length = len(sample_doc.vector) # should be 300 kb = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=entity_vector_length) for qid, desc in desc_dict.items(): desc_doc = nlp(desc) desc_enc = desc_doc.vector # NB: entity_vector could be any encoding # freq is the count of times the word appears in the corpus # not used in this tutorial kb.add_entity(entity=qid, entity_vector=desc_enc, freq=42) # add provided alias for qid, name in name_dict.items(): # probabilities is P(entity|alias) = 1.0 # we assume that each alias only maps to one entity kb.add_alias(alias=name, entities=[qid], probabilities=[1]) # add additional alias with equal probability # this could be learned from data qids = name_dict.keys() probs = [0.3 for qid in qids] kb.add_alias(alias='Emerson', entities=qids, probabilities=probs) print(f'Entities in the KB: {kb.get_entity_strings()}') print(f'Aliases in the KB: {kb.get_alias_strings()}') print() # questions here are: # 1) what matching function is being used? - is this deterministic? # 2) what threshold is being used to determine how many candidates are presented? entities = [ c.entity_ for c in kb.get_alias_candidates('Roy Stanley Emerson') ] print(f'Candidates for \'Roy Stanley Emerson\': {entities}') entities = [c.entity_ for c in kb.get_alias_candidates('Emerson')] print(f'Candidates for \'Emerson\': {entities}') entities = [c.entity_ for c in kb.get_alias_candidates('Todd')] print(f'Candidates for \'Todd\': {entities}') kb.to_disk(kb_dir) nlp.to_disk(nlp_dir)
def test_kb_invalid_combination(nlp): """Test the invalid construction of a KB with non-matching entity and probability lists""" mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1) # adding entities mykb.add_entity(entity="Q1", freq=0.9, entity_vector=[1]) mykb.add_entity(entity="Q2", freq=0.2, entity_vector=[2]) mykb.add_entity(entity="Q3", freq=0.5, entity_vector=[3]) # adding aliases - should fail because the entities and probabilities vectors are not of equal length with pytest.raises(ValueError): mykb.add_alias( alias="douglas", entities=["Q2", "Q3"], probabilities=[0.3, 0.4, 0.1] )
def test_kb_invalid_entities(nlp): """Test the invalid construction of a KB with an alias linked to a non-existing entity""" mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1) # adding entities mykb.add_entity(entity="Q1", freq=0.9, entity_vector=[1]) mykb.add_entity(entity="Q2", freq=0.2, entity_vector=[2]) mykb.add_entity(entity="Q3", freq=0.5, entity_vector=[3]) # adding aliases - should fail because one of the given IDs is not valid with pytest.raises(ValueError): mykb.add_alias( alias="douglas", entities=["Q2", "Q342"], probabilities=[0.8, 0.2] )
def test_kb_serialize_vocab(nlp): """Test serialization of the KB and custom strings""" entity = "MyFunnyID" assert entity not in nlp.vocab.strings mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1) assert not mykb.contains_entity(entity) mykb.add_entity(entity, freq=342, entity_vector=[3]) assert mykb.contains_entity(entity) assert entity in mykb.vocab.strings with make_tempdir() as d: # normal read-write behaviour mykb.to_disk(d / "kb") mykb_new = KnowledgeBase(Vocab(), entity_vector_length=1) mykb_new.from_disk(d / "kb") assert entity in mykb_new.vocab.strings
def test_append_invalid_alias(nlp): """Test that append an alias will throw an error if prior probs are exceeding 1""" mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1) # adding entities mykb.add_entity(entity="Q1", freq=27, entity_vector=[1]) mykb.add_entity(entity="Q2", freq=12, entity_vector=[2]) mykb.add_entity(entity="Q3", freq=5, entity_vector=[3]) # adding aliases mykb.add_alias(alias="douglas", entities=["Q2", "Q3"], probabilities=[0.8, 0.1]) mykb.add_alias(alias="adam", entities=["Q2"], probabilities=[0.9]) # append an alias - should fail because the entities and probabilities vectors are not of equal length with pytest.raises(ValueError): mykb.append_alias(alias="douglas", entity="Q1", prior_prob=0.2)
def create_kb(vocab): # create artificial KB mykb = KnowledgeBase(vocab, entity_vector_length=vector_length) mykb.add_entity(entity="Q270853", freq=12, entity_vector=[9, 1, -7]) mykb.add_alias( alias="No. 8", entities=["Q270853"], probabilities=[1.0], ) mykb.add_entity(entity="Q7304", freq=12, entity_vector=[6, -4, 3]) mykb.add_alias( alias="Mahler", entities=["Q7304"], probabilities=[1.0], ) return mykb
def test_preserving_links_asdoc(nlp): """Test that Span.as_doc preserves the existing entity links""" mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1) # adding entities mykb.add_entity(entity="Q1", freq=19, entity_vector=[1]) mykb.add_entity(entity="Q2", freq=8, entity_vector=[1]) # adding aliases mykb.add_alias(alias="Boston", entities=["Q1"], probabilities=[0.7]) mykb.add_alias(alias="Denver", entities=["Q2"], probabilities=[0.6]) # set up pipeline with NER (Entity Ruler) and NEL (prior probability only, model not trained) sentencizer = nlp.create_pipe("sentencizer") nlp.add_pipe(sentencizer) ruler = EntityRuler(nlp) patterns = [ { "label": "GPE", "pattern": "Boston" }, { "label": "GPE", "pattern": "Denver" }, ] ruler.add_patterns(patterns) nlp.add_pipe(ruler) el_pipe = nlp.create_pipe(name="entity_linker") el_pipe.set_kb(mykb) el_pipe.begin_training() el_pipe.incl_context = False el_pipe.incl_prior = True nlp.add_pipe(el_pipe, last=True) # test whether the entity links are preserved by the `as_doc()` function text = "She lives in Boston. He lives in Denver." doc = nlp(text) for ent in doc.ents: orig_text = ent.text orig_kb_id = ent.kb_id_ sent_doc = ent.sent.as_doc() for s_ent in sent_doc.ents: if s_ent.text == orig_text: assert s_ent.kb_id_ == orig_kb_id
def test_candidate_generation(nlp): """Test correct candidate generation""" mykb = KnowledgeBase(nlp.vocab) # adding entities mykb.add_entity(entity=u'Q1', prob=0.9) mykb.add_entity(entity=u'Q2', prob=0.2) mykb.add_entity(entity=u'Q3', prob=0.5) # adding aliases mykb.add_alias(alias=u'douglas', entities=[u'Q2', u'Q3'], probabilities=[0.8, 0.2]) mykb.add_alias(alias=u'adam', entities=[u'Q2'], probabilities=[0.9]) # test the size of the relevant candidates assert(len(mykb.get_candidates(u'douglas')) == 2) assert(len(mykb.get_candidates(u'adam')) == 1) assert(len(mykb.get_candidates(u'shrubbery')) == 0)
def test_kb_valid_entities(nlp): """Test the valid construction of a KB with 3 entities and two aliases""" mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1) # adding entities mykb.add_entity(entity='Q1', prob=0.9, entity_vector=[1]) mykb.add_entity(entity='Q2', prob=0.5, entity_vector=[2]) mykb.add_entity(entity='Q3', prob=0.5, entity_vector=[3]) # adding aliases mykb.add_alias(alias='douglas', entities=['Q2', 'Q3'], probabilities=[0.8, 0.2]) mykb.add_alias(alias='adam', entities=['Q2'], probabilities=[0.9]) # test the size of the corresponding KB assert (mykb.get_size_entities() == 3) assert (mykb.get_size_aliases() == 2)
def test_issue6730(en_vocab): """Ensure that the KB does not accept empty strings, but otherwise IO works fine.""" from spacy.kb import KnowledgeBase kb = KnowledgeBase(en_vocab, entity_vector_length=3) kb.add_entity(entity="1", freq=148, entity_vector=[1, 2, 3]) with pytest.raises(ValueError): kb.add_alias(alias="", entities=["1"], probabilities=[0.4]) assert kb.contains_alias("") is False kb.add_alias(alias="x", entities=["1"], probabilities=[0.2]) kb.add_alias(alias="y", entities=["1"], probabilities=[0.1]) with make_tempdir() as tmp_dir: kb.to_disk(tmp_dir) kb.from_disk(tmp_dir) assert kb.get_size_aliases() == 2 assert set(kb.get_alias_strings()) == {"x", "y"}
def create_kb(vocab): kb = KnowledgeBase(vocab=vocab) # adding entities entity_0 = "Q1004791_Douglas" print("adding entity", entity_0) kb.add_entity(entity=entity_0, prob=0.5) entity_1 = "Q42_Douglas_Adams" print("adding entity", entity_1) kb.add_entity(entity=entity_1, prob=0.5) entity_2 = "Q5301561_Douglas_Haig" print("adding entity", entity_2) kb.add_entity(entity=entity_2, prob=0.5) # adding aliases print() alias_0 = "Douglas" print("adding alias", alias_0) kb.add_alias(alias=alias_0, entities=[entity_0, entity_1, entity_2], probabilities=[0.1, 0.6, 0.2]) alias_1 = "Douglas Adams" print("adding alias", alias_1) kb.add_alias(alias=alias_1, entities=[entity_1], probabilities=[0.9]) print() print("kb size:", len(kb), kb.get_size_entities(), kb.get_size_aliases()) return kb
def test_kb_valid_entities(nlp): """Test the valid construction of a KB with 3 entities and two aliases""" mykb = KnowledgeBase(nlp.vocab) # adding entities mykb.add_entity(entity=u'Q1', prob=0.9) mykb.add_entity(entity=u'Q2') mykb.add_entity(entity=u'Q3', prob=0.5) # adding aliases mykb.add_alias(alias=u'douglas', entities=[u'Q2', u'Q3'], probabilities=[0.8, 0.2]) mykb.add_alias(alias=u'adam', entities=[u'Q2'], probabilities=[0.9]) # test the size of the corresponding KB assert(mykb.get_size_entities() == 3) assert(mykb.get_size_aliases() == 2)