Beispiel #1
0
def test_append_alias(nlp):
    """Test that we can append additional alias-entity pairs"""
    mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)

    # adding entities
    mykb.add_entity(entity="Q1", freq=27, entity_vector=[1])
    mykb.add_entity(entity="Q2", freq=12, entity_vector=[2])
    mykb.add_entity(entity="Q3", freq=5, entity_vector=[3])

    # adding aliases
    mykb.add_alias(alias="douglas",
                   entities=["Q2", "Q3"],
                   probabilities=[0.4, 0.1])
    mykb.add_alias(alias="adam", entities=["Q2"], probabilities=[0.9])

    # test the size of the relevant candidates
    assert len(mykb.get_alias_candidates("douglas")) == 2

    # append an alias
    mykb.append_alias(alias="douglas", entity="Q1", prior_prob=0.2)

    # test the size of the relevant candidates has been incremented
    assert len(mykb.get_alias_candidates("douglas")) == 3

    # append the same alias-entity pair again should not work (will throw a warning)
    with pytest.warns(UserWarning):
        mykb.append_alias(alias="douglas", entity="Q1", prior_prob=0.3)

    # test the size of the relevant candidates remained unchanged
    assert len(mykb.get_alias_candidates("douglas")) == 3
def test_kb_to_bytes():
    # Test that the KB's to_bytes method works correctly
    nlp = English()
    kb_1 = KnowledgeBase(nlp.vocab, entity_vector_length=3)
    kb_1.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3])
    kb_1.add_entity(entity="Q66", freq=9, entity_vector=[1, 2, 3])
    kb_1.add_alias(alias="Russ Cochran",
                   entities=["Q2146908"],
                   probabilities=[0.8])
    kb_1.add_alias(alias="Boeing", entities=["Q66"], probabilities=[0.5])
    kb_1.add_alias(alias="Randomness",
                   entities=["Q66", "Q2146908"],
                   probabilities=[0.1, 0.2])
    assert kb_1.contains_alias("Russ Cochran")
    kb_bytes = kb_1.to_bytes()
    kb_2 = KnowledgeBase(nlp.vocab, entity_vector_length=3)
    assert not kb_2.contains_alias("Russ Cochran")
    kb_2 = kb_2.from_bytes(kb_bytes)
    # check that both KBs are exactly the same
    assert kb_1.get_size_entities() == kb_2.get_size_entities()
    assert kb_1.entity_vector_length == kb_2.entity_vector_length
    assert kb_1.get_entity_strings() == kb_2.get_entity_strings()
    assert kb_1.get_vector("Q2146908") == kb_2.get_vector("Q2146908")
    assert kb_1.get_vector("Q66") == kb_2.get_vector("Q66")
    assert kb_2.contains_alias("Russ Cochran")
    assert kb_1.get_size_aliases() == kb_2.get_size_aliases()
    assert kb_1.get_alias_strings() == kb_2.get_alias_strings()
    assert len(kb_1.get_alias_candidates("Russ Cochran")) == len(
        kb_2.get_alias_candidates("Russ Cochran"))
    assert len(kb_1.get_alias_candidates("Randomness")) == len(
        kb_2.get_alias_candidates("Randomness"))
Beispiel #3
0
def test_vocab_serialization(nlp):
    """Test that string information is retained across storage"""
    mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)

    # adding entities
    mykb.add_entity(entity="Q1", freq=27, entity_vector=[1])
    q2_hash = mykb.add_entity(entity="Q2", freq=12, entity_vector=[2])
    mykb.add_entity(entity="Q3", freq=5, entity_vector=[3])

    # adding aliases
    mykb.add_alias(alias="douglas",
                   entities=["Q2", "Q3"],
                   probabilities=[0.4, 0.1])
    adam_hash = mykb.add_alias(alias="adam",
                               entities=["Q2"],
                               probabilities=[0.9])

    candidates = mykb.get_alias_candidates("adam")
    assert len(candidates) == 1
    assert candidates[0].entity == q2_hash
    assert candidates[0].entity_ == "Q2"
    assert candidates[0].alias == adam_hash
    assert candidates[0].alias_ == "adam"

    with make_tempdir() as d:
        mykb.to_disk(d / "kb")
        kb_new_vocab = KnowledgeBase(Vocab(), entity_vector_length=1)
        kb_new_vocab.from_disk(d / "kb")

        candidates = kb_new_vocab.get_alias_candidates("adam")
        assert len(candidates) == 1
        assert candidates[0].entity == q2_hash
        assert candidates[0].entity_ == "Q2"
        assert candidates[0].alias == adam_hash
        assert candidates[0].alias_ == "adam"
def create_kb(kb_dir='sample_kb', nlp_dir='sample_nlp'):
    nlp = spacy.load('en_core_web_lg')
    text = 'Tennis champion Emerson was expected to win Wimbledon.'
    doc = nlp(text)
    file_path = 'entities.csv'
    name_dict, desc_dict = load_entities(file_path)

    sample_qid, sample_desc = list(desc_dict.items())[0]
    sample_doc = nlp(sample_desc)
    entity_vector_length = len(sample_doc.vector)  # should be 300
    kb = KnowledgeBase(vocab=nlp.vocab,
                       entity_vector_length=entity_vector_length)

    for qid, desc in desc_dict.items():
        desc_doc = nlp(desc)
        desc_enc = desc_doc.vector
        # NB: entity_vector could be any encoding
        # freq is the count of times the word appears in the corpus
        # not used in this tutorial
        kb.add_entity(entity=qid, entity_vector=desc_enc, freq=42)

    # add provided alias
    for qid, name in name_dict.items():
        # probabilities is P(entity|alias) = 1.0
        # we assume that each alias only maps to one entity
        kb.add_alias(alias=name, entities=[qid], probabilities=[1])

    # add additional alias with equal probability
    # this could be learned from data
    qids = name_dict.keys()
    probs = [0.3 for qid in qids]
    kb.add_alias(alias='Emerson', entities=qids, probabilities=probs)

    print(f'Entities in the KB: {kb.get_entity_strings()}')
    print(f'Aliases in the KB: {kb.get_alias_strings()}')
    print()
    # questions here are:
    # 1) what matching function is being used? - is this deterministic?
    # 2) what threshold is being used to determine how many candidates are presented?
    entities = [
        c.entity_ for c in kb.get_alias_candidates('Roy Stanley Emerson')
    ]
    print(f'Candidates for \'Roy Stanley Emerson\': {entities}')
    entities = [c.entity_ for c in kb.get_alias_candidates('Emerson')]
    print(f'Candidates for \'Emerson\': {entities}')
    entities = [c.entity_ for c in kb.get_alias_candidates('Todd')]
    print(f'Candidates for \'Todd\': {entities}')

    kb.to_disk(kb_dir)
    nlp.to_disk(nlp_dir)