Beispiel #1
0
def test_append_alias(nlp):
    """Test that we can append additional alias-entity pairs"""
    mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)

    # adding entities
    mykb.add_entity(entity="Q1", freq=27, entity_vector=[1])
    mykb.add_entity(entity="Q2", freq=12, entity_vector=[2])
    mykb.add_entity(entity="Q3", freq=5, entity_vector=[3])

    # adding aliases
    mykb.add_alias(alias="douglas",
                   entities=["Q2", "Q3"],
                   probabilities=[0.4, 0.1])
    mykb.add_alias(alias="adam", entities=["Q2"], probabilities=[0.9])

    # test the size of the relevant candidates
    assert len(mykb.get_candidates("douglas")) == 2

    # append an alias
    mykb.append_alias(alias="douglas", entity="Q1", prior_prob=0.2)

    # test the size of the relevant candidates has been incremented
    assert len(mykb.get_candidates("douglas")) == 3

    # append the same alias-entity pair again should not work (will throw a warning)
    with pytest.warns(UserWarning):
        mykb.append_alias(alias="douglas", entity="Q1", prior_prob=0.3)

    # test the size of the relevant candidates remained unchanged
    assert len(mykb.get_candidates("douglas")) == 3
Beispiel #2
0
def test_candidate_generation(nlp):
    """Test correct candidate generation"""
    mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)

    # adding entities
    mykb.add_entity(entity="Q1", freq=27, entity_vector=[1])
    mykb.add_entity(entity="Q2", freq=12, entity_vector=[2])
    mykb.add_entity(entity="Q3", freq=5, entity_vector=[3])

    # adding aliases
    mykb.add_alias(alias="douglas",
                   entities=["Q2", "Q3"],
                   probabilities=[0.8, 0.1])
    mykb.add_alias(alias="adam", entities=["Q2"], probabilities=[0.9])

    # test the size of the relevant candidates
    assert len(mykb.get_candidates("douglas")) == 2
    assert len(mykb.get_candidates("adam")) == 1
    assert len(mykb.get_candidates("shrubbery")) == 0

    # test the content of the candidates
    assert mykb.get_candidates("adam")[0].entity_ == "Q2"
    assert mykb.get_candidates("adam")[0].alias_ == "adam"
    assert_almost_equal(mykb.get_candidates("adam")[0].entity_freq, 12)
    assert_almost_equal(mykb.get_candidates("adam")[0].prior_prob, 0.9)
Beispiel #3
0
def test_candidate_generation(nlp):
    """Test correct candidate generation"""
    mykb = KnowledgeBase(nlp.vocab)

    # adding entities
    mykb.add_entity(entity=u'Q1', prob=0.9)
    mykb.add_entity(entity=u'Q2', prob=0.2)
    mykb.add_entity(entity=u'Q3', prob=0.5)

    # adding aliases
    mykb.add_alias(alias=u'douglas', entities=[u'Q2', u'Q3'], probabilities=[0.8, 0.2])
    mykb.add_alias(alias=u'adam', entities=[u'Q2'], probabilities=[0.9])

    # test the size of the relevant candidates
    assert(len(mykb.get_candidates(u'douglas')) == 2)
    assert(len(mykb.get_candidates(u'adam')) == 1)
    assert(len(mykb.get_candidates(u'shrubbery')) == 0)
Beispiel #4
0
def test_candidate_generation(nlp):
    """Test correct candidate generation"""
    mykb = KnowledgeBase(nlp.vocab)

    # adding entities
    mykb.add_entity(entity=u'Q1', prob=0.9)
    mykb.add_entity(entity=u'Q2', prob=0.2)
    mykb.add_entity(entity=u'Q3', prob=0.5)

    # adding aliases
    mykb.add_alias(alias=u'douglas',
                   entities=[u'Q2', u'Q3'],
                   probabilities=[0.8, 0.2])
    mykb.add_alias(alias=u'adam', entities=[u'Q2'], probabilities=[0.9])

    # test the size of the relevant candidates
    assert (len(mykb.get_candidates(u'douglas')) == 2)
    assert (len(mykb.get_candidates(u'adam')) == 1)
    assert (len(mykb.get_candidates(u'shrubbery')) == 0)
Beispiel #5
0
def test_candidate_generation(nlp):
    """Test correct candidate generation"""
    mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)

    # adding entities
    mykb.add_entity(entity='Q1', prob=0.9, entity_vector=[1])
    mykb.add_entity(entity='Q2', prob=0.2, entity_vector=[2])
    mykb.add_entity(entity='Q3', prob=0.5, entity_vector=[3])

    # adding aliases
    mykb.add_alias(alias='douglas',
                   entities=['Q2', 'Q3'],
                   probabilities=[0.8, 0.2])
    mykb.add_alias(alias='adam', entities=['Q2'], probabilities=[0.9])

    # test the size of the relevant candidates
    assert (len(mykb.get_candidates('douglas')) == 2)
    assert (len(mykb.get_candidates('adam')) == 1)
    assert (len(mykb.get_candidates('shrubbery')) == 0)
Beispiel #6
0
import spacy
from spacy.kb import KnowledgeBase

nlp = spacy.load('en_core_web_sm')
kb = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=3)

# adding entities
kb.add_entity(entity="Q1004791", freq=6, entity_vector=[0, 3, 5])
kb.add_entity(entity="Q42", freq=342, entity_vector=[1, 9, -3])
kb.add_entity(entity="Q5301561", freq=12, entity_vector=[-2, 4, 2])

# adding aliases
kb.add_alias(alias="Douglas", entities=["Q1004791", "Q42", "Q5301561"], probabilities=[0.6, 0.1, 0.2])

candidates = kb.get_candidates("Douglas")
for c in candidates:
    print(" ", c.entity_, c.prior_prob, c.entity_vector)



doc = nlp("Hello, world. Here are two sentences.")
print([t.text for t in doc])

import spacy

doc = nlp("Peach emoji is where it has always been. Peach is the superior "
          "emoji. It's outranking eggplant 🍑 ")
print(doc[0].text)          # 'Peach'
print(doc[1].text)          # 'emoji'
print(doc[-1].text)         # '🍑'
Beispiel #7
0
class ConllCandidatesGenerator:
    def __init__(
                self,
                spacy_nlp_vocab_dir: str = "data/vocab",
                spacy_kb_file: str = "data/kb"
            ):
        """
        :param spacy_nlp_vocab_dir: path to directory with spaCy vocab files
        :param spacy_kb_file: path to file with spaCy KnowledgeBase
        """
        # self.spacy_nlp_str = spacy_nlp_str
        self.spacy_nlp_vocab_dir = spacy_nlp_vocab_dir
        self.spacy_kb_file = spacy_kb_file

        # Initialized in get_kb()
        self.kb = None

        self.docs = []
        self.docs_entities = []

    def get_docs(self, file: str = 'conll-wikidata-iob-annotations'):
        """
        :param file: path to file with Wikidata-annotated CoNLL dataset
        :returns: self.docs, reading it from file if not loaded
        """
        if not self.docs:
            if not os.path.isfile(file):
                raise FileNotFoundError(
                        f"Could not find annotated CoNLL file {file}."
                    )

            self.docs = list(conll_documents(file))
        return self.docs

    def del_kb(self):
        """
        Frees up memory by deleting self.kb
        """
        self.kb = None

    def get_kb(self):
        """
        :returns: self.kb, reading it from file if not loaded
        """
        if not self.kb:
            print("Loading vocabulary...")
            vocab = Vocab().from_disk(self.spacy_nlp_vocab_dir)

            print("Loading KB...")
            self.kb = KnowledgeBase(vocab=vocab)
            self.kb.load_bulk(self.spacy_kb_file)
            print("KB loaded!")
        return self.kb

    def write_entities_info(self, file: str = "docs_entities_info.json"):
        """
        Writes self.docs_entities to file.
        File then contains all necessary candidate info,
         which allows candidates to be read from file
         with read_entities_info later
        :param file: file destination of output file
        """
        if not self.docs_entities:
            raise ValueError("ERROR: No candidates to write to file. "
                  "Try the function 'get_candidates' first.")

        print(f"Writing json to file {file} ...")
        with open(file, 'w') as of:
            json.dump(self.docs_entities, of)

    def read_entities_info(self, file: str = "docs_entities_info.json"):
        """
        Read self.docs_entities from file, and returns self.docs_entities
        File should be result of function write_entities_info,
         and gives all necessary candidate info
        :param file: path to file written by write_entities_info
        :returns: self.docs_entities
        """
        if not os.path.isfile(file):
            raise FileNotFoundError(f"Could not find file {file}. "
                  "Try the function write_entities_info first.")

        print("Reading from file...")
        with open(file, 'r') as inf:
            self.docs_entities = json.load(inf)
        return self.docs_entities

    def generate_candidates_for_doc(self, doc: ConllDocument) -> List[Dict]:
        """
        Takes a ConllDocument object with tagged tokens
        (e.g. from conll_documents()).

        Outputs a list of dictionaries for each tagged named entity.
        Each dict has a dict of:
            the ground truth of the entity (as a 'Q-ID' from WikiData),
            the token position of the entity as a tuple (start, end),
            and a list of candidates, represented by their wikidata 'Q-ID'.

        :param doc: a ConllDocument object with tokens tagged with WikiData IDs
        :returns: a list over the tagged named entities, each a dictionary of
                  ground truth, entity position, and candidates
        """
        self.get_kb()
        # The return variable. Stores the list of entities.
        entities = []

        # Inner function to append a label_dict to the entities list
        def add_entity(entity_span_s, entity_span_e, entity_tokens, entity_gt):
            entity_text = ' '.join(entity_tokens)
            entity_candidates = [
                    c.entity_ for c in self.kb.get_candidates(entity_text)
                ]
            entity_span = [entity_span_s, entity_span_e]

            entities.append(
                    {'Position': entity_span,
                     'GroundTruth': entity_gt,
                     'Candidates': entity_candidates}
                )

        # Helper variables for the iteration:
        # Tokens belonging to current entity
        collected_tokens = []
        # Tag of the current entity (the ground truth)
        current_entity_tag = None
        # Position of the first entity token in the document tokens list
        span_start = None

        # Enumerate the document's list of tokens
        for i_token, token in enumerate(doc.tokens):

            # If we are looking at the beginning of a named entity
            if token.true_label.startswith("Q") or token.true_label == "B":

                # Check if we already have collected a named entity
                # This is the case when two named entities follow each other
                if len(collected_tokens) > 0:
                    add_entity(span_start, i_token-1,
                               collected_tokens, current_entity_tag)

                span_start = i_token
                collected_tokens = [token.text]
                current_entity_tag = token.true_label

            # If we are looking at the continuation of a named entity
            elif token.true_label == 'I':
                collected_tokens.append(token.text)

            # If we're not looking at a token in a named entity
            else:
                # If we have passed the end of a named entity
                if len(collected_tokens) > 0:
                    add_entity(span_start, i_token-1,
                               collected_tokens, current_entity_tag)

                collected_tokens = []

        # If the last tokens were a named entity
        if len(collected_tokens) > 0:
            add_entity(span_start, len(doc.tokens)-1,
                       collected_tokens, current_entity_tag)

        return entities

    def get_docs_entities(
                self,
                f: str = None,
                del_kb: bool = True
            ) -> List[List[Dict]]:
        """
        Iterates CoNLL documents and gets the cadidates for all mentions
        :param f: file with tagged conll documents
        :param del_kb: Whether to delete the KB object to free up space
        :returns: a list of dicts with lists of info about entities
        """

        # Generate if not cached
        if not self.docs_entities:

            if self.docs:
                self.docs = []

            for conll_doc in self.get_docs(f):
                self.docs_entities.append(
                        self.generate_candidates_for_doc(conll_doc)
                    )

            if del_kb:
                print("Deleting Spacy KB object...")
                self.del_kb()

        return self.docs_entities

    def print_candidate_stats(self):
        """
        Prints metrics about generated candidates
        """
        if not self.docs_entities:
            print("No candidates info.")
            return

        # Number of entities with no candidates (no data points)
        n_no_cand = 0
        # Number of entities where ground truth is among the candidates
        n_pos_labels = 0
        # Number of entities where GT is not among the candidates
        n_no_pos_labels = 0
        # Number of candidates excluding the GT candidate
        n_neg_labels = 0

        # Total number of named entities
        n_ne = 0
        # Only named entities in the wikidata KB
        n_ne_in_kb = 0
        # Number of named entities not linked to Wikidata KB
        n_ne_bs = 0
        # Number of candidates that belong to entities with no GT
        n_b_cands = 0

        for doc_entities in self.docs_entities:
            for entity in doc_entities:
                n_ne += 1

                if len(entity['Candidates']) == 0:
                    n_no_cand += 1
                elif entity['GroundTruth'] in entity['Candidates']:
                    n_pos_labels += 1
                    n_neg_labels += len(entity['Candidates']) - 1
                else:
                    n_no_pos_labels += 1
                    n_neg_labels += len(entity['Candidates'])

                if entity['GroundTruth'] == 'B':
                    n_ne_bs += 1
                    n_b_cands += len(entity['Candidates'])
                else:
                    n_ne_in_kb += len(entity['Candidates'])

        n_cand = n_pos_labels + n_neg_labels

        print(f"{n_ne: >7,} named entities in total")
        print(f"{n_cand: >7,} candidates in total "
              f"(total number of data points)")
        print(f"{n_pos_labels: >7,} / {n_cand: >7,} positive labels "
              f"({100 * n_pos_labels / n_cand: >5.2f} % all all labels )")
        print(f"{n_neg_labels: >7,} / {n_cand: >7,} negative labels "
              f"({100 * n_neg_labels / n_cand: >5.2f} % all all labels )")

        print(f"{n_no_cand: >7,} / {n_ne: >7,} "
              f"named entities have no candidates")
        print(f"{n_no_pos_labels: >7,} / {n_ne: >7,} "
              f"named entities where correct label is not among candidates")
        print(f"{n_ne_in_kb: >7,} / {n_cand: >7,} "
              f"candidates tagged with GT in Wikidata KB")
        print(f"{n_ne_bs: >7,} / {n_cand: >7,} "
              f"candidates for named entities not in Wikidata KB")

        print(f"{n_cand/n_ne:.1f} average number of candidates per entity")