Python KnowledgeBase.load_bulk Examples, spacy.kb.KnowledgeBase.load_bulk Python Examples

Example #1

0

Show file

def read_nlp_kb(model_dir, kb_file):
    nlp = spacy.load(model_dir)
    kb = KnowledgeBase(vocab=nlp.vocab)
    kb.load_bulk(kb_file)
    logger.info("kb entities: {}".format(kb.get_size_entities()))
    logger.info("kb aliases: {}".format(kb.get_size_aliases()))
    return nlp, kb

Example #2

0

Show file

File: el_recipe.py Project: stjordanis/projects-2

def entity_linker_manual(dataset, source, nlp_dir, kb_loc, entity_loc):
    # Load the NLP and KB objects from file
    nlp = spacy.load(nlp_dir)
    kb = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=1)
    kb.load_bulk(kb_loc)
    model = EntityRecognizer(nlp)

    # Read the pre-defined CSV file into dictionaries mapping QIDs to the full names and descriptions
    id_dict = dict()
    with entity_loc.open("r", encoding="utf8") as csvfile:
        csvreader = csv.reader(csvfile, delimiter=",")
        for row in csvreader:
            id_dict[row[0]] = (row[1], row[2])

    # Initialize the Prodigy stream by running the NER model
    stream = TXT(source)
    stream = [set_hashes(eg) for eg in stream]
    stream = (eg for score, eg in model(stream))

    # For each NER mention, add the candidates from the KB to the annotation task
    stream = _add_options(stream, kb, id_dict)
    stream = filter_duplicates(stream, by_input=True, by_task=False)

    return {
        "dataset": dataset,
        "stream": stream,
        "view_id": "choice",
        "config": {
            "choice_auto_accept": True
        },
    }

Example #3

0

Show file

File: test_issue4674.py Project: uliang/spaCy

def test_issue4674():
    """Test that setting entities with overlapping identifiers does not mess up IO"""
    nlp = English()
    kb = KnowledgeBase(nlp.vocab, entity_vector_length=3)

    vector1 = [0.9, 1.1, 1.01]
    vector2 = [1.8, 2.25, 2.01]
    kb.set_entities(entity_list=["Q1", "Q1"],
                    freq_list=[32, 111],
                    vector_list=[vector1, vector2])

    assert kb.get_size_entities() == 1

    # dumping to file & loading back in
    with make_tempdir() as d:
        dir_path = ensure_path(d)
        if not dir_path.exists():
            dir_path.mkdir()
        file_path = dir_path / "kb"
        kb.dump(str(file_path))

        kb2 = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=3)
        kb2.load_bulk(str(file_path))

    assert kb2.get_size_entities() == 1

Example #4

0

Show file

File: ne_creator.py Project: nadjet/explosion_projects

class NamedEntityCreator:

    def __init__(self, kb_folder, vectors_loc, lang='sv', stz=True, vectors_name='fasttext'):
        self.nlp = create_model(vectors_loc=vectors_loc, lang=lang, stz=stz, vectors_name=vectors_name, max_items=1000)
        self.kb = KnowledgeBase(vocab=self.nlp.vocab)
        print(kb_folder)
        self.kb.load_bulk(kb_folder)
        print()
        _print_kb(self.kb)

Example #5

0

Show file

File: spacy_knowledge_base.py Project: Xx-Ashutosh-xX/WellcomeML

 def load(self, output_dir):
     kb_path = os.path.join(output_dir, "kb")
     vocab_path = os.path.join(output_dir, "vocab")
     print("Loading vocab from", vocab_path)
     print("Loading KB from", kb_path)
     vocab = Vocab().from_disk(vocab_path)
     kb = KnowledgeBase(vocab=vocab)
     kb.load_bulk(kb_path)
     self.kb = kb
     return self.kb

Example #6

0

Show file

File: test_issue5230.py Project: cisco00/Sentimental-Analysis-on-threat

def test_save_and_load_knowledge_base():
    nlp = Language()
    kb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
    with make_tempdir() as d:
        path = d / "kb"
        try:
            kb.dump(path)
        except Exception as e:
            pytest.fail(str(e))

        try:
            kb_loaded = KnowledgeBase(nlp.vocab, entity_vector_length=1)
            kb_loaded.load_bulk(path)
        except Exception as e:
            pytest.fail(str(e))

Example #7

0

Show file

File: test_serialize_kb.py Project: rshin/spaCy

def test_serialize_kb_disk(en_vocab):
    # baseline assertions
    kb1 = _get_dummy_kb(en_vocab)
    _check_kb(kb1)

    # dumping to file & loading back in
    with make_tempdir() as d:
        dir_path = ensure_path(d)
        if not dir_path.exists():
            dir_path.mkdir()
        file_path = dir_path / "kb"
        kb1.dump(str(file_path))

        kb2 = KnowledgeBase(vocab=en_vocab, entity_vector_length=3)
        kb2.load_bulk(str(file_path))

    # final assertions
    _check_kb(kb2)

Example #8

0

Show file

File: ann_linker.py Project: jjjamie/spacy-ann-linker

    def from_disk(self, path: Path, **kwargs):
        """Deserialize saved AnnLinker from disk.
        
        path (Path): directory to deserialize from
        
        RETURNS (AnnLinker): Initialized AnnLinker
        """        
        path = util.ensure_path(path)

        kb = KnowledgeBase(self.nlp.vocab, 300)
        kb.load_bulk(path / "kb")
        self.set_kb(kb)

        cg = CandidateGenerator().from_disk(path)
        self.set_cg(cg)

        cfg = srsly.read_json(path / "cfg")
        
        self.threshold = cfg.get("threshold", 0.7)
        self.no_description_threshold = cfg.get("no_description_threshold", 0.95)
        self.disambiguate = cfg.get("disambiguate", True)

        return self

Example #9

0

Show file

File: wikidata_train_entity_linker.py Project: rshin/spaCy

def main(
    dir_kb,
    output_dir=None,
    loc_training=None,
    wp_xml=None,
    epochs=10,
    dropout=0.5,
    lr=0.005,
    l2=1e-6,
    train_inst=None,
    dev_inst=None,
    limit=None,
):
    print(now(), "Creating Entity Linker with Wikipedia and WikiData")
    print()

    # STEP 0: set up IO
    if output_dir and not output_dir.exists():
        output_dir.mkdir()

    # STEP 1 : load the NLP object
    nlp_dir = dir_kb / "nlp"
    print(now(), "STEP 1: loading model from", nlp_dir)
    nlp = spacy.load(nlp_dir)

    # check that there is a NER component in the pipeline
    if "ner" not in nlp.pipe_names:
        raise ValueError(Errors.E152)

    # STEP 2 : read the KB
    print()
    print(now(), "STEP 2: reading the KB from", dir_kb / "kb")
    kb = KnowledgeBase(vocab=nlp.vocab)
    kb.load_bulk(dir_kb / "kb")

    # STEP 3: create a training dataset from WP
    print()
    if loc_training:
        print(now(), "STEP 3: reading training dataset from", loc_training)
    else:
        if not wp_xml:
            raise ValueError(Errors.E153)

        if output_dir:
            loc_training = output_dir / "training_data"
        else:
            loc_training = dir_kb / "training_data"
        if not loc_training.exists():
            loc_training.mkdir()
        print(now(), "STEP 3: creating training dataset at", loc_training)

        if limit is not None:
            print("Warning: reading only", limit, "lines of Wikipedia dump.")

        loc_entity_defs = dir_kb / "entity_defs.csv"
        training_set_creator.create_training(
            wikipedia_input=wp_xml,
            entity_def_input=loc_entity_defs,
            training_output=loc_training,
            limit=limit,
        )

    # STEP 4: parse the training data
    print()
    print(now(), "STEP 4: parse the training & evaluation data")

    # for training, get pos & neg instances that correspond to entries in the kb
    print("Parsing training data, limit =", train_inst)
    train_data = training_set_creator.read_training(
        nlp=nlp, training_dir=loc_training, dev=False, limit=train_inst, kb=kb
    )

    print("Training on", len(train_data), "articles")
    print()

    print("Parsing dev testing data, limit =", dev_inst)
    # for testing, get all pos instances, whether or not they are in the kb
    dev_data = training_set_creator.read_training(
        nlp=nlp, training_dir=loc_training, dev=True, limit=dev_inst, kb=None
    )

    print("Dev testing on", len(dev_data), "articles")
    print()

    # STEP 5: create and train the entity linking pipe
    print()
    print(now(), "STEP 5: training Entity Linking pipe")

    el_pipe = nlp.create_pipe(
        name="entity_linker", config={"pretrained_vectors": nlp.vocab.vectors.name}
    )
    el_pipe.set_kb(kb)
    nlp.add_pipe(el_pipe, last=True)

    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "entity_linker"]
    with nlp.disable_pipes(*other_pipes):  # only train Entity Linking
        optimizer = nlp.begin_training()
        optimizer.learn_rate = lr
        optimizer.L2 = l2

    if not train_data:
        print("Did not find any training data")
    else:
        for itn in range(epochs):
            random.shuffle(train_data)
            losses = {}
            batches = minibatch(train_data, size=compounding(4.0, 128.0, 1.001))
            batchnr = 0

            with nlp.disable_pipes(*other_pipes):
                for batch in batches:
                    try:
                        docs, golds = zip(*batch)
                        nlp.update(
                            docs=docs,
                            golds=golds,
                            sgd=optimizer,
                            drop=dropout,
                            losses=losses,
                        )
                        batchnr += 1
                    except Exception as e:
                        print("Error updating batch:", e)

                if batchnr > 0:
                    el_pipe.cfg["incl_context"] = True
                    el_pipe.cfg["incl_prior"] = True
                    dev_acc_context, _ = _measure_acc(dev_data, el_pipe)
                    losses["entity_linker"] = losses["entity_linker"] / batchnr
                    print(
                        "Epoch, train loss",
                        itn,
                        round(losses["entity_linker"], 2),
                        " / dev accuracy avg",
                        round(dev_acc_context, 3),
                    )

    # STEP 6: measure the performance of our trained pipe on an independent dev set
    print()
    if len(dev_data):
        print()
        print(now(), "STEP 6: performance measurement of Entity Linking pipe")
        print()

        counts, acc_r, acc_r_d, acc_p, acc_p_d, acc_o, acc_o_d = _measure_baselines(
            dev_data, kb
        )
        print("dev counts:", sorted(counts.items(), key=lambda x: x[0]))

        oracle_by_label = [(x, round(y, 3)) for x, y in acc_o_d.items()]
        print("dev accuracy oracle:", round(acc_o, 3), oracle_by_label)

        random_by_label = [(x, round(y, 3)) for x, y in acc_r_d.items()]
        print("dev accuracy random:", round(acc_r, 3), random_by_label)

        prior_by_label = [(x, round(y, 3)) for x, y in acc_p_d.items()]
        print("dev accuracy prior:", round(acc_p, 3), prior_by_label)

        # using only context
        el_pipe.cfg["incl_context"] = True
        el_pipe.cfg["incl_prior"] = False
        dev_acc_context, dev_acc_cont_d = _measure_acc(dev_data, el_pipe)
        context_by_label = [(x, round(y, 3)) for x, y in dev_acc_cont_d.items()]
        print("dev accuracy context:", round(dev_acc_context, 3), context_by_label)

        # measuring combined accuracy (prior + context)
        el_pipe.cfg["incl_context"] = True
        el_pipe.cfg["incl_prior"] = True
        dev_acc_combo, dev_acc_combo_d = _measure_acc(dev_data, el_pipe)
        combo_by_label = [(x, round(y, 3)) for x, y in dev_acc_combo_d.items()]
        print("dev accuracy prior+context:", round(dev_acc_combo, 3), combo_by_label)

    # STEP 7: apply the EL pipe on a toy example
    print()
    print(now(), "STEP 7: applying Entity Linking to toy example")
    print()
    run_el_toy_example(nlp=nlp)

    # STEP 8: write the NLP pipeline (including entity linker) to file
    if output_dir:
        print()
        nlp_loc = output_dir / "nlp"
        print(now(), "STEP 8: Writing trained NLP to", nlp_loc)
        nlp.to_disk(nlp_loc)
        print()

    print()
    print(now(), "Done!")

Example #10

0

Show file

def main(model=None, output_dir=None):
    """Load the model and create the KB with pre-defined entity encodings.
    If an output_dir is provided, the KB will be stored there in a file 'kb'.
    The updated vocab will also be written to a directory in the output_dir."""

    nlp = spacy.load(model)  # load existing spaCy model
    print("Loaded model '%s'" % model)

    # check the length of the nlp vectors
    if "vectors" not in nlp.meta or not nlp.vocab.vectors.size:
        raise ValueError(
            "The `nlp` object should have access to pretrained word vectors, "
            " cf. https://spacy.io/usage/models#languages.")

    # You can change the dimension of vectors in your KB by using an encoder that changes the dimensionality.
    # For simplicity, we'll just use the original vector dimension here instead.
    vectors_dim = nlp.vocab.vectors.shape[1]
    kb = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=vectors_dim)

    # set up the data
    entity_ids = []
    descr_embeddings = []
    freqs = []
    for key, value in ENTITIES.items():
        desc, freq = value
        entity_ids.append(key)
        descr_embeddings.append(nlp(desc).vector)
        freqs.append(freq)

    # set the entities, can also be done by calling `kb.add_entity` for each entity
    kb.set_entities(entity_list=entity_ids,
                    freq_list=freqs,
                    vector_list=descr_embeddings)

    # adding aliases, the entities need to be defined in the KB beforehand
    kb.add_alias(
        alias="Russ Cochran",
        entities=["Q2146908", "Q7381115"],
        probabilities=[0.24, 0.7
                       ],  # the sum of these probabilities should not exceed 1
    )

    # test the trained model
    print()
    _print_kb(kb)

    # save model to output directory
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        kb_path = str(output_dir / "kb")
        kb.dump(kb_path)
        print()
        print("Saved KB to", kb_path)

        vocab_path = output_dir / "vocab"
        kb.vocab.to_disk(vocab_path)
        print("Saved vocab to", vocab_path)

        print()

        # test the saved model
        # always reload a knowledge base with the same vocab instance!
        print("Loading vocab from", vocab_path)
        print("Loading KB from", kb_path)
        vocab2 = Vocab().from_disk(vocab_path)
        kb2 = KnowledgeBase(vocab=vocab2)
        kb2.load_bulk(kb_path)
        print()
        _print_kb(kb2)

Example #11

0

Show file

def read_kb(nlp, kb_file):
    kb = KnowledgeBase(vocab=nlp.vocab)
    kb.load_bulk(kb_file)
    return kb

Example #12

0

Show file

File: training_nel.py Project: Big-Data-And-Data-Analytics/case-study-1-october2019-case-study-group-4

    def settingup_knowledgebase(self, names, train_data_2):

        QID = names['QID'].values.tolist()
        Names = names['Names'].values.tolist()
        Frequency = names['Frequency'].values.tolist()
        descript = []
        for desc in names['Description']:
            descript.append(self.custom_ner_model(desc).vector)

        print("Setting up entities \n")

        kb = KnowledgeBase(vocab=self.custom_ner_model.vocab,
                           entity_vector_length=96)
        kb.set_entities(entity_list=QID,
                        freq_list=Frequency,
                        vector_list=descript)

        print("Setting up Alias \n")

        print("\n")
        print("Spacy Pipeline \n")

        print(self.custom_ner_model.pipe_names)

        #kb_dump_file = str(input("Enter the KB Dump name: "))
        #kb_vocab_folder = str(input("Enter the KB Vocab name: "))
        folder.nel_kb_vocab()

        alias_prep = list(zip(Names, QID))
        folder.nel_kb_vocab()
        for i, j in alias_prep:
            names_alias = str(i)
            list_qid = []
            list_qid.append(j)
            prob = []
            prob.append(int(1.0))
            kb.add_alias(alias=names_alias,
                         entities=list_qid,
                         probabilities=prob)

            kb.dump("KB_Dump")
            kb.vocab.to_disk("KB_Vocab")

        print("\n")
        print("Knowbase dump and Vocab are stored in a local disk")

        train_data_dict_2 = train_data_2.to_dict('records')

        dataset_2 = []
        for data in train_data_dict_2:
            Text = data['Text']
            Name = data['Name']
            QID = data['QID']
            offset = (data["Start"], data["End"])
            links_dict = {QID: 1.0}
            dataset_2.append((Text, {"links": {offset: links_dict}}))

        self.custom_ner_model.vocab.from_disk("KB_Vocab")
        self.custom_ner_model.vocab.vectors.name = "spacy_pretrained_vectors"
        kb = KnowledgeBase(vocab=self.custom_ner_model.vocab)
        kb.load_bulk("KB_Dump")

        TRAIN_DOCS = []
        for text, annotation in dataset_2:
            doc = self.custom_ner_model(
                text
            )  # to make this more efficient, you can use nlp.pipe() just once for all the texts
            TRAIN_DOCS.append((doc, annotation))

        print("\n")
        print("Training started for Named Entity Linking \n")

        entity_linker = self.custom_ner_model.create_pipe(
            "entity_linker", config={"incl_prior": False})
        entity_linker.set_kb(kb)
        self.custom_ner_model.add_pipe(entity_linker, last=True)

        other_pipes = [
            pipe for pipe in self.custom_ner_model.pipe_names
            if pipe != "entity_linker"
        ]
        with self.custom_ner_model.disable_pipes(
                *other_pipes):  # train only the entity_linker
            optimizer = self.custom_ner_model.begin_training()
            for itn in range(
                    500):  # 500 iterations takes about a minute to train
                random.shuffle(TRAIN_DOCS)
                batches = minibatch(TRAIN_DOCS,
                                    size=compounding(
                                        4.0, 32.0,
                                        1.001))  # increasing batch sizes
                losses = {}
                for batch in batches:
                    texts, annotations = zip(*batch)
                    self.custom_ner_model.update(
                        texts,
                        annotations,
                        drop=0.2,  # prevent overfitting
                        losses=losses,
                        sgd=optimizer,
                    )
                if itn % 50 == 0:
                    print(itn, "Losses", losses)  # print the training loss
        print(itn, "Losses", losses)
        print("\n")
        print("Spacy Pipeline \n")
        print(self.custom_ner_model.pipe_names)
        ner_dump_name = str(input("Enter the Model name: "))

        self.custom_ner_model.to_disk(ner_dump_name)

        return self.custom_ner_model

Example #13

0

Show file

class ConllCandidatesGenerator:
    def __init__(
                self,
                spacy_nlp_vocab_dir: str = "data/vocab",
                spacy_kb_file: str = "data/kb"
            ):
        """
        :param spacy_nlp_vocab_dir: path to directory with spaCy vocab files
        :param spacy_kb_file: path to file with spaCy KnowledgeBase
        """
        # self.spacy_nlp_str = spacy_nlp_str
        self.spacy_nlp_vocab_dir = spacy_nlp_vocab_dir
        self.spacy_kb_file = spacy_kb_file

        # Initialized in get_kb()
        self.kb = None

        self.docs = []
        self.docs_entities = []

    def get_docs(self, file: str = 'conll-wikidata-iob-annotations'):
        """
        :param file: path to file with Wikidata-annotated CoNLL dataset
        :returns: self.docs, reading it from file if not loaded
        """
        if not self.docs:
            if not os.path.isfile(file):
                raise FileNotFoundError(
                        f"Could not find annotated CoNLL file {file}."
                    )

            self.docs = list(conll_documents(file))
        return self.docs

    def del_kb(self):
        """
        Frees up memory by deleting self.kb
        """
        self.kb = None

    def get_kb(self):
        """
        :returns: self.kb, reading it from file if not loaded
        """
        if not self.kb:
            print("Loading vocabulary...")
            vocab = Vocab().from_disk(self.spacy_nlp_vocab_dir)

            print("Loading KB...")
            self.kb = KnowledgeBase(vocab=vocab)
            self.kb.load_bulk(self.spacy_kb_file)
            print("KB loaded!")
        return self.kb

    def write_entities_info(self, file: str = "docs_entities_info.json"):
        """
        Writes self.docs_entities to file.
        File then contains all necessary candidate info,
         which allows candidates to be read from file
         with read_entities_info later
        :param file: file destination of output file
        """
        if not self.docs_entities:
            raise ValueError("ERROR: No candidates to write to file. "
                  "Try the function 'get_candidates' first.")

        print(f"Writing json to file {file} ...")
        with open(file, 'w') as of:
            json.dump(self.docs_entities, of)

    def read_entities_info(self, file: str = "docs_entities_info.json"):
        """
        Read self.docs_entities from file, and returns self.docs_entities
        File should be result of function write_entities_info,
         and gives all necessary candidate info
        :param file: path to file written by write_entities_info
        :returns: self.docs_entities
        """
        if not os.path.isfile(file):
            raise FileNotFoundError(f"Could not find file {file}. "
                  "Try the function write_entities_info first.")

        print("Reading from file...")
        with open(file, 'r') as inf:
            self.docs_entities = json.load(inf)
        return self.docs_entities

    def generate_candidates_for_doc(self, doc: ConllDocument) -> List[Dict]:
        """
        Takes a ConllDocument object with tagged tokens
        (e.g. from conll_documents()).

        Outputs a list of dictionaries for each tagged named entity.
        Each dict has a dict of:
            the ground truth of the entity (as a 'Q-ID' from WikiData),
            the token position of the entity as a tuple (start, end),
            and a list of candidates, represented by their wikidata 'Q-ID'.

        :param doc: a ConllDocument object with tokens tagged with WikiData IDs
        :returns: a list over the tagged named entities, each a dictionary of
                  ground truth, entity position, and candidates
        """
        self.get_kb()
        # The return variable. Stores the list of entities.
        entities = []

        # Inner function to append a label_dict to the entities list
        def add_entity(entity_span_s, entity_span_e, entity_tokens, entity_gt):
            entity_text = ' '.join(entity_tokens)
            entity_candidates = [
                    c.entity_ for c in self.kb.get_candidates(entity_text)
                ]
            entity_span = [entity_span_s, entity_span_e]

            entities.append(
                    {'Position': entity_span,
                     'GroundTruth': entity_gt,
                     'Candidates': entity_candidates}
                )

        # Helper variables for the iteration:
        # Tokens belonging to current entity
        collected_tokens = []
        # Tag of the current entity (the ground truth)
        current_entity_tag = None
        # Position of the first entity token in the document tokens list
        span_start = None

        # Enumerate the document's list of tokens
        for i_token, token in enumerate(doc.tokens):

            # If we are looking at the beginning of a named entity
            if token.true_label.startswith("Q") or token.true_label == "B":

                # Check if we already have collected a named entity
                # This is the case when two named entities follow each other
                if len(collected_tokens) > 0:
                    add_entity(span_start, i_token-1,
                               collected_tokens, current_entity_tag)

                span_start = i_token
                collected_tokens = [token.text]
                current_entity_tag = token.true_label

            # If we are looking at the continuation of a named entity
            elif token.true_label == 'I':
                collected_tokens.append(token.text)

            # If we're not looking at a token in a named entity
            else:
                # If we have passed the end of a named entity
                if len(collected_tokens) > 0:
                    add_entity(span_start, i_token-1,
                               collected_tokens, current_entity_tag)

                collected_tokens = []

        # If the last tokens were a named entity
        if len(collected_tokens) > 0:
            add_entity(span_start, len(doc.tokens)-1,
                       collected_tokens, current_entity_tag)

        return entities

    def get_docs_entities(
                self,
                f: str = None,
                del_kb: bool = True
            ) -> List[List[Dict]]:
        """
        Iterates CoNLL documents and gets the cadidates for all mentions
        :param f: file with tagged conll documents
        :param del_kb: Whether to delete the KB object to free up space
        :returns: a list of dicts with lists of info about entities
        """

        # Generate if not cached
        if not self.docs_entities:

            if self.docs:
                self.docs = []

            for conll_doc in self.get_docs(f):
                self.docs_entities.append(
                        self.generate_candidates_for_doc(conll_doc)
                    )

            if del_kb:
                print("Deleting Spacy KB object...")
                self.del_kb()

        return self.docs_entities

    def print_candidate_stats(self):
        """
        Prints metrics about generated candidates
        """
        if not self.docs_entities:
            print("No candidates info.")
            return

        # Number of entities with no candidates (no data points)
        n_no_cand = 0
        # Number of entities where ground truth is among the candidates
        n_pos_labels = 0
        # Number of entities where GT is not among the candidates
        n_no_pos_labels = 0
        # Number of candidates excluding the GT candidate
        n_neg_labels = 0

        # Total number of named entities
        n_ne = 0
        # Only named entities in the wikidata KB
        n_ne_in_kb = 0
        # Number of named entities not linked to Wikidata KB
        n_ne_bs = 0
        # Number of candidates that belong to entities with no GT
        n_b_cands = 0

        for doc_entities in self.docs_entities:
            for entity in doc_entities:
                n_ne += 1

                if len(entity['Candidates']) == 0:
                    n_no_cand += 1
                elif entity['GroundTruth'] in entity['Candidates']:
                    n_pos_labels += 1
                    n_neg_labels += len(entity['Candidates']) - 1
                else:
                    n_no_pos_labels += 1
                    n_neg_labels += len(entity['Candidates'])

                if entity['GroundTruth'] == 'B':
                    n_ne_bs += 1
                    n_b_cands += len(entity['Candidates'])
                else:
                    n_ne_in_kb += len(entity['Candidates'])

        n_cand = n_pos_labels + n_neg_labels

        print(f"{n_ne: >7,} named entities in total")
        print(f"{n_cand: >7,} candidates in total "
              f"(total number of data points)")
        print(f"{n_pos_labels: >7,} / {n_cand: >7,} positive labels "
              f"({100 * n_pos_labels / n_cand: >5.2f} % all all labels )")
        print(f"{n_neg_labels: >7,} / {n_cand: >7,} negative labels "
              f"({100 * n_neg_labels / n_cand: >5.2f} % all all labels )")

        print(f"{n_no_cand: >7,} / {n_ne: >7,} "
              f"named entities have no candidates")
        print(f"{n_no_pos_labels: >7,} / {n_ne: >7,} "
              f"named entities where correct label is not among candidates")
        print(f"{n_ne_in_kb: >7,} / {n_cand: >7,} "
              f"candidates tagged with GT in Wikidata KB")
        print(f"{n_ne_bs: >7,} / {n_cand: >7,} "
              f"candidates for named entities not in Wikidata KB")

        print(f"{n_cand/n_ne:.1f} average number of candidates per entity")

Example #14

0

Show file

File: train_entity_linker.py Project: zlpmichelle/spaCy

def main(kb_path, vocab_path=None, output_dir=None, n_iter=50):
    """Create a blank model with the specified vocab, set up the pipeline and train the entity linker.
    The `vocab` should be the one used during creation of the KB."""
    vocab = Vocab().from_disk(vocab_path)
    # create blank Language class with correct vocab
    nlp = spacy.blank("en", vocab=vocab)
    nlp.vocab.vectors.name = "spacy_pretrained_vectors"
    print("Created blank 'en' model with vocab from '%s'" % vocab_path)

    # create the built-in pipeline components and add them to the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if "entity_linker" not in nlp.pipe_names:
        entity_linker = nlp.create_pipe("entity_linker")
        kb = KnowledgeBase(vocab=nlp.vocab)
        kb.load_bulk(kb_path)
        print("Loaded Knowledge Base from '%s'" % kb_path)
        entity_linker.set_kb(kb)
        nlp.add_pipe(entity_linker, last=True)
    else:
        entity_linker = nlp.get_pipe("entity_linker")
        kb = entity_linker.kb

    # make sure the annotated examples correspond to known identifiers in the knowlege base
    kb_ids = kb.get_entity_strings()
    for text, annotation in TRAIN_DATA:
        for offset, kb_id_dict in annotation["links"].items():
            new_dict = {}
            for kb_id, value in kb_id_dict.items():
                if kb_id in kb_ids:
                    new_dict[kb_id] = value
                else:
                    print("Removed", kb_id,
                          "from training because it is not in the KB.")
            annotation["links"][offset] = new_dict

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "entity_linker"]
    with nlp.disable_pipes(*other_pipes):  # only train entity linker
        # reset and initialize the weights randomly
        optimizer = nlp.begin_training()
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
            losses = {}
            # batch up the examples using spaCy's minibatch
            batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(
                    texts,  # batch of texts
                    annotations,  # batch of annotations
                    drop=0.2,  # dropout - make it harder to memorise data
                    losses=losses,
                    sgd=optimizer,
                )
            print(itn, "Losses", losses)

    # test the trained model
    _apply_model(nlp)

    # save model to output directory
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.to_disk(output_dir)
        print()
        print("Saved model to", output_dir)

        # test the saved model
        print("Loading from", output_dir)
        nlp2 = spacy.load(output_dir)
        _apply_model(nlp2)

Example #15

0

Show file

File: pretrain_kb.py Project: yogesh1997/spaCy

def main(vocab_path=None, model=None, output_dir=None, n_iter=50):
    """Load the model, create the KB and pretrain the entity encodings.
    Either an nlp model or a vocab is needed to provide access to pre-trained word embeddings.
    If an output_dir is provided, the KB will be stored there in a file 'kb'.
    When providing an nlp model, the updated vocab will also be written to a directory in the output_dir."""
    if model is None and vocab_path is None:
        raise ValueError("Either the `nlp` model or the `vocab` should be specified.")

    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        vocab = Vocab().from_disk(vocab_path)
        # create blank Language class with specified vocab
        nlp = spacy.blank("en", vocab=vocab)
        print("Created blank 'en' model with vocab from '%s'" % vocab_path)

    kb = KnowledgeBase(vocab=nlp.vocab)

    # set up the data
    entity_ids = []
    descriptions = []
    freqs = []
    for key, value in ENTITIES.items():
        desc, freq = value
        entity_ids.append(key)
        descriptions.append(desc)
        freqs.append(freq)

    # training entity description encodings
    # this part can easily be replaced with a custom entity encoder
    encoder = EntityEncoder(
        nlp=nlp,
        input_dim=INPUT_DIM,
        desc_width=DESC_WIDTH,
        epochs=n_iter,
    )
    encoder.train(description_list=descriptions, to_print=True)

    # get the pretrained entity vectors
    embeddings = encoder.apply_encoder(descriptions)

    # set the entities, can also be done by calling `kb.add_entity` for each entity
    kb.set_entities(entity_list=entity_ids, freq_list=freqs, vector_list=embeddings)

    # adding aliases, the entities need to be defined in the KB beforehand
    kb.add_alias(
        alias="Russ Cochran",
        entities=["Q2146908", "Q7381115"],
        probabilities=[0.24, 0.7],  # the sum of these probabilities should not exceed 1
    )

    # test the trained model
    print()
    _print_kb(kb)

    # save model to output directory
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        kb_path = str(output_dir / "kb")
        kb.dump(kb_path)
        print()
        print("Saved KB to", kb_path)

        # only storing the vocab if we weren't already reading it from file
        if not vocab_path:
            vocab_path = output_dir / "vocab"
            kb.vocab.to_disk(vocab_path)
            print("Saved vocab to", vocab_path)

        print()

        # test the saved model
        # always reload a knowledge base with the same vocab instance!
        print("Loading vocab from", vocab_path)
        print("Loading KB from", kb_path)
        vocab2 = Vocab().from_disk(vocab_path)
        kb2 = KnowledgeBase(vocab=vocab2)
        kb2.load_bulk(kb_path)
        _print_kb(kb2)
        print()

Example #16

0

Show file

File: pretrain_kb.py Project: samlet/saai

def main(model=None, output_dir=None, n_iter=50):
    """Load the model, create the KB and pretrain the entity encodings.
    If an output_dir is provided, the KB will be stored there in a file 'kb'.
    The updated vocab will also be written to a directory in the output_dir."""

    nlp = spacy.load(model)  # load existing spaCy model
    print("Loaded model '%s'" % model)

    # check the length of the nlp vectors
    if "vectors" not in nlp.meta or not nlp.vocab.vectors.size:
        raise ValueError(
            "The `nlp` object should have access to pretrained word vectors, "
            " cf. https://spacy.io/usage/models#languages.")

    kb = KnowledgeBase(vocab=nlp.vocab)

    # set up the data
    entity_ids = []
    descriptions = []
    freqs = []
    for key, value in ENTITIES.items():
        desc, freq = value
        entity_ids.append(key)
        descriptions.append(desc)
        freqs.append(freq)

    # training entity description encodings
    # this part can easily be replaced with a custom entity encoder
    encoder = EntityEncoder(
        nlp=nlp,
        input_dim=INPUT_DIM,
        desc_width=DESC_WIDTH,
        epochs=n_iter,
    )
    encoder.train(description_list=descriptions, to_print=True)

    # get the pretrained entity vectors
    embeddings = encoder.apply_encoder(descriptions)

    # set the entities, can also be done by calling `kb.add_entity` for each entity
    kb.set_entities(entity_list=entity_ids,
                    freq_list=freqs,
                    vector_list=embeddings)

    # adding aliases, the entities need to be defined in the KB beforehand
    kb.add_alias(
        alias="Russ Cochran",
        entities=["Q2146908", "Q7381115"],
        probabilities=[0.24, 0.7
                       ],  # the sum of these probabilities should not exceed 1
    )

    # test the trained model
    print()
    _print_kb(kb)

    # save model to output directory
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        kb_path = str(output_dir / "kb")
        kb.dump(kb_path)
        print()
        print("Saved KB to", kb_path)

        vocab_path = output_dir / "vocab"
        kb.vocab.to_disk(vocab_path)
        print("Saved vocab to", vocab_path)

        print()

        # test the saved model
        # always reload a knowledge base with the same vocab instance!
        print("Loading vocab from", vocab_path)
        print("Loading KB from", kb_path)
        vocab2 = Vocab().from_disk(vocab_path)
        kb2 = KnowledgeBase(vocab=vocab2)
        kb2.load_bulk(kb_path)
        _print_kb(kb2)
        print()

Example #17

0

Show file

def train_el():
    """ Step 2: Once we have done the manual annotations, use it to train a new Entity Linking component. """
    nlp = spacy.load(output_dir / "my_nlp")
    kb = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=1)
    kb.load_bulk(output_dir / "my_kb")

    dataset = []
    json_loc = prodigy_dir / "emerson_annotated_text.jsonl"
    with json_loc.open("r", encoding="utf8") as jsonfile:
        for line in jsonfile:
            example = json.loads(line)
            text = example["text"]
            if example["answer"] == "accept":
                QID = example["accept"][0]
                offset = (example["spans"][0]["start"], example["spans"][0]["end"])
                links_dict = {QID: 1.0}
            dataset.append((text, {"links": {offset: links_dict}}))

    gold_ids = []
    for text, annot in dataset:
        for span, links_dict in annot["links"].items():
            for link, value in links_dict.items():
                if value:
                    gold_ids.append(link)
    print("Statistics of manually annotated data:")
    print(Counter(gold_ids))
    print()

    train_dataset = []
    test_dataset = []
    for QID in ['Q312545', 'Q48226', 'Q215952']:
        indices = [i for i,j in enumerate(gold_ids) if j == QID]
        train_dataset.extend(dataset[index] for index in indices[0:8])  # first 8 in training
        test_dataset.extend(dataset[index] for index in indices[8:10])  # last 2 in test

    # avoid artificial signals by reshuffling the datasets
    random.shuffle(train_dataset)
    random.shuffle(test_dataset)

    TRAIN_DOCS = []
    for text, annotation in train_dataset:
        doc = nlp(text)   # to make this more efficient, you can use nlp.pipe() just once for all the texts
        TRAIN_DOCS.append((doc, annotation))

    entity_linker = nlp.create_pipe("entity_linker", config={"incl_prior": False})
    entity_linker.set_kb(kb)
    nlp.add_pipe(entity_linker, last=True)

    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "entity_linker"]
    print("Training the entity linker")
    with nlp.disable_pipes(*other_pipes):   # train only the entity_linker
        optimizer = nlp.begin_training()
        for itn in range(500):   # 500 iterations takes about a minute to train on this small dataset
            random.shuffle(TRAIN_DOCS)
            batches = minibatch(TRAIN_DOCS, size=compounding(4.0, 32.0, 1.001))   # increasing batch size
            losses = {}
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(
                    texts,
                    annotations,
                    drop=0.2,   # prevent overfitting
                    losses=losses,
                    sgd=optimizer,
                )
            if itn % 50 == 0:
                print(itn, "Losses", losses)   # print the training loss
    print(itn, "Losses", losses)
    print()

    nlp.to_disk(output_dir / "my_nlp_el")

    with open(output_dir / "test_set.pkl", "wb") as f:
        pickle.dump(test_dataset, f)

Example #18

0

Show file

File: train_entity_linker.py Project: vova999/spaCy

def main(kb_path, vocab_path=None, output_dir=None, n_iter=50):
    """Create a blank model with the specified vocab, set up the pipeline and train the entity linker.
    The `vocab` should be the one used during creation of the KB."""
    vocab = Vocab().from_disk(vocab_path)
    # create blank Language class with correct vocab
    nlp = spacy.blank("en", vocab=vocab)
    nlp.vocab.vectors.name = "spacy_pretrained_vectors"
    print("Created blank 'en' model with vocab from '%s'" % vocab_path)

    # Add a sentencizer component. Alternatively, add a dependency parser for higher accuracy.
    nlp.add_pipe(nlp.create_pipe('sentencizer'))

    # Add a custom component to recognize "Russ Cochran" as an entity for the example training data.
    # Note that in a realistic application, an actual NER algorithm should be used instead.
    ruler = EntityRuler(nlp)
    patterns = [{
        "label": "PERSON",
        "pattern": [{
            "LOWER": "russ"
        }, {
            "LOWER": "cochran"
        }]
    }]
    ruler.add_patterns(patterns)
    nlp.add_pipe(ruler)

    # Create the Entity Linker component and add it to the pipeline.
    if "entity_linker" not in nlp.pipe_names:
        # use only the predicted EL score and not the prior probability (for demo purposes)
        cfg = {"incl_prior": False}
        entity_linker = nlp.create_pipe("entity_linker", cfg)
        kb = KnowledgeBase(vocab=nlp.vocab)
        kb.load_bulk(kb_path)
        print("Loaded Knowledge Base from '%s'" % kb_path)
        entity_linker.set_kb(kb)
        nlp.add_pipe(entity_linker, last=True)

    # Convert the texts to docs to make sure we have doc.ents set for the training examples.
    # Also ensure that the annotated examples correspond to known identifiers in the knowlege base.
    kb_ids = nlp.get_pipe("entity_linker").kb.get_entity_strings()
    TRAIN_DOCS = []
    for text, annotation in TRAIN_DATA:
        with nlp.disable_pipes("entity_linker"):
            doc = nlp(text)
        annotation_clean = annotation
        for offset, kb_id_dict in annotation["links"].items():
            new_dict = {}
            for kb_id, value in kb_id_dict.items():
                if kb_id in kb_ids:
                    new_dict[kb_id] = value
                else:
                    print("Removed", kb_id,
                          "from training because it is not in the KB.")
            annotation_clean["links"][offset] = new_dict
        TRAIN_DOCS.append((doc, annotation_clean))

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "entity_linker"]
    with nlp.disable_pipes(*other_pipes):  # only train entity linker
        # reset and initialize the weights randomly
        optimizer = nlp.begin_training()
        for itn in range(n_iter):
            random.shuffle(TRAIN_DOCS)
            losses = {}
            # batch up the examples using spaCy's minibatch
            batches = minibatch(TRAIN_DOCS, size=compounding(4.0, 32.0, 1.001))
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(
                    texts,  # batch of texts
                    annotations,  # batch of annotations
                    drop=0.2,  # dropout - make it harder to memorise data
                    losses=losses,
                    sgd=optimizer,
                )
            print(itn, "Losses", losses)

    # test the trained model
    _apply_model(nlp)

    # save model to output directory
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.to_disk(output_dir)
        print()
        print("Saved model to", output_dir)

        # test the saved model
        print("Loading from", output_dir)
        nlp2 = spacy.load(output_dir)
        _apply_model(nlp2)

Example #19

0

Show file

File: wikidata_entity_linking.py Project: youikim/spaCy

def run_pipeline():
    # set the appropriate booleans to define which parts of the pipeline should be re(run)
    print("START", datetime.datetime.now())
    print()
    nlp_1 = spacy.load('en_core_web_lg')
    nlp_2 = None
    kb_2 = None

    # one-time methods to create KB and write to file
    to_create_prior_probs = False
    to_create_entity_counts = False
    to_create_kb = False

    # read KB back in from file
    to_read_kb = True
    to_test_kb = False

    # create training dataset
    create_wp_training = False

    # train the EL pipe
    train_pipe = True
    measure_performance = True

    # test the EL pipe on a simple example
    to_test_pipeline = True

    # write the NLP object, read back in and test again
    to_write_nlp = True
    to_read_nlp = True
    test_from_file = False

    # STEP 1 : create prior probabilities from WP (run only once)
    if to_create_prior_probs:
        print("STEP 1: to_create_prior_probs", datetime.datetime.now())
        wp.read_wikipedia_prior_probs(wikipedia_input=ENWIKI_DUMP,
                                      prior_prob_output=PRIOR_PROB)
        print()

    # STEP 2 : deduce entity frequencies from WP (run only once)
    if to_create_entity_counts:
        print("STEP 2: to_create_entity_counts", datetime.datetime.now())
        wp.write_entity_counts(prior_prob_input=PRIOR_PROB,
                               count_output=ENTITY_COUNTS,
                               to_print=False)
        print()

    # STEP 3 : create KB and write to file (run only once)
    if to_create_kb:
        print("STEP 3a: to_create_kb", datetime.datetime.now())
        kb_1 = kb_creator.create_kb(nlp_1,
                                    max_entities_per_alias=MAX_CANDIDATES,
                                    min_entity_freq=MIN_ENTITY_FREQ,
                                    min_occ=MIN_PAIR_OCC,
                                    entity_def_output=ENTITY_DEFS,
                                    entity_descr_output=ENTITY_DESCR,
                                    count_input=ENTITY_COUNTS,
                                    prior_prob_input=PRIOR_PROB,
                                    wikidata_input=WIKIDATA_JSON)
        print("kb entities:", kb_1.get_size_entities())
        print("kb aliases:", kb_1.get_size_aliases())
        print()

        print("STEP 3b: write KB and NLP", datetime.datetime.now())
        kb_1.dump(KB_FILE)
        nlp_1.to_disk(NLP_1_DIR)
        print()

    # STEP 4 : read KB back in from file
    if to_read_kb:
        print("STEP 4: to_read_kb", datetime.datetime.now())
        nlp_2 = spacy.load(NLP_1_DIR)
        kb_2 = KnowledgeBase(vocab=nlp_2.vocab,
                             entity_vector_length=DESC_WIDTH)
        kb_2.load_bulk(KB_FILE)
        print("kb entities:", kb_2.get_size_entities())
        print("kb aliases:", kb_2.get_size_aliases())
        print()

        # test KB
        if to_test_kb:
            check_kb(kb_2)
            print()

    # STEP 5: create a training dataset from WP
    if create_wp_training:
        print("STEP 5: create training dataset", datetime.datetime.now())
        training_set_creator.create_training(wikipedia_input=ENWIKI_DUMP,
                                             entity_def_input=ENTITY_DEFS,
                                             training_output=TRAINING_DIR)

    # STEP 6: create and train the entity linking pipe
    if train_pipe:
        print("STEP 6: training Entity Linking pipe", datetime.datetime.now())
        type_to_int = {label: i for i, label in enumerate(nlp_2.entity.labels)}
        print(" -analysing", len(type_to_int), "different entity types")
        el_pipe = nlp_2.create_pipe(name='entity_linker',
                                    config={
                                        "context_width": CONTEXT_WIDTH,
                                        "pretrained_vectors":
                                        nlp_2.vocab.vectors.name,
                                        "type_to_int": type_to_int
                                    })
        el_pipe.set_kb(kb_2)
        nlp_2.add_pipe(el_pipe, last=True)

        other_pipes = [
            pipe for pipe in nlp_2.pipe_names if pipe != "entity_linker"
        ]
        with nlp_2.disable_pipes(*other_pipes):  # only train Entity Linking
            optimizer = nlp_2.begin_training()
            optimizer.learn_rate = LEARN_RATE
            optimizer.L2 = L2

        # define the size (nr of entities) of training and dev set
        train_limit = 5000
        dev_limit = 5000

        train_data = training_set_creator.read_training(
            nlp=nlp_2, training_dir=TRAINING_DIR, dev=False, limit=train_limit)

        print("Training on", len(train_data), "articles")
        print()

        dev_data = training_set_creator.read_training(
            nlp=nlp_2, training_dir=TRAINING_DIR, dev=True, limit=dev_limit)

        print("Dev testing on", len(dev_data), "articles")
        print()

        if not train_data:
            print("Did not find any training data")
        else:
            for itn in range(EPOCHS):
                random.shuffle(train_data)
                losses = {}
                batches = minibatch(train_data,
                                    size=compounding(4.0, 128.0, 1.001))
                batchnr = 0

                with nlp_2.disable_pipes(*other_pipes):
                    for batch in batches:
                        try:
                            docs, golds = zip(*batch)
                            nlp_2.update(
                                docs,
                                golds,
                                sgd=optimizer,
                                drop=DROPOUT,
                                losses=losses,
                            )
                            batchnr += 1
                        except Exception as e:
                            print("Error updating batch:", e)

                if batchnr > 0:
                    el_pipe.cfg["context_weight"] = 1
                    el_pipe.cfg["prior_weight"] = 1
                    dev_acc_context, dev_acc_context_dict = _measure_accuracy(
                        dev_data, el_pipe)
                    losses['entity_linker'] = losses['entity_linker'] / batchnr
                    print("Epoch, train loss", itn,
                          round(losses['entity_linker'], 2), " / dev acc avg",
                          round(dev_acc_context, 3))

        # STEP 7: measure the performance of our trained pipe on an independent dev set
        if len(dev_data) and measure_performance:
            print()
            print("STEP 7: performance measurement of Entity Linking pipe",
                  datetime.datetime.now())
            print()

            counts, acc_r, acc_r_label, acc_p, acc_p_label, acc_o, acc_o_label = _measure_baselines(
                dev_data, kb_2)
            print("dev counts:", sorted(counts.items(), key=lambda x: x[0]))
            print("dev acc oracle:", round(acc_o, 3),
                  [(x, round(y, 3)) for x, y in acc_o_label.items()])
            print("dev acc random:", round(acc_r, 3),
                  [(x, round(y, 3)) for x, y in acc_r_label.items()])
            print("dev acc prior:", round(acc_p, 3),
                  [(x, round(y, 3)) for x, y in acc_p_label.items()])

            # using only context
            el_pipe.cfg["context_weight"] = 1
            el_pipe.cfg["prior_weight"] = 0
            dev_acc_context, dev_acc_context_dict = _measure_accuracy(
                dev_data, el_pipe)
            print("dev acc context avg:", round(dev_acc_context, 3),
                  [(x, round(y, 3)) for x, y in dev_acc_context_dict.items()])

            # measuring combined accuracy (prior + context)
            el_pipe.cfg["context_weight"] = 1
            el_pipe.cfg["prior_weight"] = 1
            dev_acc_combo, dev_acc_combo_dict = _measure_accuracy(
                dev_data, el_pipe, error_analysis=False)
            print("dev acc combo avg:", round(dev_acc_combo, 3),
                  [(x, round(y, 3)) for x, y in dev_acc_combo_dict.items()])

        # STEP 8: apply the EL pipe on a toy example
        if to_test_pipeline:
            print()
            print("STEP 8: applying Entity Linking to toy example",
                  datetime.datetime.now())
            print()
            run_el_toy_example(nlp=nlp_2)

        # STEP 9: write the NLP pipeline (including entity linker) to file
        if to_write_nlp:
            print()
            print("STEP 9: testing NLP IO", datetime.datetime.now())
            print()
            print("writing to", NLP_2_DIR)
            nlp_2.to_disk(NLP_2_DIR)
            print()

    # verify that the IO has gone correctly
    if to_read_nlp:
        print("reading from", NLP_2_DIR)
        nlp_3 = spacy.load(NLP_2_DIR)

        print("running toy example with NLP 3")
        run_el_toy_example(nlp=nlp_3)

    # testing performance with an NLP model from file
    if test_from_file:
        nlp_2 = spacy.load(NLP_1_DIR)
        nlp_3 = spacy.load(NLP_2_DIR)
        el_pipe = nlp_3.get_pipe("entity_linker")

        dev_limit = 5000
        dev_data = training_set_creator.read_training(
            nlp=nlp_2, training_dir=TRAINING_DIR, dev=True, limit=dev_limit)

        print("Dev testing from file on", len(dev_data), "articles")
        print()

        dev_acc_combo, dev_acc_combo_dict = _measure_accuracy(
            dev_data, el_pipe=el_pipe, error_analysis=False)
        print("dev acc combo avg:", round(dev_acc_combo, 3),
              [(x, round(y, 3)) for x, y in dev_acc_combo_dict.items()])

    print()
    print("STOP", datetime.datetime.now())