def read_nlp_kb(model_dir, kb_file): nlp = spacy.load(model_dir) kb = KnowledgeBase(vocab=nlp.vocab) kb.load_bulk(kb_file) logger.info("kb entities: {}".format(kb.get_size_entities())) logger.info("kb aliases: {}".format(kb.get_size_aliases())) return nlp, kb
def entity_linker_manual(dataset, source, nlp_dir, kb_loc, entity_loc): # Load the NLP and KB objects from file nlp = spacy.load(nlp_dir) kb = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=1) kb.load_bulk(kb_loc) model = EntityRecognizer(nlp) # Read the pre-defined CSV file into dictionaries mapping QIDs to the full names and descriptions id_dict = dict() with entity_loc.open("r", encoding="utf8") as csvfile: csvreader = csv.reader(csvfile, delimiter=",") for row in csvreader: id_dict[row[0]] = (row[1], row[2]) # Initialize the Prodigy stream by running the NER model stream = TXT(source) stream = [set_hashes(eg) for eg in stream] stream = (eg for score, eg in model(stream)) # For each NER mention, add the candidates from the KB to the annotation task stream = _add_options(stream, kb, id_dict) stream = filter_duplicates(stream, by_input=True, by_task=False) return { "dataset": dataset, "stream": stream, "view_id": "choice", "config": { "choice_auto_accept": True }, }
def test_issue4674(): """Test that setting entities with overlapping identifiers does not mess up IO""" nlp = English() kb = KnowledgeBase(nlp.vocab, entity_vector_length=3) vector1 = [0.9, 1.1, 1.01] vector2 = [1.8, 2.25, 2.01] kb.set_entities(entity_list=["Q1", "Q1"], freq_list=[32, 111], vector_list=[vector1, vector2]) assert kb.get_size_entities() == 1 # dumping to file & loading back in with make_tempdir() as d: dir_path = ensure_path(d) if not dir_path.exists(): dir_path.mkdir() file_path = dir_path / "kb" kb.dump(str(file_path)) kb2 = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=3) kb2.load_bulk(str(file_path)) assert kb2.get_size_entities() == 1
class NamedEntityCreator: def __init__(self, kb_folder, vectors_loc, lang='sv', stz=True, vectors_name='fasttext'): self.nlp = create_model(vectors_loc=vectors_loc, lang=lang, stz=stz, vectors_name=vectors_name, max_items=1000) self.kb = KnowledgeBase(vocab=self.nlp.vocab) print(kb_folder) self.kb.load_bulk(kb_folder) print() _print_kb(self.kb)
def load(self, output_dir): kb_path = os.path.join(output_dir, "kb") vocab_path = os.path.join(output_dir, "vocab") print("Loading vocab from", vocab_path) print("Loading KB from", kb_path) vocab = Vocab().from_disk(vocab_path) kb = KnowledgeBase(vocab=vocab) kb.load_bulk(kb_path) self.kb = kb return self.kb
def test_save_and_load_knowledge_base(): nlp = Language() kb = KnowledgeBase(nlp.vocab, entity_vector_length=1) with make_tempdir() as d: path = d / "kb" try: kb.dump(path) except Exception as e: pytest.fail(str(e)) try: kb_loaded = KnowledgeBase(nlp.vocab, entity_vector_length=1) kb_loaded.load_bulk(path) except Exception as e: pytest.fail(str(e))
def test_serialize_kb_disk(en_vocab): # baseline assertions kb1 = _get_dummy_kb(en_vocab) _check_kb(kb1) # dumping to file & loading back in with make_tempdir() as d: dir_path = ensure_path(d) if not dir_path.exists(): dir_path.mkdir() file_path = dir_path / "kb" kb1.dump(str(file_path)) kb2 = KnowledgeBase(vocab=en_vocab, entity_vector_length=3) kb2.load_bulk(str(file_path)) # final assertions _check_kb(kb2)
def from_disk(self, path: Path, **kwargs): """Deserialize saved AnnLinker from disk. path (Path): directory to deserialize from RETURNS (AnnLinker): Initialized AnnLinker """ path = util.ensure_path(path) kb = KnowledgeBase(self.nlp.vocab, 300) kb.load_bulk(path / "kb") self.set_kb(kb) cg = CandidateGenerator().from_disk(path) self.set_cg(cg) cfg = srsly.read_json(path / "cfg") self.threshold = cfg.get("threshold", 0.7) self.no_description_threshold = cfg.get("no_description_threshold", 0.95) self.disambiguate = cfg.get("disambiguate", True) return self
def main( dir_kb, output_dir=None, loc_training=None, wp_xml=None, epochs=10, dropout=0.5, lr=0.005, l2=1e-6, train_inst=None, dev_inst=None, limit=None, ): print(now(), "Creating Entity Linker with Wikipedia and WikiData") print() # STEP 0: set up IO if output_dir and not output_dir.exists(): output_dir.mkdir() # STEP 1 : load the NLP object nlp_dir = dir_kb / "nlp" print(now(), "STEP 1: loading model from", nlp_dir) nlp = spacy.load(nlp_dir) # check that there is a NER component in the pipeline if "ner" not in nlp.pipe_names: raise ValueError(Errors.E152) # STEP 2 : read the KB print() print(now(), "STEP 2: reading the KB from", dir_kb / "kb") kb = KnowledgeBase(vocab=nlp.vocab) kb.load_bulk(dir_kb / "kb") # STEP 3: create a training dataset from WP print() if loc_training: print(now(), "STEP 3: reading training dataset from", loc_training) else: if not wp_xml: raise ValueError(Errors.E153) if output_dir: loc_training = output_dir / "training_data" else: loc_training = dir_kb / "training_data" if not loc_training.exists(): loc_training.mkdir() print(now(), "STEP 3: creating training dataset at", loc_training) if limit is not None: print("Warning: reading only", limit, "lines of Wikipedia dump.") loc_entity_defs = dir_kb / "entity_defs.csv" training_set_creator.create_training( wikipedia_input=wp_xml, entity_def_input=loc_entity_defs, training_output=loc_training, limit=limit, ) # STEP 4: parse the training data print() print(now(), "STEP 4: parse the training & evaluation data") # for training, get pos & neg instances that correspond to entries in the kb print("Parsing training data, limit =", train_inst) train_data = training_set_creator.read_training( nlp=nlp, training_dir=loc_training, dev=False, limit=train_inst, kb=kb ) print("Training on", len(train_data), "articles") print() print("Parsing dev testing data, limit =", dev_inst) # for testing, get all pos instances, whether or not they are in the kb dev_data = training_set_creator.read_training( nlp=nlp, training_dir=loc_training, dev=True, limit=dev_inst, kb=None ) print("Dev testing on", len(dev_data), "articles") print() # STEP 5: create and train the entity linking pipe print() print(now(), "STEP 5: training Entity Linking pipe") el_pipe = nlp.create_pipe( name="entity_linker", config={"pretrained_vectors": nlp.vocab.vectors.name} ) el_pipe.set_kb(kb) nlp.add_pipe(el_pipe, last=True) other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "entity_linker"] with nlp.disable_pipes(*other_pipes): # only train Entity Linking optimizer = nlp.begin_training() optimizer.learn_rate = lr optimizer.L2 = l2 if not train_data: print("Did not find any training data") else: for itn in range(epochs): random.shuffle(train_data) losses = {} batches = minibatch(train_data, size=compounding(4.0, 128.0, 1.001)) batchnr = 0 with nlp.disable_pipes(*other_pipes): for batch in batches: try: docs, golds = zip(*batch) nlp.update( docs=docs, golds=golds, sgd=optimizer, drop=dropout, losses=losses, ) batchnr += 1 except Exception as e: print("Error updating batch:", e) if batchnr > 0: el_pipe.cfg["incl_context"] = True el_pipe.cfg["incl_prior"] = True dev_acc_context, _ = _measure_acc(dev_data, el_pipe) losses["entity_linker"] = losses["entity_linker"] / batchnr print( "Epoch, train loss", itn, round(losses["entity_linker"], 2), " / dev accuracy avg", round(dev_acc_context, 3), ) # STEP 6: measure the performance of our trained pipe on an independent dev set print() if len(dev_data): print() print(now(), "STEP 6: performance measurement of Entity Linking pipe") print() counts, acc_r, acc_r_d, acc_p, acc_p_d, acc_o, acc_o_d = _measure_baselines( dev_data, kb ) print("dev counts:", sorted(counts.items(), key=lambda x: x[0])) oracle_by_label = [(x, round(y, 3)) for x, y in acc_o_d.items()] print("dev accuracy oracle:", round(acc_o, 3), oracle_by_label) random_by_label = [(x, round(y, 3)) for x, y in acc_r_d.items()] print("dev accuracy random:", round(acc_r, 3), random_by_label) prior_by_label = [(x, round(y, 3)) for x, y in acc_p_d.items()] print("dev accuracy prior:", round(acc_p, 3), prior_by_label) # using only context el_pipe.cfg["incl_context"] = True el_pipe.cfg["incl_prior"] = False dev_acc_context, dev_acc_cont_d = _measure_acc(dev_data, el_pipe) context_by_label = [(x, round(y, 3)) for x, y in dev_acc_cont_d.items()] print("dev accuracy context:", round(dev_acc_context, 3), context_by_label) # measuring combined accuracy (prior + context) el_pipe.cfg["incl_context"] = True el_pipe.cfg["incl_prior"] = True dev_acc_combo, dev_acc_combo_d = _measure_acc(dev_data, el_pipe) combo_by_label = [(x, round(y, 3)) for x, y in dev_acc_combo_d.items()] print("dev accuracy prior+context:", round(dev_acc_combo, 3), combo_by_label) # STEP 7: apply the EL pipe on a toy example print() print(now(), "STEP 7: applying Entity Linking to toy example") print() run_el_toy_example(nlp=nlp) # STEP 8: write the NLP pipeline (including entity linker) to file if output_dir: print() nlp_loc = output_dir / "nlp" print(now(), "STEP 8: Writing trained NLP to", nlp_loc) nlp.to_disk(nlp_loc) print() print() print(now(), "Done!")
def main(model=None, output_dir=None): """Load the model and create the KB with pre-defined entity encodings. If an output_dir is provided, the KB will be stored there in a file 'kb'. The updated vocab will also be written to a directory in the output_dir.""" nlp = spacy.load(model) # load existing spaCy model print("Loaded model '%s'" % model) # check the length of the nlp vectors if "vectors" not in nlp.meta or not nlp.vocab.vectors.size: raise ValueError( "The `nlp` object should have access to pretrained word vectors, " " cf. https://spacy.io/usage/models#languages.") # You can change the dimension of vectors in your KB by using an encoder that changes the dimensionality. # For simplicity, we'll just use the original vector dimension here instead. vectors_dim = nlp.vocab.vectors.shape[1] kb = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=vectors_dim) # set up the data entity_ids = [] descr_embeddings = [] freqs = [] for key, value in ENTITIES.items(): desc, freq = value entity_ids.append(key) descr_embeddings.append(nlp(desc).vector) freqs.append(freq) # set the entities, can also be done by calling `kb.add_entity` for each entity kb.set_entities(entity_list=entity_ids, freq_list=freqs, vector_list=descr_embeddings) # adding aliases, the entities need to be defined in the KB beforehand kb.add_alias( alias="Russ Cochran", entities=["Q2146908", "Q7381115"], probabilities=[0.24, 0.7 ], # the sum of these probabilities should not exceed 1 ) # test the trained model print() _print_kb(kb) # save model to output directory if output_dir is not None: output_dir = Path(output_dir) if not output_dir.exists(): output_dir.mkdir() kb_path = str(output_dir / "kb") kb.dump(kb_path) print() print("Saved KB to", kb_path) vocab_path = output_dir / "vocab" kb.vocab.to_disk(vocab_path) print("Saved vocab to", vocab_path) print() # test the saved model # always reload a knowledge base with the same vocab instance! print("Loading vocab from", vocab_path) print("Loading KB from", kb_path) vocab2 = Vocab().from_disk(vocab_path) kb2 = KnowledgeBase(vocab=vocab2) kb2.load_bulk(kb_path) print() _print_kb(kb2)
def read_kb(nlp, kb_file): kb = KnowledgeBase(vocab=nlp.vocab) kb.load_bulk(kb_file) return kb
def settingup_knowledgebase(self, names, train_data_2): QID = names['QID'].values.tolist() Names = names['Names'].values.tolist() Frequency = names['Frequency'].values.tolist() descript = [] for desc in names['Description']: descript.append(self.custom_ner_model(desc).vector) print("Setting up entities \n") kb = KnowledgeBase(vocab=self.custom_ner_model.vocab, entity_vector_length=96) kb.set_entities(entity_list=QID, freq_list=Frequency, vector_list=descript) print("Setting up Alias \n") print("\n") print("Spacy Pipeline \n") print(self.custom_ner_model.pipe_names) #kb_dump_file = str(input("Enter the KB Dump name: ")) #kb_vocab_folder = str(input("Enter the KB Vocab name: ")) folder.nel_kb_vocab() alias_prep = list(zip(Names, QID)) folder.nel_kb_vocab() for i, j in alias_prep: names_alias = str(i) list_qid = [] list_qid.append(j) prob = [] prob.append(int(1.0)) kb.add_alias(alias=names_alias, entities=list_qid, probabilities=prob) kb.dump("KB_Dump") kb.vocab.to_disk("KB_Vocab") print("\n") print("Knowbase dump and Vocab are stored in a local disk") train_data_dict_2 = train_data_2.to_dict('records') dataset_2 = [] for data in train_data_dict_2: Text = data['Text'] Name = data['Name'] QID = data['QID'] offset = (data["Start"], data["End"]) links_dict = {QID: 1.0} dataset_2.append((Text, {"links": {offset: links_dict}})) self.custom_ner_model.vocab.from_disk("KB_Vocab") self.custom_ner_model.vocab.vectors.name = "spacy_pretrained_vectors" kb = KnowledgeBase(vocab=self.custom_ner_model.vocab) kb.load_bulk("KB_Dump") TRAIN_DOCS = [] for text, annotation in dataset_2: doc = self.custom_ner_model( text ) # to make this more efficient, you can use nlp.pipe() just once for all the texts TRAIN_DOCS.append((doc, annotation)) print("\n") print("Training started for Named Entity Linking \n") entity_linker = self.custom_ner_model.create_pipe( "entity_linker", config={"incl_prior": False}) entity_linker.set_kb(kb) self.custom_ner_model.add_pipe(entity_linker, last=True) other_pipes = [ pipe for pipe in self.custom_ner_model.pipe_names if pipe != "entity_linker" ] with self.custom_ner_model.disable_pipes( *other_pipes): # train only the entity_linker optimizer = self.custom_ner_model.begin_training() for itn in range( 500): # 500 iterations takes about a minute to train random.shuffle(TRAIN_DOCS) batches = minibatch(TRAIN_DOCS, size=compounding( 4.0, 32.0, 1.001)) # increasing batch sizes losses = {} for batch in batches: texts, annotations = zip(*batch) self.custom_ner_model.update( texts, annotations, drop=0.2, # prevent overfitting losses=losses, sgd=optimizer, ) if itn % 50 == 0: print(itn, "Losses", losses) # print the training loss print(itn, "Losses", losses) print("\n") print("Spacy Pipeline \n") print(self.custom_ner_model.pipe_names) ner_dump_name = str(input("Enter the Model name: ")) self.custom_ner_model.to_disk(ner_dump_name) return self.custom_ner_model
class ConllCandidatesGenerator: def __init__( self, spacy_nlp_vocab_dir: str = "data/vocab", spacy_kb_file: str = "data/kb" ): """ :param spacy_nlp_vocab_dir: path to directory with spaCy vocab files :param spacy_kb_file: path to file with spaCy KnowledgeBase """ # self.spacy_nlp_str = spacy_nlp_str self.spacy_nlp_vocab_dir = spacy_nlp_vocab_dir self.spacy_kb_file = spacy_kb_file # Initialized in get_kb() self.kb = None self.docs = [] self.docs_entities = [] def get_docs(self, file: str = 'conll-wikidata-iob-annotations'): """ :param file: path to file with Wikidata-annotated CoNLL dataset :returns: self.docs, reading it from file if not loaded """ if not self.docs: if not os.path.isfile(file): raise FileNotFoundError( f"Could not find annotated CoNLL file {file}." ) self.docs = list(conll_documents(file)) return self.docs def del_kb(self): """ Frees up memory by deleting self.kb """ self.kb = None def get_kb(self): """ :returns: self.kb, reading it from file if not loaded """ if not self.kb: print("Loading vocabulary...") vocab = Vocab().from_disk(self.spacy_nlp_vocab_dir) print("Loading KB...") self.kb = KnowledgeBase(vocab=vocab) self.kb.load_bulk(self.spacy_kb_file) print("KB loaded!") return self.kb def write_entities_info(self, file: str = "docs_entities_info.json"): """ Writes self.docs_entities to file. File then contains all necessary candidate info, which allows candidates to be read from file with read_entities_info later :param file: file destination of output file """ if not self.docs_entities: raise ValueError("ERROR: No candidates to write to file. " "Try the function 'get_candidates' first.") print(f"Writing json to file {file} ...") with open(file, 'w') as of: json.dump(self.docs_entities, of) def read_entities_info(self, file: str = "docs_entities_info.json"): """ Read self.docs_entities from file, and returns self.docs_entities File should be result of function write_entities_info, and gives all necessary candidate info :param file: path to file written by write_entities_info :returns: self.docs_entities """ if not os.path.isfile(file): raise FileNotFoundError(f"Could not find file {file}. " "Try the function write_entities_info first.") print("Reading from file...") with open(file, 'r') as inf: self.docs_entities = json.load(inf) return self.docs_entities def generate_candidates_for_doc(self, doc: ConllDocument) -> List[Dict]: """ Takes a ConllDocument object with tagged tokens (e.g. from conll_documents()). Outputs a list of dictionaries for each tagged named entity. Each dict has a dict of: the ground truth of the entity (as a 'Q-ID' from WikiData), the token position of the entity as a tuple (start, end), and a list of candidates, represented by their wikidata 'Q-ID'. :param doc: a ConllDocument object with tokens tagged with WikiData IDs :returns: a list over the tagged named entities, each a dictionary of ground truth, entity position, and candidates """ self.get_kb() # The return variable. Stores the list of entities. entities = [] # Inner function to append a label_dict to the entities list def add_entity(entity_span_s, entity_span_e, entity_tokens, entity_gt): entity_text = ' '.join(entity_tokens) entity_candidates = [ c.entity_ for c in self.kb.get_candidates(entity_text) ] entity_span = [entity_span_s, entity_span_e] entities.append( {'Position': entity_span, 'GroundTruth': entity_gt, 'Candidates': entity_candidates} ) # Helper variables for the iteration: # Tokens belonging to current entity collected_tokens = [] # Tag of the current entity (the ground truth) current_entity_tag = None # Position of the first entity token in the document tokens list span_start = None # Enumerate the document's list of tokens for i_token, token in enumerate(doc.tokens): # If we are looking at the beginning of a named entity if token.true_label.startswith("Q") or token.true_label == "B": # Check if we already have collected a named entity # This is the case when two named entities follow each other if len(collected_tokens) > 0: add_entity(span_start, i_token-1, collected_tokens, current_entity_tag) span_start = i_token collected_tokens = [token.text] current_entity_tag = token.true_label # If we are looking at the continuation of a named entity elif token.true_label == 'I': collected_tokens.append(token.text) # If we're not looking at a token in a named entity else: # If we have passed the end of a named entity if len(collected_tokens) > 0: add_entity(span_start, i_token-1, collected_tokens, current_entity_tag) collected_tokens = [] # If the last tokens were a named entity if len(collected_tokens) > 0: add_entity(span_start, len(doc.tokens)-1, collected_tokens, current_entity_tag) return entities def get_docs_entities( self, f: str = None, del_kb: bool = True ) -> List[List[Dict]]: """ Iterates CoNLL documents and gets the cadidates for all mentions :param f: file with tagged conll documents :param del_kb: Whether to delete the KB object to free up space :returns: a list of dicts with lists of info about entities """ # Generate if not cached if not self.docs_entities: if self.docs: self.docs = [] for conll_doc in self.get_docs(f): self.docs_entities.append( self.generate_candidates_for_doc(conll_doc) ) if del_kb: print("Deleting Spacy KB object...") self.del_kb() return self.docs_entities def print_candidate_stats(self): """ Prints metrics about generated candidates """ if not self.docs_entities: print("No candidates info.") return # Number of entities with no candidates (no data points) n_no_cand = 0 # Number of entities where ground truth is among the candidates n_pos_labels = 0 # Number of entities where GT is not among the candidates n_no_pos_labels = 0 # Number of candidates excluding the GT candidate n_neg_labels = 0 # Total number of named entities n_ne = 0 # Only named entities in the wikidata KB n_ne_in_kb = 0 # Number of named entities not linked to Wikidata KB n_ne_bs = 0 # Number of candidates that belong to entities with no GT n_b_cands = 0 for doc_entities in self.docs_entities: for entity in doc_entities: n_ne += 1 if len(entity['Candidates']) == 0: n_no_cand += 1 elif entity['GroundTruth'] in entity['Candidates']: n_pos_labels += 1 n_neg_labels += len(entity['Candidates']) - 1 else: n_no_pos_labels += 1 n_neg_labels += len(entity['Candidates']) if entity['GroundTruth'] == 'B': n_ne_bs += 1 n_b_cands += len(entity['Candidates']) else: n_ne_in_kb += len(entity['Candidates']) n_cand = n_pos_labels + n_neg_labels print(f"{n_ne: >7,} named entities in total") print(f"{n_cand: >7,} candidates in total " f"(total number of data points)") print(f"{n_pos_labels: >7,} / {n_cand: >7,} positive labels " f"({100 * n_pos_labels / n_cand: >5.2f} % all all labels )") print(f"{n_neg_labels: >7,} / {n_cand: >7,} negative labels " f"({100 * n_neg_labels / n_cand: >5.2f} % all all labels )") print(f"{n_no_cand: >7,} / {n_ne: >7,} " f"named entities have no candidates") print(f"{n_no_pos_labels: >7,} / {n_ne: >7,} " f"named entities where correct label is not among candidates") print(f"{n_ne_in_kb: >7,} / {n_cand: >7,} " f"candidates tagged with GT in Wikidata KB") print(f"{n_ne_bs: >7,} / {n_cand: >7,} " f"candidates for named entities not in Wikidata KB") print(f"{n_cand/n_ne:.1f} average number of candidates per entity")
def main(kb_path, vocab_path=None, output_dir=None, n_iter=50): """Create a blank model with the specified vocab, set up the pipeline and train the entity linker. The `vocab` should be the one used during creation of the KB.""" vocab = Vocab().from_disk(vocab_path) # create blank Language class with correct vocab nlp = spacy.blank("en", vocab=vocab) nlp.vocab.vectors.name = "spacy_pretrained_vectors" print("Created blank 'en' model with vocab from '%s'" % vocab_path) # create the built-in pipeline components and add them to the pipeline # nlp.create_pipe works for built-ins that are registered with spaCy if "entity_linker" not in nlp.pipe_names: entity_linker = nlp.create_pipe("entity_linker") kb = KnowledgeBase(vocab=nlp.vocab) kb.load_bulk(kb_path) print("Loaded Knowledge Base from '%s'" % kb_path) entity_linker.set_kb(kb) nlp.add_pipe(entity_linker, last=True) else: entity_linker = nlp.get_pipe("entity_linker") kb = entity_linker.kb # make sure the annotated examples correspond to known identifiers in the knowlege base kb_ids = kb.get_entity_strings() for text, annotation in TRAIN_DATA: for offset, kb_id_dict in annotation["links"].items(): new_dict = {} for kb_id, value in kb_id_dict.items(): if kb_id in kb_ids: new_dict[kb_id] = value else: print("Removed", kb_id, "from training because it is not in the KB.") annotation["links"][offset] = new_dict # get names of other pipes to disable them during training other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "entity_linker"] with nlp.disable_pipes(*other_pipes): # only train entity linker # reset and initialize the weights randomly optimizer = nlp.begin_training() for itn in range(n_iter): random.shuffle(TRAIN_DATA) losses = {} # batch up the examples using spaCy's minibatch batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001)) for batch in batches: texts, annotations = zip(*batch) nlp.update( texts, # batch of texts annotations, # batch of annotations drop=0.2, # dropout - make it harder to memorise data losses=losses, sgd=optimizer, ) print(itn, "Losses", losses) # test the trained model _apply_model(nlp) # save model to output directory if output_dir is not None: output_dir = Path(output_dir) if not output_dir.exists(): output_dir.mkdir() nlp.to_disk(output_dir) print() print("Saved model to", output_dir) # test the saved model print("Loading from", output_dir) nlp2 = spacy.load(output_dir) _apply_model(nlp2)
def main(vocab_path=None, model=None, output_dir=None, n_iter=50): """Load the model, create the KB and pretrain the entity encodings. Either an nlp model or a vocab is needed to provide access to pre-trained word embeddings. If an output_dir is provided, the KB will be stored there in a file 'kb'. When providing an nlp model, the updated vocab will also be written to a directory in the output_dir.""" if model is None and vocab_path is None: raise ValueError("Either the `nlp` model or the `vocab` should be specified.") if model is not None: nlp = spacy.load(model) # load existing spaCy model print("Loaded model '%s'" % model) else: vocab = Vocab().from_disk(vocab_path) # create blank Language class with specified vocab nlp = spacy.blank("en", vocab=vocab) print("Created blank 'en' model with vocab from '%s'" % vocab_path) kb = KnowledgeBase(vocab=nlp.vocab) # set up the data entity_ids = [] descriptions = [] freqs = [] for key, value in ENTITIES.items(): desc, freq = value entity_ids.append(key) descriptions.append(desc) freqs.append(freq) # training entity description encodings # this part can easily be replaced with a custom entity encoder encoder = EntityEncoder( nlp=nlp, input_dim=INPUT_DIM, desc_width=DESC_WIDTH, epochs=n_iter, ) encoder.train(description_list=descriptions, to_print=True) # get the pretrained entity vectors embeddings = encoder.apply_encoder(descriptions) # set the entities, can also be done by calling `kb.add_entity` for each entity kb.set_entities(entity_list=entity_ids, freq_list=freqs, vector_list=embeddings) # adding aliases, the entities need to be defined in the KB beforehand kb.add_alias( alias="Russ Cochran", entities=["Q2146908", "Q7381115"], probabilities=[0.24, 0.7], # the sum of these probabilities should not exceed 1 ) # test the trained model print() _print_kb(kb) # save model to output directory if output_dir is not None: output_dir = Path(output_dir) if not output_dir.exists(): output_dir.mkdir() kb_path = str(output_dir / "kb") kb.dump(kb_path) print() print("Saved KB to", kb_path) # only storing the vocab if we weren't already reading it from file if not vocab_path: vocab_path = output_dir / "vocab" kb.vocab.to_disk(vocab_path) print("Saved vocab to", vocab_path) print() # test the saved model # always reload a knowledge base with the same vocab instance! print("Loading vocab from", vocab_path) print("Loading KB from", kb_path) vocab2 = Vocab().from_disk(vocab_path) kb2 = KnowledgeBase(vocab=vocab2) kb2.load_bulk(kb_path) _print_kb(kb2) print()
def main(model=None, output_dir=None, n_iter=50): """Load the model, create the KB and pretrain the entity encodings. If an output_dir is provided, the KB will be stored there in a file 'kb'. The updated vocab will also be written to a directory in the output_dir.""" nlp = spacy.load(model) # load existing spaCy model print("Loaded model '%s'" % model) # check the length of the nlp vectors if "vectors" not in nlp.meta or not nlp.vocab.vectors.size: raise ValueError( "The `nlp` object should have access to pretrained word vectors, " " cf. https://spacy.io/usage/models#languages.") kb = KnowledgeBase(vocab=nlp.vocab) # set up the data entity_ids = [] descriptions = [] freqs = [] for key, value in ENTITIES.items(): desc, freq = value entity_ids.append(key) descriptions.append(desc) freqs.append(freq) # training entity description encodings # this part can easily be replaced with a custom entity encoder encoder = EntityEncoder( nlp=nlp, input_dim=INPUT_DIM, desc_width=DESC_WIDTH, epochs=n_iter, ) encoder.train(description_list=descriptions, to_print=True) # get the pretrained entity vectors embeddings = encoder.apply_encoder(descriptions) # set the entities, can also be done by calling `kb.add_entity` for each entity kb.set_entities(entity_list=entity_ids, freq_list=freqs, vector_list=embeddings) # adding aliases, the entities need to be defined in the KB beforehand kb.add_alias( alias="Russ Cochran", entities=["Q2146908", "Q7381115"], probabilities=[0.24, 0.7 ], # the sum of these probabilities should not exceed 1 ) # test the trained model print() _print_kb(kb) # save model to output directory if output_dir is not None: output_dir = Path(output_dir) if not output_dir.exists(): output_dir.mkdir() kb_path = str(output_dir / "kb") kb.dump(kb_path) print() print("Saved KB to", kb_path) vocab_path = output_dir / "vocab" kb.vocab.to_disk(vocab_path) print("Saved vocab to", vocab_path) print() # test the saved model # always reload a knowledge base with the same vocab instance! print("Loading vocab from", vocab_path) print("Loading KB from", kb_path) vocab2 = Vocab().from_disk(vocab_path) kb2 = KnowledgeBase(vocab=vocab2) kb2.load_bulk(kb_path) _print_kb(kb2) print()
def train_el(): """ Step 2: Once we have done the manual annotations, use it to train a new Entity Linking component. """ nlp = spacy.load(output_dir / "my_nlp") kb = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=1) kb.load_bulk(output_dir / "my_kb") dataset = [] json_loc = prodigy_dir / "emerson_annotated_text.jsonl" with json_loc.open("r", encoding="utf8") as jsonfile: for line in jsonfile: example = json.loads(line) text = example["text"] if example["answer"] == "accept": QID = example["accept"][0] offset = (example["spans"][0]["start"], example["spans"][0]["end"]) links_dict = {QID: 1.0} dataset.append((text, {"links": {offset: links_dict}})) gold_ids = [] for text, annot in dataset: for span, links_dict in annot["links"].items(): for link, value in links_dict.items(): if value: gold_ids.append(link) print("Statistics of manually annotated data:") print(Counter(gold_ids)) print() train_dataset = [] test_dataset = [] for QID in ['Q312545', 'Q48226', 'Q215952']: indices = [i for i,j in enumerate(gold_ids) if j == QID] train_dataset.extend(dataset[index] for index in indices[0:8]) # first 8 in training test_dataset.extend(dataset[index] for index in indices[8:10]) # last 2 in test # avoid artificial signals by reshuffling the datasets random.shuffle(train_dataset) random.shuffle(test_dataset) TRAIN_DOCS = [] for text, annotation in train_dataset: doc = nlp(text) # to make this more efficient, you can use nlp.pipe() just once for all the texts TRAIN_DOCS.append((doc, annotation)) entity_linker = nlp.create_pipe("entity_linker", config={"incl_prior": False}) entity_linker.set_kb(kb) nlp.add_pipe(entity_linker, last=True) other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "entity_linker"] print("Training the entity linker") with nlp.disable_pipes(*other_pipes): # train only the entity_linker optimizer = nlp.begin_training() for itn in range(500): # 500 iterations takes about a minute to train on this small dataset random.shuffle(TRAIN_DOCS) batches = minibatch(TRAIN_DOCS, size=compounding(4.0, 32.0, 1.001)) # increasing batch size losses = {} for batch in batches: texts, annotations = zip(*batch) nlp.update( texts, annotations, drop=0.2, # prevent overfitting losses=losses, sgd=optimizer, ) if itn % 50 == 0: print(itn, "Losses", losses) # print the training loss print(itn, "Losses", losses) print() nlp.to_disk(output_dir / "my_nlp_el") with open(output_dir / "test_set.pkl", "wb") as f: pickle.dump(test_dataset, f)
def main(kb_path, vocab_path=None, output_dir=None, n_iter=50): """Create a blank model with the specified vocab, set up the pipeline and train the entity linker. The `vocab` should be the one used during creation of the KB.""" vocab = Vocab().from_disk(vocab_path) # create blank Language class with correct vocab nlp = spacy.blank("en", vocab=vocab) nlp.vocab.vectors.name = "spacy_pretrained_vectors" print("Created blank 'en' model with vocab from '%s'" % vocab_path) # Add a sentencizer component. Alternatively, add a dependency parser for higher accuracy. nlp.add_pipe(nlp.create_pipe('sentencizer')) # Add a custom component to recognize "Russ Cochran" as an entity for the example training data. # Note that in a realistic application, an actual NER algorithm should be used instead. ruler = EntityRuler(nlp) patterns = [{ "label": "PERSON", "pattern": [{ "LOWER": "russ" }, { "LOWER": "cochran" }] }] ruler.add_patterns(patterns) nlp.add_pipe(ruler) # Create the Entity Linker component and add it to the pipeline. if "entity_linker" not in nlp.pipe_names: # use only the predicted EL score and not the prior probability (for demo purposes) cfg = {"incl_prior": False} entity_linker = nlp.create_pipe("entity_linker", cfg) kb = KnowledgeBase(vocab=nlp.vocab) kb.load_bulk(kb_path) print("Loaded Knowledge Base from '%s'" % kb_path) entity_linker.set_kb(kb) nlp.add_pipe(entity_linker, last=True) # Convert the texts to docs to make sure we have doc.ents set for the training examples. # Also ensure that the annotated examples correspond to known identifiers in the knowlege base. kb_ids = nlp.get_pipe("entity_linker").kb.get_entity_strings() TRAIN_DOCS = [] for text, annotation in TRAIN_DATA: with nlp.disable_pipes("entity_linker"): doc = nlp(text) annotation_clean = annotation for offset, kb_id_dict in annotation["links"].items(): new_dict = {} for kb_id, value in kb_id_dict.items(): if kb_id in kb_ids: new_dict[kb_id] = value else: print("Removed", kb_id, "from training because it is not in the KB.") annotation_clean["links"][offset] = new_dict TRAIN_DOCS.append((doc, annotation_clean)) # get names of other pipes to disable them during training other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "entity_linker"] with nlp.disable_pipes(*other_pipes): # only train entity linker # reset and initialize the weights randomly optimizer = nlp.begin_training() for itn in range(n_iter): random.shuffle(TRAIN_DOCS) losses = {} # batch up the examples using spaCy's minibatch batches = minibatch(TRAIN_DOCS, size=compounding(4.0, 32.0, 1.001)) for batch in batches: texts, annotations = zip(*batch) nlp.update( texts, # batch of texts annotations, # batch of annotations drop=0.2, # dropout - make it harder to memorise data losses=losses, sgd=optimizer, ) print(itn, "Losses", losses) # test the trained model _apply_model(nlp) # save model to output directory if output_dir is not None: output_dir = Path(output_dir) if not output_dir.exists(): output_dir.mkdir() nlp.to_disk(output_dir) print() print("Saved model to", output_dir) # test the saved model print("Loading from", output_dir) nlp2 = spacy.load(output_dir) _apply_model(nlp2)
def run_pipeline(): # set the appropriate booleans to define which parts of the pipeline should be re(run) print("START", datetime.datetime.now()) print() nlp_1 = spacy.load('en_core_web_lg') nlp_2 = None kb_2 = None # one-time methods to create KB and write to file to_create_prior_probs = False to_create_entity_counts = False to_create_kb = False # read KB back in from file to_read_kb = True to_test_kb = False # create training dataset create_wp_training = False # train the EL pipe train_pipe = True measure_performance = True # test the EL pipe on a simple example to_test_pipeline = True # write the NLP object, read back in and test again to_write_nlp = True to_read_nlp = True test_from_file = False # STEP 1 : create prior probabilities from WP (run only once) if to_create_prior_probs: print("STEP 1: to_create_prior_probs", datetime.datetime.now()) wp.read_wikipedia_prior_probs(wikipedia_input=ENWIKI_DUMP, prior_prob_output=PRIOR_PROB) print() # STEP 2 : deduce entity frequencies from WP (run only once) if to_create_entity_counts: print("STEP 2: to_create_entity_counts", datetime.datetime.now()) wp.write_entity_counts(prior_prob_input=PRIOR_PROB, count_output=ENTITY_COUNTS, to_print=False) print() # STEP 3 : create KB and write to file (run only once) if to_create_kb: print("STEP 3a: to_create_kb", datetime.datetime.now()) kb_1 = kb_creator.create_kb(nlp_1, max_entities_per_alias=MAX_CANDIDATES, min_entity_freq=MIN_ENTITY_FREQ, min_occ=MIN_PAIR_OCC, entity_def_output=ENTITY_DEFS, entity_descr_output=ENTITY_DESCR, count_input=ENTITY_COUNTS, prior_prob_input=PRIOR_PROB, wikidata_input=WIKIDATA_JSON) print("kb entities:", kb_1.get_size_entities()) print("kb aliases:", kb_1.get_size_aliases()) print() print("STEP 3b: write KB and NLP", datetime.datetime.now()) kb_1.dump(KB_FILE) nlp_1.to_disk(NLP_1_DIR) print() # STEP 4 : read KB back in from file if to_read_kb: print("STEP 4: to_read_kb", datetime.datetime.now()) nlp_2 = spacy.load(NLP_1_DIR) kb_2 = KnowledgeBase(vocab=nlp_2.vocab, entity_vector_length=DESC_WIDTH) kb_2.load_bulk(KB_FILE) print("kb entities:", kb_2.get_size_entities()) print("kb aliases:", kb_2.get_size_aliases()) print() # test KB if to_test_kb: check_kb(kb_2) print() # STEP 5: create a training dataset from WP if create_wp_training: print("STEP 5: create training dataset", datetime.datetime.now()) training_set_creator.create_training(wikipedia_input=ENWIKI_DUMP, entity_def_input=ENTITY_DEFS, training_output=TRAINING_DIR) # STEP 6: create and train the entity linking pipe if train_pipe: print("STEP 6: training Entity Linking pipe", datetime.datetime.now()) type_to_int = {label: i for i, label in enumerate(nlp_2.entity.labels)} print(" -analysing", len(type_to_int), "different entity types") el_pipe = nlp_2.create_pipe(name='entity_linker', config={ "context_width": CONTEXT_WIDTH, "pretrained_vectors": nlp_2.vocab.vectors.name, "type_to_int": type_to_int }) el_pipe.set_kb(kb_2) nlp_2.add_pipe(el_pipe, last=True) other_pipes = [ pipe for pipe in nlp_2.pipe_names if pipe != "entity_linker" ] with nlp_2.disable_pipes(*other_pipes): # only train Entity Linking optimizer = nlp_2.begin_training() optimizer.learn_rate = LEARN_RATE optimizer.L2 = L2 # define the size (nr of entities) of training and dev set train_limit = 5000 dev_limit = 5000 train_data = training_set_creator.read_training( nlp=nlp_2, training_dir=TRAINING_DIR, dev=False, limit=train_limit) print("Training on", len(train_data), "articles") print() dev_data = training_set_creator.read_training( nlp=nlp_2, training_dir=TRAINING_DIR, dev=True, limit=dev_limit) print("Dev testing on", len(dev_data), "articles") print() if not train_data: print("Did not find any training data") else: for itn in range(EPOCHS): random.shuffle(train_data) losses = {} batches = minibatch(train_data, size=compounding(4.0, 128.0, 1.001)) batchnr = 0 with nlp_2.disable_pipes(*other_pipes): for batch in batches: try: docs, golds = zip(*batch) nlp_2.update( docs, golds, sgd=optimizer, drop=DROPOUT, losses=losses, ) batchnr += 1 except Exception as e: print("Error updating batch:", e) if batchnr > 0: el_pipe.cfg["context_weight"] = 1 el_pipe.cfg["prior_weight"] = 1 dev_acc_context, dev_acc_context_dict = _measure_accuracy( dev_data, el_pipe) losses['entity_linker'] = losses['entity_linker'] / batchnr print("Epoch, train loss", itn, round(losses['entity_linker'], 2), " / dev acc avg", round(dev_acc_context, 3)) # STEP 7: measure the performance of our trained pipe on an independent dev set if len(dev_data) and measure_performance: print() print("STEP 7: performance measurement of Entity Linking pipe", datetime.datetime.now()) print() counts, acc_r, acc_r_label, acc_p, acc_p_label, acc_o, acc_o_label = _measure_baselines( dev_data, kb_2) print("dev counts:", sorted(counts.items(), key=lambda x: x[0])) print("dev acc oracle:", round(acc_o, 3), [(x, round(y, 3)) for x, y in acc_o_label.items()]) print("dev acc random:", round(acc_r, 3), [(x, round(y, 3)) for x, y in acc_r_label.items()]) print("dev acc prior:", round(acc_p, 3), [(x, round(y, 3)) for x, y in acc_p_label.items()]) # using only context el_pipe.cfg["context_weight"] = 1 el_pipe.cfg["prior_weight"] = 0 dev_acc_context, dev_acc_context_dict = _measure_accuracy( dev_data, el_pipe) print("dev acc context avg:", round(dev_acc_context, 3), [(x, round(y, 3)) for x, y in dev_acc_context_dict.items()]) # measuring combined accuracy (prior + context) el_pipe.cfg["context_weight"] = 1 el_pipe.cfg["prior_weight"] = 1 dev_acc_combo, dev_acc_combo_dict = _measure_accuracy( dev_data, el_pipe, error_analysis=False) print("dev acc combo avg:", round(dev_acc_combo, 3), [(x, round(y, 3)) for x, y in dev_acc_combo_dict.items()]) # STEP 8: apply the EL pipe on a toy example if to_test_pipeline: print() print("STEP 8: applying Entity Linking to toy example", datetime.datetime.now()) print() run_el_toy_example(nlp=nlp_2) # STEP 9: write the NLP pipeline (including entity linker) to file if to_write_nlp: print() print("STEP 9: testing NLP IO", datetime.datetime.now()) print() print("writing to", NLP_2_DIR) nlp_2.to_disk(NLP_2_DIR) print() # verify that the IO has gone correctly if to_read_nlp: print("reading from", NLP_2_DIR) nlp_3 = spacy.load(NLP_2_DIR) print("running toy example with NLP 3") run_el_toy_example(nlp=nlp_3) # testing performance with an NLP model from file if test_from_file: nlp_2 = spacy.load(NLP_1_DIR) nlp_3 = spacy.load(NLP_2_DIR) el_pipe = nlp_3.get_pipe("entity_linker") dev_limit = 5000 dev_data = training_set_creator.read_training( nlp=nlp_2, training_dir=TRAINING_DIR, dev=True, limit=dev_limit) print("Dev testing from file on", len(dev_data), "articles") print() dev_acc_combo, dev_acc_combo_dict = _measure_accuracy( dev_data, el_pipe=el_pipe, error_analysis=False) print("dev acc combo avg:", round(dev_acc_combo, 3), [(x, round(y, 3)) for x, y in dev_acc_combo_dict.items()]) print() print("STOP", datetime.datetime.now())