def test_issue4674(): """Test that setting entities with overlapping identifiers does not mess up IO""" nlp = English() kb = KnowledgeBase(nlp.vocab, entity_vector_length=3) vector1 = [0.9, 1.1, 1.01] vector2 = [1.8, 2.25, 2.01] kb.set_entities(entity_list=["Q1", "Q1"], freq_list=[32, 111], vector_list=[vector1, vector2]) assert kb.get_size_entities() == 1 # dumping to file & loading back in with make_tempdir() as d: dir_path = ensure_path(d) if not dir_path.exists(): dir_path.mkdir() file_path = dir_path / "kb" kb.dump(str(file_path)) kb2 = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=3) kb2.load_bulk(str(file_path)) assert kb2.get_size_entities() == 1
def test_kb_to_bytes(): # Test that the KB's to_bytes method works correctly nlp = English() kb_1 = KnowledgeBase(nlp.vocab, entity_vector_length=3) kb_1.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3]) kb_1.add_entity(entity="Q66", freq=9, entity_vector=[1, 2, 3]) kb_1.add_alias(alias="Russ Cochran", entities=["Q2146908"], probabilities=[0.8]) kb_1.add_alias(alias="Boeing", entities=["Q66"], probabilities=[0.5]) kb_1.add_alias(alias="Randomness", entities=["Q66", "Q2146908"], probabilities=[0.1, 0.2]) assert kb_1.contains_alias("Russ Cochran") kb_bytes = kb_1.to_bytes() kb_2 = KnowledgeBase(nlp.vocab, entity_vector_length=3) assert not kb_2.contains_alias("Russ Cochran") kb_2 = kb_2.from_bytes(kb_bytes) # check that both KBs are exactly the same assert kb_1.get_size_entities() == kb_2.get_size_entities() assert kb_1.entity_vector_length == kb_2.entity_vector_length assert kb_1.get_entity_strings() == kb_2.get_entity_strings() assert kb_1.get_vector("Q2146908") == kb_2.get_vector("Q2146908") assert kb_1.get_vector("Q66") == kb_2.get_vector("Q66") assert kb_2.contains_alias("Russ Cochran") assert kb_1.get_size_aliases() == kb_2.get_size_aliases() assert kb_1.get_alias_strings() == kb_2.get_alias_strings() assert len(kb_1.get_alias_candidates("Russ Cochran")) == len( kb_2.get_alias_candidates("Russ Cochran")) assert len(kb_1.get_alias_candidates("Randomness")) == len( kb_2.get_alias_candidates("Randomness"))
def test_kb_valid_entities(nlp): """Test the valid construction of a KB with 3 entities and two aliases""" mykb = KnowledgeBase(nlp.vocab, entity_vector_length=3) # adding entities mykb.add_entity(entity="Q1", freq=0.9, entity_vector=[8, 4, 3]) mykb.add_entity(entity="Q2", freq=0.5, entity_vector=[2, 1, 0]) mykb.add_entity(entity="Q3", freq=0.5, entity_vector=[-1, -6, 5]) # adding aliases mykb.add_alias(alias="douglas", entities=["Q2", "Q3"], probabilities=[0.8, 0.2]) mykb.add_alias(alias="adam", entities=["Q2"], probabilities=[0.9]) # test the size of the corresponding KB assert mykb.get_size_entities() == 3 assert mykb.get_size_aliases() == 2 # test retrieval of the entity vectors assert mykb.get_vector("Q1") == [8, 4, 3] assert mykb.get_vector("Q2") == [2, 1, 0] assert mykb.get_vector("Q3") == [-1, -6, 5] # test retrieval of prior probabilities assert_almost_equal(mykb.get_prior_prob(entity="Q2", alias="douglas"), 0.8) assert_almost_equal(mykb.get_prior_prob(entity="Q3", alias="douglas"), 0.2) assert_almost_equal(mykb.get_prior_prob(entity="Q342", alias="douglas"), 0.0) assert_almost_equal(mykb.get_prior_prob(entity="Q3", alias="douglassssss"), 0.0)
def read_nlp_kb(model_dir, kb_file): nlp = spacy.load(model_dir) kb = KnowledgeBase(vocab=nlp.vocab) kb.load_bulk(kb_file) logger.info("kb entities: {}".format(kb.get_size_entities())) logger.info("kb aliases: {}".format(kb.get_size_aliases())) return nlp, kb
def create_kb(vocab): kb = KnowledgeBase(vocab=vocab, entity_vector_length=1) # adding entities entity_0 = "Q1004791_Douglas" print("adding entity", entity_0) kb.add_entity(entity=entity_0, freq=0.5, entity_vector=[0]) entity_1 = "Q42_Douglas_Adams" print("adding entity", entity_1) kb.add_entity(entity=entity_1, freq=0.5, entity_vector=[1]) entity_2 = "Q5301561_Douglas_Haig" print("adding entity", entity_2) kb.add_entity(entity=entity_2, freq=0.5, entity_vector=[2]) # adding aliases print() alias_0 = "Douglas" print("adding alias", alias_0) kb.add_alias(alias=alias_0, entities=[entity_0, entity_1, entity_2], probabilities=[0.6, 0.1, 0.2]) alias_1 = "Douglas Adams" print("adding alias", alias_1) kb.add_alias(alias=alias_1, entities=[entity_1], probabilities=[0.9]) print() print("kb size:", len(kb), kb.get_size_entities(), kb.get_size_aliases()) return kb
def create_kb(vocab): kb = KnowledgeBase(vocab=vocab) # adding entities entity_0 = "Q1004791_Douglas" print("adding entity", entity_0) kb.add_entity(entity=entity_0, prob=0.5) entity_1 = "Q42_Douglas_Adams" print("adding entity", entity_1) kb.add_entity(entity=entity_1, prob=0.5) entity_2 = "Q5301561_Douglas_Haig" print("adding entity", entity_2) kb.add_entity(entity=entity_2, prob=0.5) # adding aliases print() alias_0 = "Douglas" print("adding alias", alias_0) kb.add_alias(alias=alias_0, entities=[entity_0, entity_1, entity_2], probabilities=[0.1, 0.6, 0.2]) alias_1 = "Douglas Adams" print("adding alias", alias_1) kb.add_alias(alias=alias_1, entities=[entity_1], probabilities=[0.9]) print() print("kb size:", len(kb), kb.get_size_entities(), kb.get_size_aliases()) return kb
def test_kb_valid_entities(nlp): """Test the valid construction of a KB with 3 entities and two aliases""" mykb = KnowledgeBase(nlp.vocab) # adding entities mykb.add_entity(entity=u'Q1', prob=0.9) mykb.add_entity(entity=u'Q2') mykb.add_entity(entity=u'Q3', prob=0.5) # adding aliases mykb.add_alias(alias=u'douglas', entities=[u'Q2', u'Q3'], probabilities=[0.8, 0.2]) mykb.add_alias(alias=u'adam', entities=[u'Q2'], probabilities=[0.9]) # test the size of the corresponding KB assert(mykb.get_size_entities() == 3) assert(mykb.get_size_aliases() == 2)
def test_kb_valid_entities(nlp): """Test the valid construction of a KB with 3 entities and two aliases""" mykb = KnowledgeBase(nlp.vocab) # adding entities mykb.add_entity(entity=u'Q1', prob=0.9) mykb.add_entity(entity=u'Q2') mykb.add_entity(entity=u'Q3', prob=0.5) # adding aliases mykb.add_alias(alias=u'douglas', entities=[u'Q2', u'Q3'], probabilities=[0.8, 0.2]) mykb.add_alias(alias=u'adam', entities=[u'Q2'], probabilities=[0.9]) # test the size of the corresponding KB assert (mykb.get_size_entities() == 3) assert (mykb.get_size_aliases() == 2)
def test_kb_valid_entities(nlp): """Test the valid construction of a KB with 3 entities and two aliases""" mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1) # adding entities mykb.add_entity(entity='Q1', prob=0.9, entity_vector=[1]) mykb.add_entity(entity='Q2', prob=0.5, entity_vector=[2]) mykb.add_entity(entity='Q3', prob=0.5, entity_vector=[3]) # adding aliases mykb.add_alias(alias='douglas', entities=['Q2', 'Q3'], probabilities=[0.8, 0.2]) mykb.add_alias(alias='adam', entities=['Q2'], probabilities=[0.9]) # test the size of the corresponding KB assert (mykb.get_size_entities() == 3) assert (mykb.get_size_aliases() == 2)
import spacy from spacy.kb import KnowledgeBase nlp = spacy.load("en_core_web_sm") kb = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=3) # adding entities kb.add_entity(entity="Q1004791", freq=6, entity_vector=[0, 3, 5]) kb.add_entity(entity="Q42", freq=342, entity_vector=[1, 9, -3]) kb.add_entity(entity="Q5301561", freq=12, entity_vector=[-2, 4, 2]) # adding aliases kb.add_alias(alias="Douglas", entities=["Q1004791", "Q42", "Q5301561"], probabilities=[0.6, 0.1, 0.2]) kb.add_alias(alias="Douglas Adams", entities=["Q42"], probabilities=[0.9]) print() print("Number of entities in KB:",kb.get_size_entities()) # 3 print("Number of aliases in KB:", kb.get_size_aliases()) # 2 import spacy from spacy.kb import KnowledgeBase nlp = spacy.load('en_core_web_sm') kb = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=3) # adding entities kb.add_entity(entity="Q1004791", freq=6, entity_vector=[0, 3, 5]) kb.add_entity(entity="Q42", freq=342, entity_vector=[1, 9, -3]) kb.add_entity(entity="Q5301561", freq=12, entity_vector=[-2, 4, 2])
def create_kb( nlp, max_entities_per_alias, min_entity_freq, min_occ, entity_def_input, entity_descr_path, count_input, prior_prob_input, entity_vector_length, ): # Create the knowledge base from Wikidata entries kb = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=entity_vector_length) # read the mappings from file title_to_id = get_entity_to_id(entity_def_input) id_to_descr = get_id_to_description(entity_descr_path) # check the length of the nlp vectors if "vectors" in nlp.meta and nlp.vocab.vectors.size: input_dim = nlp.vocab.vectors_length logger.info("Loaded pretrained vectors of size %s" % input_dim) else: raise ValueError( "The `nlp` object should have access to pretrained word vectors, " " cf. https://spacy.io/usage/models#languages.") logger.info("Get entity frequencies") entity_frequencies = wp.get_all_frequencies(count_input=count_input) logger.info("Filtering entities with fewer than {} mentions".format( min_entity_freq)) # filter the entities for in the KB by frequency, because there's just too much data (8M entities) otherwise filtered_title_to_id, entity_list, description_list, frequency_list = get_filtered_entities( title_to_id, id_to_descr, entity_frequencies, min_entity_freq) logger.info("Left with {} entities".format(len(description_list))) logger.info("Train entity encoder") encoder = EntityEncoder(nlp, input_dim, entity_vector_length) encoder.train(description_list=description_list, to_print=True) logger.info("Get entity embeddings:") embeddings = encoder.apply_encoder(description_list) logger.info("Adding {} entities".format(len(entity_list))) kb.set_entities(entity_list=entity_list, freq_list=frequency_list, vector_list=embeddings) logger.info("Adding aliases") _add_aliases( kb, title_to_id=filtered_title_to_id, max_entities_per_alias=max_entities_per_alias, min_occ=min_occ, prior_prob_input=prior_prob_input, ) logger.info("KB size: {} entities, {} aliases".format( kb.get_size_entities(), kb.get_size_aliases())) logger.info("Done with kb") return kb
def create_kb( nlp, max_entities_per_alias, min_entity_freq, min_occ, entity_def_output, entity_descr_output, count_input, prior_prob_input, wikidata_input, ): # Create the knowledge base from Wikidata entries kb = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=DESC_WIDTH) # disable this part of the pipeline when rerunning the KB generation from preprocessed files read_raw_data = True if read_raw_data: print() print(" * _read_wikidata_entities", datetime.datetime.now()) title_to_id, id_to_descr = wd.read_wikidata_entities_json( wikidata_input) # write the title-ID and ID-description mappings to file _write_entity_files(entity_def_output, entity_descr_output, title_to_id, id_to_descr) else: # read the mappings from file title_to_id = get_entity_to_id(entity_def_output) id_to_descr = get_id_to_description(entity_descr_output) print() print(" * _get_entity_frequencies", datetime.datetime.now()) print() entity_frequencies = wp.get_all_frequencies(count_input=count_input) # filter the entities for in the KB by frequency, because there's just too much data (8M entities) otherwise filtered_title_to_id = dict() entity_list = [] description_list = [] frequency_list = [] for title, entity in title_to_id.items(): freq = entity_frequencies.get(title, 0) desc = id_to_descr.get(entity, None) if desc and freq > min_entity_freq: entity_list.append(entity) description_list.append(desc) frequency_list.append(freq) filtered_title_to_id[title] = entity print(len(title_to_id.keys()), "original titles") print("kept", len(filtered_title_to_id.keys()), " with frequency", min_entity_freq) print() print(" * train entity encoder", datetime.datetime.now()) print() encoder = EntityEncoder(nlp, INPUT_DIM, DESC_WIDTH) encoder.train(description_list=description_list, to_print=True) print() print(" * get entity embeddings", datetime.datetime.now()) print() embeddings = encoder.apply_encoder(description_list) print() print(" * adding", len(entity_list), "entities", datetime.datetime.now()) kb.set_entities(entity_list=entity_list, freq_list=frequency_list, vector_list=embeddings) print() print(" * adding aliases", datetime.datetime.now()) print() _add_aliases( kb, title_to_id=filtered_title_to_id, max_entities_per_alias=max_entities_per_alias, min_occ=min_occ, prior_prob_input=prior_prob_input, ) print() print("kb size:", len(kb), kb.get_size_entities(), kb.get_size_aliases()) print("done with kb", datetime.datetime.now()) return kb
def run_pipeline(): # set the appropriate booleans to define which parts of the pipeline should be re(run) print("START", datetime.datetime.now()) print() nlp_1 = spacy.load('en_core_web_lg') nlp_2 = None kb_2 = None # one-time methods to create KB and write to file to_create_prior_probs = False to_create_entity_counts = False to_create_kb = False # read KB back in from file to_read_kb = True to_test_kb = False # create training dataset create_wp_training = False # train the EL pipe train_pipe = True measure_performance = True # test the EL pipe on a simple example to_test_pipeline = True # write the NLP object, read back in and test again to_write_nlp = True to_read_nlp = True test_from_file = False # STEP 1 : create prior probabilities from WP (run only once) if to_create_prior_probs: print("STEP 1: to_create_prior_probs", datetime.datetime.now()) wp.read_wikipedia_prior_probs(wikipedia_input=ENWIKI_DUMP, prior_prob_output=PRIOR_PROB) print() # STEP 2 : deduce entity frequencies from WP (run only once) if to_create_entity_counts: print("STEP 2: to_create_entity_counts", datetime.datetime.now()) wp.write_entity_counts(prior_prob_input=PRIOR_PROB, count_output=ENTITY_COUNTS, to_print=False) print() # STEP 3 : create KB and write to file (run only once) if to_create_kb: print("STEP 3a: to_create_kb", datetime.datetime.now()) kb_1 = kb_creator.create_kb(nlp_1, max_entities_per_alias=MAX_CANDIDATES, min_entity_freq=MIN_ENTITY_FREQ, min_occ=MIN_PAIR_OCC, entity_def_output=ENTITY_DEFS, entity_descr_output=ENTITY_DESCR, count_input=ENTITY_COUNTS, prior_prob_input=PRIOR_PROB, wikidata_input=WIKIDATA_JSON) print("kb entities:", kb_1.get_size_entities()) print("kb aliases:", kb_1.get_size_aliases()) print() print("STEP 3b: write KB and NLP", datetime.datetime.now()) kb_1.dump(KB_FILE) nlp_1.to_disk(NLP_1_DIR) print() # STEP 4 : read KB back in from file if to_read_kb: print("STEP 4: to_read_kb", datetime.datetime.now()) nlp_2 = spacy.load(NLP_1_DIR) kb_2 = KnowledgeBase(vocab=nlp_2.vocab, entity_vector_length=DESC_WIDTH) kb_2.load_bulk(KB_FILE) print("kb entities:", kb_2.get_size_entities()) print("kb aliases:", kb_2.get_size_aliases()) print() # test KB if to_test_kb: check_kb(kb_2) print() # STEP 5: create a training dataset from WP if create_wp_training: print("STEP 5: create training dataset", datetime.datetime.now()) training_set_creator.create_training(wikipedia_input=ENWIKI_DUMP, entity_def_input=ENTITY_DEFS, training_output=TRAINING_DIR) # STEP 6: create and train the entity linking pipe if train_pipe: print("STEP 6: training Entity Linking pipe", datetime.datetime.now()) type_to_int = {label: i for i, label in enumerate(nlp_2.entity.labels)} print(" -analysing", len(type_to_int), "different entity types") el_pipe = nlp_2.create_pipe(name='entity_linker', config={ "context_width": CONTEXT_WIDTH, "pretrained_vectors": nlp_2.vocab.vectors.name, "type_to_int": type_to_int }) el_pipe.set_kb(kb_2) nlp_2.add_pipe(el_pipe, last=True) other_pipes = [ pipe for pipe in nlp_2.pipe_names if pipe != "entity_linker" ] with nlp_2.disable_pipes(*other_pipes): # only train Entity Linking optimizer = nlp_2.begin_training() optimizer.learn_rate = LEARN_RATE optimizer.L2 = L2 # define the size (nr of entities) of training and dev set train_limit = 5000 dev_limit = 5000 train_data = training_set_creator.read_training( nlp=nlp_2, training_dir=TRAINING_DIR, dev=False, limit=train_limit) print("Training on", len(train_data), "articles") print() dev_data = training_set_creator.read_training( nlp=nlp_2, training_dir=TRAINING_DIR, dev=True, limit=dev_limit) print("Dev testing on", len(dev_data), "articles") print() if not train_data: print("Did not find any training data") else: for itn in range(EPOCHS): random.shuffle(train_data) losses = {} batches = minibatch(train_data, size=compounding(4.0, 128.0, 1.001)) batchnr = 0 with nlp_2.disable_pipes(*other_pipes): for batch in batches: try: docs, golds = zip(*batch) nlp_2.update( docs, golds, sgd=optimizer, drop=DROPOUT, losses=losses, ) batchnr += 1 except Exception as e: print("Error updating batch:", e) if batchnr > 0: el_pipe.cfg["context_weight"] = 1 el_pipe.cfg["prior_weight"] = 1 dev_acc_context, dev_acc_context_dict = _measure_accuracy( dev_data, el_pipe) losses['entity_linker'] = losses['entity_linker'] / batchnr print("Epoch, train loss", itn, round(losses['entity_linker'], 2), " / dev acc avg", round(dev_acc_context, 3)) # STEP 7: measure the performance of our trained pipe on an independent dev set if len(dev_data) and measure_performance: print() print("STEP 7: performance measurement of Entity Linking pipe", datetime.datetime.now()) print() counts, acc_r, acc_r_label, acc_p, acc_p_label, acc_o, acc_o_label = _measure_baselines( dev_data, kb_2) print("dev counts:", sorted(counts.items(), key=lambda x: x[0])) print("dev acc oracle:", round(acc_o, 3), [(x, round(y, 3)) for x, y in acc_o_label.items()]) print("dev acc random:", round(acc_r, 3), [(x, round(y, 3)) for x, y in acc_r_label.items()]) print("dev acc prior:", round(acc_p, 3), [(x, round(y, 3)) for x, y in acc_p_label.items()]) # using only context el_pipe.cfg["context_weight"] = 1 el_pipe.cfg["prior_weight"] = 0 dev_acc_context, dev_acc_context_dict = _measure_accuracy( dev_data, el_pipe) print("dev acc context avg:", round(dev_acc_context, 3), [(x, round(y, 3)) for x, y in dev_acc_context_dict.items()]) # measuring combined accuracy (prior + context) el_pipe.cfg["context_weight"] = 1 el_pipe.cfg["prior_weight"] = 1 dev_acc_combo, dev_acc_combo_dict = _measure_accuracy( dev_data, el_pipe, error_analysis=False) print("dev acc combo avg:", round(dev_acc_combo, 3), [(x, round(y, 3)) for x, y in dev_acc_combo_dict.items()]) # STEP 8: apply the EL pipe on a toy example if to_test_pipeline: print() print("STEP 8: applying Entity Linking to toy example", datetime.datetime.now()) print() run_el_toy_example(nlp=nlp_2) # STEP 9: write the NLP pipeline (including entity linker) to file if to_write_nlp: print() print("STEP 9: testing NLP IO", datetime.datetime.now()) print() print("writing to", NLP_2_DIR) nlp_2.to_disk(NLP_2_DIR) print() # verify that the IO has gone correctly if to_read_nlp: print("reading from", NLP_2_DIR) nlp_3 = spacy.load(NLP_2_DIR) print("running toy example with NLP 3") run_el_toy_example(nlp=nlp_3) # testing performance with an NLP model from file if test_from_file: nlp_2 = spacy.load(NLP_1_DIR) nlp_3 = spacy.load(NLP_2_DIR) el_pipe = nlp_3.get_pipe("entity_linker") dev_limit = 5000 dev_data = training_set_creator.read_training( nlp=nlp_2, training_dir=TRAINING_DIR, dev=True, limit=dev_limit) print("Dev testing from file on", len(dev_data), "articles") print() dev_acc_combo, dev_acc_combo_dict = _measure_accuracy( dev_data, el_pipe=el_pipe, error_analysis=False) print("dev acc combo avg:", round(dev_acc_combo, 3), [(x, round(y, 3)) for x, y in dev_acc_combo_dict.items()]) print() print("STOP", datetime.datetime.now())
def create_kb( nlp, max_entities_per_alias, min_entity_freq, min_occ, entity_def_output, entity_descr_output, count_input, prior_prob_input, wikidata_input, entity_vector_length, limit=None, read_raw_data=True, ): # Create the knowledge base from Wikidata entries kb = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=entity_vector_length) # check the length of the nlp vectors if "vectors" in nlp.meta and nlp.vocab.vectors.size: input_dim = nlp.vocab.vectors_length print("Loaded pre-trained vectors of size %s" % input_dim) else: raise ValueError( "The `nlp` object should have access to pre-trained word vectors, " " cf. https://spacy.io/usage/models#languages.") # disable this part of the pipeline when rerunning the KB generation from preprocessed files if read_raw_data: print() print(now(), " * read wikidata entities:") title_to_id, id_to_descr = wd.read_wikidata_entities_json( wikidata_input, limit=limit) # write the title-ID and ID-description mappings to file _write_entity_files(entity_def_output, entity_descr_output, title_to_id, id_to_descr) else: # read the mappings from file title_to_id = get_entity_to_id(entity_def_output) id_to_descr = get_id_to_description(entity_descr_output) print() print(now(), " * get entity frequencies:") print() entity_frequencies = wp.get_all_frequencies(count_input=count_input) # filter the entities for in the KB by frequency, because there's just too much data (8M entities) otherwise filtered_title_to_id = dict() entity_list = [] description_list = [] frequency_list = [] for title, entity in title_to_id.items(): freq = entity_frequencies.get(title, 0) desc = id_to_descr.get(entity, None) if desc and freq > min_entity_freq: entity_list.append(entity) description_list.append(desc) frequency_list.append(freq) filtered_title_to_id[title] = entity print(len(title_to_id.keys()), "original titles") kept_nr = len(filtered_title_to_id.keys()) print("kept", kept_nr, "entities with min. frequency", min_entity_freq) print() print(now(), " * train entity encoder:") print() encoder = EntityEncoder(nlp, input_dim, entity_vector_length) encoder.train(description_list=description_list, to_print=True) print() print(now(), " * get entity embeddings:") print() embeddings = encoder.apply_encoder(description_list) print(now(), " * adding", len(entity_list), "entities") kb.set_entities(entity_list=entity_list, freq_list=frequency_list, vector_list=embeddings) alias_cnt = _add_aliases( kb, title_to_id=filtered_title_to_id, max_entities_per_alias=max_entities_per_alias, min_occ=min_occ, prior_prob_input=prior_prob_input, ) print() print(now(), " * adding", alias_cnt, "aliases") print() print() print("# of entities in kb:", kb.get_size_entities()) print("# of aliases in kb:", kb.get_size_aliases()) print(now(), "Done with kb") return kb