def main( wd_json, wp_xml, output_dir, model, max_per_alias=10, min_freq=20, min_pair=5, entity_vector_length=64, loc_prior_prob=None, loc_entity_defs=None, loc_entity_alias=None, loc_entity_desc=None, descr_from_wp=False, limit_prior=None, limit_train=None, limit_wd=None, lang="en", ): entity_defs_path = loc_entity_defs if loc_entity_defs else output_dir / ENTITY_DEFS_PATH entity_alias_path = loc_entity_alias if loc_entity_alias else output_dir / ENTITY_ALIAS_PATH entity_descr_path = loc_entity_desc if loc_entity_desc else output_dir / ENTITY_DESCR_PATH entity_freq_path = output_dir / ENTITY_FREQ_PATH prior_prob_path = loc_prior_prob if loc_prior_prob else output_dir / PRIOR_PROB_PATH training_entities_path = output_dir / TRAINING_DATA_FILE kb_path = output_dir / KB_FILE logger.info("Creating KB with Wikipedia and WikiData") # STEP 0: set up IO if not output_dir.exists(): output_dir.mkdir(parents=True) # STEP 1: Load the NLP object logger.info("STEP 1: Loading NLP model {}".format(model)) nlp = spacy.load(model) # check the length of the nlp vectors if "vectors" not in nlp.meta or not nlp.vocab.vectors.size: raise ValueError( "The `nlp` object should have access to pretrained word vectors, " " cf. https://spacy.io/usage/models#languages.") # STEP 2: create prior probabilities from WP if not prior_prob_path.exists(): # It takes about 2h to process 1000M lines of Wikipedia XML dump logger.info("STEP 2: Writing prior probabilities to {}".format( prior_prob_path)) if limit_prior is not None: logger.warning( "Warning: reading only {} lines of Wikipedia dump".format( limit_prior)) wp.read_prior_probs(wp_xml, prior_prob_path, limit=limit_prior) else: logger.info("STEP 2: Reading prior probabilities from {}".format( prior_prob_path)) # STEP 3: calculate entity frequencies if not entity_freq_path.exists(): logger.info( "STEP 3: Calculating and writing entity frequencies to {}".format( entity_freq_path)) io.write_entity_to_count(prior_prob_path, entity_freq_path) else: logger.info("STEP 3: Reading entity frequencies from {}".format( entity_freq_path)) # STEP 4: reading definitions and (possibly) descriptions from WikiData or from file if (not entity_defs_path.exists()) or (not descr_from_wp and not entity_descr_path.exists()): # It takes about 10h to process 55M lines of Wikidata JSON dump logger.info( "STEP 4: Parsing and writing Wikidata entity definitions to {}". format(entity_defs_path)) if limit_wd is not None: logger.warning( "Warning: reading only {} lines of Wikidata dump".format( limit_wd)) title_to_id, id_to_descr, id_to_alias = wd.read_wikidata_entities_json( wd_json, limit_wd, to_print=False, lang=lang, parse_descr=(not descr_from_wp), ) io.write_title_to_id(entity_defs_path, title_to_id) logger.info("STEP 4b: Writing Wikidata entity aliases to {}".format( entity_alias_path)) io.write_id_to_alias(entity_alias_path, id_to_alias) if not descr_from_wp: logger.info( "STEP 4c: Writing Wikidata entity descriptions to {}".format( entity_descr_path)) io.write_id_to_descr(entity_descr_path, id_to_descr) else: logger.info("STEP 4: Reading entity definitions from {}".format( entity_defs_path)) logger.info("STEP 4b: Reading entity aliases from {}".format( entity_alias_path)) if not descr_from_wp: logger.info("STEP 4c: Reading entity descriptions from {}".format( entity_descr_path)) # STEP 5: Getting gold entities from Wikipedia if (not training_entities_path.exists()) or ( descr_from_wp and not entity_descr_path.exists()): logger.info( "STEP 5: Parsing and writing Wikipedia gold entities to {}".format( training_entities_path)) if limit_train is not None: logger.warning( "Warning: reading only {} lines of Wikipedia dump".format( limit_train)) wp.create_training_and_desc(wp_xml, entity_defs_path, entity_descr_path, training_entities_path, descr_from_wp, limit_train) if descr_from_wp: logger.info( "STEP 5b: Parsing and writing Wikipedia descriptions to {}". format(entity_descr_path)) else: logger.info("STEP 5: Reading gold entities from {}".format( training_entities_path)) if descr_from_wp: logger.info("STEP 5b: Reading entity descriptions from {}".format( entity_descr_path)) # STEP 6: creating the actual KB # It takes ca. 30 minutes to pretrain the entity embeddings if not kb_path.exists(): logger.info("STEP 6: Creating the KB at {}".format(kb_path)) kb = kb_creator.create_kb( nlp=nlp, max_entities_per_alias=max_per_alias, min_entity_freq=min_freq, min_occ=min_pair, entity_def_path=entity_defs_path, entity_descr_path=entity_descr_path, entity_alias_path=entity_alias_path, entity_freq_path=entity_freq_path, prior_prob_path=prior_prob_path, entity_vector_length=entity_vector_length, ) kb.dump(kb_path) logger.info("kb entities: {}".format(kb.get_size_entities())) logger.info("kb aliases: {}".format(kb.get_size_aliases())) nlp.to_disk(output_dir / KB_MODEL_DIR) else: logger.info("STEP 6: KB already exists at {}".format(kb_path)) logger.info("Done!")
def run_pipeline(): # set the appropriate booleans to define which parts of the pipeline should be re(run) print("START", datetime.datetime.now()) print() nlp_1 = spacy.load('en_core_web_lg') nlp_2 = None kb_2 = None # one-time methods to create KB and write to file to_create_prior_probs = False to_create_entity_counts = False to_create_kb = False # read KB back in from file to_read_kb = True to_test_kb = False # create training dataset create_wp_training = False # train the EL pipe train_pipe = True measure_performance = True # test the EL pipe on a simple example to_test_pipeline = True # write the NLP object, read back in and test again to_write_nlp = True to_read_nlp = True test_from_file = False # STEP 1 : create prior probabilities from WP (run only once) if to_create_prior_probs: print("STEP 1: to_create_prior_probs", datetime.datetime.now()) wp.read_wikipedia_prior_probs(wikipedia_input=ENWIKI_DUMP, prior_prob_output=PRIOR_PROB) print() # STEP 2 : deduce entity frequencies from WP (run only once) if to_create_entity_counts: print("STEP 2: to_create_entity_counts", datetime.datetime.now()) wp.write_entity_counts(prior_prob_input=PRIOR_PROB, count_output=ENTITY_COUNTS, to_print=False) print() # STEP 3 : create KB and write to file (run only once) if to_create_kb: print("STEP 3a: to_create_kb", datetime.datetime.now()) kb_1 = kb_creator.create_kb(nlp_1, max_entities_per_alias=MAX_CANDIDATES, min_entity_freq=MIN_ENTITY_FREQ, min_occ=MIN_PAIR_OCC, entity_def_output=ENTITY_DEFS, entity_descr_output=ENTITY_DESCR, count_input=ENTITY_COUNTS, prior_prob_input=PRIOR_PROB, wikidata_input=WIKIDATA_JSON) print("kb entities:", kb_1.get_size_entities()) print("kb aliases:", kb_1.get_size_aliases()) print() print("STEP 3b: write KB and NLP", datetime.datetime.now()) kb_1.dump(KB_FILE) nlp_1.to_disk(NLP_1_DIR) print() # STEP 4 : read KB back in from file if to_read_kb: print("STEP 4: to_read_kb", datetime.datetime.now()) nlp_2 = spacy.load(NLP_1_DIR) kb_2 = KnowledgeBase(vocab=nlp_2.vocab, entity_vector_length=DESC_WIDTH) kb_2.load_bulk(KB_FILE) print("kb entities:", kb_2.get_size_entities()) print("kb aliases:", kb_2.get_size_aliases()) print() # test KB if to_test_kb: check_kb(kb_2) print() # STEP 5: create a training dataset from WP if create_wp_training: print("STEP 5: create training dataset", datetime.datetime.now()) training_set_creator.create_training(wikipedia_input=ENWIKI_DUMP, entity_def_input=ENTITY_DEFS, training_output=TRAINING_DIR) # STEP 6: create and train the entity linking pipe if train_pipe: print("STEP 6: training Entity Linking pipe", datetime.datetime.now()) type_to_int = {label: i for i, label in enumerate(nlp_2.entity.labels)} print(" -analysing", len(type_to_int), "different entity types") el_pipe = nlp_2.create_pipe(name='entity_linker', config={ "context_width": CONTEXT_WIDTH, "pretrained_vectors": nlp_2.vocab.vectors.name, "type_to_int": type_to_int }) el_pipe.set_kb(kb_2) nlp_2.add_pipe(el_pipe, last=True) other_pipes = [ pipe for pipe in nlp_2.pipe_names if pipe != "entity_linker" ] with nlp_2.disable_pipes(*other_pipes): # only train Entity Linking optimizer = nlp_2.begin_training() optimizer.learn_rate = LEARN_RATE optimizer.L2 = L2 # define the size (nr of entities) of training and dev set train_limit = 5000 dev_limit = 5000 train_data = training_set_creator.read_training( nlp=nlp_2, training_dir=TRAINING_DIR, dev=False, limit=train_limit) print("Training on", len(train_data), "articles") print() dev_data = training_set_creator.read_training( nlp=nlp_2, training_dir=TRAINING_DIR, dev=True, limit=dev_limit) print("Dev testing on", len(dev_data), "articles") print() if not train_data: print("Did not find any training data") else: for itn in range(EPOCHS): random.shuffle(train_data) losses = {} batches = minibatch(train_data, size=compounding(4.0, 128.0, 1.001)) batchnr = 0 with nlp_2.disable_pipes(*other_pipes): for batch in batches: try: docs, golds = zip(*batch) nlp_2.update( docs, golds, sgd=optimizer, drop=DROPOUT, losses=losses, ) batchnr += 1 except Exception as e: print("Error updating batch:", e) if batchnr > 0: el_pipe.cfg["context_weight"] = 1 el_pipe.cfg["prior_weight"] = 1 dev_acc_context, dev_acc_context_dict = _measure_accuracy( dev_data, el_pipe) losses['entity_linker'] = losses['entity_linker'] / batchnr print("Epoch, train loss", itn, round(losses['entity_linker'], 2), " / dev acc avg", round(dev_acc_context, 3)) # STEP 7: measure the performance of our trained pipe on an independent dev set if len(dev_data) and measure_performance: print() print("STEP 7: performance measurement of Entity Linking pipe", datetime.datetime.now()) print() counts, acc_r, acc_r_label, acc_p, acc_p_label, acc_o, acc_o_label = _measure_baselines( dev_data, kb_2) print("dev counts:", sorted(counts.items(), key=lambda x: x[0])) print("dev acc oracle:", round(acc_o, 3), [(x, round(y, 3)) for x, y in acc_o_label.items()]) print("dev acc random:", round(acc_r, 3), [(x, round(y, 3)) for x, y in acc_r_label.items()]) print("dev acc prior:", round(acc_p, 3), [(x, round(y, 3)) for x, y in acc_p_label.items()]) # using only context el_pipe.cfg["context_weight"] = 1 el_pipe.cfg["prior_weight"] = 0 dev_acc_context, dev_acc_context_dict = _measure_accuracy( dev_data, el_pipe) print("dev acc context avg:", round(dev_acc_context, 3), [(x, round(y, 3)) for x, y in dev_acc_context_dict.items()]) # measuring combined accuracy (prior + context) el_pipe.cfg["context_weight"] = 1 el_pipe.cfg["prior_weight"] = 1 dev_acc_combo, dev_acc_combo_dict = _measure_accuracy( dev_data, el_pipe, error_analysis=False) print("dev acc combo avg:", round(dev_acc_combo, 3), [(x, round(y, 3)) for x, y in dev_acc_combo_dict.items()]) # STEP 8: apply the EL pipe on a toy example if to_test_pipeline: print() print("STEP 8: applying Entity Linking to toy example", datetime.datetime.now()) print() run_el_toy_example(nlp=nlp_2) # STEP 9: write the NLP pipeline (including entity linker) to file if to_write_nlp: print() print("STEP 9: testing NLP IO", datetime.datetime.now()) print() print("writing to", NLP_2_DIR) nlp_2.to_disk(NLP_2_DIR) print() # verify that the IO has gone correctly if to_read_nlp: print("reading from", NLP_2_DIR) nlp_3 = spacy.load(NLP_2_DIR) print("running toy example with NLP 3") run_el_toy_example(nlp=nlp_3) # testing performance with an NLP model from file if test_from_file: nlp_2 = spacy.load(NLP_1_DIR) nlp_3 = spacy.load(NLP_2_DIR) el_pipe = nlp_3.get_pipe("entity_linker") dev_limit = 5000 dev_data = training_set_creator.read_training( nlp=nlp_2, training_dir=TRAINING_DIR, dev=True, limit=dev_limit) print("Dev testing from file on", len(dev_data), "articles") print() dev_acc_combo, dev_acc_combo_dict = _measure_accuracy( dev_data, el_pipe=el_pipe, error_analysis=False) print("dev acc combo avg:", round(dev_acc_combo, 3), [(x, round(y, 3)) for x, y in dev_acc_combo_dict.items()]) print() print("STOP", datetime.datetime.now())
def main( wd_json, wp_xml, output_dir, model, max_per_alias=10, min_freq=20, min_pair=5, entity_vector_length=64, loc_prior_prob=None, loc_entity_defs=None, loc_entity_desc=None, descriptions_from_wikipedia=False, limit=None, lang="en", ): entity_defs_path = loc_entity_defs if loc_entity_defs else output_dir / ENTITY_DEFS_PATH entity_descr_path = loc_entity_desc if loc_entity_desc else output_dir / ENTITY_DESCR_PATH entity_freq_path = output_dir / ENTITY_FREQ_PATH prior_prob_path = loc_prior_prob if loc_prior_prob else output_dir / PRIOR_PROB_PATH training_entities_path = output_dir / TRAINING_DATA_FILE kb_path = output_dir / KB_FILE logger.info("Creating KB with Wikipedia and WikiData") if limit is not None: logger.warning( "Warning: reading only {} lines of Wikipedia/Wikidata dumps.". format(limit)) # STEP 0: set up IO if not output_dir.exists(): output_dir.mkdir(parents=True) # STEP 1: create the NLP object logger.info("STEP 1: Loading model {}".format(model)) nlp = spacy.load(model) # check the length of the nlp vectors if "vectors" not in nlp.meta or not nlp.vocab.vectors.size: raise ValueError( "The `nlp` object should have access to pretrained word vectors, " " cf. https://spacy.io/usage/models#languages.") # STEP 2: create prior probabilities from WP if not prior_prob_path.exists(): # It takes about 2h to process 1000M lines of Wikipedia XML dump logger.info("STEP 2: writing prior probabilities to {}".format( prior_prob_path)) wp.read_prior_probs(wp_xml, prior_prob_path, limit=limit) logger.info( "STEP 2: reading prior probabilities from {}".format(prior_prob_path)) # STEP 3: deduce entity frequencies from WP (takes only a few minutes) logger.info("STEP 3: calculating entity frequencies") wp.write_entity_counts(prior_prob_path, entity_freq_path, to_print=False) # STEP 4: reading definitions and (possibly) descriptions from WikiData or from file message = " and descriptions" if not descriptions_from_wikipedia else "" if (not entity_defs_path.exists()) or (not descriptions_from_wikipedia and not entity_descr_path.exists()): # It takes about 10h to process 55M lines of Wikidata JSON dump logger.info("STEP 4: parsing wikidata for entity definitions" + message) title_to_id, id_to_descr = wd.read_wikidata_entities_json( wd_json, limit, to_print=False, lang=lang, parse_descriptions=(not descriptions_from_wikipedia), ) wd.write_entity_files(entity_defs_path, title_to_id) if not descriptions_from_wikipedia: wd.write_entity_description_files(entity_descr_path, id_to_descr) logger.info("STEP 4: read entity definitions" + message) # STEP 5: Getting gold entities from wikipedia message = " and descriptions" if descriptions_from_wikipedia else "" if (not training_entities_path.exists()) or ( descriptions_from_wikipedia and not entity_descr_path.exists()): logger.info("STEP 5: parsing wikipedia for gold entities" + message) training_set_creator.create_training_examples_and_descriptions( wp_xml, entity_defs_path, entity_descr_path, training_entities_path, parse_descriptions=descriptions_from_wikipedia, limit=limit, ) logger.info("STEP 5: read gold entities" + message) # STEP 6: creating the actual KB # It takes ca. 30 minutes to pretrain the entity embeddings logger.info("STEP 6: creating the KB at {}".format(kb_path)) kb = kb_creator.create_kb( nlp=nlp, max_entities_per_alias=max_per_alias, min_entity_freq=min_freq, min_occ=min_pair, entity_def_input=entity_defs_path, entity_descr_path=entity_descr_path, count_input=entity_freq_path, prior_prob_input=prior_prob_path, entity_vector_length=entity_vector_length, ) kb.dump(kb_path) nlp.to_disk(output_dir / KB_MODEL_DIR) logger.info("Done!")
def main( wd_json, wp_xml, output_dir, model, max_per_alias=10, min_freq=20, min_pair=5, entity_vector_length=64, loc_prior_prob=None, loc_entity_defs=None, loc_entity_desc=None, limit=None, ): print(now(), "Creating KB with Wikipedia and WikiData") print() if limit is not None: print("Warning: reading only", limit, "lines of Wikipedia/Wikidata dumps.") # STEP 0: set up IO if not output_dir.exists(): output_dir.mkdir() # STEP 1: create the NLP object print(now(), "STEP 1: loaded model", model) nlp = spacy.load(model) # check the length of the nlp vectors if "vectors" not in nlp.meta or not nlp.vocab.vectors.size: raise ValueError( "The `nlp` object should have access to pre-trained word vectors, " " cf. https://spacy.io/usage/models#languages." ) # STEP 2: create prior probabilities from WP print() if loc_prior_prob: print(now(), "STEP 2: reading prior probabilities from", loc_prior_prob) else: # It takes about 2h to process 1000M lines of Wikipedia XML dump loc_prior_prob = output_dir / "prior_prob.csv" print(now(), "STEP 2: writing prior probabilities at", loc_prior_prob) wp.read_prior_probs(wp_xml, loc_prior_prob, limit=limit) # STEP 3: deduce entity frequencies from WP (takes only a few minutes) print() print(now(), "STEP 3: calculating entity frequencies") loc_entity_freq = output_dir / "entity_freq.csv" wp.write_entity_counts(loc_prior_prob, loc_entity_freq, to_print=False) loc_kb = output_dir / "kb" # STEP 4: reading entity descriptions and definitions from WikiData or from file print() if loc_entity_defs and loc_entity_desc: read_raw = False print(now(), "STEP 4a: reading entity definitions from", loc_entity_defs) print(now(), "STEP 4b: reading entity descriptions from", loc_entity_desc) else: # It takes about 10h to process 55M lines of Wikidata JSON dump read_raw = True loc_entity_defs = output_dir / "entity_defs.csv" loc_entity_desc = output_dir / "entity_descriptions.csv" print(now(), "STEP 4: parsing wikidata for entity definitions and descriptions") # STEP 5: creating the actual KB # It takes ca. 30 minutes to pretrain the entity embeddings print() print(now(), "STEP 5: creating the KB at", loc_kb) kb = kb_creator.create_kb( nlp=nlp, max_entities_per_alias=max_per_alias, min_entity_freq=min_freq, min_occ=min_pair, entity_def_output=loc_entity_defs, entity_descr_output=loc_entity_desc, count_input=loc_entity_freq, prior_prob_input=loc_prior_prob, wikidata_input=wd_json, entity_vector_length=entity_vector_length, limit=limit, read_raw_data=read_raw, ) if read_raw: print(" - wrote entity definitions to", loc_entity_defs) print(" - wrote writing entity descriptions to", loc_entity_desc) kb.dump(loc_kb) nlp.to_disk(output_dir / "nlp") print() print(now(), "Done!")