def main(dir_kb, output_dir=None, loc_training=None, epochs=10, dropout=0.5, lr=0.005, l2=1e-6, train_articles=None, dev_articles=None, labels_discard=None): if not output_dir: logger.warning( "No output dir specified so no results will be written, are you sure about this ?" ) logger.info("Creating Entity Linker with Wikipedia and WikiData") output_dir = Path(output_dir) if output_dir else dir_kb training_path = loc_training if loc_training else dir_kb / TRAINING_DATA_FILE nlp_dir = dir_kb / KB_MODEL_DIR kb_path = dir_kb / KB_FILE nlp_output_dir = output_dir / OUTPUT_MODEL_DIR # STEP 0: set up IO if not output_dir.exists(): output_dir.mkdir() # STEP 1 : load the NLP object logger.info("STEP 1a: Loading model from {}".format(nlp_dir)) nlp = spacy.load(nlp_dir) logger.info( "Original NLP pipeline has following pipeline components: {}".format( nlp.pipe_names)) # check that there is a NER component in the pipeline if "ner" not in nlp.pipe_names: raise ValueError( "The `nlp` object should have a pretrained `ner` component.") logger.info("STEP 1b: Loading KB from {}".format(kb_path)) kb = read_kb(nlp, kb_path) # STEP 2: read the training dataset previously created from WP logger.info( "STEP 2: Reading training & dev dataset from {}".format(training_path)) train_indices, dev_indices = wikipedia_processor.read_training_indices( training_path) logger.info( "Training set has {} articles, limit set to roughly {} articles per epoch" .format(len(train_indices), train_articles if train_articles else "all")) logger.info( "Dev set has {} articles, limit set to rougly {} articles for evaluation" .format(len(dev_indices), dev_articles if dev_articles else "all")) if dev_articles: dev_indices = dev_indices[0:dev_articles] # STEP 3: create and train an entity linking pipe logger.info( "STEP 3: Creating and training an Entity Linking pipe for {} epochs". format(epochs)) if labels_discard: labels_discard = [x.strip() for x in labels_discard.split(",")] logger.info("Discarding {} NER types: {}".format( len(labels_discard), labels_discard)) else: labels_discard = [] el_pipe = nlp.create_pipe(name="entity_linker", config={ "pretrained_vectors": nlp.vocab.vectors.name, "labels_discard": labels_discard }) el_pipe.set_kb(kb) nlp.add_pipe(el_pipe, last=True) other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "entity_linker"] with nlp.disable_pipes(*other_pipes): # only train Entity Linking optimizer = nlp.begin_training() optimizer.learn_rate = lr optimizer.L2 = l2 logger.info("Dev Baseline Accuracies:") dev_data = wikipedia_processor.read_el_docs_golds( nlp=nlp, entity_file_path=training_path, dev=True, line_ids=dev_indices, kb=kb, labels_discard=labels_discard) measure_performance(dev_data, kb, el_pipe, baseline=True, context=False, dev_limit=len(dev_indices)) for itn in range(epochs): random.shuffle(train_indices) losses = {} batches = minibatch(train_indices, size=compounding(8.0, 128.0, 1.001)) batchnr = 0 articles_processed = 0 # we either process the whole training file, or just a part each epoch bar_total = len(train_indices) if train_articles: bar_total = train_articles with tqdm(total=bar_total, leave=False, desc='Epoch ' + str(itn)) as pbar: for batch in batches: if not train_articles or articles_processed < train_articles: with nlp.disable_pipes("entity_linker"): train_batch = wikipedia_processor.read_el_docs_golds( nlp=nlp, entity_file_path=training_path, dev=False, line_ids=batch, kb=kb, labels_discard=labels_discard) docs, golds = zip(*train_batch) try: with nlp.disable_pipes(*other_pipes): nlp.update( docs=docs, golds=golds, sgd=optimizer, drop=dropout, losses=losses, ) batchnr += 1 articles_processed += len(docs) pbar.update(len(docs)) except Exception as e: logger.error("Error updating batch:" + str(e)) if batchnr > 0: logging.info( "Epoch {} trained on {} articles, train loss {}".format( itn, articles_processed, round(losses["entity_linker"] / batchnr, 2))) # re-read the dev_data (data is returned as a generator) dev_data = wikipedia_processor.read_el_docs_golds( nlp=nlp, entity_file_path=training_path, dev=True, line_ids=dev_indices, kb=kb, labels_discard=labels_discard) measure_performance(dev_data, kb, el_pipe, baseline=False, context=True, dev_limit=len(dev_indices)) if output_dir: # STEP 4: write the NLP pipeline (now including an EL model) to file logger.info( "Final NLP pipeline has following pipeline components: {}".format( nlp.pipe_names)) logger.info("STEP 4: Writing trained NLP to {}".format(nlp_output_dir)) nlp.to_disk(nlp_output_dir) logger.info("Done!")
def main( dir_kb, output_dir=None, loc_training=None, epochs=10, dropout=0.5, lr=0.005, l2=1e-6, train_inst=None, dev_inst=None, ): logger.info("Creating Entity Linker with Wikipedia and WikiData") output_dir = Path(output_dir) if output_dir else dir_kb training_path = loc_training if loc_training else output_dir / TRAINING_DATA_FILE nlp_dir = dir_kb / KB_MODEL_DIR kb_path = output_dir / KB_FILE nlp_output_dir = output_dir / OUTPUT_MODEL_DIR # STEP 0: set up IO if not output_dir.exists(): output_dir.mkdir() # STEP 1 : load the NLP object logger.info("STEP 1: loading model from {}".format(nlp_dir)) nlp, kb = read_nlp_kb(nlp_dir, kb_path) # check that there is a NER component in the pipeline if "ner" not in nlp.pipe_names: raise ValueError("The `nlp` object should have a pretrained `ner` component.") # STEP 2: create a training dataset from WP logger.info("STEP 2: reading training dataset from {}".format(training_path)) train_data = training_set_creator.read_training( nlp=nlp, entity_file_path=training_path, dev=False, limit=train_inst, kb=kb, ) # for testing, get all pos instances, whether or not they are in the kb dev_data = training_set_creator.read_training( nlp=nlp, entity_file_path=training_path, dev=True, limit=dev_inst, kb=kb, ) # STEP 3: create and train the entity linking pipe logger.info("STEP 3: training Entity Linking pipe") el_pipe = nlp.create_pipe( name="entity_linker", config={"pretrained_vectors": nlp.vocab.vectors.name} ) el_pipe.set_kb(kb) nlp.add_pipe(el_pipe, last=True) other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "entity_linker"] with nlp.disable_pipes(*other_pipes): # only train Entity Linking optimizer = nlp.begin_training() optimizer.learn_rate = lr optimizer.L2 = l2 logger.info("Training on {} articles".format(len(train_data))) logger.info("Dev testing on {} articles".format(len(dev_data))) dev_baseline_accuracies = measure_baselines( dev_data, kb ) logger.info("Dev Baseline Accuracies:") logger.info(dev_baseline_accuracies.report_accuracy("random")) logger.info(dev_baseline_accuracies.report_accuracy("prior")) logger.info(dev_baseline_accuracies.report_accuracy("oracle")) for itn in range(epochs): random.shuffle(train_data) losses = {} batches = minibatch(train_data, size=compounding(4.0, 128.0, 1.001)) batchnr = 0 with nlp.disable_pipes(*other_pipes): for batch in batches: try: docs, golds = zip(*batch) nlp.update( docs=docs, golds=golds, sgd=optimizer, drop=dropout, losses=losses, ) batchnr += 1 except Exception as e: logger.error("Error updating batch:" + str(e)) if batchnr > 0: logging.info("Epoch {}, train loss {}".format(itn, round(losses["entity_linker"] / batchnr, 2))) measure_performance(dev_data, kb, el_pipe) # STEP 4: measure the performance of our trained pipe on an independent dev set logger.info("STEP 4: performance measurement of Entity Linking pipe") measure_performance(dev_data, kb, el_pipe) # STEP 5: apply the EL pipe on a toy example logger.info("STEP 5: applying Entity Linking to toy example") run_el_toy_example(nlp=nlp) if output_dir: # STEP 6: write the NLP pipeline (including entity linker) to file logger.info("STEP 6: Writing trained NLP to {}".format(nlp_output_dir)) nlp.to_disk(nlp_output_dir) logger.info("Done!")