def train_ner(output_dir: str, train_data_path: str, dev_data_path: str, test_data_path: str, run_test: bool = None, model: str = None, n_iter: int = 10, meta_overrides: str = None): util.fix_random_seed(util.env_opt("seed", 0)) train_data = read_ner_from_tsv(train_data_path) dev_data = read_ner_from_tsv(dev_data_path) test_data = read_ner_from_tsv(test_data_path) os.makedirs(output_dir, exist_ok=True) if run_test: nlp = spacy.load(model) print("Loaded model '%s'" % model) evaluate_ner(nlp, dev_data, dump_path=os.path.join(output_dir, "dev_metrics.json")) evaluate_ner(nlp, test_data, dump_path=os.path.join(output_dir, "test_metrics.json")) else: train(model, train_data, dev_data, test_data, output_dir, n_iter, meta_overrides)
def train_ner(output_dir: str, data_path: str, run_test: bool = None, model: str = None, n_iter: int = 100, label_granularity: int = None): if label_granularity is not None: umls_tree = construct_umls_tree_from_tsv( "data/umls_semantic_type_tree.tsv") label_mapping = umls_tree.get_collapsed_type_id_map_at_level( label_granularity) if label_granularity == 0: span_only = True else: label_mapping = None span_only = False train_data, dev_data, test_data = read_full_med_mentions( data_path, label_mapping, span_only) os.makedirs(output_dir, exist_ok=True) if run_test: nlp = spacy.load(model) print("Loaded model '%s'" % model) evaluate_ner(nlp, dev_data, dump_path=os.path.join(output_dir, "dev_metrics.json")) evaluate_ner(nlp, test_data, dump_path=os.path.join(output_dir, "test_metrics.json")) else: train(model, train_data, dev_data, output_dir, n_iter)
def main(model_path: str, dataset: str, output_path: str, code: Optional[str], med_mentions_folder_path: Optional[str]): if code is not None: # need to import code before loading a spacy model spec = importlib.util.spec_from_file_location(name, str(loc)) module = importlib.util.module_from_spec(spec) spec.loader.exec_module(module) nlp = spacy.load(model_path) if dataset.startswith("medmentions"): train_data, dev_data, test_data = read_full_med_mentions( med_mentions_folder_path, None, False) data_split = dataset.split("-")[1] if data_split == "train": data = train_data elif data_split == "dev": data = dev_data elif data_split == "test": data = test_data else: raise Exception(f"Unrecognized split {data_split}") else: data = read_ner_from_tsv(dataset) evaluate_ner(nlp, data, dump_path=output_path)
def train_ner(output_dir: str, train_data_path: str, dev_data_path: str, test_data_path: str, run_test: bool = None, model: str = None, n_iter: int = 10, meta_overrides: str = None): model = "en_core_sci_sm" util.fix_random_seed(util.env_opt("seed", 0)) with Path(train_data_path).open('rb') as file: train_data = pickle.load(file) with Path(dev_data_path).open('rb') as file: dev_data = pickle.load(file) with Path(test_data_path).open('rb') as file: test_data = pickle.load(file) # train_data = read_ner_from_tsv(train_data_path) # dev_data = read_ner_from_tsv(dev_data_path) # test_data = read_ner_from_tsv(test_data_path) os.makedirs(output_dir, exist_ok=True) if run_test: nlp = spacy.load(model) print("Loaded model '%s'" % model) evaluate_ner(nlp, dev_data, dump_path=os.path.join(output_dir, "dev_metrics.json")) evaluate_ner(nlp, test_data, dump_path=os.path.join(output_dir, "test_metrics.json")) else: train(model, train_data, dev_data, test_data, output_dir, n_iter, meta_overrides)
def train(model, train_data, dev_data, test_data, output_dir, n_iter, meta_overrides): """Load the model, set up the pipeline and train the entity recognizer.""" if model is not None: nlp = spacy.load(model) # load existing spaCy model print("Loaded model '%s'" % model) else: nlp = spacy.blank('en') # create blank Language class print("Created blank 'en' model") if meta_overrides is not None: metadata = json.load(open(meta_overrides)) nlp.meta.update(metadata) original_tokenizer = nlp.tokenizer nlp.tokenizer = WhitespaceTokenizer(nlp.vocab) # create the built-in pipeline components and add them to the pipeline # nlp.create_pipe works for built-ins that are registered with spaCy if 'ner' not in nlp.pipe_names and "parser" in nlp.pipe_names: ner = nlp.create_pipe('ner') nlp.add_pipe(ner, after="parser") elif 'ner' not in nlp.pipe_names and "tagger" in nlp.pipe_names: ner = nlp.create_pipe('ner') nlp.add_pipe(ner, after="tagger") elif 'ner' not in nlp.pipe_names: ner = nlp.create_pipe('ner') nlp.add_pipe(ner, last=True) # otherwise, get it so we can add labels else: ner = nlp.get_pipe('ner') # add labels for _, annotations in train_data: for ent in annotations.get('entities'): ner.add_label(ent[2]) # get names of other pipes to disable them during training other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner'] dropout_rates = util.decaying(util.env_opt('dropout_from', 0.2), util.env_opt('dropout_to', 0.2), util.env_opt('dropout_decay', 0.005)) batch_sizes = util.compounding(util.env_opt('batch_from', 1), util.env_opt('batch_to', 32), util.env_opt('batch_compound', 1.001)) with nlp.disable_pipes(*other_pipes): optimizer = nlp.begin_training() best_epoch = 0 best_f1 = 0 for i in range(n_iter): random.shuffle(train_data) count = 0 losses = {} total = len(train_data) with nlp.disable_pipes(*other_pipes): # only train NER with tqdm.tqdm(total=total, leave=True) as pbar: for batch in minibatch(train_data, size=batch_sizes): docs, golds = zip(*batch) nlp.update(docs, golds, sgd=optimizer, losses=losses, drop=next(dropout_rates)) pbar.update(len(batch)) if count % 100 == 0 and count > 0: print('sum loss: %s' % losses['ner']) count += 1 # save model to output directory output_dir_path = Path(output_dir + "/" + str(i)) if not output_dir_path.exists(): output_dir_path.mkdir() with nlp.use_params(optimizer.averages): nlp.tokenizer = original_tokenizer nlp.to_disk(output_dir_path) print("Saved model to", output_dir_path) # test the saved model print("Loading from", output_dir_path) nlp2 = util.load_model_from_path(output_dir_path) nlp2.tokenizer = WhitespaceTokenizer(nlp.vocab) metrics = evaluate_ner(nlp2, dev_data) if metrics["f1-measure-overall"] > best_f1: best_f1 = metrics["f1-measure-overall"] best_epoch = i # save model to output directory best_model_path = Path(output_dir + "/" + "best") print(f"Best Epoch: {best_epoch} of {n_iter}") if os.path.exists(best_model_path): shutil.rmtree(best_model_path) shutil.copytree(os.path.join(output_dir, str(best_epoch)), best_model_path) # test the saved model print("Loading from", best_model_path) nlp2 = util.load_model_from_path(best_model_path) nlp2.tokenizer = WhitespaceTokenizer(nlp.vocab) evaluate_ner(nlp2, dev_data, dump_path=os.path.join(output_dir, "dev_metrics.json")) evaluate_ner(nlp2, test_data, dump_path=os.path.join(output_dir, "test_metrics.json"))