def main(use_gpu=False, nb_epoch=100): if use_gpu: Model.ops = CupyOps() Model.Ops = CupyOps train, test = datasets.imdb(limit=2000) print("Load data") train_X, train_y = zip(*train) test_X, test_y = zip(*test) train_y = Model.ops.asarray(to_categorical(train_y, nb_classes=2)) test_y = Model.ops.asarray(to_categorical(test_y, nb_classes=2)) nlp = spacy.load('en_vectors_web_lg') nlp.add_pipe(nlp.create_pipe('sentencizer'), first=True) preprocessor = FeatureExtracter([ORTH, LOWER, PREFIX, SUFFIX, SHAPE, ID]) train_X = [preprocessor(list(doc.sents)) for doc in tqdm.tqdm(nlp.pipe(train_X))] test_X = [preprocessor(list(doc.sents)) for doc in tqdm.tqdm(nlp.pipe(test_X))] dev_X = train_X[-1000:] dev_y = train_y[-1000:] train_X = train_X[:-1000] train_y = train_y[:-1000] print("Parse data") n_sent = sum([len(list(sents)) for sents in train_X]) print("%d sentences" % n_sent) model = build_model(2, width=128, conv_depth=2, depth=2, train_X=train_X, train_y=train_y) with model.begin_training(train_X[:100], train_y[:100]) as (trainer, optimizer): epoch_loss = [0.] def report_progress(): with model.use_params(optimizer.averages): print(epoch_loss[-1], epoch_var[-1], model.evaluate(dev_X, dev_y), trainer.dropout) epoch_loss.append(0.) epoch_var.append(0.) trainer.each_epoch.append(report_progress) batch_sizes = compounding(64, 64, 1.01) trainer.dropout = 0.3 trainer.batch_size = int(next(batch_sizes)) trainer.dropout_decay = 0.0 trainer.nb_epoch = nb_epoch #optimizer.alpha = 0.1 #optimizer.max_grad_norm = 10.0 #optimizer.b1 = 0.0 #optimizer.b2 = 0.0 epoch_var = [0.] for X, y in trainer.iterate(train_X, train_y): yh, backprop = model.begin_update(X, drop=trainer.dropout) losses = ((yh-y)**2.).sum(axis=1) / y.shape[0] epoch_var[-1] += losses.var() loss = losses.mean() backprop((yh-y)/yh.shape[0], optimizer) epoch_loss[-1] += loss trainer.batch_size = int(next(batch_sizes)) with model.use_params(optimizer.averages): print('Avg dev.: %.3f' % model.evaluate(dev_X, dev_y))
def main(model=None, output_dir=None, n_iter=15): """Load the model, set up the pipeline and train the parser.""" if model is not None: nlp = spacy.load(model) # load existing spaCy model print("Loaded model '%s'" % model) else: nlp = spacy.blank("en") # create blank Language class print("Created blank 'en' model") # add the parser to the pipeline if it doesn't exist # nlp.create_pipe works for built-ins that are registered with spaCy if "parser" not in nlp.pipe_names: parser = nlp.create_pipe("parser") nlp.add_pipe(parser, first=True) # otherwise, get it, so we can add labels to it else: parser = nlp.get_pipe("parser") # add labels to the parser for _, annotations in TRAIN_DATA: for dep in annotations.get("deps", []): parser.add_label(dep) # get names of other pipes to disable them during training other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "parser"] with nlp.disable_pipes(*other_pipes): # only train parser optimizer = nlp.begin_training() for itn in range(n_iter): random.shuffle(TRAIN_DATA) losses = {} # batch up the examples using spaCy's minibatch batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001)) for batch in batches: texts, annotations = zip(*batch) nlp.update(texts, annotations, sgd=optimizer, losses=losses) print("Losses", losses) # test the trained model test_text = "I like securities." doc = nlp(test_text) print("Dependencies", [(t.text, t.dep_, t.head.text) for t in doc]) # save model to output directory if output_dir is not None: output_dir = Path(output_dir) if not output_dir.exists(): output_dir.mkdir() nlp.to_disk(output_dir) print("Saved model to", output_dir) # test the saved model print("Loading from", output_dir) nlp2 = spacy.load(output_dir) doc = nlp2(test_text) print("Dependencies", [(t.text, t.dep_, t.head.text) for t in doc])
def main(model=None, output_dir=None, n_iter=15): """Load the model, set up the pipeline and train the parser.""" if model is not None: nlp = spacy.load(model) # load existing spaCy model print("Loaded model '%s'" % model) else: nlp = spacy.blank("en") # create blank Language class print("Created blank 'en' model") # We'll use the built-in dependency parser class, but we want to create a # fresh instance – just in case. if "parser" in nlp.pipe_names: nlp.remove_pipe("parser") parser = nlp.create_pipe("parser") nlp.add_pipe(parser, first=True) for text, annotations in TRAIN_DATA: for dep in annotations.get("deps", []): parser.add_label(dep) other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "parser"] with nlp.disable_pipes(*other_pipes): # only train parser optimizer = nlp.begin_training() for itn in range(n_iter): random.shuffle(TRAIN_DATA) losses = {} # batch up the examples using spaCy's minibatch batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001)) for batch in batches: texts, annotations = zip(*batch) nlp.update(texts, annotations, sgd=optimizer, losses=losses) print("Losses", losses) # test the trained model test_model(nlp) # save model to output directory if output_dir is not None: output_dir = Path(output_dir) if not output_dir.exists(): output_dir.mkdir() nlp.to_disk(output_dir) print("Saved model to", output_dir) # test the saved model print("Loading from", output_dir) nlp2 = spacy.load(output_dir) test_model(nlp2)
def main(model_name, unlabelled_loc): n_iter = 10 dropout = 0.2 batch_size = 4 nlp = spacy.load(model_name) nlp.get_pipe("ner").add_label(LABEL) raw_docs = list(read_raw_data(nlp, unlabelled_loc)) optimizer = nlp.resume_training() # Avoid use of Adam when resuming training. I don't understand this well # yet, but I'm getting weird results from Adam. Try commenting out the # nlp.update(), and using Adam -- you'll find the models drift apart. # I guess Adam is losing precision, introducing gradient noise? optimizer.alpha = 0.1 optimizer.b1 = 0.0 optimizer.b2 = 0.0 # get names of other pipes to disable them during training other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"] sizes = compounding(1.0, 4.0, 1.001) with nlp.disable_pipes(*other_pipes): for itn in range(n_iter): random.shuffle(TRAIN_DATA) random.shuffle(raw_docs) losses = {} r_losses = {} # batch up the examples using spaCy's minibatch raw_batches = minibatch(raw_docs, size=4) for batch in minibatch(TRAIN_DATA, size=sizes): docs, golds = zip(*batch) nlp.update(docs, golds, sgd=optimizer, drop=dropout, losses=losses) raw_batch = list(next(raw_batches)) nlp.rehearse(raw_batch, sgd=optimizer, losses=r_losses) print("Losses", losses) print("R. Losses", r_losses) print(nlp.get_pipe('ner').model.unseen_classes) test_text = "Do you like horses?" doc = nlp(test_text) print("Entities in '%s'" % test_text) for ent in doc.ents: print(ent.label_, ent.text)
def main(kb_path, vocab_path=None, output_dir=None, n_iter=50): """Create a blank model with the specified vocab, set up the pipeline and train the entity linker. The `vocab` should be the one used during creation of the KB.""" vocab = Vocab().from_disk(vocab_path) # create blank Language class with correct vocab nlp = spacy.blank("en", vocab=vocab) nlp.vocab.vectors.name = "spacy_pretrained_vectors" print("Created blank 'en' model with vocab from '%s'" % vocab_path) # Add a sentencizer component. Alternatively, add a dependency parser for higher accuracy. nlp.add_pipe(nlp.create_pipe('sentencizer')) # Add a custom component to recognize "Russ Cochran" as an entity for the example training data. # Note that in a realistic application, an actual NER algorithm should be used instead. ruler = EntityRuler(nlp) patterns = [{ "label": "PERSON", "pattern": [{ "LOWER": "russ" }, { "LOWER": "cochran" }] }] ruler.add_patterns(patterns) nlp.add_pipe(ruler) # Create the Entity Linker component and add it to the pipeline. if "entity_linker" not in nlp.pipe_names: # use only the predicted EL score and not the prior probability (for demo purposes) cfg = {"incl_prior": False} entity_linker = nlp.create_pipe("entity_linker", cfg) kb = KnowledgeBase(vocab=nlp.vocab) kb.load_bulk(kb_path) print("Loaded Knowledge Base from '%s'" % kb_path) entity_linker.set_kb(kb) nlp.add_pipe(entity_linker, last=True) # Convert the texts to docs to make sure we have doc.ents set for the training examples. # Also ensure that the annotated examples correspond to known identifiers in the knowlege base. kb_ids = nlp.get_pipe("entity_linker").kb.get_entity_strings() TRAIN_DOCS = [] for text, annotation in TRAIN_DATA: with nlp.disable_pipes("entity_linker"): doc = nlp(text) annotation_clean = annotation for offset, kb_id_dict in annotation["links"].items(): new_dict = {} for kb_id, value in kb_id_dict.items(): if kb_id in kb_ids: new_dict[kb_id] = value else: print("Removed", kb_id, "from training because it is not in the KB.") annotation_clean["links"][offset] = new_dict TRAIN_DOCS.append((doc, annotation_clean)) # get names of other pipes to disable them during training pipe_exceptions = ["entity_linker", "trf_wordpiecer", "trf_tok2vec"] other_pipes = [ pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions ] with nlp.disable_pipes(*other_pipes): # only train entity linker # reset and initialize the weights randomly optimizer = nlp.begin_training() for itn in range(n_iter): random.shuffle(TRAIN_DOCS) losses = {} # batch up the examples using spaCy's minibatch batches = minibatch(TRAIN_DOCS, size=compounding(4.0, 32.0, 1.001)) for batch in batches: texts, annotations = zip(*batch) nlp.update( texts, # batch of texts annotations, # batch of annotations drop=0.2, # dropout - make it harder to memorise data losses=losses, sgd=optimizer, ) print(itn, "Losses", losses) # test the trained model _apply_model(nlp) # save model to output directory if output_dir is not None: output_dir = Path(output_dir) if not output_dir.exists(): output_dir.mkdir() nlp.to_disk(output_dir) print() print("Saved model to", output_dir) # test the saved model print("Loading from", output_dir) nlp2 = spacy.load(output_dir) _apply_model(nlp2)
def __train(self, level: int, n_iter=40): label = self.label(level) prepared_data = self.training_data.prepare_data(level, label) model_path = self.model_path(level) # Setup model path, verify access model_path.mkdir(parents=True, exist_ok=True) random.seed(0) nlp = spacy.blank('en') self.__console.info('Created blank model') # Add entity recognizer to model if it's not in the pipeline # nlp.create_pipe works for built-ins that are registered with spaCy if 'ner' not in nlp.pipe_names: ner = nlp.create_pipe('ner') nlp.add_pipe(ner) # otherwise, get it, so we can add labels to it else: ner = nlp.get_pipe('ner') ner.add_label(label) optimizer = nlp.begin_training() move_names = list(ner.move_names) # get names of other pipes to disable them during training pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"] other_pipes = [ pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions ] # only train NER with nlp.disable_pipes(*other_pipes) and warnings.catch_warnings(): # show warnings for misaligned entity spans once warnings.filterwarnings("once", category=UserWarning, module='spacy') sizes = compounding(1.0, 4.0, 1.001) # batch up the examples using spaCy's minibatch with self.__console.timed(f'Training model {label}', 'Trained model in {0:.3f}s'): for itn in range(n_iter): random.shuffle(prepared_data) batches = minibatch(prepared_data, size=sizes) losses = {} for batch in batches: texts, annotations = zip(*batch) nlp.update(texts, annotations, sgd=optimizer, drop=0.35, losses=losses) self.__console.info("Losses", losses) # test the trained model test_text = random.choice(self.training_data.questions) doc = nlp(test_text) self.__console.info("Entities in '%s'" % test_text) for ent in doc.ents: self.__console.info(ent.label_, ent.text) # save model to output directory nlp.meta["name"] = label nlp.to_disk(model_path) self.__console.okay("Saved model to", model_path) # test the saved model self.__console.info("Loading from", model_path) nlp2 = spacy.load(model_path) # Check the classes have loaded back consistently assert nlp2.get_pipe("ner").move_names == move_names doc2 = nlp2(test_text) for ent in doc2.ents: self.__console.info(ent.label_, ent.text) self.model_qsize_path(level).write_text( str(len(self.training_data.questions))) return nlp
other_pipes #n_iter=100 n_iter = 40 with nlp.disable_pipes(*other_pipes): # only train NER nlp.begin_training() #print("3*****") for itn in range(n_iter): print("Iteration:", itn) #print("Iteration# " , itn) random.shuffle(TR_DATA) #print("2") losses = {} # batch up the examples using spaCy's minibatch #print("4*****") #print("3") batches = minibatch(TR_DATA, size=compounding(4.0, 32.0, 1.001)) #print("batches len:",size) #print(type(batches)) #print("4") #cnt=1 for batch in batches: #print("current batch:",cnt) #print("5") texts, annotations = zip(*batch) #print(texts) #print("##########") #print(annotations) #print("*****") #print("6") nlp.update( texts, # batch of texts
def chunk_courses( model=None, new_model_name="chunk_course_info_model", output_dir=None, n_iter=30 ): """Set up the pipeline and entity recognizer, and train the new entity.""" ( TRAIN_DATA, LABELS, ) = get_data() # needs to not pull from file and pull from DB eventually # Train a series of models random.seed(0) if model is not None: nlp = spacy.load(model) # load existing spaCy model print("Loaded model '%s'" % model) else: nlp = spacy.blank("en") # create blank Language class print("Created blank 'en' model") # Add entity recognizer to model if it's not in the pipeline # nlp.create_pipe works for built-ins that are registered with spaCy if "ner" not in nlp.pipe_names: ner = nlp.create_pipe("ner") nlp.add_pipe(ner) # otherwise, get it, so we can add labels to it else: ner = nlp.get_pipe("ner") # Add labels for l in LABELS: ner.add_label(l) # add new entity label to entity recognizer # Begin/Resume training if model is None: optimizer = nlp.begin_training() else: optimizer = nlp.resume_training() move_names = list(ner.move_names) # get names of other pipes to disable them during training pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"] other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions] with nlp.disable_pipes(*other_pipes): # only train NER sizes = compounding(1.0, 4.0, 1.001) # batch up the examples using spaCy's minibatch for itn in range(n_iter): random.shuffle(TRAIN_DATA) batches = minibatch(TRAIN_DATA, size=sizes) losses = {} for batch in batches: texts, annotations = zip(*batch) nlp.update(texts, annotations, sgd=optimizer, drop=0.35, losses=losses) print("Losses", losses) test_model(nlp) ## TEST <------ # save model to output directory if output_dir is not None: output_dir = Path(output_dir) if not output_dir.exists(): output_dir.mkdir() nlp.meta["name"] = new_model_name # rename model nlp.to_disk(output_dir) print("Saved model to", output_dir)
def main(model=None, new_model_name="animal", output_dir=None, n_iter=10): """Set up the pipeline and entity recognizer, and train the new entity.""" if model is not None: nlp = spacy.load(model) # load existing spaCy model print("Loaded model '%s'" % model) else: nlp = spacy.blank("en") # create blank Language class print("Created blank 'en' model") # Add entity recognizer to model if it's not in the pipeline # nlp.create_pipe works for built-ins that are registered with spaCy if "ner" not in nlp.pipe_names: ner = nlp.create_pipe("ner") nlp.add_pipe(ner) # otherwise, get it, so we can add labels to it else: ner = nlp.get_pipe("ner") ner.add_label(LABEL) # add new entity label to entity recognizer if model is None: optimizer = nlp.begin_training() else: # Note that 'begin_training' initializes the models, so it'll zero out # existing entity types. optimizer = nlp.entity.create_optimizer() # get names of other pipes to disable them during training other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"] with nlp.disable_pipes(*other_pipes): # only train NER for itn in range(n_iter): random.shuffle(TRAIN_DATA) losses = {} # batch up the examples using spaCy's minibatch batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001)) for batch in batches: texts, annotations = zip(*batch) nlp.update(texts, annotations, sgd=optimizer, drop=0.35, losses=losses) print("Losses", losses) # test the trained model test_text = "Do you like horses?" doc = nlp(test_text) print("Entities in '%s'" % test_text) for ent in doc.ents: print(ent.label_, ent.text) # save model to output directory if output_dir is not None: output_dir = Path(output_dir) if not output_dir.exists(): output_dir.mkdir() nlp.meta["name"] = new_model_name # rename model nlp.to_disk(output_dir) print("Saved model to", output_dir) # test the saved model print("Loading from", output_dir) nlp2 = spacy.load(output_dir) doc2 = nlp2(test_text) for ent in doc2.ents: print(ent.label_, ent.text)
def main(model=None, output_dir=None, n_iter=20, n_texts=2000, init_tok2vec=None): if output_dir is not None: output_dir = Path(output_dir) if not output_dir.exists(): output_dir.mkdir() if model is not None: nlp = spacy.load(model) # load existing spaCy model print("Loaded model '%s'" % model) else: nlp = spacy.blank("en") # create blank Language class print("Created blank 'en' model") # add the text classifier to the pipeline if it doesn't exist # nlp.create_pipe works for built-ins that are registered with spaCy if "textcat" not in nlp.pipe_names: textcat = nlp.create_pipe( "textcat", config={ "exclusive_classes": True, "architecture": "simple_cnn", } ) nlp.add_pipe(textcat, last=True) # otherwise, get it, so we can add labels to it else: textcat = nlp.get_pipe("textcat") # add label to text classifier textcat.add_label("POSITIVE") textcat.add_label("NEGATIVE") # load the IMDB dataset print("Loading IMDB data...") (train_texts, train_cats), (dev_texts, dev_cats) = load_data() train_texts = train_texts[:n_texts] train_cats = train_cats[:n_texts] print( "Using {} examples ({} training, {} evaluation)".format( n_texts, len(train_texts), len(dev_texts) ) ) train_data = list(zip(train_texts, [{"cats": cats} for cats in train_cats])) # get names of other pipes to disable them during training other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "textcat"] with nlp.disable_pipes(*other_pipes): # only train textcat optimizer = nlp.begin_training() if init_tok2vec is not None: with init_tok2vec.open("rb") as file_: textcat.model.tok2vec.from_bytes(file_.read()) print("Training the model...") print("{:^5}\t{:^5}\t{:^5}\t{:^5}".format("LOSS", "P", "R", "F")) batch_sizes = compounding(4.0, 32.0, 1.001) for i in range(n_iter): losses = {} # batch up the examples using spaCy's minibatch random.shuffle(train_data) batches = minibatch(train_data, size=batch_sizes) for batch in batches: texts, annotations = zip(*batch) nlp.update(texts, annotations, sgd=optimizer, drop=0.2, losses=losses) with textcat.model.use_params(optimizer.averages): # evaluate on the dev data split off in load_data() scores = evaluate(nlp.tokenizer, textcat, dev_texts, dev_cats) print( "{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}".format( # print a simple table losses["textcat"], scores["textcat_p"], scores["textcat_r"], scores["textcat_f"], ) ) # test the trained model test_text = "This movie sucked" doc = nlp(test_text) print(test_text, doc.cats) if output_dir is not None: with nlp.use_params(optimizer.averages): nlp.to_disk(output_dir) print("Saved model to", output_dir) # test the saved model print("Loading from", output_dir) nlp2 = spacy.load(output_dir) doc2 = nlp2(test_text) print(test_text, doc2.cats)
print("Using {} examples ({} training, {} evaluation)".format( n_texts, len(train_texts), len(dev_texts))) train_data = list(zip(train_texts, [{'cats': cats} for cats in train_cats])) #training # get names of other pipes to disable them during training other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'textcat'] with nlp.disable_pipes(*other_pipes): # only train textcat optimizer = nlp.begin_training() print("Training the model...") print('{:^5}\t{:^5}\t{:^5}\t{:^5}'.format('LOSS', 'P', 'R', 'F')) for i in range(n_iter): t1 = time.time() losses = {} # batch up the examples using spaCy's minibatch batches = minibatch(train_data, size=compounding(4., 32., 1.001)) for batch in batches: texts, annotations = zip(*batch) nlp.update(texts, annotations, sgd=optimizer, drop=0.2, losses=losses) with textcat.model.use_params(optimizer.averages): # evaluate on the dev data split off in load_data() scores = evaluate(nlp.tokenizer, textcat, dev_texts, dev_cats) t2 = time.time() time_taken = t2 - t1 print('{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}\t{4:.3f}'.format( losses['textcat'], scores['textcat_p'], scores['textcat_r'], scores['textcat_f'], time_taken))
def train_spacy_model(train_data, test_data, model, output_dir=None, n_iter=100): """Load the model, set up the pipeline and train the entity recognizer.""" nlp = model ent_types = [] for _, e in train_data: ee = [ent[2] for ent in e['entities']] ent_types += ee for text, ent in train_data: doc = nlp(text) entities = ent['entities'] tags = biluo_tags_from_offsets(doc, entities) # # if "-" in tags: # print(text) # print(entities, tags) # for t in doc: # print(t, tags[t.i]) # print("\n\n\n") # create the built-in pipeline components and add them to the pipeline # nlp.create_pipe works for built-ins that are registered with spaCy if "ner" not in nlp.pipe_names: ner = nlp.create_pipe("ner") nlp.add_pipe(ner, last=True) # otherwise, get it so we can add labels else: ner = nlp.get_pipe("ner") # add labels for _, annotations in train_data: for ent in annotations.get("entities"): ner.add_label(ent[2]) # get names of other pipes to disable them during training other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"] with nlp.disable_pipes(*other_pipes): # only train NER # reset and initialize the weights randomly – but only if we're # training a new model # if model is None: nlp.begin_training() for itn in range(n_iter): random.shuffle(train_data) losses = {} # batch up the examples using spaCy's minibatch batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001)) for batch in batches: texts, annotations = zip(*batch) nlp.update( texts, # batch of texts annotations, # batch of annotations drop=0.5, # dropout - make it harder to memorise data losses=losses, ) print(f"{itn}:") print("\tLosses", losses) score = evaluate(nlp, test_data) if not os.path.isdir("models"): os.mkdir("models") nlp.to_disk(os.path.join("models", f"model_{itn}")) print("\t", score)
# Note that 'begin_training' initializes the models, so it'll zero out # existing entity types. optimizer = nlp.entity.create_optimizer() In[ ]: # get names of other pipes to disable them during training other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner'] with nlp.disable_pipes(*other_pipes): # only train NER for itn in range(n_iter): random.shuffle(TRAIN_DATA) losses = {} # batch up the examples using spaCy's minibatch batches = minibatch(TRAIN_DATA, size=compounding(4., 32., 1.001)) for batch in batches: texts, annotations = zip(*batch) nlp.update(texts, annotations, sgd=optimizer, drop=0.35, losses=losses) print('Losses', losses) In[ ]: #ner.add_label(LABEL) # add new entity label to entity recognizer if model is None: optimizer = nlp.begin_training() else: # Note that 'begin_training' initializes the models, so it'll zero out
def main( ud_dir, parses_dir, corpus, config=None, limit=0, gpu_device=-1, vectors_dir=None, use_oracle_segments=False, ): spacy.util.fix_random_seed() lang.zh.Chinese.Defaults.use_jieba = False lang.ja.Japanese.Defaults.use_janome = False if config is not None: config = Config.load(config, vectors_dir=vectors_dir) else: config = Config(vectors_dir=vectors_dir) paths = TreebankPaths(ud_dir, corpus) if not (parses_dir / corpus).exists(): (parses_dir / corpus).mkdir() print("Train and evaluate", corpus, "using lang", paths.lang) nlp = load_nlp(paths.lang, config, vectors=vectors_dir) docs, golds = read_data( nlp, paths.train.conllu.open(), paths.train.text.open(), max_doc_length=config.max_doc_length, limit=limit, ) optimizer = initialize_pipeline(nlp, docs, golds, config, gpu_device) batch_sizes = compounding(config.min_batch_size, config.max_batch_size, 1.001) beam_prob = compounding(0.2, 0.8, 1.001) for i in range(config.nr_epoch): docs, golds = read_data( nlp, paths.train.conllu.open(), paths.train.text.open(), max_doc_length=config.max_doc_length, limit=limit, oracle_segments=use_oracle_segments, raw_text=not use_oracle_segments, ) Xs = list(zip(docs, golds)) random.shuffle(Xs) if config.batch_by_words: batches = minibatch_by_words(Xs, size=batch_sizes) else: batches = minibatch(Xs, size=batch_sizes) losses = {} n_train_words = sum(len(doc) for doc in docs) with tqdm.tqdm(total=n_train_words, leave=False) as pbar: for batch in batches: batch_docs, batch_gold = zip(*batch) pbar.update(sum(len(doc) for doc in batch_docs)) nlp.parser.cfg["beam_update_prob"] = next(beam_prob) nlp.update( batch_docs, batch_gold, sgd=optimizer, drop=config.dropout, losses=losses, ) out_path = parses_dir / corpus / "epoch-{i}.conllu".format(i=i) with nlp.use_params(optimizer.averages): if use_oracle_segments: parsed_docs, scores = evaluate( nlp, paths.dev.conllu, paths.dev.conllu, out_path ) else: parsed_docs, scores = evaluate( nlp, paths.dev.text, paths.dev.conllu, out_path ) print_progress(i, losses, scores)
def main(model=None, output_dir=None, n_iter=1000): """Load the model, set up the pipeline and train the entity recognizer.""" # Create blank Language class nlp = spacy.blank("en") # Create the built-in pipeline components and add them to the pipeline # Get it so we can add labels ner = nlp.create_pipe("ner") nlp.add_pipe(ner, last=True) # Add labels for _, annotations in TRAIN_DATA: for ent in annotations.get("entities"): ner.add_label(ent[2]) # get names of other pipes to disable them during training pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"] other_pipes = [ pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions ] # only train NER with nlp.disable_pipes(*other_pipes), warnings.catch_warnings(): # show warnings for misaligned entity spans once warnings.filterwarnings("once", category=UserWarning, module='spacy') # reset and initialize the weights randomly – but only if we're # training a new model if model is None: nlp.begin_training() for itn in range(n_iter): random.shuffle(TRAIN_DATA) losses = {} # batch up the examples using spaCy's minibatch batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001)) for batch in batches: texts, annotations = zip(*batch) nlp.update( texts, # batch of texts annotations, # batch of annotations drop=0.5, # dropout - make it harder to memorise data losses=losses, ) print("Losses", losses) # TEST THE TRAINED MODEL print('\nSo what are we detected?:') # Manually test. Try with text, what wont be used in train data! doc = nlp( "I like snakes and other animals. Byt python is my favourive animal. Besides, I like to work with python programming language." ) print( "Entities", [(ent.text, ent.label_) for ent in doc.ents] ) # Should mark first 'python' as a ANIMAL and second 'python' as a P-LANG # auto - from text's from our train data #for text, _ in TRAIN_DATA: #doc = nlp(text) #print("Entities", [(ent.text, ent.label_) for ent in doc.ents]) ##print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc]) # save model to output directory if output_dir is not None: output_dir = Path(output_dir) if not output_dir.exists(): output_dir.mkdir() nlp.to_disk(output_dir) print("Saved model to", output_dir) # test the saved model print("Loading from", output_dir) nlp2 = spacy.load(output_dir) for text, _ in TRAIN_DATA: doc = nlp2(text) print("Entities", [(ent.text, ent.label_) for ent in doc.ents]) print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])
def _train_one_round(self, i: int) -> None: n_iter = 100 #number of iterations. could make this customizable but I feel that it would be too messy #train model and save to self.nlp self.anns_this_round = i * self.anns_per_point if self.verbose: print("Training on %s annotations" % (self.anns_this_round)) count = 0 train_data = [] for line in self.train_file: train_data.append(line) count += 1 if count >= self.anns_this_round: break """Set up the pipeline and entity recognizer, and train the new entity.""" random.seed(0) self.nlp = spacy.blank("en") # create blank Language class # Add entity recognizer to model if it's not in the pipeline # nlp.create_pipe works for built-ins that are registered with spaCy if "ner" not in self.nlp.pipe_names: ner = self.nlp.create_pipe("ner") self.nlp.add_pipe(ner) # otherwise, get it, so we can add labels to it else: ner = self.nlp.get_pipe("ner") for label in self.label: ner.add_label(label) # add new entity label to entity recognizer optimizer = self.nlp.begin_training() move_names = list(ner.move_names) # get names of other pipes to disable them during training pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"] other_pipes = [ pipe for pipe in self.nlp.pipe_names if pipe not in pipe_exceptions ] # only train NER with self.nlp.disable_pipes( *other_pipes) and warnings.catch_warnings(): # show warnings for misaligned entity spans once warnings.filterwarnings("once", category=UserWarning, module='spacy') sizes = compounding(1.0, 4.0, 1.001) # batch up the examples using spaCy's minibatch for itn in range(n_iter): random.shuffle(train_data) # Need some oversampling somewhere in here batches = minibatch(train_data, size=sizes) losses = {} for batch in batches: texts, annotations = zip(*batch) self.nlp.update(texts, annotations, sgd=optimizer, drop=0.35, losses=losses) #print("Losses", losses) output_dir = Path("Baby") if not output_dir.exists(): output_dir.mkdir() self.nlp.meta["name"] = "BabyModel" # rename model self.nlp.to_disk(output_dir)
def main(model=None, new_model_name='new_model', output_dir='F://ThesisProject//data//ann//', n_iter=20): """Setting up the pipeline and entity recognizer, and training the new entity.""" if model is not None: nlp = spacy.load(model) # load existing spacy model print("Loaded model '%s'" % model) else: nlp = spacy.blank('en') # create blank Language class print("Created blank 'en' model") if 'ner' not in nlp.pipe_names: ner = nlp.create_pipe('ner') nlp.add_pipe(ner) else: ner = nlp.get_pipe('ner') for i in LABEL: ner.add_label(i) # Add new entity labels to entity recognizer print(i) if model is None: optimizer = nlp.begin_training() else: optimizer = nlp.entity.create_optimizer() # Get names of other pipes to disable them during training to train only NER other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner'] with nlp.disable_pipes(*other_pipes): # only train NER for itn in range(n_iter): random.shuffle(TRAIN_DATA) losses = {} batches = minibatch(TRAIN_DATA, size=compounding(4., 32., 1.001)) for batch in batches: texts, annotations = zip(*batch) nlp.update(texts, annotations, sgd=optimizer, drop=0.35, losses=losses) print('Losses', losses) # Test the trained model test_text = "A new harvesting system and the fibre properties of reed canary grass (Phalaris arundinacea L.) makes this grass an interesting new raw material source for the pulp and paper industry in the Nordic countries. Pilot scale tests in Finland shows that high quality fine paper can successfully be produced from delayed harvested reed canary grass. Birch pulp can be replaced with reed canary grass pulp in fine paper furnish without any significant differences in the functional properties of paper." doc = nlp(test_text) print("Entities in '%s'" % test_text) for ent in doc.ents: print(ent.label_, ent.text) # Save model if output_dir is not None: output_dir = Path(output_dir) if not output_dir.exists(): output_dir.mkdir() nlp.meta['name'] = new_model_name # rename model nlp.to_disk(output_dir) print("Saved model to", output_dir) # Test the saved model print("Loading from", output_dir) nlp2 = spacy.load(output_dir) doc2 = nlp2(test_text) for ent in doc2.ents: print(ent.label_, ent.text)
def main(model=None, output_dir=None, n_iter=100): """Load the model, set up the pipeline and train the entity recognizer.""" if model is not None: nlp = spacy.load(model) # load existing spaCy model print("Loaded model '%s'" % model) else: nlp = spacy.blank("sv") # create blank Language class print("Created blank 'sv' model") if model is None: with open('cc.sv.300.vec', 'rb') as file_: header = file_.readline() nr_row, nr_dim = header.split() nlp.vocab.reset_vectors(width=int(nr_dim)) for line in file_: line = line.rstrip().decode('utf8') pieces = line.rsplit(' ', int(nr_dim)) word = pieces[0] vector = numpy.asarray([float(v) for v in pieces[1:]], dtype='f') nlp.vocab.set_vector(word, vector) # add the vectors to the vocab # create the built-in pipeline components and add them to the pipeline # nlp.create_pipe works for built-ins that are registered with spaCy if "ner" not in nlp.pipe_names: ner = nlp.create_pipe("ner") nlp.add_pipe(ner, last=True) # otherwise, get it so we can add labels else: ner = nlp.get_pipe("ner") # add labels for _, annotations in TRAIN_DATA: for ent in annotations.get("entities"): ner.add_label(ent[2]) # get names of other pipes to disable them during training other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"] with nlp.disable_pipes(*other_pipes): # only train NER # reset and initialize the weights randomly – but only if we're # training a new model if model is None: nlp.begin_training() for itn in range(n_iter): print("\nStatring iteration " + str(itn+1)) random.shuffle(TRAIN_DATA) losses = {} # batch up the examples using spaCy's minibatch batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001)) for batch in batches: texts, annotations = zip(*batch) nlp.update( texts, # batch of texts annotations, # batch of annotations drop=0.5, # dropout - make it harder to memorise data losses=losses, ) print(f"Loss: {losses['ner']}") print('------- Evaluation on TRAIN -------') print(evaluate(nlp, TRAIN_DATA)) print('\n------- Evaluation on TEST -------') print(evaluate(nlp, TEST_DATA)) # save model to output directory if output_dir is not None: output_dir = Path(output_dir) if not output_dir.exists(): output_dir.mkdir() nlp.to_disk(output_dir) print("Saved model to", output_dir) # test the saved model print("Loading from", output_dir) nlp2 = spacy.load(output_dir) print(evaluate(nlp2, TEST_DATA))
def main(model=None, output_dir=None, n_iter=20, n_texts=2000): if model is not None: nlp = spacy.load(model) # load existing spaCy model print("Loaded model '%s'" % model) else: nlp = spacy.blank('en') # create blank Language class print("Created blank 'en' model") # add the text classifier to the pipeline if it doesn't exist # nlp.create_pipe works for built-ins that are registered with spaCy if 'textcat' not in nlp.pipe_names: textcat = nlp.create_pipe('textcat') nlp.add_pipe(textcat, last=True) # otherwise, get it, so we can add labels to it else: textcat = nlp.get_pipe('textcat') # add label to text classifier textcat.add_label('POSITIVE') # load the IMDB dataset print("Loading IMDB data...") (train_texts, train_cats), (dev_texts, dev_cats) = load_data(limit=n_texts) print("Using {} examples ({} training, {} evaluation)" .format(n_texts, len(train_texts), len(dev_texts))) train_data = list(zip(train_texts, [{'cats': cats} for cats in train_cats])) # get names of other pipes to disable them during training other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'textcat'] with nlp.disable_pipes(*other_pipes): # only train textcat optimizer = nlp.begin_training() print("Training the model...") print('{:^5}\t{:^5}\t{:^5}\t{:^5}'.format('LOSS', 'P', 'R', 'F')) for i in range(n_iter): losses = {} # batch up the examples using spaCy's minibatch batches = minibatch(train_data, size=compounding(4., 32., 1.001)) for batch in batches: texts, annotations = zip(*batch) nlp.update(texts, annotations, sgd=optimizer, drop=0.2, losses=losses) with textcat.model.use_params(optimizer.averages): # evaluate on the dev data split off in load_data() scores = evaluate(nlp.tokenizer, textcat, dev_texts, dev_cats) print('{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}' # print a simple table .format(losses['textcat'], scores['textcat_p'], scores['textcat_r'], scores['textcat_f'])) # test the trained model test_text = "This movie sucked" doc = nlp(test_text) print(test_text, doc.cats) if output_dir is not None: output_dir = Path(output_dir) if not output_dir.exists(): output_dir.mkdir() nlp.to_disk(output_dir) print("Saved model to", output_dir) # test the saved model print("Loading from", output_dir) nlp2 = spacy.load(output_dir) doc2 = nlp2(test_text) print(test_text, doc2.cats)
def main(model=None, output_dir=None, n_iter=20, n_texts=2000): if model is not None: nlp = spacy.load(model) # load existing spaCy model print("Loaded model '%s'" % model) else: nlp = spacy.blank('en') # create blank Language class print("Created blank 'en' model") # add the text classifier to the pipeline if it doesn't exist # nlp.create_pipe works for built-ins that are registered with spaCy if 'textcat' not in nlp.pipe_names: textcat = nlp.create_pipe('textcat') nlp.add_pipe(textcat, last=True) # otherwise, get it, so we can add labels to it else: textcat = nlp.get_pipe('textcat') # add label to text classifier textcat.add_label('Neutral') textcat.add_label('Bullish') textcat.add_label('Bearish') # load the IMDB dataset print("Loading tweets data...") # (train_texts, train_cats), (dev_texts, dev_cats) = load_data(limit=n_texts) (train_texts, train_cats), (dev_texts, dev_cats) = load_data_2(limit=n_texts) print("Using {} examples ({} training, {} evaluation)".format( n_texts * 2, len(train_texts), len(dev_texts))) train_data = list(zip(train_texts, [{ 'cats': cats } for cats in train_cats])) # get names of other pipes to disable them during training other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'textcat'] dropout = decaying(0.6, 0.2, 1e-4) with nlp.disable_pipes(*other_pipes): # only train textcat optimizer = nlp.begin_training() print("Training the model...") print('{:^5}\t{:^5}\t{:^5}\t{:^5}'.format('LOSS', 'P', 'R', 'F')) for i in range(n_iter): losses = {} # batch up the examples using spaCy's minibatch batches = minibatch(train_data, size=compounding(2., 8., 1.001)) for batch in batches: texts, annotations = zip(*batch) nlp.update(texts, annotations, sgd=optimizer, drop=next(dropout), losses=losses) with textcat.model.use_params(optimizer.averages): # evaluate on the dev data split off in load_data() try: scores = evaluate(nlp.tokenizer, textcat, dev_texts, dev_cats) print( '{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}' # print a simple table .format(losses['textcat'], scores['textcat_p'], scores['textcat_r'], scores['textcat_f'])) except Exception as e: print(e) pass # test the trained model test_text = "#aapl buy for 250m the market!!!" doc = nlp(test_text) print(test_text, doc.cats) if output_dir is None: output_dir = Path('tweetsClassifier/spacyTrainingModel') if not output_dir.exists(): output_dir.mkdir() with nlp.use_params(optimizer.averages): nlp.to_disk(output_dir) print("Saved model to", output_dir) # test the saved model print("Loading from", output_dir) test_text = "long #aapl for 250m the market!!!" test_text2 = "#aapl lead the market!" nlp2 = spacy.load(output_dir) doc2 = nlp2(test_text) print(test_text, doc2.cats)
for _, annotations in TRAIN_DATA: for ent in annotations.get('entities'): ner.add_label(ent[2]) n_iter=100 # get names of other pipes to disable them during training other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner'] with nlp.disable_pipes(*other_pipes): # only train NER optimizer = nlp.begin_training() for itn in range(n_iter): print("Starting iteration " + str(itn)) random.shuffle(TRAIN_DATA) losses = {} # batch up the examples using spaCy's minibatch batches = minibatch(TRAIN_DATA, size=compounding(4., 32., 1.001)) for batch in batches: texts, annotations = zip(*batch) nlp.update( texts, # batch of texts annotations, # batch of annotations drop=0.55, # dropout - make it harder to memorise data sgd=optimizer, # callable to update weights losses=losses) print('Losses', losses) # test the trained model for text, _ in TRAIN_DATA: doc = nlp(text) print('Entities', [(ent.text, ent.label_) for ent in doc.ents]) #print('Tokens', [(t.text, t.ent_type_, t.ent_iob) for t in doc])
def _set_params(self, kwargs): """ Set input parameters based on the request. : :For details refer to the GitHub project: https://github.com/nabeel-oz/qlik-py-tools """ # Set default values which will be used if execution arguments are not passed # Default parameters: self.debug = False self.model = 'en_core_web_sm' self.custom = False self.base_model = 'en_core_web_sm' self.blank = False self.epochs = 100 self.batch_size = compounding(4.0, 32.0, 1.001) self.drop = 0.25 self.test = 0 # Extract the model path if required try: # Get the model name from the first row in the request_df self.model = self.request_df.loc[0, 'model_name'] # Remove the model_name column from the request_df self.request_df = self.request_df.drop(['model_name'], axis=1) except KeyError: pass # If key word arguments were included in the request, get the parameters and values if len(kwargs) > 0: # Transform the string of arguments into a dictionary self.kwargs = utils.get_kwargs(kwargs) # Set the debug option for generating execution logs # Valid values are: true, false if 'debug' in self.kwargs: self.debug = 'true' == self.kwargs['debug'].lower() # Additional information is printed to the terminal and logs if the paramater debug = true if self.debug: # Increment log counter for the class. Each instance of the class generates a new log. self.__class__.log_no += 1 # Create a log file for the instance # Logs will be stored in ..\logs\SpaCy Log <n>.txt self.logfile = os.path.join(os.getcwd(), 'logs', 'SpaCy Log {}.txt'.format(self.log_no)) self._print_log(1) # Set whether the model (if getting named entites) or base model (if retraining) is a custom model # i.e. not one of the pre-trained models provided by spaCy if 'custom' in self.kwargs: self.custom = 'true' == self.kwargs['custom'].lower() # Set the base model, i.e an existing spaCy model to be retrained. if 'base_model' in self.kwargs: self.base_model = self.kwargs['base_model'].lower() # Set the retraining to be done on a blank Language class if 'blank' in self.kwargs: self.blank = 'true' == self.kwargs['blank'].lower() # Set the epochs for training the model. # This is the the number times that the learning algorithm will work through the entire training dataset. # Valid values are an integer e.g. 200 if 'epochs' in self.kwargs: self.epochs = utils.atoi(self.kwargs['epochs']) # Set the batch size to be used during model training. # The model's internal parameters will be updated at the end of each batch. # Valid values are a single integer or compounding or decaying parameters. if 'batch_size' in self.kwargs: # The batch size may be a single integer try: self.batch_size = utils.atoi(self.kwargs['batch_size']) # Or a list of floats except ValueError: sizes = utils.get_kwargs_by_type(self.kwargs['batch_size']) # If the start < end, batch sizes will be compounded if sizes[0] < sizes[1]: self.batch_size = compounding(sizes[0], sizes[1], sizes[2]) # else bath sizes will decay during training else: self.batch_size = decaying(sizes[0], sizes[1], sizes[2]) # Set the dropout rate for retraining the model # This determines the likelihood that a feature or internal representation in the model will be dropped, # making it harder for the model to memorize the training data. # Valid values are a float lesser than 1.0 e.g. 0.35 if 'drop' in self.kwargs: self.drop = utils.atof(self.kwargs['drop']) # Set the ratio of data to be used for testing. # This data will be held out from training and just used to provide evaluation metrics. # Valid values are a float >= zero and < 1.0 e.g. 0.3 if 'test' in self.kwargs: self.test = utils.atof(self.kwargs['test']) # Debug information is printed to the terminal and logs if the paramater debug = true if self.debug: self._print_log(2) # Remove the kwargs column from the request_df self.request_df = self.request_df.drop(['kwargs'], axis=1)
def train_spacy_ner(training_data, model=None, n_iter=20, debug=False): """Example of training spaCy's named entity recognizer, starting off with an existing model or a blank model. For more details, see the documentation: * Training: https://spacy.io/usage/training * NER: https://spacy.io/usage/linguistic-features#named-entities Compatible with: spaCy v2.0.0+ Last tested with: v2.1.0 """ """Load the model, set up the pipeline and train the entity recognizer.""" if model is not None: nlp = spacy.load(model) # load existing spaCy model print("Loaded model '%s'" % model) else: nlp = spacy.blank("en") # create blank Language class print("Created blank 'en' model") # create the built-in pipeline components and add them to the pipeline # nlp.create_pipe works for built-ins that are registered with spaCy if "ner" not in nlp.pipe_names: ner = nlp.create_pipe("ner") nlp.add_pipe(ner, last=True) # otherwise, get it so we can add labels else: ner = nlp.get_pipe("ner") # add labels for _, annotations in training_data: for ent in annotations.get("entities"): ner.add_label(ent[2]) # get names of other pipes to disable them during training other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"] with nlp.disable_pipes(*other_pipes): # only train NER # reset and initialize the weights randomly – but only if we're # training a new model if model is None: nlp.begin_training() ## gather up docs that throw exception (only in debug mode) baddocs = set() for itn in range(n_iter): random.shuffle(training_data) losses = {} # batch up the examples using spaCy's minibatch # singlebatch for debug singlebatch = 1 compoundbatch = compounding(4.0, 32.0, 1.001) batchsize = singlebatch if debug else compoundbatch batches = minibatch(training_data, size=batchsize) for batch in batches: texts, annotations = zip(*batch) try: nlp.update( texts, # batch of texts annotations, # batch of annotations drop=0.5, # dropout - make it harder to memorise data losses=losses, ) except Exception: if debug: print("Exception thrown when processing doc:") print(texts, annotations) baddocs.add(batch[0][0]) continue print("Losses", losses) return nlp, baddocs
def main(model=str(get_newest_model()), new_model_name=LABEL.lower(), output_dir='custom-models', n_iter=10): """Set up the pipeline and entity recognizer, and train the new entity.""" if model is not None: nlp = spacy.load(model) # load existing spaCy model print("Loaded model '%s'" % model) else: nlp = spacy.blank('en') # create blank Language class print("Created blank 'en' model") # Add entity recognizer to model if it's not in the pipeline # nlp.create_pipe works for built-ins that are registered with spaCy if 'ner' not in nlp.pipe_names: ner = nlp.create_pipe('ner') nlp.add_pipe(ner) # otherwise, get it, so we can add labels to it else: ner = nlp.get_pipe('ner') ner.add_label(LABEL) # add new entity label to entity recognizer if model is None: optimizer = nlp.begin_training() else: # Note that 'begin_training' initializes the models, so it'll zero out # existing entity types. optimizer = nlp.entity.create_optimizer() # get names of other pipes to disable them during training other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner'] with nlp.disable_pipes(*other_pipes): # only train NER for itn in range(n_iter): random.shuffle(TRAIN_DATA) losses = {} # batch up the examples using spaCy's minibatch batches = minibatch(TRAIN_DATA, size=compounding(4., 32., 1.001)) for batch in batches: texts, annotations = zip(*batch) nlp.update(texts, annotations, sgd=optimizer, drop=0.35, losses=losses) print('Losses', losses) # test the trained model with test_text provided by secondary script exec argument; defaulting to a very basic test test_text = "This is " + new_model_name.replace('_', ' ').title() + "." if len(sys.argv) >= 3 and sys.argv[2] != "": test_text = sys.argv[2] doc = nlp(test_text) print("Entities in '%s'" % test_text) for ent in doc.ents: print(ent.label_, ent.text) # save model to output directory if output_dir is not None: millis = int(round(time.time() * 1000)) output_dir = Path(output_dir + '/' + str(millis) + "-" + new_model_name) if not output_dir.exists(): output_dir.mkdir() nlp.meta['name'] = new_model_name # rename model nlp.to_disk(output_dir) print("Saved model to", output_dir) # test the saved model print("Loading from", output_dir) nlp2 = spacy.load(output_dir) doc2 = nlp2(test_text) for ent in doc2.ents: print(ent.label_, ent.text)
def main(model=None, new_model_name='pineapple', output_dir='./pineapple_1_0', n_iter=10): """Set up the pipeline and entity recognizer, and train the new entity.""" # if model is not None: # nlp = spacy.load('en') # print("Loaded model '%s'" % model) # else: # nlp = spacy.blank('en') # create blank Language class # print("Created blank 'en' model") # Add entity recognizer to model if it's not in the pipeline # nlp.create_pipe works for built-ins that are registered with spaCy nlp = spacy.load('en_core_web_sm') if 'ner' not in nlp.pipe_names: ner = nlp.create_pipe('ner') nlp.add_pipe(ner) # otherwise, get it, so we can add labels to it else: ner = nlp.get_pipe('ner') ner.add_label(LABEL) # add new entity label to entity recognizer ner.add_label(LABEL_2) if model is None: optimizer = nlp.begin_training() else: # Note that 'begin_training' initializes the models, so it'll zero out # existing entity types. optimizer = nlp.entity.create_optimizer() # get names of other pipes to disable them during training other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner'] with nlp.disable_pipes(*other_pipes): # only train NER for itn in range(n_iter): random.shuffle(TRAIN_DATA) losses = {} # batch up the examples using spaCy's minibatch batches = minibatch(TRAIN_DATA, size=compounding(4., 32., 1.001)) for batch in batches: texts, annotations = zip(*batch) nlp.update(texts, annotations, sgd=optimizer, drop=0.35, losses=losses) print('Losses', losses) test_text = 'Office E&T 420 Phone (323) 343-669 Address 5151 State University Drive, Los Angeles, CA 90032 Web cs.calstatela.edu or www.calstatela.edu/cs' doc = nlp(test_text) #print("Entities in '%s'" % test_text) for ent in doc.ents: print(ent.label_, ent.text) # save model to output directory if output_dir is not None: output_dir = Path(output_dir) if not output_dir.exists(): output_dir.mkdir() nlp.meta['name'] = new_model_name # rename model nlp.to_disk(output_dir) print("Saved model to", output_dir) # test the saved model print("Loading from", output_dir) nlp2 = spacy.load(output_dir) doc2 = nlp2(test_text) for ent in doc2.ents: print(ent.label_, ent.text)
def main(path, model=None, new_model_name='new_model', output_dir=None, n_iter=10): """Setting up the pipeline and entity recognizer, and training the new entity.""" with open(path, 'rb') as fp: train_data = pickle.load(fp) trim_entity_spans(train_data) train_data = [fix(el) for el in train_data] if model is not None: nlp = spacy.load(model) # load existing spacy model print("Loaded model '%s'" % model) else: nlp = spacy.blank('xx') # create blank Language class print("Created blank 'xx' model") if 'ner' not in nlp.pipe_names: ner = nlp.create_pipe('ner') nlp.add_pipe(ner) else: ner = nlp.get_pipe('ner') # for i in LABEL: # ner.add_label(i) # Add new entity labels to entity recognizer for _, annotations in train_data: for ent in annotations.get('entities'): ner.add_label(ent[2]) if model is None: optimizer = nlp.begin_training() else: optimizer = nlp.entity.create_optimizer() # Get names of other pipes to disable them during training to train only NER other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner'] with nlp.disable_pipes(*other_pipes): # only train NER for itn in range(n_iter): random.shuffle(train_data) losses = {} batches = minibatch(train_data, size=compounding(4., 32., 1.001)) for batch in batches: texts, annotations = zip(*batch) texts = [str.strip(text) for text in texts] try: nlp.update(texts, annotations, sgd=optimizer, drop=0.35, losses=losses) except Exception as e: print(e) if losses['ner'] < 20: break print('Losses', losses) # Test the trained model test_text = 'Republika Srbija se odriće Kosova na dan 22. maja 2019. godine, i prestonica drzave je u Novi Sad.' doc = nlp(test_text) print("Entities in '%s'" % test_text) for ent in doc.ents: print(ent.label_, ent.text) # Save model if output_dir is not None: output_dir = Path(output_dir) if not output_dir.exists(): output_dir.mkdir() nlp.meta['name'] = new_model_name # rename model nlp.to_disk(output_dir) print("Saved model to", output_dir) # Test the saved model print("Loading from", output_dir) nlp2 = spacy.load(output_dir) doc2 = nlp2(test_text) for ent in doc2.ents: print(ent.label_, ent.text)
def main(dir_kb, output_dir=None, loc_training=None, epochs=10, dropout=0.5, lr=0.005, l2=1e-6, train_inst=None, dev_inst=None, labels_discard=None): logger.info("Creating Entity Linker with Wikipedia and WikiData") output_dir = Path(output_dir) if output_dir else dir_kb training_path = loc_training if loc_training else dir_kb / TRAINING_DATA_FILE nlp_dir = dir_kb / KB_MODEL_DIR kb_path = dir_kb / KB_FILE nlp_output_dir = output_dir / OUTPUT_MODEL_DIR # STEP 0: set up IO if not output_dir.exists(): output_dir.mkdir() # STEP 1 : load the NLP object logger.info("STEP 1a: Loading model from {}".format(nlp_dir)) nlp = spacy.load(nlp_dir) logger.info("STEP 1b: Loading KB from {}".format(kb_path)) kb = read_kb(nlp, kb_path) # check that there is a NER component in the pipeline if "ner" not in nlp.pipe_names: raise ValueError( "The `nlp` object should have a pretrained `ner` component.") # STEP 2: read the training dataset previously created from WP logger.info( "STEP 2: Reading training dataset from {}".format(training_path)) if labels_discard: labels_discard = [x.strip() for x in labels_discard.split(",")] logger.info("Discarding {} NER types: {}".format( len(labels_discard), labels_discard)) train_data = wikipedia_processor.read_training( nlp=nlp, entity_file_path=training_path, dev=False, limit=train_inst, kb=kb, labels_discard=labels_discard) # for testing, get all pos instances (independently of KB) dev_data = wikipedia_processor.read_training( nlp=nlp, entity_file_path=training_path, dev=True, limit=dev_inst, kb=None, labels_discard=labels_discard) # STEP 3: create and train an entity linking pipe logger.info("STEP 3: Creating and training an Entity Linking pipe") el_pipe = nlp.create_pipe(name="entity_linker", config={ "pretrained_vectors": nlp.vocab.vectors.name, "labels_discard": labels_discard }) el_pipe.set_kb(kb) nlp.add_pipe(el_pipe, last=True) other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "entity_linker"] with nlp.disable_pipes(*other_pipes): # only train Entity Linking optimizer = nlp.begin_training() optimizer.learn_rate = lr optimizer.L2 = l2 logger.info("Training on {} articles".format(len(train_data))) logger.info("Dev testing on {} articles".format(len(dev_data))) # baseline performance on dev data logger.info("Dev Baseline Accuracies:") measure_performance(dev_data, kb, el_pipe, baseline=True, context=False) for itn in range(epochs): random.shuffle(train_data) losses = {} batches = minibatch(train_data, size=compounding(4.0, 128.0, 1.001)) batchnr = 0 with nlp.disable_pipes(*other_pipes): for batch in batches: try: docs, golds = zip(*batch) nlp.update( docs=docs, golds=golds, sgd=optimizer, drop=dropout, losses=losses, ) batchnr += 1 except Exception as e: logger.error("Error updating batch:" + str(e)) if batchnr > 0: logging.info("Epoch {}, train loss {}".format( itn, round(losses["entity_linker"] / batchnr, 2))) measure_performance(dev_data, kb, el_pipe, baseline=False, context=True) # STEP 4: measure the performance of our trained pipe on an independent dev set logger.info("STEP 4: Final performance measurement of Entity Linking pipe") measure_performance(dev_data, kb, el_pipe) # STEP 5: apply the EL pipe on a toy example logger.info("STEP 5: Applying Entity Linking to toy example") run_el_toy_example(nlp=nlp) if output_dir: # STEP 6: write the NLP pipeline (now including an EL model) to file logger.info("STEP 6: Writing trained NLP to {}".format(nlp_output_dir)) nlp.to_disk(nlp_output_dir) logger.info("Done!")
else: f_score = 2 * (precision * recall) / (precision + recall) return {"textcat_p": precision, "textcat_r": recall, "textcat_f": f_score} # get names of other pipes to disable them during training other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "textcat"] init_tok2vec=None # Train with nlp.disable_pipes(*other_pipes): # only train textcat optimizer = nlp.begin_training() if init_tok2vec is not None: with init_tok2vec.open("rb") as file_: textcat.model.tok2vec.from_bytes(file_.read()) print("Training the model...") print("{:^5}\t{:^5}\t{:^5}\t{:^5}".format("LOSS", "P", "R", "F")) batch_sizes = compounding(4.0, 32.0, 1.001) for i in range(n_iter): losses = {} # batch up the examples using spaCy's minibatch random.shuffle(train_data) batches = minibatch(train_data, size=batch_sizes) for batch in batches: texts, annotations = zip(*batch) if texts[0] == None or annotations[0] == None: print('texts[0] | annotations[0] == None, continuing') continue nlp.update(texts, annotations, sgd=optimizer,
def main(model=None, n_iter=100): nlp = spacy.load(model) # load existing spaCy model print("Loaded model '%s'" % model) if "ner" not in nlp.pipe_names: ner = nlp.create_pipe("ner") nlp.add_pipe(ner, last=True) else: ner = nlp.get_pipe("ner") # add labels for _, annotations in TRAIN_DATA: for ent in annotations.get("entities"): ner.add_label(ent[2]) # get names of other pipes to disable them during training pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"] other_pipes = [ pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions ] # only train NER with nlp.disable_pipes(*other_pipes), warnings.catch_warnings(): # show warnings for misaligned entity spans once warnings.filterwarnings("once", category=UserWarning, module='spacy') # reset and initialize the weights randomly – but only if we're # training a new model if model is None: nlp.begin_training() for itn in range(n_iter): random.shuffle(TRAIN_DATA) losses = {} # batch up the examples using spaCy's minibatch batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001)) for batch in batches: texts, annotations = zip(*batch) nlp.update( texts, # batch of texts annotations, # batch of annotations drop=0.5, # dropout - make it harder to memorise data losses=losses, ) print("Losses", losses) # test the trained model for text, _ in TRAIN_DATA: doc = nlp(preprocess(text)) print("Entities", [(ent.text, ent.label_) for ent in doc.ents]) print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc]) # save model to output directory output_dir = Path("vi") if not output_dir.exists(): output_dir.mkdir() nlp.to_disk(output_dir) print("Saved model to", output_dir) # test the saved model print("Loading from", output_dir) nlp2 = spacy.load(output_dir) for text, _ in TRAIN_DATA: doc = nlp2(text) print("Entities", [(ent.text, ent.label_) for ent in doc.ents]) print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])
def main(model=None, output_dir="model_entRuler", n_iter=70): # setup pipeline # load the model you want to use # use nlp.disable_pipes to disable all pipes but NER """ Load the model, set up the pipeline and train the entity recognizer. """ if model is not None: nlp = spacy.load('model') # load existing spaCy model print("Loaded model '%s'" % model) else: nlp = spacy.blank("en") # create blank Language class #nlp = spacy.load('en_core_web_sm', disable=['tagger', 'parser']) print("Created blank 'en' model") # create the built-in pipeline components and add them to the pipeline # nlp.create_pipe works for built-ins that are registered with spaCy if "ner" not in nlp.pipe_names: ner = nlp.create_pipe("ner") nlp.add_pipe(ner, last=True) # otherwise, get it so we can add labels else: ner = nlp.get_pipe("ner") # ADD EXISTING/TRAINED ENTITY RULER ---------------- # load patterns from external file #nu_ruler = EntityRuler(nlp).from_disk('iesa_ners_patterns_mmat.jsonl') # putting the ruler before ner will override ner decisions in favor of ruler patterns #nlp.add_pipe(nu_ruler, before='ner') # show pipeline components: print(nlp.pipe_names) # -------------------------------------------------- # add labels for _, annotations in TRAIN_DATA: for ent in annotations.get("entities"): ner.add_label(ent[2]) # get names of other pipes to disable them during training other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"] with nlp.disable_pipes(*other_pipes): # only train NER # reset and initialize the weights randomly – but only if we're # training a new model if model is None: nlp.begin_training() for itn in range(n_iter): random.shuffle(TRAIN_DATA) losses = {} # batch up the examples using spaCy's minibatch batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001)) for batch in batches: texts, annotations = zip(*batch) nlp.update( texts, # batch of texts annotations, # batch of annotations drop=0.5, # dropout - make it harder to memorise data losses=losses, ) print("Losses", losses) # test the trained model for text, _ in TRAIN_DATA: doc = nlp(text) print("Entities", [(ent.text, ent.label_) for ent in doc.ents]) print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc]) # save model to output directory if output_dir is not None: output_dir = Path(output_dir) if not output_dir.exists(): output_dir.mkdir() nlp.to_disk(output_dir) print("Saved model to", output_dir) # test the saved model print("Loading from", output_dir) nlp2 = spacy.load(output_dir) for text, _ in TRAIN_DATA: doc = nlp2(text) print("Entities", [(ent.text, ent.label_) for ent in doc.ents]) print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc]) # show pipeline components: print(nlp.pipe_names) # end program print('Done.')
def train(args, db): """ Set up the pipeline and entity recognizer, and train the new entity. training data Note: If you're using an existing model, make sure to mix in examples of other entity types that spaCy correctly recognized before. Otherwise, your model might learn the new type, but "forget" what it previously knew. https://explosion.ai/blog/pseudo-rehearsal-catastrophic-forgetting """ n_iter = 1 model = None usecase = args.usecase source = args.source threshold_value = args.threshold print("Building model for usecase: %s , Source: %s, Threshold: %s" % (usecase, source, threshold_value)) base_dir = Path("{}/usecases/{}".format(MODEL_PATH, usecase)) output_dir = "{}/usecases/{}/Model1".format(MODEL_PATH, usecase) if not base_dir.exists(): os.makedirs("{}/usecases/{}".format(MODEL_PATH, usecase)) output_dir = Path(output_dir) if output_dir.exists(): model = output_dir training_set = [] labels = [] if source == 'csv': df = pd.read_csv(DATA_PATH + '/usecases_trainingset.csv') trainingset = df[df['use_case_id'] == usecase][[ 'sentence', 'entities' ]].values.tolist() else: trainingset = db.query( 'select sentence, entities from usecases_trainingset where use_case_id=%s' % usecase) for data in trainingset: training_set.append((data[0], {"entities": ast.literal_eval(data[1])})) for label in ast.literal_eval(data[1]): labels.append(label[2]) if model is not None: nlp = spacy.load(model) # load existing spaCy model print("Loaded model '%s'" % model) else: nlp = spacy.load('en_core_web_sm') # create blank Language class # nlp.vocab.vectors.name = 'spacy_pretrained_vectors' print("Created blank 'en' model") # Add entity recognizer to model if it's not in the pipeline # nlp.create_pipe works for built-ins that are registered with spaCy if 'ner' not in nlp.pipe_names: ner = nlp.create_pipe('ner') nlp.add_pipe(ner) # otherwise, get it, so we can add labels to it else: ner = nlp.get_pipe('ner') for label in labels: ner.add_label(label) if model is None: optimizer = nlp.begin_training() else: # Note that 'begin_training' initializes the models, so it'll zero out # existing entity types. optimizer = nlp.entity.create_optimizer() # get names of other pipes to disable them during training other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner'] with nlp.disable_pipes(*other_pipes): # only train NER for itn in range(n_iter): random.shuffle(training_set) losses = {} # batch up the examples using spaCy's minibatch batches = minibatch(training_set, size=compounding(4., 32., 1.001)) for batch in batches: texts, annotations = zip(*batch) nlp.update(texts, annotations, sgd=optimizer, drop=0.35, losses=losses) print('Losses', losses) # test the trained model # test_text = "Your PNR is U7223N." # doc = nlp(test_text) # print("Entities in '%s'" % test_text) # for ent in doc.ents: # print(ent.label_, ent.text) # save model to output directory print('output_dir') if output_dir is not None: if not output_dir.exists(): output_dir.mkdir() nlp.meta['name'] = 'usecase_{}.model'.format(usecase) nlp.to_disk(output_dir)
def train_spacy(data, n_iter, load, output_dir): """ Train the spacy model to tag new entity and save the trained model @param: TRAIN_DATA: training data () n_iter: number of iterations, type integer load: bool value, False to load pretrained spacy model. True to load an empty model output_dir: str, path of directory to save the trained model """ TRAIN_DATA = data # load space model if load: print('Load Spacy model') nlp = spacy.load("en_core_web_sm") else: nlp = spacy.blank('en') if "ner" not in nlp.pipe_names: ner = nlp.create_pipe("ner") nlp.add_pipe(ner, last=True) else: ner = nlp.get_pipe("ner") # add labels ner.add_label('AstroTerm') # get names of other pipes to disable them during training other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner'] # Resume training #optimizer = nlp.resume_training() #move_names = list(ner.move_names) print("Model training...") nlp.begin_training() # Begin training by disabling other pipeline components with nlp.disable_pipes(*other_pipes): # show warnings for misaligned entity spans once optimizer = nlp.begin_training() sizes = compounding(1.0, 5.0, 1.001) # Training for n_iter iterations for itn in progressbar(range(n_iter)): sleep(0.02) # shuffle examples before training random.shuffle(TRAIN_DATA) # batch up the examples using spaCy's minibatch batches = minibatch(TRAIN_DATA, size=sizes) # dictionary to store losses losses = {} for batch in batches: texts, annotations = zip(*batch) # Calling update() over the iteration nlp.update(texts, annotations, sgd=optimizer, drop=0.25, losses=losses) #print("Losses", losses) print("Saving the trained model...") if output_dir is not None: output_dir = Path(output_dir) if not output_dir.exists(): output_dir.mkdir() nlp.to_disk(output_dir) print("Saved model to", output_dir)
def main(TRAIN_DATA, model=None, output_dir=None, n_iter=30): """Load the model, set up the pipeline and train the entity recognizer.""" if model is not None: nlp = spacy.load(model) # load existing spaCy model print("Loaded model '%s'" % model) else: nlp = spacy.blank("es") # create blank Language class print("Created blank 'es' model") # create the built-in pipeline components and add them to the pipeline # nlp.create_pipe works for built-ins that are registered with spaCy if "ner" not in nlp.pipe_names: ner = nlp.create_pipe("ner") nlp.add_pipe(ner, last=True) # otherwise, get it so we can add labels else: ner = nlp.get_pipe("ner") # add labels for _, annotations in TRAIN_DATA: for ent in annotations.get("entities"): ner.add_label(ent[2]) # get names of other pipes to disable them during training other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"] with nlp.disable_pipes(*other_pipes): # only train NER # reset and initialize the weights randomly – but only if we're # training a new model if model is None: nlp.begin_training() for itn in range(n_iter): random.shuffle(TRAIN_DATA) losses = {} # batch up the examples using spaCy's minibatch batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001)) for batch in batches: texts, annotations = zip(*batch) nlp.update( texts, # batch of texts annotations, # batch of annotations drop=0.5, # dropout - make it harder to memorise data losses=losses, ) print("Losses", losses) # test the trained model # for text, _ in TRAIN_DATA: # doc = nlp(text) # print("Entities", [(ent.text, ent.label_) for ent in doc.ents]) # print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc]) # save model to output directory output_dir = "./neroutput" if output_dir is not None: output_dir = Path(output_dir) if not output_dir.exists(): output_dir.mkdir() nlp.to_disk(output_dir) print("Saved model to", output_dir) return nlp # test the saved model # print("Loading from", output_dir) # nlp2 = spacy.load(output_dir) # for text, _ in TRAIN_DATA: # doc = nlp2(text) # print("Entities", [(ent.text, ent.label_) for ent in doc.ents]) # print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])
def main( model=None, # '.\\trained_models\\omkar_model', new_model_name="newtrain", output_dir='.\\trained_models\\new_model', n_iter=10): """Set up the pipeline and entity recognizer, and train the new entity.""" random.seed(0) if model is not None: nlp = spacy.load(model) # load existing spaCy model print("Loaded model '%s'" % model) else: nlp = spacy.blank("en") # create blank Language class # nlp = spacy.load("en_core_web_sm") # create Language class print("Created blank 'en' model") # Add entity recognizer to model if it's not in the pipeline # nlp.create_pipe works for built-ins that are registered with spaCy if "ner" not in nlp.pipe_names: print("Creating new pipe") ner = nlp.create_pipe("ner") nlp.add_pipe(ner, last=True) # otherwise, get it, so we can add labels to it else: ner = nlp.get_pipe("ner") # add labels for _, annotations in TRAIN_DATA: for ent in annotations.get('entities'): ner.add_label(ent[2]) # if model is None or reset_weights: # optimizer = nlp.begin_training() # else: # optimizer = nlp.resume_training() # move_names = list(ner.move_names) # get names of other pipes to disable them during training other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"] with nlp.disable_pipes(*other_pipes): # only train NER optimizer = nlp.begin_training() # batch up the examples using spaCy's minibatch sizes = compounding(1.0, 4.0, 1.001) for itn in range(n_iter): print("Starting iteration " + str(itn)) random.shuffle(TRAIN_DATA) losses = {} batches = minibatch(TRAIN_DATA, size=sizes) for batch in batches: texts, annotations = zip(*batch) try: nlp.update(texts, annotations, sgd=optimizer, drop=0.4, losses=losses) except: pass print("Losses", losses) # test the trained model test_text = "Marathwada Mitra Mandals College of Engineering" doc = nlp(test_text) print("Entities in '%s'" % test_text) for ent in doc.ents: print(ent.label_, ent.text) # save model to output directory if output_dir is not None: output_dir = Path(output_dir) if not output_dir.exists(): output_dir.mkdir() nlp.meta["name"] = new_model_name # rename model nlp.to_disk(output_dir) print("Saved model to", output_dir) # test the saved model print("Loading from", output_dir) nlp2 = spacy.load(output_dir) # Check the classes have loaded back consistently # assert nlp2.get_pipe("ner").move_names == move_names doc2 = nlp2(test_text) for ent in doc2.ents: print(ent.label_, ent.text)
def main(model=None, output_dir=None, n_iter=20, n_texts=2000, init_tok2vec=None): if output_dir is not None: output_dir = Path(output_dir) if not output_dir.exists(): output_dir.mkdir() if model is not None: nlp = spacy.load(model) # load existing spaCy model print("Loaded model '%s'" % model) else: nlp = spacy.blank("en") # create blank Language class print("Created blank 'en' model") # add the text classifier to the pipeline if it doesn't exist # nlp.create_pipe works for built-ins that are registered with spaCy if "textcat" not in nlp.pipe_names: textcat = nlp.create_pipe("textcat", config={ "exclusive_classes": True, "architecture": "simple_cnn" }) nlp.add_pipe(textcat, last=True) # otherwise, get it, so we can add labels to it else: textcat = nlp.get_pipe("textcat") # add label to text classifier textcat.add_label("POSITIVE") textcat.add_label("NEGATIVE") # load the IMDB dataset print("Loading IMDB data...") (train_texts, train_cats), (dev_texts, dev_cats) = load_data() train_texts = train_texts[:n_texts] train_cats = train_cats[:n_texts] print("Using {} examples ({} training, {} evaluation)".format( n_texts, len(train_texts), len(dev_texts))) train_data = list(zip(train_texts, [{ "cats": cats } for cats in train_cats])) # get names of other pipes to disable them during training other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "textcat"] with nlp.disable_pipes(*other_pipes): # only train textcat optimizer = nlp.begin_training() if init_tok2vec is not None: with init_tok2vec.open("rb") as file_: textcat.model.tok2vec.from_bytes(file_.read()) print("Training the model...") print("{:^5}\t{:^5}\t{:^5}\t{:^5}".format("LOSS", "P", "R", "F")) batch_sizes = compounding(4.0, 32.0, 1.001) for i in range(n_iter): losses = {} # batch up the examples using spaCy's minibatch random.shuffle(train_data) batches = minibatch(train_data, size=batch_sizes) for batch in batches: texts, annotations = zip(*batch) nlp.update(texts, annotations, sgd=optimizer, drop=0.2, losses=losses) with textcat.model.use_params(optimizer.averages): # evaluate on the dev data split off in load_data() scores = evaluate(nlp.tokenizer, textcat, dev_texts, dev_cats) print("{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}". format( # print a simple table losses["textcat"], scores["textcat_p"], scores["textcat_r"], scores["textcat_f"], )) # test the trained model test_text = "This movie sucked" doc = nlp(test_text) print(test_text, doc.cats) if output_dir is not None: with nlp.use_params(optimizer.averages): nlp.to_disk(output_dir) print("Saved model to", output_dir) # test the saved model print("Loading from", output_dir) nlp2 = spacy.load(output_dir) doc2 = nlp2(test_text) print(test_text, doc2.cats)
def main(model=None, new_model_name="rosatom-docs", output_dir='./model', n_iter=30): """Set up the pipeline and entity recognizer, and train the new entity.""" random.seed(0) if model is not None: nlp = spacy.load(model) # load existing spaCy model print("Loaded model '%s'" % model) else: nlp = spacy.blank("ru") # create blank Language class print("Created blank 'ru' model") # Add entity recognizer to model if it's not in the pipeline # nlp.create_pipe works for built-ins that are registered with spaCy if "ner" not in nlp.pipe_names: ner = nlp.create_pipe("ner") nlp.add_pipe(ner) # otherwise, get it, so we can add labels to it else: ner = nlp.get_pipe("ner") for train_data in TRAIN_DATA: print("Train data '%s'" % train_data['label']) ner.add_label( train_data['label']) # add new entity label to entity recognizer if model is None: optimizer = nlp.begin_training() else: optimizer = nlp.resume_training() move_names = list(ner.move_names) # get names of other pipes to disable them during training pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"] other_pipes = [ pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions ] # only train NER with nlp.disable_pipes(*other_pipes) and warnings.catch_warnings(): # show warnings for misaligned entity spans once warnings.filterwarnings("once", category=UserWarning, module='spacy') sizes = compounding(1.0, 4.0, 1.001) # batch up the examples using spaCy's minibatch for itn in range(n_iter): random.shuffle(train_data['data']) batches = minibatch(train_data['data'], size=sizes) losses = {} for batch in batches: texts, annotations = zip(*batch) nlp.update(texts, annotations, sgd=optimizer, drop=0.35, losses=losses) print("Losses", losses) # save model to output directory if output_dir is not None: output_dir = Path(output_dir) if not output_dir.exists(): output_dir.mkdir() nlp.meta["name"] = new_model_name # rename model nlp.to_disk(output_dir) print("Saved model to", output_dir)
def main(model=None, output_dir=None, n_iter=100): if model is not None: nlp = spacy.load(model) print("Loaded model '%s'" % model) else: nlp = spacy.blank("en") print("Created blank 'en' model") if "ner" not in nlp.pipe_names: ner = nlp.create_pipe("ner") nlp.add_pipe(ner, last=True) else: ner = nlp.get_pipe("ner") for _, annotations in TRAIN_DATA: for ent in annotations.get("entities"): ner.add_label(ent[2]) other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"] with nlp.disable_pipes(*other_pipes): if model is None: nlp.begin_training() for itn in range(n_iter): ts = time.time() st= datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S') print (st) print("Starting iteration " + str(itn)) random.shuffle(TRAIN_DATA) losses = {} batches = minibatch(TRAIN_DATA, size=compounding(1000., 8000., 1.25)) for batch in batches: texts, annotations = zip(*batch) nlp.update( texts, annotations, drop=0.5, losses=losses, ) print("Losses", losses) ts = time.time() st = datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S') print (st) print ("CONCLUIDO") for text, _ in TRAIN_DATA: doc = nlp(text) print("Entities", [(ent.text, ent.label_) for ent in doc.ents]) print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc]) if output_dir is not None: output_dir = Path(output_dir) if not output_dir.exists(): output_dir.mkdir() nlp.to_disk(output_dir) print("Saved model to", output_dir) print("Loading from", output_dir) nlp2 = spacy.load(output_dir) for text, _ in TRAIN_DATA: doc = nlp2(text) print("Entities", [(ent.text, ent.label_) for ent in doc.ents]) print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])
def train(model=None, output_dir=None, n_iter=20, n_texts=2000, categories=[], train_texts=[], train_cats=[], dev_texts=[], dev_cats=[]): if model is not None: nlp = spacy.load(model) # load existing spaCy model print("Loaded model '%s'" % model) else: nlp = spacy.blank('en') # create blank Language class print("Created blank 'en' model") # add the text classifier to the pipeline if it doesn't exist # nlp.create_pipe works for built-ins that are registered with spaCy if 'textcat' not in nlp.pipe_names: textcat = nlp.create_pipe('textcat') nlp.add_pipe(textcat, last=True) # otherwise, get it, so we can add labels to it else: textcat = nlp.get_pipe('textcat') # add label to text classifier #categories = ['greet', 'time', 'direction', 'self-location', 'location', 'search-general', #'search-restaurants', 'affirmation', 'negation', 'launch', 'news', 'shut-down', #'compliment', 'search-wikipedia'] for category in categories: textcat.add_label(category) # load the IMDB dataset print("Loading categorisation data...") #(train_texts, train_cats), (dev_texts, dev_cats) = load_data(categories, limit=n_texts) print("Using {} examples ({} training, {} evaluation)" .format(n_texts, len(train_texts), len(dev_texts))) train_data = list(zip(train_texts, [{'cats': cats} for cats in train_cats])) # get names of other pipes to disable them during training other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'textcat'] with nlp.disable_pipes(*other_pipes): # only train textcat optimizer = nlp.begin_training() print("Training the model...") print('{:^5}\t{:^5}\t{:^5}\t{:^5}\t{:^5}'.format('Iter #', 'LOSS', 'P', 'R', 'F')) for i in range(n_iter): losses = {} # batch up the examples using spaCy's minibatch #batches = get_batches(train_data, 'textcat') batches = minibatch(train_data, size=compounding(4., 32., 1.001)) dropout = decaying(0.6, 0.2, 1e-4) for batch in batches: texts, annotations = zip(*batch) nlp.update(texts, annotations, sgd=optimizer, drop=0.2, losses=losses) with textcat.model.use_params(optimizer.averages): # evaluate on the dev data split off in load_data() scores = evaluate(nlp.tokenizer, textcat, dev_texts, dev_cats) print('{0}\t{1:.3f}\t{2:.3f}\t{3:.3f}\t{4:.3f}' # print a simple table .format(i, losses['textcat'], scores['textcat_p'], scores['textcat_r'], scores['textcat_f'])) if output_dir is not None: output_dir = Path(output_dir) if not output_dir.exists(): output_dir.mkdir() nlp.to_disk(output_dir) print("Saved model to", output_dir)
ner.add_label(ent[2]) n_iter=10 other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner'] with nlp.disable_pipes(*other_pipes): optimizer = nlp.begin_training() for itn in range(n_iter): ts = time.time() st= datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S') print (st) print("Starting iteration " + str(itn)) random.shuffle(TRAIN_DATA) losses = {} #batches = minibatch(TRAIN_DATA, size=compounding(4., 32., 1.001)) batches = minibatch(TRAIN_DATA, size=compounding(1000., 8000., 1.25)) for batch in batches: texts, annotations = zip(*batch) nlp.update( texts, annotations, drop=0.3, sgd=optimizer, losses=losses) print('Losses', losses) ts = time.time() st = datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S') print (st) print ("CONCLUIDO") # test the trained model
def main( ud_dir, parses_dir, corpus, config=None, limit=0, gpu_device=-1, vectors_dir=None, use_oracle_segments=False, ): # temp fix to avoid import issues cf https://github.com/explosion/spaCy/issues/4200 import tqdm spacy.util.fix_random_seed() lang.zh.Chinese.Defaults.use_jieba = False lang.ja.Japanese.Defaults.use_janome = False if config is not None: config = Config.load(config, vectors_dir=vectors_dir) else: config = Config(vectors_dir=vectors_dir) paths = TreebankPaths(ud_dir, corpus) if not (parses_dir / corpus).exists(): (parses_dir / corpus).mkdir() print("Train and evaluate", corpus, "using lang", paths.lang) nlp = load_nlp(paths.lang, config, vectors=vectors_dir) docs, golds = read_data( nlp, paths.train.conllu.open(), paths.train.text.open(), max_doc_length=config.max_doc_length, limit=limit, ) optimizer = initialize_pipeline(nlp, docs, golds, config, gpu_device) batch_sizes = compounding(config.min_batch_size, config.max_batch_size, 1.001) beam_prob = compounding(0.2, 0.8, 1.001) for i in range(config.nr_epoch): docs, golds = read_data( nlp, paths.train.conllu.open(), paths.train.text.open(), max_doc_length=config.max_doc_length, limit=limit, oracle_segments=use_oracle_segments, raw_text=not use_oracle_segments, ) Xs = list(zip(docs, golds)) random.shuffle(Xs) if config.batch_by_words: batches = minibatch_by_words(Xs, size=batch_sizes) else: batches = minibatch(Xs, size=batch_sizes) losses = {} n_train_words = sum(len(doc) for doc in docs) with tqdm.tqdm(total=n_train_words, leave=False) as pbar: for batch in batches: batch_docs, batch_gold = zip(*batch) pbar.update(sum(len(doc) for doc in batch_docs)) nlp.parser.cfg["beam_update_prob"] = next(beam_prob) nlp.update( batch_docs, batch_gold, sgd=optimizer, drop=config.dropout, losses=losses, ) out_path = parses_dir / corpus / "epoch-{i}.conllu".format(i=i) with nlp.use_params(optimizer.averages): if use_oracle_segments: parsed_docs, scores = evaluate(nlp, paths.dev.conllu, paths.dev.conllu, out_path) else: parsed_docs, scores = evaluate(nlp, paths.dev.text, paths.dev.conllu, out_path) print_progress(i, losses, scores)
def main(model=None, output_dir=None, n_iter=100): """Load the model, set up the pipeline and train the entity recognizer.""" if model is not None: nlp = spacy.load(model) # load existing spaCy model print("Loaded model '%s'" % model) else: nlp = spacy.blank("en") # create blank Language class print("Created blank 'en' model") # create the built-in pipeline components and add them to the pipeline # nlp.create_pipe works for built-ins that are registered with spaCy if "ner" not in nlp.pipe_names: ner = nlp.create_pipe("ner") nlp.add_pipe(ner, last=True) # otherwise, get it so we can add labels else: ner = nlp.get_pipe("ner") # add labels for _, annotations in TRAIN_DATA: for ent in annotations.get("entities"): ner.add_label(ent[2]) # get names of other pipes to disable them during training other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"] with nlp.disable_pipes(*other_pipes): # only train NER # reset and initialize the weights randomly – but only if we're # training a new model if model is None: nlp.begin_training() for itn in range(n_iter): random.shuffle(TRAIN_DATA) losses = {} # batch up the examples using spaCy's minibatch batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001)) for batch in batches: texts, annotations = zip(*batch) nlp.update( texts, # batch of texts annotations, # batch of annotations drop=0.5, # dropout - make it harder to memorise data losses=losses, ) print("Losses", losses) # test the trained model for text, _ in TRAIN_DATA: doc = nlp(text) print("Entities", [(ent.text, ent.label_) for ent in doc.ents]) print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc]) # save model to output directory if output_dir is not None: output_dir = Path(output_dir) if not output_dir.exists(): output_dir.mkdir() nlp.to_disk(output_dir) print("Saved model to", output_dir) # test the saved model print("Loading from", output_dir) nlp2 = spacy.load(output_dir) for text, _ in TRAIN_DATA: doc = nlp2(text) print("Entities", [(ent.text, ent.label_) for ent in doc.ents]) print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])
def main(model=None, new_model_name="animal", output_dir=None, n_iter=30): """Set up the pipeline and entity recognizer, and train the new entity.""" random.seed(0) if model is not None: nlp = spacy.load(model) # load existing spaCy model print("Loaded model '%s'" % model) else: nlp = spacy.blank("en") # create blank Language class print("Created blank 'en' model") # Add entity recognizer to model if it's not in the pipeline # nlp.create_pipe works for built-ins that are registered with spaCy if "ner" not in nlp.pipe_names: ner = nlp.create_pipe("ner") nlp.add_pipe(ner) # otherwise, get it, so we can add labels to it else: ner = nlp.get_pipe("ner") ner.add_label(LABEL) # add new entity label to entity recognizer # Adding extraneous labels shouldn't mess anything up ner.add_label("VEGETABLE") if model is None: optimizer = nlp.begin_training() else: optimizer = nlp.resume_training() move_names = list(ner.move_names) # get names of other pipes to disable them during training other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"] with nlp.disable_pipes(*other_pipes): # only train NER sizes = compounding(1.0, 4.0, 1.001) # batch up the examples using spaCy's minibatch for itn in range(n_iter): random.shuffle(TRAIN_DATA) batches = minibatch(TRAIN_DATA, size=sizes) losses = {} for batch in batches: texts, annotations = zip(*batch) nlp.update(texts, annotations, sgd=optimizer, drop=0.35, losses=losses) print("Losses", losses) # test the trained model test_text = "Do you like horses?" doc = nlp(test_text) print("Entities in '%s'" % test_text) for ent in doc.ents: print(ent.label_, ent.text) # save model to output directory if output_dir is not None: output_dir = Path(output_dir) if not output_dir.exists(): output_dir.mkdir() nlp.meta["name"] = new_model_name # rename model nlp.to_disk(output_dir) print("Saved model to", output_dir) # test the saved model print("Loading from", output_dir) nlp2 = spacy.load(output_dir) # Check the classes have loaded back consistently assert nlp2.get_pipe("ner").move_names == move_names doc2 = nlp2(test_text) for ent in doc2.ents: print(ent.label_, ent.text)