def main(model=None, output_dir=None, n_iter=15): """Load the model, set up the pipeline and train the parser.""" if model is not None: nlp = spacy.load(model) # load existing spaCy model print("Loaded model '%s'" % model) else: nlp = spacy.blank("en") # create blank Language class print("Created blank 'en' model") # add the parser to the pipeline if it doesn't exist # nlp.create_pipe works for built-ins that are registered with spaCy if "parser" not in nlp.pipe_names: parser = nlp.create_pipe("parser") nlp.add_pipe(parser, first=True) # otherwise, get it, so we can add labels to it else: parser = nlp.get_pipe("parser") # add labels to the parser for _, annotations in TRAIN_DATA: for dep in annotations.get("deps", []): parser.add_label(dep) # get names of other pipes to disable them during training other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "parser"] with nlp.disable_pipes(*other_pipes): # only train parser optimizer = nlp.begin_training() for itn in range(n_iter): random.shuffle(TRAIN_DATA) losses = {} # batch up the examples using spaCy's minibatch batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001)) for batch in batches: texts, annotations = zip(*batch) nlp.update(texts, annotations, sgd=optimizer, losses=losses) print("Losses", losses) # test the trained model test_text = "I like securities." doc = nlp(test_text) print("Dependencies", [(t.text, t.dep_, t.head.text) for t in doc]) # save model to output directory if output_dir is not None: output_dir = Path(output_dir) if not output_dir.exists(): output_dir.mkdir() nlp.to_disk(output_dir) print("Saved model to", output_dir) # test the saved model print("Loading from", output_dir) nlp2 = spacy.load(output_dir) doc = nlp2(test_text) print("Dependencies", [(t.text, t.dep_, t.head.text) for t in doc])
def train_tensorizer(nlp, texts, dropout, n_iter): tensorizer = nlp.create_pipe("tensorizer") nlp.add_pipe(tensorizer) optimizer = nlp.begin_training() for i in range(n_iter): losses = {} for i, batch in enumerate(minibatch(tqdm.tqdm(texts))): docs = [nlp.make_doc(text) for text in batch] tensorizer.update(docs, None, losses=losses, sgd=optimizer, drop=dropout) print(losses) return optimizer
def main(model_name, unlabelled_loc): n_iter = 10 dropout = 0.2 batch_size = 4 nlp = spacy.load(model_name) nlp.get_pipe("ner").add_label(LABEL) raw_docs = list(read_raw_data(nlp, unlabelled_loc)) optimizer = nlp.resume_training() # Avoid use of Adam when resuming training. I don't understand this well # yet, but I'm getting weird results from Adam. Try commenting out the # nlp.update(), and using Adam -- you'll find the models drift apart. # I guess Adam is losing precision, introducing gradient noise? optimizer.alpha = 0.1 optimizer.b1 = 0.0 optimizer.b2 = 0.0 # get names of other pipes to disable them during training other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"] sizes = compounding(1.0, 4.0, 1.001) with nlp.disable_pipes(*other_pipes): for itn in range(n_iter): random.shuffle(TRAIN_DATA) random.shuffle(raw_docs) losses = {} r_losses = {} # batch up the examples using spaCy's minibatch raw_batches = minibatch(raw_docs, size=4) for batch in minibatch(TRAIN_DATA, size=sizes): docs, golds = zip(*batch) nlp.update(docs, golds, sgd=optimizer, drop=dropout, losses=losses) raw_batch = list(next(raw_batches)) nlp.rehearse(raw_batch, sgd=optimizer, losses=r_losses) print("Losses", losses) print("R. Losses", r_losses) print(nlp.get_pipe('ner').model.unseen_classes) test_text = "Do you like horses?" doc = nlp(test_text) print("Entities in '%s'" % test_text) for ent in doc.ents: print(ent.label_, ent.text)
def main(output_dir, model="en_core_web_sm", n_jobs=4, batch_size=1000, limit=10000): nlp = spacy.load(model) # load spaCy model print("Loaded model '%s'" % model) if not output_dir.exists(): output_dir.mkdir() # load and pre-process the IMBD dataset print("Loading IMDB data...") data, _ = thinc.extra.datasets.imdb() texts, _ = zip(*data[-limit:]) print("Processing texts...") partitions = minibatch(texts, size=batch_size) executor = Parallel(n_jobs=n_jobs, backend="multiprocessing", prefer="processes") do = delayed(partial(transform_texts, nlp)) tasks = (do(i, batch, output_dir) for i, batch in enumerate(partitions)) executor(tasks)
def main(model=None, output_dir=None, n_iter=15): """Load the model, set up the pipeline and train the parser.""" if model is not None: nlp = spacy.load(model) # load existing spaCy model print("Loaded model '%s'" % model) else: nlp = spacy.blank("en") # create blank Language class print("Created blank 'en' model") # We'll use the built-in dependency parser class, but we want to create a # fresh instance – just in case. if "parser" in nlp.pipe_names: nlp.remove_pipe("parser") parser = nlp.create_pipe("parser") nlp.add_pipe(parser, first=True) for text, annotations in TRAIN_DATA: for dep in annotations.get("deps", []): parser.add_label(dep) other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "parser"] with nlp.disable_pipes(*other_pipes): # only train parser optimizer = nlp.begin_training() for itn in range(n_iter): random.shuffle(TRAIN_DATA) losses = {} # batch up the examples using spaCy's minibatch batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001)) for batch in batches: texts, annotations = zip(*batch) nlp.update(texts, annotations, sgd=optimizer, losses=losses) print("Losses", losses) # test the trained model test_model(nlp) # save model to output directory if output_dir is not None: output_dir = Path(output_dir) if not output_dir.exists(): output_dir.mkdir() nlp.to_disk(output_dir) print("Saved model to", output_dir) # test the saved model print("Loading from", output_dir) nlp2 = spacy.load(output_dir) test_model(nlp2)
def train_batch_model(nlp, train_data): random.seed(1) spacy.util.fix_random_seed(1) optimizer = nlp.begin_training() losses = {} for epoch in range(10): random.shuffle(train_data) # Create the batch generator with batch size = 8 batches = minibatch(train_data, size=8) # Iterate through minibatches for batch in batches: # Each batch is a list of (text, label) but we need to # send separate lists for texts and labels to update(). # This is a quick way to split a list of tuples into lists texts, labels = zip(*batch) nlp.update(texts, labels, sgd=optimizer, losses=losses) return nlp
def predict(self,data): data = list(data) #batches = minibatch(data,size=compounding(10,128,1.3)) batches = minibatch(data,size=32) for batch in batches: print("batch size: ",len(batch)) batch = list(batch) score = self.model_score(batch) for i in range(len(batch)): if('label' not in batch[i].keys()): batch[i]['label'] = self.labels[0] if(self.coll): if('score' in list(self.coll.find({'text': batch[i]['text']}))[0].keys()): current_score = list(self.coll.find({'text': batch[i]['text']}))[0]['score'] else: current_score = [] current_score.append(float(score[i])) self.coll.update_one({"text": batch[i]['text']}, {"$set": {"score": current_score}}) yield (float(score[i]),batch[i])
def learn_embeddings(self, train_texts, train_cats, val_texts=None, val_cats=None, force_restart=False, epochs=10, batch_size=compounding(1., 64., 1.001)): logger.info("Learning embeddings") train_texts = [utils.decode(text) for text in train_texts] train_cats = [{'cats': {'POS': bool(cat)}} for cat in train_cats] train_data = list(zip(train_texts, train_cats)) validation = True if val_texts is None or val_cats is None or len(val_texts) == 0 or len( val_cats) == 0: logger.warn( "Validation data is either not given or incomplete so evaluation of validation data set will not be done" ) validation = False with self.nlp.disable_pipes(*self.other_pipes): logger.warn('Disabled following pipes for training: ' + str(self.other_pipes)) if self.optimizer is None or force_restart: self.optimizer = self.nlp.begin_training() logger.info("New optimizer created") for i in range(epochs): losses = {} batches = minibatch(train_data, size=batch_size) for batch in batches: texts, cats = zip(*batch) self.nlp.update(texts, cats, sgd=self.optimizer, drop=0.2, losses=losses) if validation: self.validate(val_texts, val_cats) logger.info("Epoch " + str(i) + " finished, losses: " + str(losses)) return self.vectorize(train_texts)
def train_ner(nlp, train_data, entity_types, n_iter=1000): if "ner" not in nlp.pipe_names: ner = nlp.create_pipe("ner") nlp.add_pipe(ner, last=True) # otherwise, get it so we can add labels else: ner = nlp.get_pipe("ner") # add labels for _, annotations in train_data: for ent in annotations.get("entities"): ner.add_label(ent[2]) # get names of other pipes to disable them during training other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"] with nlp.disable_pipes(*other_pipes): # only train NER # reset and initialize the weights randomly – but only if we're # training a new model # if model is None: nlp.begin_training() for itn in range(n_iter): random.shuffle(train_data) losses = {} # batch up the examples using spaCy's minibatch batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001)) for batch in batches: texts, annotations = zip(*batch) print(batch) print(texts) print(annotations) nlp.update( texts, # batch of texts annotations, # batch of annotations drop=0.5, # dropout - make it harder to memorise data losses=losses, ) print("Losses", losses) return nlp
def main(lang="en", output_dir=None, n_iter=25): """Create a new model, set up the pipeline and train the tagger. In order to train the tagger with a custom tag map, we're creating a new Language instance with a custom vocab. """ nlp = spacy.blank(lang) # add the tagger to the pipeline # nlp.create_pipe works for built-ins that are registered with spaCy tagger = nlp.create_pipe("tagger") # Add the tags. This needs to be done before you start training. for tag, values in TAG_MAP.items(): tagger.add_label(tag, values) nlp.add_pipe(tagger) optimizer = nlp.begin_training() for i in range(n_iter): random.shuffle(TRAIN_DATA) losses = {} # batch up the examples using spaCy's minibatch batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001)) for batch in batches: texts, annotations = zip(*batch) nlp.update(texts, annotations, sgd=optimizer, losses=losses) print("Losses", losses) # test the trained model test_text = "I like blue eggs" doc = nlp(test_text) print("Tags", [(t.text, t.tag_, t.pos_) for t in doc]) # save model to output directory if output_dir is not None: output_dir = Path(output_dir) if not output_dir.exists(): output_dir.mkdir() nlp.to_disk(output_dir) print("Saved model to", output_dir) # test the save model print("Loading from", output_dir) nlp2 = spacy.load(output_dir) doc = nlp2(test_text) print("Tags", [(t.text, t.tag_, t.pos_) for t in doc])
def train_cnn(X_train, cats_train, X_val, cats_val, X_test, cats_test, n_iter=5): """Train CNN on training data and updates model using validation data. Finally, print scores on test data. It also prints training progres.""" nlp = spacy.load('en_core_web_sm') if 'textcat' not in nlp.pipe_names: textcat = nlp.create_pipe('textcat') nlp.add_pipe(textcat, last=True) else: textcat = nlp.get_pipe('textcat') textcat.add_label('POSITIVE') # only train textcat, disable other pipes in pipeline other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'textcat'] train_data = list(zip(X_train, [{'cats': cats} for cats in cats_train])) with nlp.disable_pipes(*other_pipes): optimizer = nlp.begin_training() for i in range(n_iter): losses = {} batches = minibatch(train_data, size=compounding(4., 32., 1.001)) for batch in batches: texts, annotations = zip(*batch) nlp.update(texts, annotations, sgd=optimizer, drop=0.2, losses=losses) with textcat.model.use_params(optimizer.averages): scores = evaluate(nlp.tokenizer, textcat, X_val, cats_val) print(scores) # save model to disk nlp.to_disk('../models/cnn_classifier') print('Scores on test data...') print(evaluate(nlp.tokenizer, textcat, X_test, cats_test))
def test_train_empty(): """Test that training an empty text does not throw errors.""" train_data = [ ("Who is Shaka Khan?", {"entities": [(7, 17, "PERSON")]}), ("", {"entities": []}), ] nlp = English() train_examples = [] for t in train_data: train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) ner = nlp.add_pipe("ner", last=True) ner.add_label("PERSON") nlp.initialize() for itn in range(2): losses = {} batches = util.minibatch(train_examples, size=8) for batch in batches: nlp.update(batch, losses=losses)
def test_train_negative_deprecated(): """Test that the deprecated negative entity format raises a custom error.""" train_data = [ ("Who is Shaka Khan?", {"entities": [(7, 17, "!PERSON")]}), ] nlp = English() train_examples = [] for t in train_data: train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) ner = nlp.add_pipe("ner", last=True) ner.add_label("PERSON") nlp.initialize() for itn in range(2): losses = {} batches = util.minibatch(train_examples, size=8) for batch in batches: with pytest.raises(ValueError): nlp.update(batch, losses=losses)
def train(self, n_iter=400): """Load the model, set up the pipeline and train the parser. Arnaud Brown""" # We'll use the built-in dependency parser class, but we want to create a # fresh instance – just in case. if "parser" in self.nlp_model.pipe_names: self.nlp_model.remove_pipe("parser") parser = self.nlp_model.create_pipe("parser") self.nlp_model.add_pipe(parser, first=True) for _, annotations in TRAIN_DATA: for dep in annotations.get("deps", []): parser.add_label(dep) pipe_exceptions = ["parser", "trf_wordpiecer", "trf_tok2vec"] other_pipes = [ pipe for pipe in self.nlp_model.pipe_names if pipe not in pipe_exceptions ] with self.nlp_model.disable_pipes(*other_pipes): # only train parser optimizer = self.nlp_model.begin_training() for _ in range(n_iter): random.shuffle(TRAIN_DATA) losses = {} # batch up the examples using spaCy's minibatch batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001)) for batch in batches: texts, annotations = zip(*batch) self.nlp_model.update(texts, annotations, sgd=optimizer, losses=losses) print("Losses", losses) # save model to output directory if self.output_dir is not None: self.output_dir_path = Path(self.output_dir) if not self.output_dir_path.exists(): self.output_dir_path.mkdir() self.nlp_model.to_disk(self.output_dir_path) print("Saved model to", self.output_dir_path)
def test_issue4030(): """Test whether textcat works fine with empty doc""" unique_classes = ["offensive", "inoffensive"] x_train = [ "This is an offensive text", "This is the second offensive text", "inoff", ] y_train = ["offensive", "offensive", "inoffensive"] nlp = spacy.blank("en") # preparing the data train_data = [] for text, train_instance in zip(x_train, y_train): cat_dict = {label: label == train_instance for label in unique_classes} train_data.append( Example.from_dict(nlp.make_doc(text), {"cats": cat_dict})) # add a text categorizer component model = { "@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "ngram_size": 2, "no_output_layer": False, } textcat = nlp.add_pipe("textcat", config={"model": model}, last=True) for label in unique_classes: textcat.add_label(label) # training the network with nlp.select_pipes(enable="textcat"): optimizer = nlp.initialize() for i in range(3): losses = {} batches = util.minibatch(train_data, size=compounding(4.0, 32.0, 1.001)) for batch in batches: nlp.update(examples=batch, sgd=optimizer, drop=0.1, losses=losses) # processing of an empty doc should result in 0.0 for all categories doc = nlp("") assert doc.cats["offensive"] == 0.0 assert doc.cats["inoffensive"] == 0.0
def train_textcat(nlp, n_texts, n_iter=10): textcat = nlp.get_pipe("textcat") tok2vec_weights = textcat.model.tok2vec.to_bytes() (train_texts, train_cats), (dev_texts, dev_cats) = load_textcat_data(limit=n_texts) print("Using {} examples ({} training, {} evaluation)".format( n_texts, len(train_texts), len(dev_texts))) train_data = list(zip(train_texts, [{ "cats": cats } for cats in train_cats])) # get names of other pipes to disable them during training pipe_exceptions = ["textcat", "trf_wordpiecer", "trf_tok2vec"] other_pipes = [ pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions ] with nlp.disable_pipes(*other_pipes): # only train textcat optimizer = nlp.begin_training() textcat.model.tok2vec.from_bytes(tok2vec_weights) print("Training the model...") print("{:^5}\t{:^5}\t{:^5}\t{:^5}".format("LOSS", "P", "R", "F")) for i in range(n_iter): losses = {"textcat": 0.0} # batch up the examples using spaCy's minibatch batches = minibatch(tqdm.tqdm(train_data), size=2) for batch in batches: texts, annotations = zip(*batch) nlp.update(texts, annotations, sgd=optimizer, drop=0.2, losses=losses) with textcat.model.use_params(optimizer.averages): # evaluate on the dev data split off in load_data() scores = evaluate_textcat(nlp.tokenizer, textcat, dev_texts, dev_cats) print("{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}". format( # print a simple table losses["textcat"], scores["textcat_p"], scores["textcat_r"], scores["textcat_f"], ))
def train(subject, n_sent, n_iter): # Creates or updates the NER model for a subject train_data = [] for doc in traindb['sentence'].find({ 'subject': subject, 'train': { '$exists': True } }).limit(n_sent): train_data.append((doc['text'], {'entities': doc['train']['tags']})) path = base.joinpath(subject) if path.exists(): nlp = spacy.load(path) ner = nlp.get_pipe('ner') else: nlp = Portuguese() ner = nlp.create_pipe('ner') ner.add_label('CARDINAL') ner.add_label('QUANTITY') ner.add_label('PRODUCT') nlp.add_pipe(ner, last=True) nlp.begin_training() for itn in range(n_iter): random.shuffle(train_data) losses = {} batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001)) for batch in batches: texts, annotations = zip(*batch) nlp.update( texts, annotations, drop=0.5, losses=losses, ) print("Losses", losses) nlp.to_disk(path)
def train_spacy(data, iterations): TRAIN_DATA = data nlp = spacy.load('en_core_web_sm') # create blank Language class # create the built-in pipeline components and add them to the pipeline # nlp.create_pipe works for built-ins that are registered with spaCy if "ner" not in nlp.pipe_names: ner = nlp.create_pipe("ner") nlp.add_pipe(ner, last=True) # otherwise, get it so we can add labels else: ner = nlp.get_pipe("ner") # add labels for _, annotations in TRAIN_DATA: for ent in annotations.get('entities'): ner.add_label(ent[2]) # get names of other pipes to disable them during training # get names of other pipes to disable them during training pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"] other_pipes = [ pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions ] with nlp.disable_pipes(*other_pipes): # only train NER # reset and initialize the weights randomly – but only if we're # training a new model nlp.begin_training() for itn in range(iterations): random.shuffle(TRAIN_DATA) losses = {} # batch up the examples using spaCy's minibatch batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001)) for batch in batches: texts, annotations = zip(*batch) nlp.update( texts, # batch of texts annotations, # batch of annotations drop=0.5, # dropout - make it harder to memorise data losses=losses, ) print("Losses", losses) return nlp
def train(train_data, output_dir, n_iter, model=None): """ Train n_iter NLP models. """ # Load or create NLP model if model is not None: nlp = spacy.load(output_dir) else: nlp = spacy.blank("en") # Pipeline components if "ner" not in nlp.pipe_names: ner = nlp.create_pipe("ner") nlp.add_pipe(ner, last=True) else: ner = nlp.get_pipe("ner") # Label for _, annotations in train_data: for entity in annotations.get("entities"): ner.add_label(entity[2]) # Disable other piplines (only train NER) other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"] with nlp.disable_pipes(*other_pipes): if model is None: nlp.begin_training() else: nlp.resume_training() # Train NER for itn in tqdm.tqdm(range(n_iter)): random.shuffle(train_data) batches = minibatch(train_data, size=compounding(4.0, 500.0, 1.001)) losses = {} for batch in batches: texts, annotations = zip(*batch) nlp.update(texts, annotations, drop=0.5, losses=losses) save_model(output_dir, nlp, "st_ner")
async def train(self, sources: Sources): train_examples = await self._preprocess_data(sources) for _, entities in train_examples: for ent in entities.get("entities"): self.ner.add_label(ent[2]) # get names of other pipes to disable them during training pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"] other_pipes = [ pipe for pipe in self.nlp.pipe_names if pipe not in pipe_exceptions ] # only train NER with self.nlp.disable_pipes(*other_pipes), warnings.catch_warnings(): # show warnings for misaligned entity spans once warnings.filterwarnings("once", category=UserWarning, module="spacy") if self.parent.config.model_name_or_path is None: self.nlp.begin_training() for itn in range(self.parent.config.n_iter): random.shuffle(train_examples) losses = {} batches = minibatch(train_examples, size=compounding(4.0, 32.0, 1.001)) for batch in batches: examples = [] for doc, gold_dict in batch: doc = self.nlp.make_doc(doc) examples.append(Example.from_dict(doc, gold_dict)) self.nlp.update( examples, drop=self.parent.config.dropout, losses=losses, ) self.logger.debug(f"Losses: {losses}") if self.parent.config.directory is not None: if not self.parent.config.directory.exists(): self.parent.config.directory.mkdir(parents=True) self.nlp.to_disk(self.parent.config.directory) self.logger.debug( f"Saved model to {self.parent.config.directory.name}")
def _multiprocessing_pipe(self, texts, pipes, n_process, batch_size): # raw_texts is used later to stop iteration. texts, raw_texts = itertools.tee(texts) # for sending texts to worker texts_q = [mp.Queue() for _ in range(n_process)] # for receiving byte-encoded docs from worker bytedocs_recv_ch, bytedocs_send_ch = zip( *[mp.Pipe(False) for _ in range(n_process)]) batch_texts = minibatch(texts, batch_size) # Sender sends texts to the workers. # This is necessary to properly handle infinite length of texts. # (In this case, all data cannot be sent to the workers at once) sender = _Sender(batch_texts, texts_q, chunk_size=n_process) # send twice to make process busy sender.send() sender.send() procs = [ mp.Process( target=_apply_pipes, args=(self.make_doc, pipes, rch, sch, Underscore.get_state(), load_nlp.VECTORS), ) for rch, sch in zip(texts_q, bytedocs_send_ch) ] for proc in procs: proc.start() # Cycle channels not to break the order of docs. # The received object is a batch of byte-encoded docs, so flatten them with chain.from_iterable. byte_docs = chain.from_iterable(recv.recv() for recv in cycle(bytedocs_recv_ch)) docs = (Doc(self.vocab).from_bytes(byte_doc) for byte_doc in byte_docs) try: for i, (_, doc) in enumerate(zip(raw_texts, docs), 1): yield doc if i % batch_size == 0: # tell `sender` that one batch was consumed. sender.step() finally: for proc in procs: proc.terminate()
def train_spacy_ner(model, data, iterations): if model is not None: nlp = spacy.load(model) print("Loaded model '%s'" % model) else: nlp = spacy.blank('vi') # create blank Language class print("Created blank 'vi' model") if 'ner' not in nlp.pipe_names: ner = nlp.create_pipe('ner') nlp.add_pipe(ner, last=True) else: ner = nlp.get_pipe('ner') for _, annotations in data: for entity in annotations.get('entities'): ner.add_label(entity[2]) # get names of other pipes to disable them during training other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner'] with nlp.disable_pipes(*other_pipes): # only extract_xpath NER if model is None: optimizer = nlp.begin_training() else: # optimizer = nlp.resume_training() optimizer = nlp.entity.create_optimizer() for itn in range(iterations): print("Statring iteration " + str(itn)) random.shuffle(data) split = int(len(data) * 5/6) training_data = data[:split] test_data = data[split + 1:] losses = {} batches = minibatch(training_data, size=compounding(2000, 5000., 1.5)) for batch in batches: texts, annotations = zip(*batch) nlp.update(texts, annotations, sgd=optimizer, drop=0.35, losses=losses) print('Losses:', losses) print('Score:', evaluate(nlp, test_data)) return nlp
def ner_model(TRAIN_DATA, n_iter=500): print('Training started...') """Load the model, set up the pipeline and train the entity recognizer.""" nlp = spacy.blank("en") # create blank Language class print("Created blank 'en' model") # create the built-in pipeline components and add them to the pipeline # nlp.create_pipe works for built-ins that are registered with spaCy if "ner" not in nlp.pipe_names: ner = nlp.create_pipe("ner") nlp.add_pipe(ner, last=True) # otherwise, get it so we can add labels else: ner = nlp.get_pipe("ner") # add labels for _, annotations in TRAIN_DATA: for ent in annotations.get("entities"): ner.add_label(ent[2]) # get names of other pipes to disable them during training other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"] with nlp.disable_pipes(*other_pipes): # only train NER # reset and initialize the weights randomly – but only if we're # training a new model nlp.begin_training() for itn in tqdm_notebook(range(n_iter)): random.shuffle(TRAIN_DATA) losses = {} # batch up the examples using spaCy's minibatch batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001)) for batch in batches: texts, annotations = zip(*batch) nlp.update( texts, # batch of texts annotations, # batch of annotations drop=0.5, # dropout - make it harder to memorise data losses=losses, ) print('Training completed.') return nlp
def train_model(data: list, folder_to_save_model: str, n_iter: int, batch_size: int, dropout_rate: float): """ Train a NER model using Spacy :param data: list of tuples [(text, offset)] :param folder_to_save_model: Where to save the learned model. None to skip. Will be overiden with new model :param n_iter: number iterations of the CNN :param batch_size: more = less precise / less time to learn :param dropout_rate: more : learn less / better generalization """ nlp = get_empty_model(load_labels_for_training=True) nlp.vocab.vectors.name = 'spacy_pretrained_vectors' optimizer = nlp.begin_training() with tqdm(total=n_iter * ceil(len(data) / batch_size), unit=" paragraphs", desc="Learn NER model") as pbar: for itn in range(n_iter): pbar.set_description(f"Learn NER model - iteration {itn + 1}") losses = {} random.shuffle(data) batches = util.minibatch(data, batch_size) for current_batch_item in batches: case_id, texts, annotations = zip(*current_batch_item) docs = [nlp.make_doc(text) for text in texts] gold_with_unknown_bilou = convert_unknown_bilou_bulk( docs=docs, offsets=annotations) nlp.update( docs, # batch of texts gold_with_unknown_bilou, # batch of annotations drop= dropout_rate, # dropout - make it harder to memorise rules sgd=optimizer, # callable to update weights losses=losses) pbar.postfix = "loss: " + str(losses['ner']) pbar.update() # save model to output directory if folder_to_save_model is not None: folder_to_save_model = Path(folder_to_save_model) nlp.to_disk(folder_to_save_model)
def train_model(training_data: list, test_data: list, iterations: int = 20) -> None: nlp = spacy.load("en_core_web_sm") if "textcat" not in nlp.pipe_names: textcat = nlp.create_pipe("textcat", config={"architecture": "simple_cnn"}) nlp.add_pipe(textcat, last=True) else: textcat = nlp.get_pipe("textcat") textcat.add_label("pos") textcat.add_label("neg") training_excluded_pipes = [ pipe for pipe in nlp.pipe_names if pipe != "textcat" ] with nlp.disable_pipes(training_excluded_pipes): optimizer = nlp.begin_training() print("Training...") batch_sizes = compounding(4.0, 32.0, 1.001) for i in trange(iterations): loss = {} random.shuffle(training_data) batches = minibatch(training_data, size=batch_sizes) for batch in batches: text, labels = zip(*batch) nlp.update(text, labels, drop=0.2, sgd=optimizer, losses=loss) with textcat.model.use_params(optimizer.averages): evaluation_results = evaluate_model(tokenizer=nlp.tokenizer, textcat=textcat, test_data=test_data) print(f"{loss['textcat']}\t{evaluation_results['precision']}" f"\t{evaluation_results['recall']}" f"\t{evaluation_results['f-score']}") with nlp.use_params(optimizer.averages): nlp.to_disk("model_artifacts")
def train_spacy_ner(path): import spacy TRAIN_DATA = label_studio_to_spacy(path) nlp = spacy.load('en_core_web_sm') ner = nlp.get_pipe("ner") for _, annotations in TRAIN_DATA: for ent in annotations.get("entities"): ner.add_label(ent[2]) pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"] unaffected_pipes = [ pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions ] import random from spacy.util import minibatch, compounding from pathlib import Path # TRAINING THE MODEL with nlp.disable_pipes(*unaffected_pipes): # Training for 30 iterations for iteration in range(50): # shuufling examples before every iteration random.shuffle(TRAIN_DATA) losses = {} # batch up the examples using spaCy's minibatch batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001)) for batch in batches: texts, annotations = zip(*batch) nlp.update( texts, # batch of texts annotations, # batch of annotations drop=0.5, # dropout - make it harder to memorise data losses=losses, ) print("Losses", losses) output_dir = Path(r'E:\Nitin\RVCE\Projects\PDF-OCR\code\models') nlp.to_disk(output_dir) print("Saved model to", output_dir)
def ner_trainig(): with open(r'C:\Users\Akash\Desktop\be_proj\Data\NER_training_data.pickle', 'rb') as fp: TRAIN_DATA = pickle.load(fp) LABEL = ['BRANCH'] nlp = spacy.blank('en') #created a blank model ner = nlp.create_pipe('ner') #created a pipeline ner nlp.add_pipe(ner) #added NER to the spcy pipeline for l in LABEL: ner.add_label(l) #added labels for the component optimizer = nlp.begin_training() n_iter = 10 pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"] other_pipes = [ pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions ] with nlp.disable_pipes(*other_pipes): # only train NER for itn in range(n_iter): random.shuffle(TRAIN_DATA) #shuffled the dataset losses = {} batches = minibatch(TRAIN_DATA, size=compounding(1.0, 4.0, 1.001)) for batch in batches: texts, annotations = zip(*batch) nlp.update(texts, annotations, sgd=optimizer, drop=0.35, losses=losses) print('Losses', losses) #test_text = "how many electronics students are there in IT branch ?" #doc = nlp(test_text) #print("Entities in '%s'" % test_text) #for ent in doc.ents: # print(ent.label_, ent.text) nlp.meta["name"] = "NER_mod" # rename model nlp.to_disk(r"D:\projects\col_chatbot - Copy\college_bot\NERdata")
def train_model(n_iter: int = 100) -> None: # You can load other languages if specificied in Dockerfile nlp = spacy.load("en_core_web_sm") if "ner" not in nlp.pipe_names: ner = nlp.create_pipe("ner") nlp.add_pipe(ner) else: ner = nlp.get_pipe("ner") # add labels for _, annotations in TRAIN_DATA: for ent in annotations.get("entities"): ner.add_label(ent[2]) # get names of other pipes to disable them during training pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"] other_pipes = [ pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions ] # only train NER with nlp.disable_pipes(*other_pipes): nlp.begin_training() for step_n in range(n_iter): random.shuffle(TRAIN_DATA) losses = {} batches = minibatch(TRAIN_DATA, size=compounding(1.0, 4.0, 1.001)) for batch in batches: texts, annotations = zip(*batch) nlp.update( texts, annotations, drop=0.5, losses=losses, ) print("Losses: ", losses) tracking.log_metrics(step=step_n, **losses) nlp.to_disk("custom_spacy_model")
def start_training(model=None, output=None, epoch=30): train_data = TRAIN_DATA2 + TRAIN_DATA # Loading or create an empty model. if model is not None: nlp = spacy.load(model) print("Loaded model '%s'." % model) else: nlp = spacy.blank('en') print("Created blank model to train.") # Create a fresh instance of parser. if 'parser' in nlp.pipe_names: nlp.remove_pipe('parser') parser = nlp.create_pipe('parser') nlp.add_pipe(parser, first=True) for text, tags in train_data: for dep in tags.get('deps', []): parser.add_label(dep) other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'parser'] with nlp.disable_pipes(*other_pipes): optimizer = nlp.begin_training() for _ in range(epoch): random.shuffle(train_data) losses = {} batches = minibatch(train_data, size=4) for batch in batches: texts, labels = zip(*batch) nlp.update(texts, labels, sgd=optimizer, losses=losses) print('Losses', losses) if output is not None: output = Path(output) if not output.exists(): output.mkdir() nlp.to_disk(output) print("Saved model to directory %s." % output) return nlp
def _train_categories(self, train_categories_data): # get names of other pipes to disable them during training other_pipes = [ pipe for pipe in self._nlp.pipe_names if pipe != 'textcat' ] with self._nlp.disable_pipes(*other_pipes): # only train textcat optimizer = self._nlp.begin_training() # print("Training the model...") # print('{:^5}\t{:^5}\t{:^5}\t{:^5}'.format('LOSS', 'P', 'R', 'F')) for itn in range(self._iterations): losses = {} batches = minibatch(train_categories_data, size=compounding(4., 32., 1.001)) for batch in batches: texts, annotations = zip(*batch) self._nlp.update(texts, annotations, sgd=optimizer, losses=losses)
def _predict(self, texts: List[str], batch_size: Optional[int] = 1000) -> Iterable[str]: translated_texts = [] with tqdm(total=len(texts)) as pbar: for batch in minibatch(texts, batch_size): json_body = [ { "q": text, "source": self.source_lang, "target": self.target_lang, "format": "text", } for text in batch ] res = httpx.post(self._translate_url, params=self._default_params, json=json_body) data = res.json() for doc in data["translations"]: translated_texts.append(doc["text"]) pbar.update(1) return translated_texts
def pipe(self, stream: Iterable[Doc], *, batch_size: int = 128) -> Iterator[Doc]: """Apply the pipe to a stream of documents. This usually happens under the hood when the nlp object is called on a text and all components are applied to the Doc. stream (Iterable[Doc]): A stream of documents. batch_size (int): The number of documents to buffer. YIELDS (Doc): Processed documents in order. DOCS: https://nightly.spacy.io/api/transformer#pipe """ for outer_batch in minibatch(stream, batch_size): outer_batch = list(outer_batch) for indices in batch_by_length(outer_batch, self.cfg["max_batch_items"]): subbatch = [outer_batch[i] for i in indices] self.set_annotations(subbatch, self.predict(subbatch)) yield from outer_batch
def main(args): merge_terms = load_merge_terms(args.merge_terms) \ if args.merge_terms else {} nlp = get_parser(disable=args.disable, merge_terms=merge_terms, max_sent_len=args.max_sent_len) identity_preprocess = lambda x: x corpus = dataloader(args.inputdir, preprocess=identity_preprocess) partitions = minibatch(corpus, size=args.batch_size) executor = Parallel(n_jobs=args.n_procs, backend="multiprocessing", prefer="processes") do = delayed(partial(transform_texts, nlp)) tasks = (do(i, batch, args.outputdir, args.disable, args.prefix, args.keep_whitespace) \ for i, batch in enumerate(partitions)) executor(tasks)
def _train_ner(self, train_entities_data): # get names of other pipes to disable them during training other_pipes = [pipe for pipe in self._nlp.pipe_names if pipe != 'ner'] with self._nlp.disable_pipes(*other_pipes): # only train NER optimizer = self._nlp.begin_training() losses = {} for itn in range(self._iterations): random.shuffle(train_entities_data) # batch up the examples using spaCy's minibatch batches = minibatch(train_entities_data, size=compounding(4., 32., 1.001)) for batch in batches: texts, annotations = zip(*batch) self._nlp.update( texts, # batch of texts annotations, # batch of annotations sgd=optimizer, # callable to update weights losses=losses) self._logger.debug('Losses ', losses)
def _predict(self, texts: List[str], batch_size: Optional[int] = 1000) -> Iterable[str]: translated_texts = [] with tqdm(total=len(texts)) as pbar: for batch in minibatch(texts, batch_size): json_body = [{"text": text} for text in batch] res = httpx.post( self._translate_url, params=self._default_params, headers=self._default_headers, json=json_body, ) data = res.json() for doc in data: translation = doc["translations"][0] translated_texts.append(translation["text"]) pbar.update(1) return translated_texts
def train_textcat(nlp, n_texts, n_iter=10): textcat = nlp.get_pipe("textcat") tok2vec_weights = textcat.model.tok2vec.to_bytes() (train_texts, train_cats), (dev_texts, dev_cats) = load_textcat_data(limit=n_texts) print( "Using {} examples ({} training, {} evaluation)".format( n_texts, len(train_texts), len(dev_texts) ) ) train_data = list(zip(train_texts, [{"cats": cats} for cats in train_cats])) # get names of other pipes to disable them during training other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "textcat"] with nlp.disable_pipes(*other_pipes): # only train textcat optimizer = nlp.begin_training() textcat.model.tok2vec.from_bytes(tok2vec_weights) print("Training the model...") print("{:^5}\t{:^5}\t{:^5}\t{:^5}".format("LOSS", "P", "R", "F")) for i in range(n_iter): losses = {"textcat": 0.0} # batch up the examples using spaCy's minibatch batches = minibatch(tqdm.tqdm(train_data), size=2) for batch in batches: texts, annotations = zip(*batch) nlp.update(texts, annotations, sgd=optimizer, drop=0.2, losses=losses) with textcat.model.use_params(optimizer.averages): # evaluate on the dev data split off in load_data() scores = evaluate_textcat(nlp.tokenizer, textcat, dev_texts, dev_cats) print( "{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}".format( # print a simple table losses["textcat"], scores["textcat_p"], scores["textcat_r"], scores["textcat_f"], ) )
def main(model=None, output_dir=None, n_iter=100): """Load the model, set up the pipeline and train the entity recognizer.""" if model is not None: nlp = spacy.load(model) # load existing spaCy model print("Loaded model '%s'" % model) else: nlp = spacy.blank("en") # create blank Language class print("Created blank 'en' model") # create the built-in pipeline components and add them to the pipeline # nlp.create_pipe works for built-ins that are registered with spaCy if "ner" not in nlp.pipe_names: ner = nlp.create_pipe("ner") nlp.add_pipe(ner, last=True) # otherwise, get it so we can add labels else: ner = nlp.get_pipe("ner") # add labels for _, annotations in TRAIN_DATA: for ent in annotations.get("entities"): ner.add_label(ent[2]) # get names of other pipes to disable them during training other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"] with nlp.disable_pipes(*other_pipes): # only train NER # reset and initialize the weights randomly – but only if we're # training a new model if model is None: nlp.begin_training() for itn in range(n_iter): random.shuffle(TRAIN_DATA) losses = {} # batch up the examples using spaCy's minibatch batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001)) for batch in batches: texts, annotations = zip(*batch) nlp.update( texts, # batch of texts annotations, # batch of annotations drop=0.5, # dropout - make it harder to memorise data losses=losses, ) print("Losses", losses) # test the trained model for text, _ in TRAIN_DATA: doc = nlp(text) print("Entities", [(ent.text, ent.label_) for ent in doc.ents]) print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc]) # save model to output directory if output_dir is not None: output_dir = Path(output_dir) if not output_dir.exists(): output_dir.mkdir() nlp.to_disk(output_dir) print("Saved model to", output_dir) # test the saved model print("Loading from", output_dir) nlp2 = spacy.load(output_dir) for text, _ in TRAIN_DATA: doc = nlp2(text) print("Entities", [(ent.text, ent.label_) for ent in doc.ents]) print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])
for _, annotations in TRAIN_DATA: for ent in annotations.get('entities'): ner.add_label(ent[2]) n_iter=100 # get names of other pipes to disable them during training other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner'] with nlp.disable_pipes(*other_pipes): # only train NER optimizer = nlp.begin_training() for itn in range(n_iter): print("Starting iteration " + str(itn)) random.shuffle(TRAIN_DATA) losses = {} # batch up the examples using spaCy's minibatch batches = minibatch(TRAIN_DATA, size=compounding(4., 32., 1.001)) for batch in batches: texts, annotations = zip(*batch) nlp.update( texts, # batch of texts annotations, # batch of annotations drop=0.55, # dropout - make it harder to memorise data sgd=optimizer, # callable to update weights losses=losses) print('Losses', losses) # test the trained model for text, _ in TRAIN_DATA: doc = nlp(text) print('Entities', [(ent.text, ent.label_) for ent in doc.ents]) #print('Tokens', [(t.text, t.ent_type_, t.ent_iob) for t in doc])
ner.add_label(ent[2]) n_iter=10 other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner'] with nlp.disable_pipes(*other_pipes): optimizer = nlp.begin_training() for itn in range(n_iter): ts = time.time() st= datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S') print (st) print("Starting iteration " + str(itn)) random.shuffle(TRAIN_DATA) losses = {} #batches = minibatch(TRAIN_DATA, size=compounding(4., 32., 1.001)) batches = minibatch(TRAIN_DATA, size=compounding(1000., 8000., 1.25)) for batch in batches: texts, annotations = zip(*batch) nlp.update( texts, annotations, drop=0.3, sgd=optimizer, losses=losses) print('Losses', losses) ts = time.time() st = datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S') print (st) print ("CONCLUIDO") # test the trained model
def main(model=None, new_model_name="animal", output_dir=None, n_iter=30): """Set up the pipeline and entity recognizer, and train the new entity.""" random.seed(0) if model is not None: nlp = spacy.load(model) # load existing spaCy model print("Loaded model '%s'" % model) else: nlp = spacy.blank("en") # create blank Language class print("Created blank 'en' model") # Add entity recognizer to model if it's not in the pipeline # nlp.create_pipe works for built-ins that are registered with spaCy if "ner" not in nlp.pipe_names: ner = nlp.create_pipe("ner") nlp.add_pipe(ner) # otherwise, get it, so we can add labels to it else: ner = nlp.get_pipe("ner") ner.add_label(LABEL) # add new entity label to entity recognizer # Adding extraneous labels shouldn't mess anything up ner.add_label("VEGETABLE") if model is None: optimizer = nlp.begin_training() else: optimizer = nlp.resume_training() move_names = list(ner.move_names) # get names of other pipes to disable them during training other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"] with nlp.disable_pipes(*other_pipes): # only train NER sizes = compounding(1.0, 4.0, 1.001) # batch up the examples using spaCy's minibatch for itn in range(n_iter): random.shuffle(TRAIN_DATA) batches = minibatch(TRAIN_DATA, size=sizes) losses = {} for batch in batches: texts, annotations = zip(*batch) nlp.update(texts, annotations, sgd=optimizer, drop=0.35, losses=losses) print("Losses", losses) # test the trained model test_text = "Do you like horses?" doc = nlp(test_text) print("Entities in '%s'" % test_text) for ent in doc.ents: print(ent.label_, ent.text) # save model to output directory if output_dir is not None: output_dir = Path(output_dir) if not output_dir.exists(): output_dir.mkdir() nlp.meta["name"] = new_model_name # rename model nlp.to_disk(output_dir) print("Saved model to", output_dir) # test the saved model print("Loading from", output_dir) nlp2 = spacy.load(output_dir) # Check the classes have loaded back consistently assert nlp2.get_pipe("ner").move_names == move_names doc2 = nlp2(test_text) for ent in doc2.ents: print(ent.label_, ent.text)
def main( ud_dir, parses_dir, corpus, config=None, limit=0, gpu_device=-1, vectors_dir=None, use_oracle_segments=False, ): spacy.util.fix_random_seed() lang.zh.Chinese.Defaults.use_jieba = False lang.ja.Japanese.Defaults.use_janome = False if config is not None: config = Config.load(config, vectors_dir=vectors_dir) else: config = Config(vectors_dir=vectors_dir) paths = TreebankPaths(ud_dir, corpus) if not (parses_dir / corpus).exists(): (parses_dir / corpus).mkdir() print("Train and evaluate", corpus, "using lang", paths.lang) nlp = load_nlp(paths.lang, config, vectors=vectors_dir) docs, golds = read_data( nlp, paths.train.conllu.open(), paths.train.text.open(), max_doc_length=config.max_doc_length, limit=limit, ) optimizer = initialize_pipeline(nlp, docs, golds, config, gpu_device) batch_sizes = compounding(config.min_batch_size, config.max_batch_size, 1.001) beam_prob = compounding(0.2, 0.8, 1.001) for i in range(config.nr_epoch): docs, golds = read_data( nlp, paths.train.conllu.open(), paths.train.text.open(), max_doc_length=config.max_doc_length, limit=limit, oracle_segments=use_oracle_segments, raw_text=not use_oracle_segments, ) Xs = list(zip(docs, golds)) random.shuffle(Xs) if config.batch_by_words: batches = minibatch_by_words(Xs, size=batch_sizes) else: batches = minibatch(Xs, size=batch_sizes) losses = {} n_train_words = sum(len(doc) for doc in docs) with tqdm.tqdm(total=n_train_words, leave=False) as pbar: for batch in batches: batch_docs, batch_gold = zip(*batch) pbar.update(sum(len(doc) for doc in batch_docs)) nlp.parser.cfg["beam_update_prob"] = next(beam_prob) nlp.update( batch_docs, batch_gold, sgd=optimizer, drop=config.dropout, losses=losses, ) out_path = parses_dir / corpus / "epoch-{i}.conllu".format(i=i) with nlp.use_params(optimizer.averages): if use_oracle_segments: parsed_docs, scores = evaluate( nlp, paths.dev.conllu, paths.dev.conllu, out_path ) else: parsed_docs, scores = evaluate( nlp, paths.dev.text, paths.dev.conllu, out_path ) print_progress(i, losses, scores)
def main(model=None, output_dir=None, n_iter=20, n_texts=2000, init_tok2vec=None): if output_dir is not None: output_dir = Path(output_dir) if not output_dir.exists(): output_dir.mkdir() if model is not None: nlp = spacy.load(model) # load existing spaCy model print("Loaded model '%s'" % model) else: nlp = spacy.blank("en") # create blank Language class print("Created blank 'en' model") # add the text classifier to the pipeline if it doesn't exist # nlp.create_pipe works for built-ins that are registered with spaCy if "textcat" not in nlp.pipe_names: textcat = nlp.create_pipe( "textcat", config={ "exclusive_classes": True, "architecture": "simple_cnn", } ) nlp.add_pipe(textcat, last=True) # otherwise, get it, so we can add labels to it else: textcat = nlp.get_pipe("textcat") # add label to text classifier textcat.add_label("POSITIVE") textcat.add_label("NEGATIVE") # load the IMDB dataset print("Loading IMDB data...") (train_texts, train_cats), (dev_texts, dev_cats) = load_data() train_texts = train_texts[:n_texts] train_cats = train_cats[:n_texts] print( "Using {} examples ({} training, {} evaluation)".format( n_texts, len(train_texts), len(dev_texts) ) ) train_data = list(zip(train_texts, [{"cats": cats} for cats in train_cats])) # get names of other pipes to disable them during training other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "textcat"] with nlp.disable_pipes(*other_pipes): # only train textcat optimizer = nlp.begin_training() if init_tok2vec is not None: with init_tok2vec.open("rb") as file_: textcat.model.tok2vec.from_bytes(file_.read()) print("Training the model...") print("{:^5}\t{:^5}\t{:^5}\t{:^5}".format("LOSS", "P", "R", "F")) batch_sizes = compounding(4.0, 32.0, 1.001) for i in range(n_iter): losses = {} # batch up the examples using spaCy's minibatch random.shuffle(train_data) batches = minibatch(train_data, size=batch_sizes) for batch in batches: texts, annotations = zip(*batch) nlp.update(texts, annotations, sgd=optimizer, drop=0.2, losses=losses) with textcat.model.use_params(optimizer.averages): # evaluate on the dev data split off in load_data() scores = evaluate(nlp.tokenizer, textcat, dev_texts, dev_cats) print( "{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}".format( # print a simple table losses["textcat"], scores["textcat_p"], scores["textcat_r"], scores["textcat_f"], ) ) # test the trained model test_text = "This movie sucked" doc = nlp(test_text) print(test_text, doc.cats) if output_dir is not None: with nlp.use_params(optimizer.averages): nlp.to_disk(output_dir) print("Saved model to", output_dir) # test the saved model print("Loading from", output_dir) nlp2 = spacy.load(output_dir) doc2 = nlp2(test_text) print(test_text, doc2.cats)
def main(model=None, output_dir=None, n_iter=100): if model is not None: nlp = spacy.load(model) print("Loaded model '%s'" % model) else: nlp = spacy.blank("en") print("Created blank 'en' model") if "ner" not in nlp.pipe_names: ner = nlp.create_pipe("ner") nlp.add_pipe(ner, last=True) else: ner = nlp.get_pipe("ner") for _, annotations in TRAIN_DATA: for ent in annotations.get("entities"): ner.add_label(ent[2]) other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"] with nlp.disable_pipes(*other_pipes): if model is None: nlp.begin_training() for itn in range(n_iter): ts = time.time() st= datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S') print (st) print("Starting iteration " + str(itn)) random.shuffle(TRAIN_DATA) losses = {} batches = minibatch(TRAIN_DATA, size=compounding(1000., 8000., 1.25)) for batch in batches: texts, annotations = zip(*batch) nlp.update( texts, annotations, drop=0.5, losses=losses, ) print("Losses", losses) ts = time.time() st = datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S') print (st) print ("CONCLUIDO") for text, _ in TRAIN_DATA: doc = nlp(text) print("Entities", [(ent.text, ent.label_) for ent in doc.ents]) print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc]) if output_dir is not None: output_dir = Path(output_dir) if not output_dir.exists(): output_dir.mkdir() nlp.to_disk(output_dir) print("Saved model to", output_dir) print("Loading from", output_dir) nlp2 = spacy.load(output_dir) for text, _ in TRAIN_DATA: doc = nlp2(text) print("Entities", [(ent.text, ent.label_) for ent in doc.ents]) print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])
def main(model=None, output_dir=None, n_iter=20, n_texts=2000): if model is not None: nlp = spacy.load(model) # load existing spaCy model print("Loaded model '%s'" % model) else: nlp = spacy.blank('en') # create blank Language class print("Created blank 'en' model") # add the text classifier to the pipeline if it doesn't exist # nlp.create_pipe works for built-ins that are registered with spaCy if 'textcat' not in nlp.pipe_names: textcat = nlp.create_pipe('textcat') nlp.add_pipe(textcat, last=True) # otherwise, get it, so we can add labels to it else: textcat = nlp.get_pipe('textcat') # add label to text classifier textcat.add_label('POSITIVE') # load the IMDB dataset print("Loading IMDB data...") (train_texts, train_cats), (dev_texts, dev_cats) = load_data(limit=n_texts) print("Using {} examples ({} training, {} evaluation)" .format(n_texts, len(train_texts), len(dev_texts))) train_data = list(zip(train_texts, [{'cats': cats} for cats in train_cats])) # get names of other pipes to disable them during training other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'textcat'] with nlp.disable_pipes(*other_pipes): # only train textcat optimizer = nlp.begin_training() print("Training the model...") print('{:^5}\t{:^5}\t{:^5}\t{:^5}'.format('LOSS', 'P', 'R', 'F')) for i in range(n_iter): losses = {} # batch up the examples using spaCy's minibatch batches = minibatch(train_data, size=compounding(4., 32., 1.001)) for batch in batches: texts, annotations = zip(*batch) nlp.update(texts, annotations, sgd=optimizer, drop=0.2, losses=losses) with textcat.model.use_params(optimizer.averages): # evaluate on the dev data split off in load_data() scores = evaluate(nlp.tokenizer, textcat, dev_texts, dev_cats) print('{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}' # print a simple table .format(losses['textcat'], scores['textcat_p'], scores['textcat_r'], scores['textcat_f'])) # test the trained model test_text = "This movie sucked" doc = nlp(test_text) print(test_text, doc.cats) if output_dir is not None: output_dir = Path(output_dir) if not output_dir.exists(): output_dir.mkdir() nlp.to_disk(output_dir) print("Saved model to", output_dir) # test the saved model print("Loading from", output_dir) nlp2 = spacy.load(output_dir) doc2 = nlp2(test_text) print(test_text, doc2.cats)