def create_model(): set_gpu() TRAIN_DATA, TEST_DATA = load_data() nlp = spacy.load("en_pytt_bertbaseuncased_lg") textcat = nlp.create_pipe("pytt_textcat", config={"exclusive_classes": True}) for label in ("POSITIVE", "NEGATIVE"): textcat.add_label(label) nlp.add_pipe(textcat) optimizer = nlp.resume_training() dropout = decaying(0.6, 0.2, 1e-4) print("Training the model...") for i in range(10): print("Iteration =>", i) random.shuffle(TRAIN_DATA) losses = {} for batch in get_batches(TRAIN_DATA, "textcat"): texts, cats = zip(*batch) print(texts, cats) nlp.update(texts, cats, sgd=optimizer, losses=losses, drop=dropout) print(i, losses) with nlp.use_params(optimizer.averages): nlp.to_disk("models")
def test_issue3447(): sizes = decaying(10.0, 1.0, 0.5) size = next(sizes) assert size == 10.0 size = next(sizes) assert size == 10.0 - 0.5 size = next(sizes) assert size == 10.0 - 0.5 - 0.5
def test_decaying(): sizes = decaying(10., 1., .5) size = next(sizes) assert size == 10. size = next(sizes) assert size == 10. - 0.5 size = next(sizes) assert size == 10. - 0.5 - 0.5
def train_entity(self, nlp, output_dir, train_data, n_iter, dropout): """Load the model, set up the pipeline and train the entity recognizer. Keyword arguments: model -- path to the model if existent output_dir -- path where model is saved at n_iter -- amount of times data is trained with train_data -- training data in BILOU Format Returns: output_dir -- path to model """ dropout = decaying(0.6, 0.2, 1e-4) pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"] other_pipes = [ pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions ] disabled = nlp.disable_pipes(*other_pipes) logging.info("Started training entities...") optimizer = nlp.begin_training() for iteration in range(n_iter): random.shuffle(train_data) losses = {} # batch up the examples using spaCy's minibatch batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001)) for batch in batches: texts, annotations, _ = zip(*batch) nlp.update( texts, # batch of texts annotations, # batch of annotations drop=next( dropout), # dropout - make it harder to memorise data sgd=optimizer, losses=losses, ) p, r, f = self.evaluate_entity(nlp) self.entity_score.append([p, r, f]) logging.info("Finished %s iteration for NER with %s losses", iteration, losses) self.losses_ner.append(losses) logging.info("Finished training entities...") disabled.restore() # save model to output directory if output_dir is not None: output_dir = Path(output_dir) if not output_dir.exists(): output_dir.mkdir() nlp.to_disk(output_dir) logging.info("Saved entity model to %s", output_dir) return output_dir
def __init__(self): self.model_file = 'custommodel' self.train_data_file = 'data/train-data.txt' self.dropout = decaying(0.1, 0.0, 1e-4) self.iters = 20 self.batch_size = 2 #address self.address_label = "GPE" address_pattern = '\d+[\w\s]+(?:avenue|ave|road|rd|boulevard|blvd|street|st|drive|dr|court|ct|highway|hwy|square|sq|park|parkway|pkwy|circle|cir|trail|trl)[,*\w\s]+([a-z][0-9][a-z]\s*[0-9][a-z][0-9](,*\s*canada)?)' self.address_pattern_object = re.compile(address_pattern, re.IGNORECASE) #date self.date_label = "DATE" date_pattern = '\d+(jan|january|feb|february|mar|march|apr|april|may|jun|june|jul|july|aug|august|sep|september|oct|october|nov|november|dec|december)\d+' self.date_pattern_object = re.compile(date_pattern, re.IGNORECASE)
def train(new_model_name='persons', output_dir=None): optimizer = nlp.begin_training() other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner'] with nlp.disable_pipes(*other_pipes): # only train NER for itn in range(5): batches = minibatch(to_train_ents, size=compounding(4., 32., 1.001)) losses = {} # for text, annotations in to_train_ents: # nlp.update([text], [annotations], sgd=optimizer, drop=0.40, # losses=losses) random.shuffle(to_train_ents) for batch in batches: texts, annotations = zip(*batch) nlp.update(texts, annotations, sgd=optimizer, drop=next(decaying(0.35, 0.25, 1e-4)), losses=losses) print(losses) if output_dir is None: output_dir = "./model1" noutput_dir = Path(output_dir) if not noutput_dir.exists(): noutput_dir.mkdir() if output_dir is not None: nlp.meta['accuracy'] = {'ner': best_acc} nlp.meta['name'] = new_model_name with nlp.use_params(optimizer.averages): nlp.to_disk(output_dir) random.shuffle(to_train_ents) # quick test the saved model test_text = 'Gina Haspel, President Donald Trump’s controversial pick to be the next CIA director, has officially been confirmed by the Senate in a 54-45 vote.' print("Loading from", output_dir) nlp2 = spacy.load(output_dir) doc2 = nlp2(preprocess(nlp2(test_text))) print("Entities in '%s'" % doc2) for ent in doc2.ents: print(ent.label_, ent.text)
def train_ner(train_data, validation_data): nlp = spacy.blank('en') if 'ner' not in nlp.pipe_names: ner = nlp.create_pipe('ner') nlp.add_pipe(ner, last=True) else: ner = nlp.get_pipe('ner') for raw_text, annotations in train_data: '''doc = nlp.make_doc(raw_text) for word in doc: _ = nlp.vocab[word.orth]''' for ent in annotations.get("entities"): ner.add_label(ent[2]) other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner'] with nlp.disable_pipes(*other_pipes): optimizer = nlp.begin_training() dropout = decaying(0.6, 0.2, 0.03) '''for itn in range(10): random.shuffle(train_data) losses = {} # batch up the examples using spaCy's minibatch batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.01)) for batch in batches: texts, annotations = zip(*batch) nlp.update(texts, annotations, drop=next(dropout), sgd=optimizer, losses=losses)''' for itn in range(10): random.shuffle(train_data) losses = {} for text, annotations in train_data: nlp.update([text], [annotations], drop=next(dropout), sgd=optimizer, losses=losses) print("Losses: {}".format(losses)) validate(nlp, validation_data, itn + 1) print('Epoch {} complete.\n'.format(itn + 1)) return nlp
def main(model='en', new_model_name='en-animals', output_dir=animal_model, use_gpu=-1, n_iter=20): if model_exists(output_dir): print('model exists.') test_model(output_dir, use_gpu) return """Set up the pipeline and entity recognizer, and train the new entity.""" if model is not None: print("Loading model '%s' ... " % model) if (use_gpu >= 0): spacy.util.use_gpu(0) nlp = spacy.load(model) # load existing spaCy model else: print("Creating blank 'en' model ... ") nlp = spacy.blank('en') # create blank Language class # Add entity recognizer to model if it's not in the pipeline # nlp.create_pipe works for built-ins that are registered with spaCy if 'ner' not in nlp.pipe_names: ner = nlp.create_pipe('ner') nlp.add_pipe(ner) else: # otherwise, get it, so we can add labels to it ner = nlp.get_pipe('ner') ner.add_label(LABEL) # add new entity label to entity recognizer print('begin training... ') if model is None: optimizer = nlp.begin_training(device=use_gpu) else: # Note that 'begin_training' initializes the models, so it'll zero out # existing entity types. optimizer = nlp.entity.create_optimizer() # Take dropout and batch size as generators of values -- dropout # starts high and decays sharply, to force the optimizer to explore. # Batch size starts at 1 and grows, so that we make updates quickly # at the beginning of training. dropout_rates = decaying(env_opt('dropout_from', 0.6), env_opt('dropout_to', 0.2), env_opt('dropout_decay', 1e-4)) batch_sizes = compounding(env_opt('batch_from', 15), env_opt('batch_to', 30), env_opt('batch_compound', 1.005)) # disable other pipes during training other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner'] with nlp.disable_pipes(*other_pipes): # only train NER n_train_words = count_train() for i in range(n_iter): losses = {} random.shuffle(TRAIN_DATA) with tqdm.tqdm(total=n_train_words, leave=False) as pbar: for batch in minibatch(TRAIN_DATA, size=batch_sizes): texts, annotations = zip(*batch) nlp.update(texts, annotations, sgd=optimizer, drop=next(dropout_rates), losses=losses) pbar.update(count_tokens(texts)) print('{}/{} loss: {}'.format(i + 1, n_iter, losses)) for text in test_texts: doc = nlp(text) print("Entities in '%s'" % text) for ent in doc.ents: print(ent.label_, ent.text) # save model to output directory if output_dir is not None: output_dir = Path(output_dir) if not output_dir.exists(): output_dir.mkdir() nlp.meta['name'] = new_model_name # rename model nlp.to_disk(output_dir) print("Saved model to", output_dir) # test the saved model test_model(output_dir, use_gpu)
nlp.add_pipe(ner, last=True) ner.add_label("KATZ") # add all new labels n_iter = 100 # number of iterations other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner" ] # get names of other pipes to disable them during training annotations = [] # ******************************** # start training # ******************************** with nlp.disable_pipes(*other_pipes): # only train NER optimizer = nlp.begin_training() dropouts = decaying(0.3, 0.1, 1e-4) batch_size = compounding( 4.0, 32.0, 1.001) # https://spacy.io/usage/training#tips-batch-size for itn in range(n_iter): shuffle(TRAIN_DATA) losses = {} batches = minibatch( TRAIN_DATA, size=batch_size) # batch up the examples using spaCy's minibatch dropout = next(dropouts) print(itn) print("Dropout", dropout) for batch in batches:
def _set_params(self, kwargs): """ Set input parameters based on the request. : :For details refer to the GitHub project: https://github.com/nabeel-oz/qlik-py-tools """ # Set default values which will be used if execution arguments are not passed # Default parameters: self.debug = False self.model = 'en_core_web_sm' self.custom = False self.base_model = 'en_core_web_sm' self.blank = False self.epochs = 100 self.batch_size = compounding(4.0, 32.0, 1.001) self.drop = 0.25 self.test = 0 # Extract the model path if required try: # Get the model name from the first row in the request_df self.model = self.request_df.loc[0, 'model_name'] # Remove the model_name column from the request_df self.request_df = self.request_df.drop(['model_name'], axis=1) except KeyError: pass # If key word arguments were included in the request, get the parameters and values if len(kwargs) > 0: # Transform the string of arguments into a dictionary self.kwargs = utils.get_kwargs(kwargs) # Set the debug option for generating execution logs # Valid values are: true, false if 'debug' in self.kwargs: self.debug = 'true' == self.kwargs['debug'].lower() # Additional information is printed to the terminal and logs if the paramater debug = true if self.debug: # Increment log counter for the class. Each instance of the class generates a new log. self.__class__.log_no += 1 # Create a log file for the instance # Logs will be stored in ..\logs\SpaCy Log <n>.txt self.logfile = os.path.join( os.getcwd(), 'logs', 'SpaCy Log {}.txt'.format(self.log_no)) self._print_log(1) # Set whether the model (if getting named entites) or base model (if retraining) is a custom model # i.e. not one of the pre-trained models provided by spaCy if 'custom' in self.kwargs: self.custom = 'true' == self.kwargs['custom'].lower() # Set the base model, i.e an existing spaCy model to be retrained. if 'base_model' in self.kwargs: self.base_model = self.kwargs['base_model'].lower() # Set the retraining to be done on a blank Language class if 'blank' in self.kwargs: self.blank = 'true' == self.kwargs['blank'].lower() # Set the epochs for training the model. # This is the the number times that the learning algorithm will work through the entire training dataset. # Valid values are an integer e.g. 200 if 'epochs' in self.kwargs: self.epochs = utils.atoi(self.kwargs['epochs']) # Set the batch size to be used during model training. # The model's internal parameters will be updated at the end of each batch. # Valid values are a single integer or compounding or decaying parameters. if 'batch_size' in self.kwargs: # The batch size may be a single integer try: self.batch_size = utils.atoi(self.kwargs['batch_size']) # Or a list of floats except ValueError: sizes = utils.get_kwargs_by_type(self.kwargs['batch_size']) # If the start < end, batch sizes will be compounded if sizes[0] < sizes[1]: self.batch_size = compounding(sizes[0], sizes[1], sizes[2]) # else bath sizes will decay during training else: self.batch_size = decaying(sizes[0], sizes[1], sizes[2]) # Set the dropout rate for retraining the model # This determines the likelihood that a feature or internal representation in the model will be dropped, # making it harder for the model to memorize the training data. # Valid values are a float lesser than 1.0 e.g. 0.35 if 'drop' in self.kwargs: self.drop = utils.atof(self.kwargs['drop']) # Set the ratio of data to be used for testing. # This data will be held out from training and just used to provide evaluation metrics. # Valid values are a float >= zero and < 1.0 e.g. 0.3 if 'test' in self.kwargs: self.test = utils.atof(self.kwargs['test']) # Debug information is printed to the terminal and logs if the paramater debug = true if self.debug: self._print_log(2) # Remove the kwargs column from the request_df self.request_df = self.request_df.drop(['kwargs'], axis=1)
def determine_dropout(self): """ For small datasets, it’s useful to set a high dropout rate at first, and decay it down towards a more reasonable value. This helps avoid the network immediately overfitting, while still encouraging it to learn some of the more interesting things in your data. """ dropout = decaying(self.dropout_start, self.dropout_end, self.interval) return dropout
def train_textcat(nlp, train_data, init_tok2vec=None, continue_training=False, epochs=10, dropout_rates=(0.6, 0.2, 1e-4), minibatch_sizes=(1.0, 64.0, 1.001), valid_docs=None, valid_labels=None, output_dir=None, use_tqdm=False): """Train, evaluate, and store TextCategorizer model.""" if "textcat" in nlp.pipe_names: train_eval_time = time.time() if valid_docs is not None or init_tok2vec is not None: textcat = nlp.get_pipe("textcat") # get names of other pipes to disable them during training other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "textcat"] with nlp.disable_pipes(*other_pipes): # only train textcat # if base if continue_training: # Start with an existing model, use default optimizer optimizer = nlp.resume_training() else: optimizer = nlp.begin_training() # load pretrained LMAO weights if init_tok2vec is not None: with init_tok2vec.open("rb") as file_: print("Loading LMAO weights...") textcat.model.tok2vec.from_bytes(file_.read()) print("Training the model...") print("{:^5}\t{:^5}\t{:^5}\t{:^5}".format("LOSS", "P", "R", "F")) # create batch sizes min_batch_size, max_batch_size, update_by = minibatch_sizes batch_sizes = compounding(min_batch_size, max_batch_size, update_by) # create decaying dropout starting_dropout, ending_dropout, decay_rate = dropout_rates dropouts = decaying(starting_dropout, ending_dropout, decay_rate) best_avg_f1 = 0 for i in range(epochs): print("Epoch:", i) losses = {} # batch up the examples using spaCy's minibatch random.shuffle(train_data) if use_tqdm: train_data = tqdm(train_data, leave=False) batches = minibatch(train_data, size=batch_sizes) for batch, dropout in zip(batches, dropouts): texts, annotations = zip(*batch) nlp.update(texts, annotations, sgd=optimizer, drop=dropout, losses=losses) # evaluate model on validatation set if valid_docs is not None and valid_labels is not None: with textcat.model.use_params(optimizer.averages): scores, valid_label_set = evaluate( textcat, valid_docs, valid_labels) print("{0:.3f}\t{1:}\t{2:}\t{3:}".format( losses["textcat"], "_____", "_____", "_____")) avg_f1 = 0 for vc in valid_label_set: print("{0:}\t{1:.3f}\t{2:.3f}\t{3:.3f}". format( # print as a table vc, scores[vc]["precision"], scores[vc]["recall"], scores[vc]["f1-score"], )) avg_f1 += scores[vc]["f1-score"] print("Accuracy:", scores["accuracy"]) print("_____________________________") # assign best model, score, and epoch avg_f1 = avg_f1 / len(valid_label_set) if avg_f1 > best_avg_f1: best_avg_f1 = avg_f1 # overwrite the weak with the strong store_model(output_dir, nlp, optimizer) else: print("{0:.3f}\t{1:}\t{2:}\t{3:}".format( losses["textcat"], "_____", "_____", "_____")) if use_tqdm: # train_data was put into tqdm object and won't shuffle properly due to indexing # put train_data back to it's original type train_data = train_data.iterable # store final model if no evaluation performed if valid_docs is None: store_model(output_dir, nlp, optimizer) print("Finished after: {0:.2f} minutes".format( (time.time() - train_eval_time) / 60)) else: raise NameError( "Pipe 'textcat' is not in the nlp pipeline. Be sure to run mk_model() before training." ) return nlp
def train_classifier_model(self, task, proj): # TODO buscar essas variaveis dos parametros da task, project ou self.config n_iter = 20 n_texts = 2000 init_tok2vec = None languages = self.cfg.get('languages', {}) self.logging.debug(f"languages: {languages}") for lang, model in languages.items(): # data sample query selection proj['index_query'] = util.createTrainDataQuery(proj, lang) # load correct dataset from index (train_texts, train_cats), ( dev_texts, dev_cats), categories = self.generate_classifier_data(proj) if len(categories) == 0 or len(train_texts) == 0: # only load the model if is data to train self.logging.debug( f"No new training data found in index for language: {lang}" ) continue # check if already exists the taget model file, if so then load it, and update this existing model model_file, _, _ = util.getDataFilename(self.cfg, f"{proj['id']}/{lang}", None, None) _nlp = None if os.path.exists(model_file): self.logging.info(f"Loading project model '{model_file}'") _nlp = spacy.load(model_file) else: # fallback to language core model self.logging.info(f"Loading default lang model '{model}'") _nlp = spacy.load(model) # add the text classifier to the pipeline if it doesn't exist if "textcat" not in _nlp.pipe_names: textcat = _nlp.create_pipe("textcat", config={ "exclusive_classes": True, "architecture": "simple_cnn" }) _nlp.add_pipe(textcat, last=True) # otherwise, get it, so we can add labels to it else: textcat = _nlp.get_pipe("textcat") # add label to text classifier self.logging.info(f"Categories: {categories}") for cat in categories: textcat.add_label(cat) train_texts = train_texts[:n_texts] train_cats = train_cats[:n_texts] self.logging.info( f"Using {n_texts} examples ({len(train_texts)} training, {len(dev_texts)} evaluation)" ) train_data = list( zip(train_texts, [{ "cats": cats } for cats in train_cats])) # get names of other pipes to disable them during training pipe_exceptions = ["textcat", "trf_wordpiecer", "trf_tok2vec"] other_pipes = [ pipe for pipe in _nlp.pipe_names if pipe not in pipe_exceptions ] with _nlp.disable_pipes(*other_pipes): # only train textcat optimizer = _nlp.begin_training() if init_tok2vec is not None: with init_tok2vec.open("rb") as file_: textcat.model.tok2vec.from_bytes(file_.read()) self.logging.info("Training the model...") self.logging.debug("{:^5}\t{:^5}\t{:^5}\t{:^5}".format( "LOSS", "P", "R", "F")) batch_sizes = compounding(4.0, 32.0, 1.001) dropout = decaying(0.6, 0.2, 1e-4) for _i in range(n_iter): losses = {} # batch up the examples using spaCy's minibatch random.shuffle(train_data) batches = minibatch(train_data, size=batch_sizes) for batch in batches: texts, annotations = zip(*batch) _nlp.update(texts, annotations, sgd=optimizer, drop=next(dropout), losses=losses) #_nlp.update(texts, annotations, sgd=optimizer, drop=0.2, losses=losses) with textcat.model.use_params(optimizer.averages): # evaluate on the dev data split off in load_data() scores = self.evaluate(_nlp.tokenizer, textcat, dev_texts, dev_cats) self.logging.debug("{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}". format( # print a simple table losses["textcat"], scores["textcat_p"], scores["textcat_r"], scores["textcat_f"], )) # save the model with _nlp.use_params(optimizer.averages): _nlp.to_disk(model_file) self.logging.info(f"Saved model to {model_file}") # Load the saved model self.loaded_models[f"{proj['id']}/{lang}"] = spacy.load(model_file) self.logging.debug(self.loaded_models)
def modelSpacy(model=None, new_model_name="Product", output_dir=None, n_iter=40, training_data=None, validation_data=None, validation_plot=False, dropout=(0.35, 0.35, 1), batch=(1., 32., 1.001), verbose=1): # IMPORT LIBRARIES from bloo.mlMaster import spacyEvaluate from spacy.util import decaying # DECLARE VARIABLES lossesList = [] lossesList.append(len(training_data)) metricsLocal = {"precision": [0], "recall": [0], "f1score": [0]} # We want to reproduce the same random situation in each test random.seed(0) if model is not None: nlp = spacy.load(model) else: nlp = spacy.blank("en") if (verbose >= 1): print("Created a blank 'en' model") # Now we add the NER recognizer to the model if "ner" not in nlp.pipe_names: ner = nlp.create_pipe("ner") nlp.add_pipe(ner) else: ner = nlp.get_pipe("ner") # Add the new entity label to entity recognizer ner.add_label("PRODUCTS") # Decide whether to start or to resume training if model is None: optimizer = nlp.begin_training() else: optimizer = nlp.resume_training() move_names = list(ner.move_names) # Get names of other pipes to disable them during training pipe_executions = ["ner", "trf_wordpiercer", "trf_tok2vec"] other_pipes = [ pipe for pipe in nlp.pipe_names if pipe not in pipe_executions ] # Only train the given NER with nlp.disable_pipes(*other_pipes), warnings.catch_warnings(): warnings.filterwarnings("once", category=UserWarning, module='spacy') sizes = compounding(batch[0], batch[1], batch[2]) dropout = decaying(dropout[0], dropout[1], dropout[2]) # batch up the examples using spacy's mini batch for itn in range(n_iter + 1): random.shuffle(training_data) batches = minibatch(training_data, size=sizes) losses = {} for batch in batches: texts, annotations = zip(*batch) nlp.update(texts, annotations, sgd=optimizer, drop=next(dropout), losses=losses) # Print information about the current iteration after it is finished so we can better visualize the progress if (verbose >= 1): print("Losses after iteration %i: %s" % (itn, str(losses))) print("Current dropout rate: %.2f" % (next(dropout))) # Add the current loss to the list of losses, so we can plot it later lossesList.append(int(losses['ner'])) # AFTER EACH ITERATION WE WANT TO VALIDATE THE DATA AND GET SOME RESULTS if validation_data is not None: random.shuffle(validation_data) metrics = spacyEvaluate(ner_model=nlp, examples=validation_data) print( "Metrics after iteration %i: PRECISION: %.2f%% | RECALL: %.2f%% | F1SCORE: %.2f%% |" % (itn, metrics['precision'], metrics['recall'], metrics['f1score'])) for key, value in metricsLocal.items(): metricsLocal[key].append(metrics[key]) # SAVE THE MODEL if output_dir is not None: print("Spacy model: Saving the model in the output directory: \"%s\"" % (str(output_dir))) output_dir = Path(output_dir) if not output_dir.exists(): output_dir.mkdir() nlp.meta["name"] = new_model_name nlp.to_disk(output_dir) print("Spacy model: saved successfully.") return nlp, lossesList, metricsLocal
def train(model, train_data, dev_data, test_data, output_dir, n_iter, meta_overrides): """Load the model, set up the pipeline and train the entity recognizer.""" if model is not None: nlp = spacy.load(model) # load existing spaCy model print("Loaded model '%s'" % model) else: nlp = spacy.blank('en') # create blank Language class print("Created blank 'en' model") if meta_overrides is not None: metadata = json.load(open(meta_overrides)) nlp.meta.update(metadata) original_tokenizer = nlp.tokenizer nlp.tokenizer = WhitespaceTokenizer(nlp.vocab) # create the built-in pipeline components and add them to the pipeline # nlp.create_pipe works for built-ins that are registered with spaCy if 'ner' not in nlp.pipe_names and "parser" in nlp.pipe_names: ner = nlp.create_pipe('ner') nlp.add_pipe(ner, after="parser") elif 'ner' not in nlp.pipe_names and "tagger" in nlp.pipe_names: ner = nlp.create_pipe('ner') nlp.add_pipe(ner, after="tagger") elif 'ner' not in nlp.pipe_names: ner = nlp.create_pipe('ner') nlp.add_pipe(ner, last=True) # otherwise, get it so we can add labels else: ner = nlp.get_pipe('ner') # add labels for _, annotations in train_data: for ent in annotations.get('entities'): ner.add_label(ent[2]) # get names of other pipes to disable them during training other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner'] dropout_rates = util.decaying(util.env_opt('dropout_from', 0.2), util.env_opt('dropout_to', 0.2), util.env_opt('dropout_decay', 0.005)) batch_sizes = util.compounding(util.env_opt('batch_from', 1), util.env_opt('batch_to', 32), util.env_opt('batch_compound', 1.001)) with nlp.disable_pipes(*other_pipes): optimizer = nlp.begin_training() best_epoch = 0 best_f1 = 0 for i in range(n_iter): random.shuffle(train_data) count = 0 losses = {} total = len(train_data) with nlp.disable_pipes(*other_pipes): # only train NER with tqdm.tqdm(total=total, leave=True) as pbar: for batch in minibatch(train_data, size=batch_sizes): docs, golds = zip(*batch) nlp.update(docs, golds, sgd=optimizer, losses=losses, drop=next(dropout_rates)) pbar.update(len(batch)) if count % 100 == 0 and count > 0: print('sum loss: %s' % losses['ner']) count += 1 # save model to output directory output_dir_path = Path(output_dir + "/" + str(i)) if not output_dir_path.exists(): output_dir_path.mkdir() with nlp.use_params(optimizer.averages): nlp.tokenizer = original_tokenizer nlp.to_disk(output_dir_path) print("Saved model to", output_dir_path) # test the saved model print("Loading from", output_dir_path) nlp2 = util.load_model_from_path(output_dir_path) nlp2.tokenizer = WhitespaceTokenizer(nlp.vocab) metrics = evaluate_ner(nlp2, dev_data) if metrics["f1-measure-overall"] > best_f1: best_f1 = metrics["f1-measure-overall"] best_epoch = i # save model to output directory best_model_path = Path(output_dir + "/" + "best") print(f"Best Epoch: {best_epoch} of {n_iter}") if os.path.exists(best_model_path): shutil.rmtree(best_model_path) shutil.copytree(os.path.join(output_dir, str(best_epoch)), best_model_path) # test the saved model print("Loading from", best_model_path) nlp2 = util.load_model_from_path(best_model_path) nlp2.tokenizer = WhitespaceTokenizer(nlp.vocab) evaluate_ner(nlp2, dev_data, dump_path=os.path.join(output_dir, "dev_metrics.json")) evaluate_ner(nlp2, test_data, dump_path=os.path.join(output_dir, "test_metrics.json"))
Created on Sat Sep 1 11:26:50 2018 @author: Gurunath """ from __future__ import unicode_literals, print_function import plac import random import pandas as pd from pathlib import Path import thinc.extra.datasets import spacy from spacy.util import minibatch, compounding from spacy.util import decaying dropout = decaying(0.6, 0.2, 1e-4) tweet_df = pd.read_csv( r'F:\E\Learning_DL_fastai\competition\NLP_data\train_2kmZucJ.csv') def get_batches(train_data, model_type): max_batch_sizes = {'tagger': 32, 'parser': 16, 'ner': 16, 'textcat': 64} max_batch_size = max_batch_sizes[model_type] if len(train_data) < 1000: max_batch_size /= 2 if len(train_data) < 500: max_batch_size /= 2 batch_size = compounding(1, max_batch_size, 1.001) batches = minibatch(train_data, size=batch_size) return batches
def train(model=None, output_dir=None, n_iter=20, n_texts=2000, categories=[], train_texts=[], train_cats=[], dev_texts=[], dev_cats=[]): if model is not None: nlp = spacy.load(model) # load existing spaCy model print("Loaded model '%s'" % model) else: nlp = spacy.blank('en') # create blank Language class print("Created blank 'en' model") # add the text classifier to the pipeline if it doesn't exist # nlp.create_pipe works for built-ins that are registered with spaCy if 'textcat' not in nlp.pipe_names: textcat = nlp.create_pipe('textcat') nlp.add_pipe(textcat, last=True) # otherwise, get it, so we can add labels to it else: textcat = nlp.get_pipe('textcat') # add label to text classifier #categories = ['greet', 'time', 'direction', 'self-location', 'location', 'search-general', #'search-restaurants', 'affirmation', 'negation', 'launch', 'news', 'shut-down', #'compliment', 'search-wikipedia'] for category in categories: textcat.add_label(category) # load the IMDB dataset print("Loading categorisation data...") #(train_texts, train_cats), (dev_texts, dev_cats) = load_data(categories, limit=n_texts) print("Using {} examples ({} training, {} evaluation)" .format(n_texts, len(train_texts), len(dev_texts))) train_data = list(zip(train_texts, [{'cats': cats} for cats in train_cats])) # get names of other pipes to disable them during training other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'textcat'] with nlp.disable_pipes(*other_pipes): # only train textcat optimizer = nlp.begin_training() print("Training the model...") print('{:^5}\t{:^5}\t{:^5}\t{:^5}\t{:^5}'.format('Iter #', 'LOSS', 'P', 'R', 'F')) for i in range(n_iter): losses = {} # batch up the examples using spaCy's minibatch #batches = get_batches(train_data, 'textcat') batches = minibatch(train_data, size=compounding(4., 32., 1.001)) dropout = decaying(0.6, 0.2, 1e-4) for batch in batches: texts, annotations = zip(*batch) nlp.update(texts, annotations, sgd=optimizer, drop=0.2, losses=losses) with textcat.model.use_params(optimizer.averages): # evaluate on the dev data split off in load_data() scores = evaluate(nlp.tokenizer, textcat, dev_texts, dev_cats) print('{0}\t{1:.3f}\t{2:.3f}\t{3:.3f}\t{4:.3f}' # print a simple table .format(i, losses['textcat'], scores['textcat_p'], scores['textcat_r'], scores['textcat_f'])) if output_dir is not None: output_dir = Path(output_dir) if not output_dir.exists(): output_dir.mkdir() nlp.to_disk(output_dir) print("Saved model to", output_dir)
textcat = nlp_model.create_pipe('textcat') nlp_model.add_pipe(textcat, last=True) # add label to text classifier for custom_label in [ "toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate" ]: # Enter custom labels here <--------------------- textcat.add_label(custom_label) # otherwise, get it, so we can add labels to it else: print('model already contains textcat!') textcat = nlp_model.get_pipe('textcat') optimizer = textcat.begin_training() dropout = decaying(drop_max, drop_min, drop_step) # store for evaluation precisions = list() recalls = list() f_scores = list() loss_vals = list() data_pds = pd.read_csv(nrows=n_texts, filepath_or_buffer=data_filepath, chunksize=chunk_size, header=0, names=[ "id", "comment_text", "toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate" ])
def main(model=None, output_dir=None, n_iter=20, n_texts=2000): if model is not None: nlp = spacy.load(model) # load existing spaCy model print("Loaded model '%s'" % model) else: nlp = spacy.blank('en') # create blank Language class print("Created blank 'en' model") # add the text classifier to the pipeline if it doesn't exist # nlp.create_pipe works for built-ins that are registered with spaCy if 'textcat' not in nlp.pipe_names: textcat = nlp.create_pipe('textcat') nlp.add_pipe(textcat, last=True) # otherwise, get it, so we can add labels to it else: textcat = nlp.get_pipe('textcat') # add label to text classifier textcat.add_label('Neutral') textcat.add_label('Bullish') textcat.add_label('Bearish') # load the IMDB dataset print("Loading tweets data...") # (train_texts, train_cats), (dev_texts, dev_cats) = load_data(limit=n_texts) (train_texts, train_cats), (dev_texts, dev_cats) = load_data_2(limit=n_texts) print("Using {} examples ({} training, {} evaluation)".format( n_texts * 2, len(train_texts), len(dev_texts))) train_data = list(zip(train_texts, [{ 'cats': cats } for cats in train_cats])) # get names of other pipes to disable them during training other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'textcat'] dropout = decaying(0.6, 0.2, 1e-4) with nlp.disable_pipes(*other_pipes): # only train textcat optimizer = nlp.begin_training() print("Training the model...") print('{:^5}\t{:^5}\t{:^5}\t{:^5}'.format('LOSS', 'P', 'R', 'F')) for i in range(n_iter): losses = {} # batch up the examples using spaCy's minibatch batches = minibatch(train_data, size=compounding(2., 8., 1.001)) for batch in batches: texts, annotations = zip(*batch) nlp.update(texts, annotations, sgd=optimizer, drop=next(dropout), losses=losses) with textcat.model.use_params(optimizer.averages): # evaluate on the dev data split off in load_data() try: scores = evaluate(nlp.tokenizer, textcat, dev_texts, dev_cats) print( '{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}' # print a simple table .format(losses['textcat'], scores['textcat_p'], scores['textcat_r'], scores['textcat_f'])) except Exception as e: print(e) pass # test the trained model test_text = "#aapl buy for 250m the market!!!" doc = nlp(test_text) print(test_text, doc.cats) if output_dir is None: output_dir = Path('tweetsClassifier/spacyTrainingModel') if not output_dir.exists(): output_dir.mkdir() with nlp.use_params(optimizer.averages): nlp.to_disk(output_dir) print("Saved model to", output_dir) # test the saved model print("Loading from", output_dir) test_text = "long #aapl for 250m the market!!!" test_text2 = "#aapl lead the market!" nlp2 = spacy.load(output_dir) doc2 = nlp2(test_text) print(test_text, doc2.cats)
def train_spacy(data, iterations): if model is not None: nlp = spacy.load(model) # load existing spaCy model print("Loaded model '%s'" % model) else: nlp = spacy.blank("id") # create blank Language class print("Created blank 'indo' model") TRAIN_DATA = data # create the built-in pipeline components and add them to the pipeline # nlp.create_pipe works for built-ins that are registered with spaCy if 'ner' not in nlp.pipe_names: ner = nlp.create_pipe('ner') nlp.add_pipe(ner, last=True) else: ner = nlp.get_pipe("ner") # add labels for _, annotations in TRAIN_DATA: for ent in annotations.get('entities'): ner.add_label(ent[2]) if model is None: optimizer = nlp.begin_training() # For training with customized cfg # nlp.entity.cfg['conv_depth'] = 16 # nlp.entity.cfg['token_vector_width'] = 256 # nlp.entity.cfg['bilstm_depth'] = 1 # nlp.entity.cfg['beam_width'] = 2 else: print("resuming") optimizer = nlp.resume_training() print(optimizer.learn_rate) # get names of other pipes to disable them during training pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"] other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner'] dropout = decaying(0.8, 0.2, 1e-6) # minimum, max, decay rate # sizes = compounding(1.0, 4.0, 1.001) sizes = compounding(4., 32., 1.001) with nlp.disable_pipes(*other_pipes): # only train NER warnings.filterwarnings("once", category=UserWarning, module='spacy') optimizer.learn_rate = 0.001 for itn in range(iterations): file = open(outlog_txt, 'a') # For logging losses of iterations start = time.time() # Iteration Time if itn % 100 == 0 and itn != 0: # print("Itn : " + str(itn), time.time() - start_training_time) # print('Testing') # results = evaluate(nlp, TEST_DATA) # file1 = open(outlog_file, 'a') # file1.write(str(itn) + ',' + str(results['ents_p']) + ',' + str(results['ents_r']) + ',' + str( # results['ents_f']) + ',' + str(results["ents_per_type"]) + "\n") # file1.close() # results = evaluate(nlp, TRAIN_DATA) # file2 = open(train_file, 'a') # file2.write(str(itn) + ',' + str(results['ents_p']) + ',' + str(results['ents_r']) + ',' + str( # results['ents_f']) + ',' + str(results["ents_per_type"]) + "\n") # file2.close() #todo check point modelfile = output_dir + "training_model" + str(itn) nlp.to_disk(modelfile) # Reducing Learning rate after certain operations if itn == 100: optimizer.learn_rate = 0.0005 if itn == 150: optimizer.learn_rate = 0.0001 print("Statring iteration " + str(itn)) random.shuffle(TRAIN_DATA) losses = {} # use either batches or entire set at once ##### For training in Batches batches = minibatch(TRAIN_DATA[:int(len(TRAIN_DATA) * 1)], size=sizes) for batch in batches: texts, annotations = zip(*batch) # nlp.update(texts, annotations, sgd=optimizer, drop=next(dropout), losses=losses) nlp.update(texts, annotations, sgd=optimizer, drop=0.3, losses=losses) ########################################### ##### For training in as a single iteration # for text, annotations in TRAIN_DATA: # nlp.update( # [text], # batch of texts # [annotations], # batch of annotations # drop=0.2, # dropout - make it harder to memorise data # # drop=next(dropout), Incase you are using decaying drop # sgd=optimizer, # callable to update weights # losses=losses) print("Losses", losses) file.write(str(itn) + "," + str(losses['ner']) + "\n") print("time for iteration:", time.time() - start) file.close() return nlp
def train_textcat_model( load_data_func: Callable[ [], Tuple[List[Tuple[Any, Dict[str, Dict[str, bool]]]], List[Tuple[Any, Dict[str, Dict[str, bool]]]]] ], n_iter: int = 20, max_texts: int = 2000, model: Optional[str] = None, output_dir: str = '/tmp/model', labels: Optional[Iterable[str]] = None, test_text: Optional[str] = None ) -> None: # Load data and verify there is some train_data, eval_data = load_data_func() if not train_data: raise Exception('There is no data provided to train') if not eval_data: raise Exception('There is no data provided to evaluate the trained model') if not labels: raise Exception('No labels were provided to train') if not output_dir: raise Exception('Output dir must be specified') if model: nlp = spacy.load(model) print(f'Loaded model "{model}"') else: nlp = spacy.blank("en") print('Created blank "en" model') # Add the text classifier to the pipeline if it doesn't exist if 'textcat' not in nlp.pipe_names: # nlp.create_pipe works for built-ins that are registered with spaCy textcat = nlp.create_pipe( 'textcat', config={'exclusive_classes': True, 'architecture': 'simple_cnn'} ) nlp.add_pipe(textcat, last=True) else: # Otherwise, get it, so we can add labels to it textcat = nlp.get_pipe('textcat') # Add labels to text classifier for label in labels: textcat.add_label(label) print( "Using max {} examples ({} training, {} evaluation)".format( max_texts, len(train_data), len(eval_data) ) ) # We mainly have small data sets, so it's recommended to use a high dropout rate at first # From https://spacy.io/usage/training#tips-dropout dropout = decaying(0.6, 0.2, 1e-4) # Get names of other pipes to disable them during training other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "textcat"] with nlp.disable_pipes(*other_pipes): optimizer = nlp.begin_training() print('Training the model...') print('{:^5}\t{:^5}\t{:^5}\t{:^5}'.format("LOSS", "P", "R", "F")) batch_sizes = compounding(4.0, 32.0, 1.001) for i in range(n_iter): losses = {} # batch up the examples using spaCy's minibatch random.shuffle(train_data) batches = minibatch(train_data, size=batch_sizes) for batch in batches: texts, annotations = zip(*batch) nlp.update(texts, annotations, sgd=optimizer, drop=next(dropout), losses=losses) with textcat.model.use_params(optimizer.averages): # evaluate on the dev data split off in load_data() scores = _evaluate(nlp.tokenizer, textcat, eval_data) # Print a simple table print( "{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}".format( losses["textcat"], scores["textcat_p"], scores["textcat_r"], scores["textcat_f"], ) ) # Create the output dir (if it doesn't exist output_dir = Path(output_dir) if not output_dir.exists(): output_dir.mkdir() # Use the averages when writing out the model # From https://spacy.io/usage/training#tips-param-avg with nlp.use_params(optimizer.averages): nlp.to_disk(output_dir) print(f'Saved model to {output_dir}') # test the saved model if test_text: print(f'Loading saved model from {output_dir}') test_textcat_model(output_dir, test_text)
def custom_train( lang, output_path, train_path, dev_path, raw_text=None, base_model=None, pipeline="tagger,parser,ner", vectors=None, n_iter=30, n_early_stopping=None, n_examples=0, use_gpu=-1, version="0.0.0", meta_path=None, init_tok2vec=None, parser_multitasks="", entity_multitasks="", noise_level=0.0, orth_variant_level=0.0, eval_beam_widths="", gold_preproc=False, learn_tokens=False, textcat_multilabel=False, textcat_arch="bow", textcat_positive_label=None, verbose=False, debug=False, ): """ Train or update a spaCy model. Requires data to be formatted in spaCy's JSON format. To convert data from other formats, use the `spacy convert` command. """ # temp fix to avoid import issues cf https://github.com/explosion/spaCy/issues/4200 import tqdm msg = Printer() util.fix_random_seed() util.set_env_log(verbose) # Make sure all files and paths exists if they are needed train_path = util.ensure_path(train_path) dev_path = util.ensure_path(dev_path) meta_path = util.ensure_path(meta_path) output_path = util.ensure_path(output_path) if raw_text is not None: raw_text = list(srsly.read_jsonl(raw_text)) if not train_path or not train_path.exists(): msg.fail("Training data not found", train_path, exits=1) if not dev_path or not dev_path.exists(): msg.fail("Development data not found", dev_path, exits=1) if meta_path is not None and not meta_path.exists(): msg.fail("Can't find model meta.json", meta_path, exits=1) meta = srsly.read_json(meta_path) if meta_path else {} if output_path.exists() and [ p for p in output_path.iterdir() if p.is_dir() ]: msg.warn( "Output directory is not empty", "This can lead to unintended side effects when saving the model. " "Please use an empty directory or a different path instead. If " "the specified output path doesn't exist, the directory will be " "created for you.", ) if not output_path.exists(): output_path.mkdir() # Take dropout and batch size as generators of values -- dropout # starts high and decays sharply, to force the optimizer to explore. # Batch size starts at 1 and grows, so that we make updates quickly # at the beginning of training. dropout_rates = util.decaying( util.env_opt("dropout_from", 0.2), util.env_opt("dropout_to", 0.2), util.env_opt("dropout_decay", 0.0), ) batch_sizes = util.compounding( util.env_opt("batch_from", 100.0), util.env_opt("batch_to", 1000.0), util.env_opt("batch_compound", 1.001), ) if not eval_beam_widths: eval_beam_widths = [1] else: eval_beam_widths = [int(bw) for bw in eval_beam_widths.split(",")] if 1 not in eval_beam_widths: eval_beam_widths.append(1) eval_beam_widths.sort() has_beam_widths = eval_beam_widths != [1] # Set up the base model and pipeline. If a base model is specified, load # the model and make sure the pipeline matches the pipeline setting. If # training starts from a blank model, intitalize the language class. pipeline = [p.strip() for p in pipeline.split(",")] msg.text("Training pipeline: {}".format(pipeline)) if base_model: msg.text("Starting with base model '{}'".format(base_model)) nlp = util.load_model(base_model) if nlp.lang != lang: msg.fail( "Model language ('{}') doesn't match language specified as " "`lang` argument ('{}') ".format(nlp.lang, lang), exits=1, ) nlp.disable_pipes([p for p in nlp.pipe_names if p not in pipeline]) for pipe in pipeline: if pipe not in nlp.pipe_names: if pipe == "parser": pipe_cfg = {"learn_tokens": learn_tokens} elif pipe == "textcat": pipe_cfg = { "exclusive_classes": not textcat_multilabel, "architecture": textcat_arch, "positive_label": textcat_positive_label, } else: pipe_cfg = {} nlp.add_pipe(nlp.create_pipe(pipe, config=pipe_cfg)) else: if pipe == "textcat": textcat_cfg = nlp.get_pipe("textcat").cfg base_cfg = { "exclusive_classes": textcat_cfg["exclusive_classes"], "architecture": textcat_cfg["architecture"], "positive_label": textcat_cfg["positive_label"], } pipe_cfg = { "exclusive_classes": not textcat_multilabel, "architecture": textcat_arch, "positive_label": textcat_positive_label, } if base_cfg != pipe_cfg: msg.fail( "The base textcat model configuration does" "not match the provided training options. " "Existing cfg: {}, provided cfg: {}".format( base_cfg, pipe_cfg), exits=1, ) else: msg.text("Starting with blank model '{}'".format(lang)) lang_cls = util.get_lang_class(lang) ### Here are our modifications: lang_cls.Defaults.tag_map = custom_tag_map nlp = lang_cls() assert nlp.vocab.morphology.n_tags == 36 ### for pipe in pipeline: if pipe == "parser": pipe_cfg = {"learn_tokens": learn_tokens} elif pipe == "textcat": pipe_cfg = { "exclusive_classes": not textcat_multilabel, "architecture": textcat_arch, "positive_label": textcat_positive_label, } else: pipe_cfg = {} nlp.add_pipe(nlp.create_pipe(pipe, config=pipe_cfg)) if vectors: msg.text("Loading vector from model '{}'".format(vectors)) _load_vectors(nlp, vectors) # Multitask objectives multitask_options = [("parser", parser_multitasks), ("ner", entity_multitasks)] for pipe_name, multitasks in multitask_options: if multitasks: if pipe_name not in pipeline: msg.fail("Can't use multitask objective without '{}' in the " "pipeline".format(pipe_name)) pipe = nlp.get_pipe(pipe_name) for objective in multitasks.split(","): pipe.add_multitask_objective(objective) # Prepare training corpus msg.text("Counting training words (limit={})".format(n_examples)) corpus = GoldCorpus(train_path, dev_path, limit=n_examples) n_train_words = corpus.count_train() if base_model: # Start with an existing model, use default optimizer optimizer = create_default_optimizer(Model.ops) else: # Start with a blank model, call begin_training optimizer = nlp.begin_training(lambda: corpus.train_tuples, device=use_gpu) nlp._optimizer = None # Load in pretrained weights if init_tok2vec is not None: components = _load_pretrained_tok2vec(nlp, init_tok2vec) msg.text("Loaded pretrained tok2vec for: {}".format(components)) # Verify textcat config if "textcat" in pipeline: textcat_labels = nlp.get_pipe("textcat").cfg["labels"] if textcat_positive_label and textcat_positive_label not in textcat_labels: msg.fail( "The textcat_positive_label (tpl) '{}' does not match any " "label in the training data.".format(textcat_positive_label), exits=1, ) if textcat_positive_label and len(textcat_labels) != 2: msg.fail( "A textcat_positive_label (tpl) '{}' was provided for training " "data that does not appear to be a binary classification " "problem with two labels.".format(textcat_positive_label), exits=1, ) train_docs = corpus.train_docs( nlp, noise_level=noise_level, gold_preproc=gold_preproc, max_length=0, ignore_misaligned=True, ) train_labels = set() if textcat_multilabel: multilabel_found = False for text, gold in train_docs: train_labels.update(gold.cats.keys()) if list(gold.cats.values()).count(1.0) != 1: multilabel_found = True if not multilabel_found and not base_model: msg.warn("The textcat training instances look like they have " "mutually-exclusive classes. Remove the flag " "'--textcat-multilabel' to train a classifier with " "mutually-exclusive classes.") if not textcat_multilabel: for text, gold in train_docs: train_labels.update(gold.cats.keys()) if list(gold.cats.values()).count(1.0) != 1 and not base_model: msg.warn( "Some textcat training instances do not have exactly " "one positive label. Modifying training options to " "include the flag '--textcat-multilabel' for classes " "that are not mutually exclusive.") nlp.get_pipe("textcat").cfg["exclusive_classes"] = False textcat_multilabel = True break if base_model and set(textcat_labels) != train_labels: msg.fail( "Cannot extend textcat model using data with different " "labels. Base model labels: {}, training data labels: " "{}.".format(textcat_labels, list(train_labels)), exits=1, ) if textcat_multilabel: msg.text( "Textcat evaluation score: ROC AUC score macro-averaged across " "the labels '{}'".format(", ".join(textcat_labels))) elif textcat_positive_label and len(textcat_labels) == 2: msg.text("Textcat evaluation score: F1-score for the " "label '{}'".format(textcat_positive_label)) elif len(textcat_labels) > 1: if len(textcat_labels) == 2: msg.warn( "If the textcat component is a binary classifier with " "exclusive classes, provide '--textcat_positive_label' for " "an evaluation on the positive class.") msg.text( "Textcat evaluation score: F1-score macro-averaged across " "the labels '{}'".format(", ".join(textcat_labels))) else: msg.fail( "Unsupported textcat configuration. Use `spacy debug-data` " "for more information.") # fmt: off row_head, output_stats = _configure_training_output( pipeline, use_gpu, has_beam_widths) row_widths = [len(w) for w in row_head] row_settings = { "widths": row_widths, "aligns": tuple(["r" for i in row_head]), "spacing": 2 } # fmt: on print("") msg.row(row_head, **row_settings) msg.row(["-" * width for width in row_settings["widths"]], **row_settings) try: iter_since_best = 0 best_score = 0.0 for i in range(n_iter): train_docs = corpus.train_docs( nlp, noise_level=noise_level, orth_variant_level=orth_variant_level, gold_preproc=gold_preproc, max_length=0, ignore_misaligned=True, ) if raw_text: random.shuffle(raw_text) raw_batches = util.minibatch( (nlp.make_doc(rt["text"]) for rt in raw_text), size=8) words_seen = 0 with tqdm.tqdm(total=n_train_words, leave=False) as pbar: losses = {} for batch in util.minibatch_by_words(train_docs, size=batch_sizes): if not batch: continue docs, golds = zip(*batch) nlp.update( docs, golds, sgd=optimizer, drop=next(dropout_rates), losses=losses, ) if raw_text: # If raw text is available, perform 'rehearsal' updates, # which use unlabelled data to reduce overfitting. raw_batch = list(next(raw_batches)) nlp.rehearse(raw_batch, sgd=optimizer, losses=losses) if not int(os.environ.get("LOG_FRIENDLY", 0)): pbar.update(sum(len(doc) for doc in docs)) words_seen += sum(len(doc) for doc in docs) with nlp.use_params(optimizer.averages): util.set_env_log(False) epoch_model_path = output_path / ("model%d" % i) nlp.to_disk(epoch_model_path) nlp_loaded = util.load_model_from_path(epoch_model_path) for beam_width in eval_beam_widths: for name, component in nlp_loaded.pipeline: if hasattr(component, "cfg"): component.cfg["beam_width"] = beam_width dev_docs = list( corpus.dev_docs( nlp_loaded, gold_preproc=gold_preproc, ignore_misaligned=True, )) nwords = sum(len(doc_gold[0]) for doc_gold in dev_docs) start_time = timer() scorer = nlp_loaded.evaluate(dev_docs, verbose=verbose) end_time = timer() if use_gpu < 0: gpu_wps = None cpu_wps = nwords / (end_time - start_time) else: gpu_wps = nwords / (end_time - start_time) with Model.use_device("cpu"): nlp_loaded = util.load_model_from_path( epoch_model_path) for name, component in nlp_loaded.pipeline: if hasattr(component, "cfg"): component.cfg["beam_width"] = beam_width dev_docs = list( corpus.dev_docs( nlp_loaded, gold_preproc=gold_preproc, ignore_misaligned=True, )) start_time = timer() scorer = nlp_loaded.evaluate(dev_docs, verbose=verbose) end_time = timer() cpu_wps = nwords / (end_time - start_time) acc_loc = output_path / ("model%d" % i) / "accuracy.json" srsly.write_json(acc_loc, scorer.scores) # Update model meta.json meta["lang"] = nlp.lang meta["pipeline"] = nlp.pipe_names meta["spacy_version"] = ">=%s" % about.__version__ if beam_width == 1: meta["speed"] = { "nwords": nwords, "cpu": cpu_wps, "gpu": gpu_wps, } meta["accuracy"] = scorer.scores else: meta.setdefault("beam_accuracy", {}) meta.setdefault("beam_speed", {}) meta["beam_accuracy"][beam_width] = scorer.scores meta["beam_speed"][beam_width] = { "nwords": nwords, "cpu": cpu_wps, "gpu": gpu_wps, } meta["vectors"] = { "width": nlp.vocab.vectors_length, "vectors": len(nlp.vocab.vectors), "keys": nlp.vocab.vectors.n_keys, "name": nlp.vocab.vectors.name, } meta.setdefault("name", "model%d" % i) meta.setdefault("version", version) meta["labels"] = nlp.meta["labels"] meta_loc = output_path / ("model%d" % i) / "meta.json" srsly.write_json(meta_loc, meta) util.set_env_log(verbose) progress = _get_progress( i, losses, scorer.scores, output_stats, beam_width=beam_width if has_beam_widths else None, cpu_wps=cpu_wps, gpu_wps=gpu_wps, ) if i == 0 and "textcat" in pipeline: textcats_per_cat = scorer.scores.get( "textcats_per_cat", {}) for cat, cat_score in textcats_per_cat.items(): if cat_score.get("roc_auc_score", 0) < 0: msg.warn( "Textcat ROC AUC score is undefined due to " "only one value in label '{}'.".format( cat)) msg.row(progress, **row_settings) # Early stopping if n_early_stopping is not None: current_score = _score_for_model(meta) if current_score < best_score: iter_since_best += 1 else: iter_since_best = 0 best_score = current_score if iter_since_best >= n_early_stopping: msg.text("Early stopping, best iteration " "is: {}".format(i - iter_since_best)) msg.text("Best score = {}; Final iteration " "score = {}".format(best_score, current_score)) break finally: with nlp.use_params(optimizer.averages): final_model_path = output_path / "model-final" nlp.to_disk(final_model_path) msg.good("Saved model to output directory", final_model_path) with msg.loading("Creating best model..."): best_model_path = _collate_best_model(meta, output_path, nlp.pipe_names) msg.good("Created best model", best_model_path)
def update(self): ''' Using spacy V2.1 to update the dependency parser ''' dropout = decaying(0.5, 0.2, 1e-4) self.require_gpu(self.gpu) # getting data in spacy required format data = self.get_data(self.train_path) random.seed(777) random.shuffle(data) if self.model is not None: nlp = spacy.load(self.model) # load existing spaCy model print("Loaded model '%s'" % self.model) else: nlp = spacy.blank(self.lang) # create blank Language class print("Created blank '%s' model" % self.lang) # add the parser to the pipeline if it doesn't exist # nlp.create_pipe works for built-ins that are registered with spaCy if "parser" not in nlp.pipe_names: parser = nlp.create_pipe("parser") nlp.add_pipe(parser, first=True) # otherwise, get it, so we can add labels to it else: parser = nlp.get_pipe("parser") # sentence segmentation diable #nlp.add_pipe(self.prevent_sentence_boundary_detection, name='prevent-sbd', before='parser') # change the tokens to spacy Doc new_data = list() for dat in data: assert (len(Doc(nlp.vocab, words=dat[0])) == len(dat[1]['deps'])) assert (len(Doc(nlp.vocab, words=dat[0])) == len(dat[1]['heads'])) doc = Doc(nlp.vocab, words=dat[0]) new_data.append( (doc, GoldParse(doc, heads=dat[1]['heads'], deps=dat[1]['deps']))) # add labels to the parser for _, annotations in data: for dep in annotations.get("deps", []): parser.add_label(dep) pretrain_weights_path = os.path.join(os.path.dirname(os.path.realpath(__file__)),\ 'pretrained_weights','pretrained_weights.bin') if os.path.exists(pretrain_weights_path): # loading pretrained weights with open(pretrain_weights_path, "rb") as file_: nlp.from_bytes(file_.read()) print('LOADED from PRETRAIN %s' % pretrain_weights_path) # get names of other pipes to disable them during training other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "parser"] with nlp.disable_pipes(*other_pipes): # only train parser optimizer = nlp.begin_training() for itn in range(self.n_iter): random.shuffle(new_data) losses = {} # batch up the examples using spaCy's minibatch batches = minibatch(new_data, size=compounding(4.0, 32.0, 1.001)) for batch in batches: texts, annotations = zip(*batch) print(type(texts[0])) print(annotations[0]) nlp.update(texts, annotations, sgd=optimizer, losses=losses, drop=next(dropout)) print("Losses", losses) # save model to output directory if self.output_dir is not None: self.output_dir = Path(self.output_dir) if not self.output_dir.exists(): self.output_dir.mkdir() nlp.meta['name'] = "Custom-launguage-model" # rename model with nlp.use_params(optimizer.averages): nlp.to_disk(self.output_dir) print("Saved model to", self.output_dir)
def train_intent(self, nlp, output_dir, train_data, n_iter, dropout): """Load the model, set up the pipeline and train the entity recognizer. Keyword arguments: model -- path to the model if existent output_dir -- path where model is saved at n_iter -- amount of times data is trained with train_data -- training data in BILOU Format Returns: output_dir -- path to model """ dropout = decaying(0.6, 0.2, 1e-4) pipe_exceptions = ["trf_textcat", "trf_wordpiecer", "trf_tok2vec"] other_pipes = [ pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions ] disabled = nlp.disable_pipes(*other_pipes) logging.info("Started training intents...") optimizer = nlp.resume_training() optimizer.alpha = 0.001 optimizer.trf_weight_decay = 0.005 optimizer.L2 = 0.0 learn_rate = 2e-5 batch_size = 8 learn_rates = cyclic_triangular_rate(learn_rate / 3, learn_rate * 3, 2 * len(train_data) // batch_size) for iteration in range(n_iter): random.shuffle(train_data) losses = {} # batch up the examples using spaCy's minibatch batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001)) for batch in batches: optimizer.trf_lr = next(learn_rates) texts, _, annotations = zip(*batch) nlp.update( texts, # batch of texts annotations, # batch of annotations sgd=optimizer, drop=next( dropout), # dropout - make it harder to memorise data losses=losses) self.losses_cat.append(losses) p, r, f = self.evaluate_intent(nlp) self.cat_scores.append([p, r, f]) logging.info( "Finished %s iteration for text classification with %s losses", iteration, losses) #if cat_score <= self.cat_scores[-2]: #break logging.info("Finished training intents...") disabled.restore() # save model to output directory if output_dir is not None: output_dir = Path(output_dir) if not output_dir.exists(): output_dir.mkdir() nlp.to_disk(output_dir) logging.info("Saved model to %s", output_dir) return output_dir
def train_parser_and_tagger(train_json_path: str, dev_json_path: str, test_json_path: str, model_output_dir: str, model_path: str = None, ontonotes_path: str = None, ontonotes_train_percent: float = 0.0): """Function to train the spacy parser and tagger from a blank model, with the default, en_core_web_sm vocab. Training setup is mostly copied from the spacy cli train command. @param train_json_path: path to the conll formatted training data @param dev_json_path: path to the conll formatted dev data @param test_json_path: path to the conll formatted test data @param model_output_dir: path to the output directory for the trained models @param model_path: path to the model to load @param ontonotes_path: path to the directory containnig ontonotes in spacy format (optional) @param ontonotes_train_percent: percentage of the ontonotes training data to use (optional) """ msg = Printer() train_json_path = cached_path(train_json_path) dev_json_path = cached_path(dev_json_path) test_json_path = cached_path(test_json_path) if model_path is not None: nlp = spacy.load(model_path) else: lang_class = util.get_lang_class('en') nlp = lang_class() if 'tagger' not in nlp.pipe_names: tagger = nlp.create_pipe('tagger') nlp.add_pipe(tagger, first=True) else: tagger = nlp.get_pipe('tagger') if 'parser' not in nlp.pipe_names: parser = nlp.create_pipe('parser') nlp.add_pipe(parser) else: parser = nlp.get_pipe('parser') train_corpus = GoldCorpus(train_json_path, dev_json_path) test_corpus = GoldCorpus(train_json_path, test_json_path) if ontonotes_path: onto_train_path = os.path.join(ontonotes_path, "train") onto_dev_path = os.path.join(ontonotes_path, "dev") onto_test_path = os.path.join(ontonotes_path, "test") onto_train_corpus = GoldCorpus(onto_train_path, onto_dev_path) onto_test_corpus = GoldCorpus(onto_train_path, onto_test_path) dropout_rates = util.decaying(0.2, 0.2, 0.0) batch_sizes = util.compounding(1., 16., 1.001) if model_path is not None: meta = nlp.meta else: meta = {} meta["lang"] = "en" meta["pipeline"] = ["tagger", "parser"] meta["name"] = "scispacy_core_web_sm" meta["license"] = "CC BY-SA 3.0" meta["author"] = "Allen Institute for Artificial Intelligence" meta["url"] = "allenai.org" meta["sources"] = ["OntoNotes 5", "Common Crawl", "GENIA 1.0"] meta["version"] = "1.0.0" meta["spacy_version"] = ">=2.2.1" meta["parent_package"] = "spacy" meta["email"] = "*****@*****.**" n_train_words = train_corpus.count_train() other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in ['tagger', 'parser']] with nlp.disable_pipes(*other_pipes): if ontonotes_path: optimizer = nlp.begin_training(lambda: itertools.chain(train_corpus.train_tuples, onto_train_corpus.train_tuples)) else: optimizer = nlp.begin_training(lambda: train_corpus.train_tuples) nlp._optimizer = None train_docs = train_corpus.train_docs(nlp) train_docs = list(train_docs) train_mixture = train_docs if ontonotes_path: onto_train_docs = onto_train_corpus.train_docs(nlp) onto_train_docs = list(onto_train_docs) num_onto_docs = int(float(ontonotes_train_percent)*len(onto_train_docs)) randomly_sampled_onto = random.sample(onto_train_docs, num_onto_docs) train_mixture += randomly_sampled_onto row_head, output_stats = _configure_training_output(nlp.pipe_names, -1, False) row_widths = [len(w) for w in row_head] row_settings = {"widths": row_widths, "aligns": tuple(["r" for i in row_head]), "spacing": 2} print("") msg.row(row_head, **row_settings) msg.row(["-" * width for width in row_settings["widths"]], **row_settings) best_epoch = 0 best_epoch_uas = 0.0 for i in range(20): random.shuffle(train_mixture) with nlp.disable_pipes(*other_pipes): with tqdm(total=n_train_words, leave=False) as pbar: losses = {} minibatches = list(util.minibatch(train_docs, size=batch_sizes)) for batch in minibatches: docs, golds = zip(*batch) nlp.update(docs, golds, sgd=optimizer, drop=next(dropout_rates), losses=losses) pbar.update(sum(len(doc) for doc in docs)) # save intermediate model and output results on the dev set with nlp.use_params(optimizer.averages): epoch_model_path = os.path.join(model_output_dir, "model"+str(i)) os.makedirs(epoch_model_path, exist_ok=True) nlp.to_disk(epoch_model_path) with open(os.path.join(model_output_dir, "model"+str(i), "meta.json"), "w") as meta_fp: meta_fp.write(json.dumps(meta)) nlp_loaded = util.load_model_from_path(epoch_model_path) dev_docs = train_corpus.dev_docs(nlp_loaded) dev_docs = list(dev_docs) nwords = sum(len(doc_gold[0]) for doc_gold in dev_docs) start_time = timer() scorer = nlp_loaded.evaluate(dev_docs) end_time = timer() gpu_wps = None cpu_wps = nwords/(end_time-start_time) if ontonotes_path: onto_dev_docs = list(onto_train_corpus.dev_docs(nlp_loaded)) onto_scorer = nlp_loaded.evaluate(onto_dev_docs) if scorer.scores["uas"] > best_epoch_uas: best_epoch_uas = scorer.scores["uas"] best_epoch = i progress = _get_progress( i, losses, scorer.scores, output_stats, cpu_wps=cpu_wps, gpu_wps=gpu_wps ) msg.row(progress, **row_settings) if ontonotes_path: progress = _get_progress( i, losses, onto_scorer.scores, output_stats, cpu_wps=cpu_wps, gpu_wps=gpu_wps ) msg.row(progress, **row_settings) # save final model and output results on the test set final_model_path = os.path.join(model_output_dir, "best") if os.path.exists(final_model_path): shutil.rmtree(final_model_path) shutil.copytree(os.path.join(model_output_dir, "model" + str(best_epoch)), final_model_path) nlp_loaded = util.load_model_from_path(final_model_path) start_time = timer() test_docs = test_corpus.dev_docs(nlp_loaded) test_docs = list(test_docs) nwords = sum(len(doc_gold[0]) for doc_gold in test_docs) scorer = nlp_loaded.evaluate(test_docs) end_time = timer() gpu_wps = None cpu_wps = nwords/(end_time-start_time) meta["speed"] = {"gpu": None, "nwords": nwords, "cpu": cpu_wps} print("Retrained genia evaluation") print("Test results:") print("UAS:", scorer.uas) print("LAS:", scorer.las) print("Tag %:", scorer.tags_acc) print("Token acc:", scorer.token_acc) with open(os.path.join(model_output_dir, "genia_test.json"), "w+") as metric_file: json.dump(scorer.scores, metric_file) with open(os.path.join(model_output_dir, "best", "meta.json"), "w") as meta_fp: meta_fp.write(json.dumps(meta)) if ontonotes_path: onto_test_docs = list(onto_test_corpus.dev_docs(nlp_loaded)) print("Retrained ontonotes evaluation") scorer_onto_retrained = nlp_loaded.evaluate(onto_test_docs) print("Test results:") print("UAS:", scorer_onto_retrained.uas) print("LAS:", scorer_onto_retrained.las) print("Tag %:", scorer_onto_retrained.tags_acc) print("Token acc:", scorer_onto_retrained.token_acc) with open(os.path.join(model_output_dir, "ontonotes_test.json"), "w+") as metric_file: json.dump(scorer_onto_retrained.scores, metric_file)
def train_NER(model=None, new_model_name="skill", output_dir='./models/train_textrank_labels', n_iter=5): # model = None for starting with an empty model # model = 'en_core_web_sm' for starting with a pretrained model """Set up the pipeline and entity recognizer, and train the new entity.""" random.seed(0) if model is not None: nlp = spacy.load(model) # load existing spaCy model print("Loaded model '%s'" % model) else: nlp = spacy.blank("en") # create blank Language class print("Created blank 'en' model") # Add entity recognizer to model if it's not in the pipeline # nlp.create_pipe works for built-ins that are registered with spaCy if "ner" not in nlp.pipe_names: ner = nlp.create_pipe("ner") nlp.add_pipe(ner) # otherwise, get it, so we can add labels to it else: ner = nlp.get_pipe("ner") ner.add_label('SKILL') # add new entity label to entity recognizer if model is None: optimizer = nlp.begin_training() else: optimizer = nlp.resume_training() move_names = list(ner.move_names) # get names of other pipes to disable them during training other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"] with nlp.disable_pipes(*other_pipes): # only train NER sizes = compounding(1.0, 4.0, 1.001) # batch up the examples using spaCy's minibatch dropout = decaying(0.2, 0, 0.02) loss_dict = {} for itn in range(n_iter): random.shuffle(TRAIN_DATA) batches = minibatch(TRAIN_DATA, size=sizes) losses = {} for batch in batches: texts, annotations = zip(*batch) nlp.update(texts, annotations, sgd=optimizer, drop=0.05, losses=losses) # or drop=next(dropout) loss_dict[itn] = losses['ner'] print("Losses", losses) lists = sorted(loss_dict.items()) x, y = zip(*lists) plt.plot(x, y) plt.show() # save model to output directory if output_dir is not None: output_dir = Path(output_dir) if not output_dir.exists(): output_dir.mkdir() nlp.meta["name"] = new_model_name # rename model nlp.to_disk(output_dir) print("Saved model to", output_dir)
def training(train_texts, train_cats, dev_texts, dev_cats, test_texts, test_cats, L2, learn_rate, n_iter, output_dir=None): """ Spacy example function modified Trains citation needed classifier and saves model Parameters: train_texts :str -list - text train features train_cats :str - list - label citation sentence - TRUE else FALSE dev_texts :str - list - text train features dev_cats :str - list - label citation sentence - TRUE else FALSE test_texts :str - list - text train features test_cats :str - list - label citation sentence - TRUE else FALSE L2 : int - regularization parameter - default value 1e-6 learn_rate : learning rate - default rate - 0.001, output_dir :str = None - path to save the model returns: returns list of evaluated metrics (accuracy, f1, precision and recall) train_results : list - evaluated metrics for training dataset val_results : list - evaluated metrics for validation dataset """ if output_dir is not None: output_dir = Path(output_dir) if not output_dir.exists(): output_dir.mkdir() # Disabling other components nlp = spacy.load('en_core_web_sm') # Adding the built-in textcat component to the pipeline. textcat = nlp.create_pipe("textcat", config={ "exclusive_classes": True, "architecture": "simple_cnn" }) nlp.add_pipe(textcat, last=True) # Adding the labels to textcat textcat.add_label("POSITIVE") textcat.add_label("NEGATIVE") other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'textcat'] with nlp.disable_pipes(*other_pipes): # only train textcat optimizer = nlp.begin_training() optimizer.L2 = L2 optimizer.learn_rate = learn_rate #dec = decaying(0.6, 0.2, 1e-4) dec = decaying(10.0, 1.0, 0.001) print("Training the model...") print('{:^5}\t{:^5}\t{:^5}\t{:^5}\t{:^5}\t{:^5}\t{:^5}'.format( 'LOSS', 'A_train', 'A_dev', 'A_test', 'P', 'R', 'F')) train_results = [] dev_results = [] test_results = [] # Performing training for i in range(n_iter): losses = {} train_data = list( zip(train_texts, [{ 'cats': cats } for cats in train_cats])) random.shuffle(train_data) # (train_texts, train_cats) = zip(*train_data) batches = minibatch(train_data, size=compounding(4., 32., 1.001)) for batch in batches: texts, annotations = zip(*batch) nlp.update(texts, annotations, sgd=optimizer, drop=next(dec), losses=losses) # Calling the evaluate() function and printing the train scores scores1 = evaluate(nlp.tokenizer, textcat, train_texts, train_cats) train_results.append(scores1) # Calling the evaluate() function and printing the test scores with textcat.model.use_params(optimizer.averages): scores2 = evaluate(nlp.tokenizer, textcat, dev_texts, dev_cats) scores3 = evaluate(nlp.tokenizer, textcat, test_texts, test_cats) dev_results.append(scores2) test_results.append(scores3) print( '{0:.4f}\t{1:.4f}\t{2:.4f}\t{3:.4f}\t{4:.4f}\t{5:.4f}\t{6:.4f}' .format(losses['textcat'], scores1['textcat_a'], scores2['textcat_a'], scores3['textcat_a'], scores1['textcat_p'], scores1['textcat_r'], scores1['textcat_f'])) if output_dir is not None: with nlp.use_params(optimizer.averages): nlp.to_disk(output_dir) print("Saved model to", output_dir) return train_results, dev_results, test_results
def train(pretrained, output_dir, train_data, dev_data, n_iter=30, n_sents=0, parser_multitasks='', entity_multitasks='', use_gpu=-1, no_tagger=False, no_parser=False, no_entities=False, gold_preproc=False, version="0.0.0", meta_path=None, verbose=False): """ Re-train a pre-trained model. Expects data in spaCy's JSON format. This code is based on https://github.com/explosion/spaCy/blob/master/spacy/cli/train.py. """ # There is a bug that prevents me from using the GPU when resuming # training from a saved model. See # https://github.com/explosion/spaCy/issues/1806. if use_gpu >= 0: msg = "\nWARNING: using GPU may require re-installing thinc. " msg += "See https://github.com/explosion/spaCy/issues/1806.\n" print(msg) util.fix_random_seed() util.set_env_log(True) n_sents = n_sents or None output_path = util.ensure_path(output_dir) train_path = util.ensure_path(train_data) dev_path = util.ensure_path(dev_data) meta_path = util.ensure_path(meta_path) if not output_path.exists(): output_path.mkdir() if not train_path.exists(): prints(train_path, title=Messages.M050, exits=1) if dev_path and not dev_path.exists(): prints(dev_path, title=Messages.M051, exits=1) if meta_path is not None and not meta_path.exists(): prints(meta_path, title=Messages.M020, exits=1) meta = util.read_json(meta_path) if meta_path else {} if not isinstance(meta, dict): prints(Messages.M053.format(meta_type=type(meta)), title=Messages.M052, exits=1) # Take dropout and batch size as generators of values -- dropout # starts high and decays sharply, to force the optimizer to explore. # Batch size starts at 1 and grows, so that we make updates quickly # at the beginning of training. dropout_rates = util.decaying(util.env_opt('dropout_from', 0.2), util.env_opt('dropout_to', 0.2), util.env_opt('dropout_decay', 0.0)) batch_sizes = util.compounding(util.env_opt('batch_from', 1), util.env_opt('batch_to', 16), util.env_opt('batch_compound', 1.001)) max_doc_len = util.env_opt('max_doc_len', 5000) corpus = GoldCorpus(train_path, dev_path, limit=n_sents) n_train_words = corpus.count_train() # Load pre-trained model. Remove components that we are not # re-training. nlp = load(pretrained) if no_tagger and 'tagger' in nlp.pipe_names: nlp.remove_pipe('tagger') if no_parser and 'parser' in nlp.pipe_names: nlp.remove_pipe('parser') if no_entities and 'ner' in nlp.pipe_names: nlp.remove_pipe('ner') meta.setdefault('name', 'unnamed') meta['pipeline'] = nlp.pipe_names meta.setdefault('lang', nlp.lang) nlp.meta.update(meta) # Add multi-task objectives if parser_multitasks: for objective in parser_multitasks.split(','): nlp.parser.add_multitask_objective(objective) if entity_multitasks: for objective in entity_multitasks.split(','): nlp.entity.add_multitask_objective(objective) # Get optimizer optimizer = nlp.begin_training(lambda: corpus.train_tuples, device=use_gpu) nlp._optimizer = None print(nlp.pipe_names) print(nlp.pipeline) print( "Itn. Dep Loss NER Loss UAS NER P. NER R. NER F. Tag % Token % CPU WPS GPU WPS" ) try: train_docs = corpus.train_docs(nlp, projectivize=True, noise_level=0.0, gold_preproc=gold_preproc, max_length=0) train_docs = list(train_docs) for i in range(n_iter): with tqdm.tqdm(total=n_train_words, leave=False) as pbar: losses = {} for batch in minibatch(train_docs, size=batch_sizes): batch = [(d, g) for (d, g) in batch if len(d) < max_doc_len] if not batch: continue docs, golds = zip(*batch) nlp.update(docs, golds, sgd=optimizer, drop=next(dropout_rates), losses=losses) pbar.update(sum(len(doc) for doc in docs)) with nlp.use_params(optimizer.averages): util.set_env_log(False) epoch_model_path = output_path / ('model%d' % i) nlp.to_disk(epoch_model_path) nlp_loaded = util.load_model_from_path(epoch_model_path) dev_docs = list( corpus.dev_docs(nlp_loaded, gold_preproc=gold_preproc)) nwords = sum(len(doc_gold[0]) for doc_gold in dev_docs) start_time = timer() scorer = nlp_loaded.evaluate(dev_docs, verbose) end_time = timer() if use_gpu < 0: gpu_wps = None cpu_wps = nwords / (end_time - start_time) else: gpu_wps = nwords / (end_time - start_time) with Model.use_device('cpu'): nlp_loaded = util.load_model_from_path( epoch_model_path) dev_docs = list( corpus.dev_docs(nlp_loaded, gold_preproc=gold_preproc)) start_time = timer() scorer = nlp_loaded.evaluate(dev_docs) end_time = timer() cpu_wps = nwords / (end_time - start_time) acc_loc = (output_path / ('model%d' % i) / 'accuracy.json') with acc_loc.open('w') as file_: file_.write(json_dumps(scorer.scores)) meta_loc = output_path / ('model%d' % i) / 'meta.json' meta['accuracy'] = scorer.scores meta['speed'] = { 'nwords': nwords, 'cpu': cpu_wps, 'gpu': gpu_wps } meta['vectors'] = { 'width': nlp.vocab.vectors_length, 'vectors': len(nlp.vocab.vectors), 'keys': nlp.vocab.vectors.n_keys } meta['lang'] = nlp.lang meta['pipeline'] = nlp.pipe_names meta['spacy_version'] = '>=%s' % about.__version__ meta.setdefault('name', 'model%d' % i) meta.setdefault('version', version) with meta_loc.open('w') as file_: file_.write(json_dumps(meta)) util.set_env_log(True) print_progress(i, losses, scorer.scores, cpu_wps=cpu_wps, gpu_wps=gpu_wps) finally: print("Saving model...") with nlp.use_params(optimizer.averages): final_model_path = output_path / 'model-final' nlp.to_disk(final_model_path)
def retrain_model(project, model=None, n_iter=30): """Load the model, set up the pipeline and train the entity recognizer.""" dropout_rates = util.decaying(util.env_opt('dropout_from', 0.2), util.env_opt('dropout_to', 0.2), util.env_opt('dropout_decay', 0.0)) batch_sizes = util.compounding(util.env_opt('batch_from', 1), util.env_opt('batch_to', 16), util.env_opt('batch_compound', 1.001)) if model == 'model_1': output_model = 'model_2' else: output_model = 'model_1' al_project = ContentType.objects.get(app_label="spacyal", model="al_project").model_class() project = al_project.objects.get(pk=project) base_d = '/'.join(project.texts.path.split('/')[:-1]) output_dir = os.path.join(base_d, output_model) if project.len_training_data() < project.num_retrain: message = {'folder': os.path.join(base_d, model), 'retrained': False, 'project': project.pk} return message TRAIN_DATA, eval_data, hist_object = project.get_training_data( include_all=True, include_negative=True) nlp = spacy.load(os.path.join(base_d, model))# load existing spaCy model if project.project_history_set.all().count() == 1: project_history = ContentType.objects.get( app_label="spacyal", model="project_history").model_class() ev = test_model(eval_data, nlp) f1 = ev.compute_f1() hist2 = project_history.objects.create( project=project, eval_f1=f1['fbeta'], eval_precission=f1['precission'], eval_recall=f1['recall']) hist2.cases_training.add(*list(hist_object.cases_training.all())) hist2.cases_evaluation.add(*list(hist_object.cases_evaluation.all())) TRAIN_DATA = mix_train_data(nlp, TRAIN_DATA) with open(os.path.join(base_d, 'training_data.json'), 'w') as outp: json.dump(TRAIN_DATA, outp) r = nlp.get_pipe('ner') # get names of other pipes to disable them during training other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner'] count_prog_list = list(range(0, n_iter, int(n_iter/10))) with nlp.disable_pipes(*other_pipes): optimizer = nlp.begin_training() for itn in range(n_iter): if itn in count_prog_list: current_task.update_state( state='PROGRESS', meta={'progress': count_prog_list.index(itn)*10, 'model': output_model, 'project': project.pk}) random.shuffle(TRAIN_DATA) losses = {} for batch in util.minibatch(TRAIN_DATA, size=batch_sizes): texts, annotations = zip(*batch) nlp.update( texts, # batch of texts annotations, # batch of annotations drop=next(dropout_rates), # dropout - make it harder to memorise data sgd=optimizer, # callable to update weights losses=losses) if not Path(output_dir).exists(): Path(output_dir).mkdir() nlp.to_disk(output_dir) print(eval_data) ev = test_model(eval_data, nlp) f1 = ev.compute_f1() hist_object.eval_f1 = f1['fbeta'] hist_object.eval_precission = f1['precission'] hist_object.eval_recall = f1['recall'] hist_object.model_path = output_dir hist_object.save() message = {'folder': output_dir, 'retrained': True, 'project': project.pk, 'f1': f1} return message
def main(model=None, new_model_name='DCC_ent', input_dir=input_dir, saved_model_dir=model_dir, output_dir=output_dir, test_dir=test_dir, n_iter=n_iter): random.seed(1234) # create the training from annotated data produced by using Brat data_reading_start_time = time.time() training_data = create_training_data(input_dir) data_reading_end_time = time.time() data_reading_time = data_reading_end_time - data_reading_start_time print("--->data reading time: ", data_reading_time) # check if the user provides an existing language model if model is not None: nlp = spacy.load(model) # load existing spaCy model print("Loaded existing model '%s'" % model) else: nlp = spacy.blank('en') # create blank Language class print("No model provided, created blank 'en' model") # Add entity recognizer to model if it's not in the pipeline # nlp.create_pipe works for built-ins that are registered with spaCy if 'ner' not in nlp.pipe_names: ner = nlp.create_pipe('ner') nlp.add_pipe(ner) else: # otherwise, get it, so we can add labels to it ner = nlp.get_pipe('ner') # add all new entities to the recognizer for i in range(len(new_entities_list)): ner.add_label(new_entities_list[i]) if model is None: optimizer = nlp.begin_training() else: # Note that 'begin_training' initializes the models, so it'll zero out # existing entity types. optimizer = nlp.entity.create_optimizer() # start the training of the recognizer (and the time) training_start_time = time.time() for itn in range(n_iter): iter_start_time = time.time() dropout = decaying(0.4, 0.2, 1.0e-2) random.shuffle(training_data) losses = {} # batch up the examples using spaCy's minibatch batches = minibatch(training_data, size=compounding(4., 32., 1.001)) for ib, batch in enumerate(batches): # print(" batch ", ib) ignore_batch = False for bl in range(len(batch)): # print(batch[bl]) # print(len(batch[bl])) if len(batch[bl]) < 2: ignore_batch = True if ignore_batch == True: continue texts, annotations = zip(*batch) # print(texts) # print(annotations) nlp.update(texts, annotations, sgd=optimizer, drop=0.35, losses=losses) iter_end_time = time.time() iter_elapsed_time = iter_end_time - iter_start_time print(' iter:', itn) print(' Losses', losses) print(' iter elapsed time:', iter_elapsed_time) training_end_time = time.time() print("training time: ", training_end_time - training_start_time) ############################ # test the ner model on a set of text data taken from papers # (if the user does not provide text data, no testing will be performed) if test_dir is not None: # test_ner_model(nlp, test_dir) test_ner_model(nlp, test_dir, output_dir, out_tag='_ents_from_existing_model') ########################## # model evaluation # # define a set of examples that will be used as ground truth examples = [ ('Deep learning is applied in many every day application with great success in object recognition.', [(0, 13, 'Method'), (77, 95, 'Task')]), ('Recurrent neural networks are used for forecasting and natural language processing.', [(0, 25, 'Method'), (39, 50, 'Task'), (55, 82, 'Task')]), ('Convolutional neural networks are frequently used in object recognition and medical image processing.', [(0, 29, 'Method'), (53, 72, 'Task'), (84, 101, 'Task')]) ] res = ner_eval(nlp, examples) print("\nModel evaluation results:") print(res) ############################################ # save trained model # (if the user does not provide a directory, the trained model will not be saved) if saved_model_dir is not None: saved_model_dir = Path(saved_model_dir) if not saved_model_dir.exists(): saved_model_dir.mkdir() nlp.meta['name'] = new_model_name # rename model nlp.to_disk(saved_model_dir) print("The model was saved to the directory: ", saved_model_dir)