def train_model(self, local_model, x_train, y_train, x_valid=None, y_valid=None, max_epoch=50): # todo: if valid set if None, create it as random segment of the shuffled train set if self.training_config.early_stop: training_generator = DataGenerator( x_train, y_train, batch_size=self.training_config.batch_size, preprocessor=self.preprocessor, char_embed_size=self.model_config.char_embedding_size, embeddings=self.embeddings, shuffle=True) validation_generator = DataGenerator( x_valid, y_valid, batch_size=self.training_config.batch_size, preprocessor=self.preprocessor, char_embed_size=self.model_config.char_embedding_size, embeddings=self.embeddings, shuffle=False) callbacks = get_callbacks(log_dir=self.checkpoint_path, eary_stopping=True, valid=(validation_generator, self.preprocessor)) else: x_train = np.concatenate((x_train, x_valid), axis=0) y_train = np.concatenate((y_train, y_valid), axis=0) training_generator = DataGenerator( x_train, y_train, batch_size=self.training_config.batch_size, preprocessor=self.preprocessor, char_embed_size=self.model_config.char_embedding_size, embeddings=self.embeddings, shuffle=True) callbacks = get_callbacks(log_dir=self.checkpoint_path, eary_stopping=False) nb_workers = 6 if self.embeddings.use_ELMo: nb_workers = 0 # dump token context independent data for train set, done once for the training local_model.fit_generator(generator=training_generator, epochs=max_epoch, use_multiprocessing=True, workers=nb_workers, callbacks=callbacks) return local_model
def tag(self, texts, output_format): assert isinstance(texts, list) if output_format is 'json': res = { "software": "DeLFT", "date": datetime.datetime.now().isoformat(), "model": self.model.config.model_name, "texts": [] } else: list_of_tags = [] predict_generator = DataGenerator( texts, None, batch_size=self.model_config.batch_size, preprocessor=self.preprocessor, char_embed_size=self.model_config.char_embedding_size, embeddings=self.embeddings, tokenize=True, shuffle=False) nb_workers = 6 multiprocessing = True # multiple workers will not work with ELMo due to GPU memory limit (with GTX 1080Ti 11GB) if self.embeddings.use_ELMo: # worker at 0 means the training will be executed in the main thread nb_workers = 0 multiprocessing = False # dump token context independent data for train set, done once for the training preds = self.model.predict_generator( generator=predict_generator, use_multiprocessing=multiprocessing, workers=nb_workers) for i in range(0, len(preds)): pred = [preds[i]] text = texts[i] tokens, offsets = tokenizeAndFilter(text) tags = self._get_tags(pred) prob = self._get_prob(pred) if output_format is 'json': piece = {} piece["text"] = text piece["entities"] = self._build_json_response( tokens, tags, prob, offsets)["entities"] res["texts"].append(piece) else: the_tags = list(zip(tokens, tags)) list_of_tags.append(the_tags) if output_format is 'json': return res else: return list_of_tags
def eval_nfold(self, x_test, y_test): if self.models is not None: total_f1 = 0 best_f1 = 0 best_index = 0 worst_f1 = 1 worst_index = 0 reports = [] total_precision = 0 total_recall = 0 for i in range(0, self.model_config.fold_number): print('\n------------------------ fold ' + str(i) + '--------------------------------------') # Prepare test data(steps, generator) test_generator = DataGenerator( x_test, y_test, batch_size=self.training_config.batch_size, preprocessor=self.p, char_embed_size=self.model_config.char_embedding_size, embeddings=self.embeddings, shuffle=False) # Build the evaluator and evaluate the model scorer = Scorer(test_generator, self.p, evaluation=True) scorer.model = self.models[i] scorer.on_epoch_end(epoch=-1) f1 = scorer.f1 precision = scorer.precision recall = scorer.recall reports.append(scorer.report) if best_f1 < f1: best_f1 = f1 best_index = i if worst_f1 > f1: worst_f1 = f1 worst_index = i total_f1 += f1 total_precision += precision total_recall += recall macro_f1 = total_f1 / self.model_config.fold_number macro_precision = total_precision / self.model_config.fold_number macro_recall = total_recall / self.model_config.fold_number print("\naverage over", self.model_config.fold_number, "folds") print("\tmacro f1 =", macro_f1) print("\tmacro precision =", macro_precision) print("\tmacro recall =", macro_recall, "\n") print("\n** Worst ** model scores - \n") print(reports[worst_index]) self.model = self.models[best_index] print("\n** Best ** model scores - \n") print(reports[best_index])
def tag(self, texts, output_format): assert isinstance(texts, list) if output_format is 'json': res = { "software": "DeLFT", "date": datetime.datetime.now().isoformat(), "model": self.model.config.model_name, "texts": [] } else: list_of_tags = [] predict_generator = DataGenerator( texts, None, batch_size=self.model_config.batch_size, preprocessor=self.preprocessor, char_embed_size=self.model_config.char_embedding_size, embeddings=self.embeddings, tokenize=True, shuffle=False) nb_workers = 6 if self.embeddings.use_ELMo: nb_workers = 0 preds = self.model.predict_generator(generator=predict_generator, use_multiprocessing=True, workers=nb_workers) for i in range(0, len(preds)): pred = [preds[i]] text = texts[i] tokens, offsets = tokenizeAndFilter(text) tags = self._get_tags(pred) prob = self._get_prob(pred) if output_format is 'json': piece = {} piece["text"] = text piece["entities"] = self._build_json_response( tokens, tags, prob, offsets)["entities"] res["texts"].append(piece) else: the_tags = list(zip(tokens, tags)) list_of_tags.append(the_tags) if output_format is 'json': return res else: return list_of_tags
def eval_single(self, x_test, y_test): if self.model: # Prepare test data(steps, generator) test_generator = DataGenerator(x_test, y_test, batch_size=self.training_config.batch_size, preprocessor=self.p, char_embed_size=self.model_config.char_embedding_size, embeddings=self.embeddings, shuffle=False) # Build the evaluator and evaluate the model scorer = Scorer(test_generator, self.p, evaluation=True) scorer.model = self.model scorer.on_epoch_end(epoch=-1) else: raise (OSError('Could not find a model.'))
def tag(self, texts, output_format, features=None): assert isinstance(texts, list) if output_format is 'json': res = { "software": "DeLFT", "date": datetime.datetime.now().isoformat(), "model": self.model.config.model_name, "texts": [] } else: list_of_tags = [] tokeniz = False if (len(texts) > 0 and isinstance(texts[0], str)): tokeniz = True predict_generator = DataGenerator( texts, None, batch_size=self.model_config.batch_size, preprocessor=self.preprocessor, char_embed_size=self.model_config.char_embedding_size, embeddings=self.embeddings, tokenize=tokeniz, shuffle=False, features=None) nb_workers = 6 multiprocessing = True # multiple workers will not work with ELMo due to GPU memory limit (with GTX 1080Ti 11GB) if self.embeddings.use_ELMo: # worker at 0 means the training will be executed in the main thread nb_workers = 0 multiprocessing = False # dump token context independent data for train set, done once for the training steps_done = 0 steps = len(predict_generator) #while steps_done < steps: for generator_output in predict_generator: if steps_done == steps: break #generator_output = next(predict_generator) preds = self.model.predict_on_batch(generator_output[0]) '''preds = self.model.predict_generator( generator=predict_generator, use_multiprocessing=multiprocessing, workers=nb_workers ) ''' for i in range(0, len(preds)): pred = [preds[i]] text = texts[i + (steps_done * self.model_config.batch_size)] if (isinstance(text, str)): tokens, offsets = tokenizeAndFilter(text) else: # it is a list of string, so a string already tokenized # note that in this case, offset are not present and json output is impossible tokens = text offsets = [] tags = self._get_tags(pred) prob = self._get_prob(pred) if output_format is 'json': piece = {} piece["text"] = text piece["entities"] = self._build_json_response( tokens, tags, prob, offsets)["entities"] res["texts"].append(piece) else: the_tags = list(zip(tokens, tags)) list_of_tags.append(the_tags) steps_done += 1 if output_format is 'json': return res else: return list_of_tags