def train(self, x_train, y_train, f_train=None, x_valid=None, y_valid=None, f_valid=None, callbacks=None): # TBD if valid is None, segment train to get one if early_stop is True # we concatenate all the training+validation data to create the model vocabulary if not x_valid is None: x_all = np.concatenate((x_train, x_valid), axis=0) else: x_all = x_train if not y_valid is None: y_all = np.concatenate((y_train, y_valid), axis=0) else: y_all = y_train features_all = concatenate_or_none((f_train, f_valid), axis=0) self.p = prepare_preprocessor(x_all, y_all, features=features_all, model_config=self.model_config) self.model_config.char_vocab_size = len(self.p.vocab_char) self.model_config.case_vocab_size = len(self.p.vocab_case) self.model = get_model(self.model_config, self.p, len(self.p.vocab_tag), load_pretrained_weights=True) print_parameters(self.model_config, self.training_config) self.model.print_summary() # uncomment to plot graph #plot_model(self.model, # to_file='data/models/textClassification/'+self.model_config.model_name+'_'+self.model_config.architecture+'.png') trainer = Trainer( self.model, self.models, self.embeddings, self.model_config, self.training_config, checkpoint_path=self.log_dir, preprocessor=self.p, transformer_preprocessor=self.model.transformer_preprocessor) trainer.train(x_train, y_train, x_valid, y_valid, features_train=f_train, features_valid=f_valid, callbacks=callbacks) if self.embeddings and self.embeddings.use_ELMo: self.embeddings.clean_ELMo_cache()
def train(self, x_train, y_train, vocab_init=None, callbacks=None): self.model = getModel(self.model_config, self.training_config) print_parameters(self.model_config, self.training_config) self.model.print_summary() bert_data = False if self.transformer_name is not None: bert_data = True if self.training_config.early_stop: # create validation set xtr, val_x, y, val_y = train_test_split(x_train, y_train, test_size=0.1) training_generator = DataGenerator(xtr, y, batch_size=self.training_config.batch_size, maxlen=self.model_config.maxlen, list_classes=self.model_config.list_classes, embeddings=self.embeddings, shuffle=True, bert_data=bert_data, transformer_tokenizer=self.model.transformer_tokenizer) validation_generator = DataGenerator(val_x, None, batch_size=self.training_config.batch_size, maxlen=self.model_config.maxlen, list_classes=self.model_config.list_classes, embeddings=self.embeddings, shuffle=False, bert_data=bert_data, transformer_tokenizer=self.model.transformer_tokenizer) else: val_y = y_train training_generator = DataGenerator(x_train, y_train, batch_size=self.training_config.batch_size, maxlen=self.model_config.maxlen, list_classes=self.model_config.list_classes, embeddings=self.embeddings, shuffle=True, bert_data=bert_data, transformer_tokenizer=self.model.transformer_tokenizer) validation_generator = None # uncomment to plot graph #plot_model(self.model, # to_file='data/models/textClassification/'+self.model_config.model_name+'_'+self.model_config.architecture+'.png') self.model.train_model( self.model_config.list_classes, self.training_config.batch_size, self.training_config.max_epoch, self.training_config.use_roc_auc, self.training_config.class_weights, training_generator, validation_generator, val_y, patience=self.training_config.patience, multiprocessing=self.training_config.multiprocessing, callbacks=callbacks)
def load(self, dir_path='data/models/textClassification/'): model_path = os.path.join(dir_path, self.model_config.model_name) self.model_config = ModelConfig.load(os.path.join(model_path, self.config_file)) if self.model_config.transformer_name is None: # load embeddings # Do not use cache in 'production' mode self.embeddings = Embeddings(self.model_config.embeddings_name, resource_registry=self.registry, use_cache=False) self.model_config.word_embedding_size = self.embeddings.embed_size else: self.transformer_name = self.model_config.transformer_name self.embeddings = None self.model = getModel(self.model_config, self.training_config, load_pretrained_weights=False, local_path=model_path) print_parameters(self.model_config, self.training_config) self.model.print_summary() if self.model_config.fold_number == 1: print("load weights from", os.path.join(model_path, self.weight_file)) self.model.load(os.path.join(model_path, self.weight_file)) else: self.models = [] if self.model_config.transformer_name is None: for i in range(0, self.model_config.fold_number): local_model = getModel(self.model_config, self.training_config, load_pretrained_weights=False, local_path=model_path) local_model.load(os.path.join(model_path, "model{0}_weights.hdf5".format(i))) self.models.append(local_model) else: # only init first fold one, the other will be init at prediction time, all weights will be loaded at prediction time local_model = getModel(self.model_config, self.training_config, load_pretrained_weights=False, local_path=model_path) self.models.append(local_model)
def train_folds(X, y, model_config, training_config, embeddings, callbacks=None): fold_count = model_config.fold_number max_epoch = training_config.max_epoch architecture = model_config.architecture use_roc_auc = training_config.use_roc_auc class_weights = training_config.class_weights fold_size = len(X) // fold_count models = [] scores = [] bert_data = False if model_config.transformer_name is not None: bert_data = True for fold_id in range(0, fold_count): fold_start = fold_size * fold_id fold_end = fold_start + fold_size if fold_id == fold_size - 1: fold_end = len(X) train_x = np.concatenate([X[:fold_start], X[fold_end:]]) train_y = np.concatenate([y[:fold_start], y[fold_end:]]) val_x = X[fold_start:fold_end] val_y = y[fold_start:fold_end] foldModel = getModel(model_config, training_config) if fold_id == 0: print_parameters(model_config, training_config) foldModel.print_summary() print('\n------------------------ fold ' + str(fold_id) + '--------------------------------------') training_generator = DataGenerator( train_x, train_y, batch_size=training_config.batch_size, maxlen=model_config.maxlen, list_classes=model_config.list_classes, embeddings=embeddings, bert_data=bert_data, shuffle=True, transformer_tokenizer=foldModel.transformer_tokenizer) validation_generator = None if training_config.early_stop: validation_generator = DataGenerator( val_x, val_y, batch_size=training_config.batch_size, maxlen=model_config.maxlen, list_classes=model_config.list_classes, embeddings=embeddings, bert_data=bert_data, shuffle=False, transformer_tokenizer=foldModel.transformer_tokenizer) foldModel.train_model(model_config.list_classes, training_config.batch_size, max_epoch, use_roc_auc, class_weights, training_generator, validation_generator, val_y, multiprocessing=training_config.multiprocessing, patience=training_config.patience, callbacks=callbacks) if model_config.transformer_name is None: models.append(foldModel) else: # if we are using a transformer layer in the architecture, we need to save the fold model on the disk directory = os.path.join("data/models/textClassification/", model_config.model_name) if not os.path.exists(directory): os.makedirs(directory) if fold_id == 0: models.append(foldModel) # save transformer config and tokenizer if foldModel.transformer_config is not None: foldModel.transformer_config.to_json_file( os.path.join(directory, TRANSFORMER_CONFIG_FILE_NAME)) if foldModel.transformer_tokenizer is not None: foldModel.transformer_tokenizer.save_pretrained( os.path.join(directory, DEFAULT_TRANSFORMER_TOKENIZER_DIR)) model_path = os.path.join(directory, "model{0}_weights.hdf5".format(fold_id)) foldModel.save(model_path) if fold_id != 0: del foldModel return models
def eval_nfold(self, x_test, y_test, features=None): if self.models is not None: total_f1 = 0 best_f1 = 0 best_index = 0 worst_f1 = 1 worst_index = 0 reports = [] reports_as_map = [] total_precision = 0 total_recall = 0 for i in range(self.model_config.fold_number): if self.model_config.transformer_name is None: the_model = self.models[i] bert_preprocessor = None else: # the architecture model uses a transformer layer, it is large and needs to be loaded from disk dir_path = 'data/models/sequenceLabelling/' weight_file = DEFAULT_WEIGHT_FILE_NAME.replace( ".hdf5", str(i) + ".hdf5") self.model = get_model(self.model_config, self.p, ntags=len(self.p.vocab_tag), load_pretrained_weights=False, local_path=os.path.join( dir_path, self.model_config.model_name)) self.model.load(filepath=os.path.join( dir_path, self.model_config.model_name, weight_file)) the_model = self.model bert_preprocessor = self.model.transformer_preprocessor if i == 0: the_model.print_summary() print_parameters(self.model_config, self.training_config) print('\n------------------------ fold ' + str(i) + ' --------------------------------------') # we can use a data generator for evaluation # Prepare test data(steps, generator) generator = the_model.get_generator() test_generator = generator( x_test, y_test, batch_size=self.model_config.batch_size, preprocessor=self.p, bert_preprocessor=bert_preprocessor, char_embed_size=self.model_config.char_embedding_size, max_sequence_length=self.model_config.max_sequence_length, embeddings=self.embeddings, shuffle=False, features=features, output_input_offsets=True, use_chain_crf=self.model_config.use_chain_crf) # Build the evaluator and evaluate the model scorer = Scorer(test_generator, self.p, evaluation=True, use_crf=self.model_config.use_crf, use_chain_crf=self.model_config.use_chain_crf) scorer.model = the_model scorer.on_epoch_end(epoch=-1) f1 = scorer.f1 precision = scorer.precision recall = scorer.recall reports.append(scorer.report) reports_as_map.append(scorer.report_as_map) if best_f1 < f1: best_f1 = f1 best_index = i if worst_f1 > f1: worst_f1 = f1 worst_index = i total_f1 += f1 total_precision += precision total_recall += recall fold_average_evaluation = {'labels': {}, 'micro': {}, 'macro': {}} micro_f1 = total_f1 / self.model_config.fold_number micro_precision = total_precision / self.model_config.fold_number micro_recall = total_recall / self.model_config.fold_number micro_eval_block = { 'f1': micro_f1, 'precision': micro_precision, 'recall': micro_recall } fold_average_evaluation['micro'] = micro_eval_block # field-level average over the n folds labels = [] for label in sorted(self.p.vocab_tag): if label == 'O' or label == '<PAD>': continue if label.startswith("B-") or label.startswith( "S-") or label.startswith("I-") or label.startswith( "E-"): label = label[2:] if label in labels: continue labels.append(label) sum_p = 0 sum_r = 0 sum_f1 = 0 sum_support = 0 for j in range(0, self.model_config.fold_number): if label not in reports_as_map[j]['labels']: continue report_as_map = reports_as_map[j]['labels'][label] sum_p += report_as_map["precision"] sum_r += report_as_map["recall"] sum_f1 += report_as_map["f1"] sum_support += report_as_map["support"] avg_p = sum_p / self.model_config.fold_number avg_r = sum_r / self.model_config.fold_number avg_f1 = sum_f1 / self.model_config.fold_number avg_support = sum_support / self.model_config.fold_number avg_support_dec = str(avg_support - int(avg_support))[1:] if avg_support_dec != '0': avg_support = math.floor(avg_support) block_label = { 'precision': avg_p, 'recall': avg_r, 'support': avg_support, 'f1': avg_f1 } fold_average_evaluation['labels'][label] = block_label print( "----------------------------------------------------------------------" ) print("\n** Worst ** model scores - run", str(worst_index)) print(reports[worst_index]) print("\n** Best ** model scores - run", str(best_index)) print(reports[best_index]) fold_nb = self.model_config.fold_number self.model_config.fold_number = 1 if self.model_config.transformer_name is None: self.model = self.models[best_index] else: dir_path = 'data/models/sequenceLabelling/' weight_file = DEFAULT_WEIGHT_FILE_NAME.replace( ".hdf5", str(best_index) + ".hdf5") # saved config file must be updated to single fold self.model.load(filepath=os.path.join( dir_path, self.model_config.model_name, weight_file)) print( "----------------------------------------------------------------------" ) print("\nAverage over", str(int(fold_nb)), "folds") print( get_report(fold_average_evaluation, digits=4, include_avgs=['micro']))
def eval_single(self, x_test, y_test, features=None): if self.model is None: raise (OSError('Could not find a model.')) print_parameters(self.model_config, self.training_config) self.model.print_summary() if self.model_config.transformer_name is None: # we can use a data generator for evaluation # Prepare test data(steps, generator) generator = self.model.get_generator() test_generator = generator( x_test, y_test, batch_size=self.model_config.batch_size, preprocessor=self.p, char_embed_size=self.model_config.char_embedding_size, max_sequence_length=self.model_config.max_sequence_length, embeddings=self.embeddings, shuffle=False, features=features, output_input_offsets=True, use_chain_crf=self.model_config.use_chain_crf) # Build the evaluator and evaluate the model scorer = Scorer(test_generator, self.p, evaluation=True, use_crf=self.model_config.use_crf, use_chain_crf=self.model_config.use_chain_crf) scorer.model = self.model scorer.on_epoch_end(epoch=-1) else: # the architecture model uses a transformer layer # note that we could also use the above test_generator, but as an alternative here we check the # test/prediction alignment of tokens and the validity of the maximum sequence input length # wrt the length of the test sequences tagger = Tagger( self.model, self.model_config, self.embeddings, preprocessor=self.p, transformer_preprocessor=self.model.transformer_preprocessor) y_pred_pairs = tagger.tag(x_test, output_format=None, features=features) # keep only labels y_pred = [] for result in y_pred_pairs: result_labels = [] for pair in result: result_labels.append(pair[1]) y_pred.append(result_labels) nb_alignment_issues = 0 for i in range(len(y_test)): if len(y_test[i]) != len(y_pred[i]): #print("y_test:", y_test[i]) #print("y_pred:", y_pred[i]) nb_alignment_issues += 1 # BERT tokenizer appears to introduce some additional tokens without ## prefix, # but we normally handled that well when predicting. # To be very conservative, the following ensure the number of tokens always # match, but it should never be used in practice. if len(y_test[i]) < len(y_pred[i]): y_test[i] = y_test[i] + ["O"] * (len(y_pred[i]) - len(y_test[i])) if len(y_test[i]) > len(y_pred[i]): y_pred[i] = y_pred[i] + ["O"] * (len(y_test[i]) - len(y_pred[i])) if nb_alignment_issues > 0: print("number of alignment issues with test set:", nb_alignment_issues) print( "to solve them consider increasing the maximum sequence input length of the model and retrain" ) report, report_as_map = classification_report(y_test, y_pred, digits=4) print(report)
def train_nfold(self, x_train, y_train, x_valid=None, y_valid=None, f_train=None, f_valid=None, callbacks=None): """ n-fold training for the instance model for RNN models: -> the n models are stored in self.models, and self.model left unset at this stage fold number is available with self.model_config.fold_number for models with transformer layer: -> fold models are saved on disk (because too large) and self.models is not used, we identify the usage of folds with self.model_config.fold_number """ fold_count = self.model_config.fold_number fold_size = len(x_train) // fold_count dir_path = 'data/models/sequenceLabelling/' output_directory = os.path.join(dir_path, self.model_config.model_name) print("Output directory:", output_directory) if not os.path.exists(output_directory): os.makedirs(output_directory) if self.model_config.transformer_name is not None: # save the config, preprocessor and transformer layer config on disk self.model_config.save( os.path.join(output_directory, CONFIG_FILE_NAME)) self.preprocessor.save( os.path.join(output_directory, PROCESSOR_FILE_NAME)) for fold_id in range(0, fold_count): if x_valid is None: # segment train and valid fold_start = fold_size * fold_id fold_end = fold_start + fold_size if fold_id == fold_size - 1: fold_end = len(x_train) train_x = np.concatenate( [x_train[:fold_start], x_train[fold_end:]]) train_y = np.concatenate( [y_train[:fold_start], y_train[fold_end:]]) train_f = np.concatenate( [f_train[:fold_start], f_train[fold_end:]]) val_x = x_train[fold_start:fold_end] val_y = y_train[fold_start:fold_end] val_f = f_train[fold_start:fold_end] else: # reuse given segmentation train_x = x_train train_y = y_train train_f = f_train val_x = x_valid val_y = y_valid val_f = f_valid foldModel = get_model(self.model_config, self.preprocessor, ntags=len(self.preprocessor.vocab_tag), load_pretrained_weights=True) if fold_id == 0: print_parameters(self.model_config, self.training_config) foldModel.print_summary() print('\n------------------------ fold ' + str(fold_id) + '--------------------------------------') self.transformer_preprocessor = foldModel.transformer_preprocessor foldModel = self.compile_model(foldModel, len(train_x)) foldModel = self.train_model( foldModel, train_x, train_y, x_valid=val_x, y_valid=val_y, f_train=train_f, f_valid=val_f, max_epoch=self.training_config.max_epoch, callbacks=callbacks) if self.model_config.transformer_name is None: self.models.append(foldModel) else: # save the model with transformer layer on disk weight_file = DEFAULT_WEIGHT_FILE_NAME.replace( ".hdf5", str(fold_id) + ".hdf5") foldModel.save(os.path.join(output_directory, weight_file)) if fold_id == 0: foldModel.transformer_config.to_json_file( os.path.join(output_directory, TRANSFORMER_CONFIG_FILE_NAME)) if self.model_config.transformer_name is not None: transformer_preprocessor = foldModel.transformer_preprocessor transformer_preprocessor.tokenizer.save_pretrained( os.path.join(output_directory, DEFAULT_TRANSFORMER_TOKENIZER_DIR))
def eval(self, x_test, y_test, use_main_thread_only=False): print_parameters(self.model_config, self.training_config) bert_data = False if self.transformer_name is not None: bert_data = True if self.model_config.fold_number == 1: if self.model != None: self.model.print_summary() test_generator = DataGenerator(x_test, None, batch_size=self.model_config.batch_size, maxlen=self.model_config.maxlen, list_classes=self.model_config.list_classes, embeddings=self.embeddings, shuffle=False, bert_data=bert_data, transformer_tokenizer=self.model.transformer_tokenizer) result = self.model.predict(test_generator, use_main_thread_only=use_main_thread_only) else: raise (OSError('Could not find a model.')) else: if self.models is None: raise (OSError('Could not find nfolds models.')) self.models[0].print_summary() # just a warning: n classifiers using BERT layer for prediction might be heavy in term of model sizes test_generator = DataGenerator(x_test, None, batch_size=self.model_config.batch_size, maxlen=self.model_config.maxlen, list_classes=self.model_config.list_classes, embeddings=self.embeddings, shuffle=False, bert_data=bert_data, transformer_tokenizer=self.models[0].transformer_tokenizer) result = predict_folds(self.models, test_generator, self.model_config, self.training_config, use_main_thread_only=use_main_thread_only) print("-----------------------------------------------") print("\nEvaluation on", x_test.shape[0], "instances:") total_accuracy = 0.0 total_f1 = 0.0 total_loss = 0.0 total_roc_auc = 0.0 ''' def normer(t): if t < 0.5: return 0 else: return 1 vfunc = np.vectorize(normer) result_binary = vfunc(result) ''' result_intermediate = np.asarray([np.argmax(line) for line in result]) def vectorize(index, size): result = np.zeros(size) if index < size: result[index] = 1 return result result_binary = np.array([vectorize(xi, len(self.model_config.list_classes)) for xi in result_intermediate]) precision, recall, fscore, support = precision_recall_fscore_support(y_test, result_binary, average=None) print('{:>14} {:>12} {:>12} {:>12} {:>12}'.format(" ", "precision", "recall", "f-score", "support")) p = 0 for the_class in self.model_config.list_classes: the_class = the_class[:14] print('{:>14} {:>12} {:>12} {:>12} {:>12}'.format(the_class, "{:10.4f}" .format(precision[p]), "{:10.4f}".format(recall[p]), "{:10.4f}".format(fscore[p]), support[p])) p += 1 # macro-average (average of class scores) # we distinguish 1-class and multiclass problems if len(self.model_config.list_classes) == 1: total_accuracy = accuracy_score(y_test, result_binary) total_f1 = f1_score(y_test, result_binary) # sklearn will complain if log(0) total_loss = log_loss(y_test, result, labels=[0,1]) if len(np.unique(y_test)) == 1: # roc_auc_score sklearn implementation is not working in this case, it needs more balanced batches # a simple fix is to return the r2_score instead in this case (which is a regression score and not a loss) total_roc_auc = r2_score(y_test, result) if total_roc_auc < 0: total_roc_auc = 0 else: total_roc_auc = roc_auc_score(y_test, result) else: for j in range(0, len(self.model_config.list_classes)): accuracy = accuracy_score(y_test[:, j], result_binary[:, j]) total_accuracy += accuracy f1 = f1_score(y_test[:, j], result_binary[:, j], average='micro') total_f1 += f1 loss = log_loss(y_test[:, j], result[:, j], labels=[0,1]) total_loss += loss if len(np.unique(y_test[:, j])) == 1: # roc_auc_score sklearn implementation is not working in this case, it needs more balanced batches # a simple fix is to return the r2_score instead in this case (which is a regression score and not a loss) roc_auc = r2_score(y_test[:, j], result[:, j]) if roc_auc < 0: roc_auc = 0 else: roc_auc = roc_auc_score(y_test[:, j], result[:, j], labels=[0,1]) total_roc_auc += roc_auc ''' print("\nClass:", self.model_config.list_classes[j]) print("\taccuracy at 0.5 =", accuracy) print("\tf-1 at 0.5 =", f1) print("\tlog-loss =", loss) print("\troc auc =", roc_auc) ''' total_accuracy /= len(self.model_config.list_classes) total_f1 /= len(self.model_config.list_classes) total_loss /= len(self.model_config.list_classes) total_roc_auc /= len(self.model_config.list_classes) ''' if len(self.model_config.list_classes) != 1: print("\nMacro-average:") print("\taverage accuracy at 0.5 =", "{:10.4f}".format(total_accuracy)) print("\taverage f-1 at 0.5 =", "{:10.4f}".format(total_f1)) print("\taverage log-loss =","{:10.4f}".format( total_loss)) print("\taverage roc auc =", "{:10.4f}".format(total_roc_auc)) ''' # micro-average (average of scores for each instance) # make sense only if we have more than 1 class, otherwise same as # macro-avergae if len(self.model_config.list_classes) != 1: total_accuracy = 0.0 total_f1 = 0.0 total_loss = 0.0 total_roc_auc = 0.0 for i in range(0, result.shape[0]): accuracy = accuracy_score(y_test[i,:], result_binary[i,:]) total_accuracy += accuracy f1 = f1_score(y_test[i,:], result_binary[i,:], average='micro') total_f1 += f1 loss = log_loss(y_test[i,:], result[i,:], labels=[0.0, 1.0]) total_loss += loss if len(np.unique(y_test[i,:])) == 1: # roc_auc_score sklearn implementation is not working in this case, it needs more balanced batches # a simple fix is to return the r2_score instead in this case (which is a regression score and not a loss) roc_auc = r2_score(y_test[i,:], result[i,:]) if roc_auc < 0: roc_auc = 0 else: roc_auc = roc_auc_score(y_test[i,:], result[i,:], labels=[0.0, 1.0]) total_roc_auc += roc_auc total_accuracy /= result.shape[0] total_f1 /= result.shape[0] total_loss /= result.shape[0] total_roc_auc /= result.shape[0] '''