def train_nfold(self, x_train, y_train, x_valid=None, y_valid=None, f_train: np.array = None, f_valid: np.array = None, fold_number=10, callbacks=None): x_all = np.concatenate((x_train, x_valid), axis=0) if x_valid is not None else x_train y_all = np.concatenate((y_train, y_valid), axis=0) if y_valid is not None else y_train features_all = concatenate_or_none((f_train, f_valid), axis=0) self.p = prepare_preprocessor(x_all, y_all, features=features_all, model_config=self.model_config) self.model_config.char_vocab_size = len(self.p.vocab_char) self.model_config.case_vocab_size = len(self.p.vocab_case) self.p.return_lengths = True if 'bert' in self.model_config.model_type.lower(): self.model = get_model(self.model_config, self.p, len(self.p.vocab_tag)) self.models = [] for k in range(0, fold_number): model = get_model(self.model_config, self.p, len(self.p.vocab_tag)) self.models.append(model) trainer = Trainer(self.model, self.models, self.embeddings, self.model_config, self.training_config, checkpoint_path=self.log_dir, preprocessor=self.p ) trainer.train_nfold(x_train, y_train, x_valid, y_valid, f_train=f_train, f_valid=f_valid, callbacks=callbacks) if self.embeddings.use_ELMo: self.embeddings.clean_ELMo_cache() if self.embeddings.use_BERT: self.embeddings.clean_BERT_cache() if 'bert' in self.model_config.model_type.lower(): self.save()
def load(self, dir_path='data/models/sequenceLabelling/'): self.model_config = ModelConfig.load( os.path.join(dir_path, self.model_config.model_name, self.config_file)) self.p = WordPreprocessor.load( os.path.join(dir_path, self.model_config.model_name, self.preprocessor_file)) if self.model_config.model_type.lower().find("bert") != -1: self.model = get_model(self.model_config, self.p, ntags=len(self.p.vocab_tag), dir_path=dir_path) self.model.load_model() return # load embeddings # Do not use cache in 'production' mode self.embeddings = Embeddings(self.model_config.embeddings_name, use_ELMo=self.model_config.use_ELMo, use_BERT=self.model_config.use_BERT, use_cache=False) self.model_config.word_embedding_size = self.embeddings.embed_size self.model = get_model(self.model_config, self.p, ntags=len(self.p.vocab_tag)) self.model.load(filepath=os.path.join( dir_path, self.model_config.model_name, self.weight_file))
def train_nfold(self, x_train, y_train, x_valid=None, y_valid=None, fold_number=10): if x_valid is not None and y_valid is not None: x_all = np.concatenate((x_train, x_valid), axis=0) y_all = np.concatenate((y_train, y_valid), axis=0) self.p = prepare_preprocessor(x_all, y_all, self.model_config) else: self.p = prepare_preprocessor(x_train, y_train, self.model_config) self.model_config.char_vocab_size = len(self.p.vocab_char) self.model_config.case_vocab_size = len(self.p.vocab_case) self.p.return_lengths = True #self.model = get_model(self.model_config, self.p, len(self.p.vocab_tag)) self.models = [] for k in range(0, fold_number): model = get_model(self.model_config, self.p, len(self.p.vocab_tag)) self.models.append(model) trainer = Trainer(self.model, self.models, self.embeddings, self.model_config, self.training_config, checkpoint_path=self.log_dir, preprocessor=self.p ) trainer.train_nfold(x_train, y_train, x_valid, y_valid) if self.embeddings.use_ELMo: self.embeddings.clean_ELMo_cache()
def train(self, x_train, y_train, x_valid=None, y_valid=None): # TBD if valid is None, segment train to get one x_all = np.concatenate((x_train, x_valid), axis=0) y_all = np.concatenate((y_train, y_valid), axis=0) self.p = prepare_preprocessor(x_all, y_all, self.model_config) self.model_config.char_vocab_size = len(self.p.vocab_char) self.model_config.case_vocab_size = len(self.p.vocab_case) """ if self.embeddings.use_ELMo: # dump token context independent data for the train set, done once for the training x_train_local = x_train if not self.training_config.early_stop: # in case we want to train with the validation set too, we dump also # the ELMo embeddings for the token of the valid set x_train_local = np.concatenate((x_train, x_valid), axis=0) self.embeddings.dump_ELMo_token_embeddings(x_train_local) """ self.model = get_model(self.model_config, self.p, len(self.p.vocab_tag)) trainer = Trainer(self.model, self.models, self.embeddings, self.model_config, self.training_config, checkpoint_path=self.log_dir, preprocessor=self.p ) trainer.train(x_train, y_train, x_valid, y_valid) if self.embeddings.use_ELMo: self.embeddings.clean_ELMo_cache()
def load(self, dir_path='data/models/sequenceLabelling/', weight_file=DEFAULT_WEIGHT_FILE_NAME): model_path = os.path.join(dir_path, self.model_config.model_name) self.model_config = ModelConfig.load( os.path.join(model_path, CONFIG_FILE_NAME)) if self.model_config.embeddings_name is not None: # load embeddings # Do not use cache in 'prediction/production' mode self.embeddings = Embeddings(self.model_config.embeddings_name, resource_registry=self.registry, use_ELMo=self.model_config.use_ELMo, use_cache=False) self.model_config.word_embedding_size = self.embeddings.embed_size else: self.embeddings = None self.model_config.word_embedding_size = 0 self.p = Preprocessor.load( os.path.join(dir_path, self.model_config.model_name, PROCESSOR_FILE_NAME)) self.model = get_model(self.model_config, self.p, ntags=len(self.p.vocab_tag), load_pretrained_weights=False, local_path=os.path.join( dir_path, self.model_config.model_name)) print( "load weights from", os.path.join(dir_path, self.model_config.model_name, weight_file)) self.model.load(filepath=os.path.join( dir_path, self.model_config.model_name, weight_file)) self.model.print_summary()
def train(self, x_train, y_train, f_train=None, x_valid=None, y_valid=None, f_valid=None, callbacks=None): # TBD if valid is None, segment train to get one if early_stop is True # we concatenate all the training+validation data to create the model vocabulary if not x_valid is None: x_all = np.concatenate((x_train, x_valid), axis=0) else: x_all = x_train if not y_valid is None: y_all = np.concatenate((y_train, y_valid), axis=0) else: y_all = y_train features_all = concatenate_or_none((f_train, f_valid), axis=0) self.p = prepare_preprocessor(x_all, y_all, features=features_all, model_config=self.model_config) self.model_config.char_vocab_size = len(self.p.vocab_char) self.model_config.case_vocab_size = len(self.p.vocab_case) self.model = get_model(self.model_config, self.p, len(self.p.vocab_tag), load_pretrained_weights=True) print_parameters(self.model_config, self.training_config) self.model.print_summary() # uncomment to plot graph #plot_model(self.model, # to_file='data/models/textClassification/'+self.model_config.model_name+'_'+self.model_config.architecture+'.png') trainer = Trainer( self.model, self.models, self.embeddings, self.model_config, self.training_config, checkpoint_path=self.log_dir, preprocessor=self.p, transformer_preprocessor=self.model.transformer_preprocessor) trainer.train(x_train, y_train, x_valid, y_valid, features_train=f_train, features_valid=f_valid, callbacks=callbacks) if self.embeddings and self.embeddings.use_ELMo: self.embeddings.clean_ELMo_cache()
def train(self, x_train, y_train, f_train: np.array = None, x_valid=None, y_valid=None, f_valid: np.array = None, callbacks=None): # TBD if valid is None, segment train to get one x_all = np.concatenate((x_train, x_valid), axis=0) if x_valid is not None else x_train y_all = np.concatenate((y_train, y_valid), axis=0) if y_valid is not None else y_train features_all = concatenate_or_none((f_train, f_valid), axis=0) self.p = prepare_preprocessor(x_all, y_all, features=features_all, model_config=self.model_config) self.model_config.char_vocab_size = len(self.p.vocab_char) self.model_config.case_vocab_size = len(self.p.vocab_case) self.model = get_model(self.model_config, self.p, len(self.p.vocab_tag)) if self.p.return_features is not False: print('x_train.shape: ', x_train.shape) print('features_train.shape: ', f_train.shape) sample_transformed_features = self.p.transform_features(f_train) self.model_config.max_feature_size = np.asarray(sample_transformed_features).shape[-1] print('max_feature_size: ', self.model_config.max_feature_size) trainer = Trainer(self.model, self.models, self.embeddings, self.model_config, self.training_config, checkpoint_path=self.log_dir, preprocessor=self.p ) trainer.train(x_train, y_train, x_valid, y_valid, features_train=f_train, features_valid=f_valid, callbacks=callbacks) if self.embeddings.use_ELMo: self.embeddings.clean_ELMo_cache() if self.embeddings.use_BERT: self.embeddings.clean_BERT_cache()
def load(self, dir_path='data/models/sequenceLabelling/'): self.p = WordPreprocessor.load(os.path.join(dir_path, self.model_config.model_name, self.preprocessor_file)) self.model_config = ModelConfig.load(os.path.join(dir_path, self.model_config.model_name, self.config_file)) # load embeddings self.embeddings = Embeddings(self.model_config.embeddings_name, use_ELMo=self.model_config.use_ELMo) self.model_config.word_embedding_size = self.embeddings.embed_size self.model = get_model(self.model_config, self.p, ntags=len(self.p.vocab_tag)) self.model.load(filepath=os.path.join(dir_path, self.model_config.model_name, self.weight_file))
def train(self, x_train, y_train, x_valid=None, y_valid=None): # TBD if valid is None, segment train to get one x_all = np.concatenate((x_train, x_valid), axis=0) y_all = np.concatenate((y_train, y_valid), axis=0) self.p = prepare_preprocessor(x_all, y_all, self.model_config) self.model_config.char_vocab_size = len(self.p.vocab_char) self.model_config.case_vocab_size = len(self.p.vocab_case) self.model = get_model(self.model_config, self.p, len(self.p.vocab_tag)) trainer = Trainer(self.model, self.models, self.embeddings, self.model_config, self.training_config, checkpoint_path=self.log_dir, preprocessor=self.p) trainer.train(x_train, y_train, x_valid, y_valid) if self.embeddings.use_ELMo: self.embeddings.clean_ELMo_cache() if self.embeddings.use_BERT: self.embeddings.clean_BERT_cache()
def eval_nfold(self, x_test, y_test, features=None): if self.models is not None: total_f1 = 0 best_f1 = 0 best_index = 0 worst_f1 = 1 worst_index = 0 reports = [] reports_as_map = [] total_precision = 0 total_recall = 0 for i in range(self.model_config.fold_number): print('\n------------------------ fold ' + str(i) + ' --------------------------------------') if 'bert' not in self.model_config.model_type.lower(): # Prepare test data(steps, generator) test_generator = DataGenerator( x_test, y_test, batch_size=self.model_config.batch_size, preprocessor=self.p, char_embed_size=self.model_config.char_embedding_size, max_sequence_length=self.model_config. max_sequence_length, embeddings=self.embeddings, shuffle=False, features=features) # Build the evaluator and evaluate the model scorer = Scorer(test_generator, self.p, evaluation=True) scorer.model = self.models[i] scorer.on_epoch_end(epoch=-1) f1 = scorer.f1 precision = scorer.precision recall = scorer.recall reports.append(scorer.report) reports_as_map.append(scorer.report_as_map) else: # BERT architecture model dir_path = 'data/models/sequenceLabelling/' self.model_config = ModelConfig.load( os.path.join(dir_path, self.model_config.model_name, self.config_file)) self.p = WordPreprocessor.load( os.path.join(dir_path, self.model_config.model_name, self.preprocessor_file)) self.model = get_model(self.model_config, self.p, ntags=len(self.p.vocab_tag)) self.model.load_model(i) y_pred = self.model.predict(x_test, fold_id=i) nb_alignment_issues = 0 for j in range(len(y_test)): if len(y_test[i]) != len(y_pred[j]): nb_alignment_issues += 1 # BERT tokenizer appears to introduce some additional tokens without ## prefix, # but this is normally handled when predicting. # To be very conservative, the following ensure the number of tokens always # match, but it should never be used in practice. if len(y_test[j]) < len(y_pred[j]): y_test[j] = y_test[j] + ["O"] * ( len(y_pred[j]) - len(y_test[j])) if len(y_test[j]) > len(y_pred[j]): y_pred[j] = y_pred[j] + ["O"] * ( len(y_test[j]) - len(y_pred[j])) if nb_alignment_issues > 0: print("number of alignment issues with test set:", nb_alignment_issues) f1 = f1_score(y_test, y_pred) precision = precision_score(y_test, y_pred) recall = recall_score(y_test, y_pred) print("\tf1: {:04.2f}".format(f1 * 100)) print("\tprecision: {:04.2f}".format(precision * 100)) print("\trecall: {:04.2f}".format(recall * 100)) report, report_as_map = classification_report(y_test, y_pred, digits=4) reports.append(report) reports_as_map.append(report_as_map) if best_f1 < f1: best_f1 = f1 best_index = i if worst_f1 > f1: worst_f1 = f1 worst_index = i total_f1 += f1 total_precision += precision total_recall += recall fold_average_evaluation = {'labels': {}, 'micro': {}, 'macro': {}} micro_f1 = total_f1 / self.model_config.fold_number micro_precision = total_precision / self.model_config.fold_number micro_recall = total_recall / self.model_config.fold_number micro_eval_block = { 'f1': micro_f1, 'precision': micro_precision, 'recall': micro_recall } fold_average_evaluation['micro'] = micro_eval_block # field-level average over the n folds labels = [] for label in sorted(self.p.vocab_tag): if label == 'O' or label == '<PAD>': continue if label.startswith("B-") or label.startswith( "S-") or label.startswith("I-") or label.startswith( "E-"): label = label[2:] if label in labels: continue labels.append(label) sum_p = 0 sum_r = 0 sum_f1 = 0 sum_support = 0 for j in range(0, self.model_config.fold_number): if not label in reports_as_map[j]['labels']: continue report_as_map = reports_as_map[j]['labels'][label] sum_p += report_as_map["precision"] sum_r += report_as_map["recall"] sum_f1 += report_as_map["f1"] sum_support += report_as_map["support"] avg_p = sum_p / self.model_config.fold_number avg_r = sum_r / self.model_config.fold_number avg_f1 = sum_f1 / self.model_config.fold_number avg_support = sum_support / self.model_config.fold_number avg_support_dec = str(avg_support - int(avg_support))[1:] if avg_support_dec != '0': avg_support = math.floor(avg_support) block_label = { 'precision': avg_p, 'recall': avg_r, 'support': avg_support, 'f1': avg_f1 } fold_average_evaluation['labels'][label] = block_label print( "----------------------------------------------------------------------" ) print("\n** Worst ** model scores - run", str(worst_index)) print(reports[worst_index]) print("\n** Best ** model scores - run", str(best_index)) print(reports[best_index]) if 'bert' not in self.model_config.model_type.lower(): self.model = self.models[best_index] else: # copy best BERT model fold_number best_model_dir = 'data/models/sequenceLabelling/' + self.model_config.model_name + str( best_index) new_model_dir = 'data/models/sequenceLabelling/' + self.model_config.model_name # update new_model_dir if it already exists, keep its existing config content merge_folders(best_model_dir, new_model_dir) # clean other fold directory for i in range(self.model_config.fold_number): shutil.rmtree('data/models/sequenceLabelling/' + self.model_config.model_name + str(i)) print( "----------------------------------------------------------------------" ) print("\nAverage over", self.model_config.fold_number, "folds") print( get_report(fold_average_evaluation, digits=4, include_avgs=['micro']))
def eval_nfold(self, x_test, y_test, features=None): if self.models is not None: total_f1 = 0 best_f1 = 0 best_index = 0 worst_f1 = 1 worst_index = 0 reports = [] reports_as_map = [] total_precision = 0 total_recall = 0 for i in range(self.model_config.fold_number): if self.model_config.transformer_name is None: the_model = self.models[i] bert_preprocessor = None else: # the architecture model uses a transformer layer, it is large and needs to be loaded from disk dir_path = 'data/models/sequenceLabelling/' weight_file = DEFAULT_WEIGHT_FILE_NAME.replace( ".hdf5", str(i) + ".hdf5") self.model = get_model(self.model_config, self.p, ntags=len(self.p.vocab_tag), load_pretrained_weights=False, local_path=os.path.join( dir_path, self.model_config.model_name)) self.model.load(filepath=os.path.join( dir_path, self.model_config.model_name, weight_file)) the_model = self.model bert_preprocessor = self.model.transformer_preprocessor if i == 0: the_model.print_summary() print_parameters(self.model_config, self.training_config) print('\n------------------------ fold ' + str(i) + ' --------------------------------------') # we can use a data generator for evaluation # Prepare test data(steps, generator) generator = the_model.get_generator() test_generator = generator( x_test, y_test, batch_size=self.model_config.batch_size, preprocessor=self.p, bert_preprocessor=bert_preprocessor, char_embed_size=self.model_config.char_embedding_size, max_sequence_length=self.model_config.max_sequence_length, embeddings=self.embeddings, shuffle=False, features=features, output_input_offsets=True, use_chain_crf=self.model_config.use_chain_crf) # Build the evaluator and evaluate the model scorer = Scorer(test_generator, self.p, evaluation=True, use_crf=self.model_config.use_crf, use_chain_crf=self.model_config.use_chain_crf) scorer.model = the_model scorer.on_epoch_end(epoch=-1) f1 = scorer.f1 precision = scorer.precision recall = scorer.recall reports.append(scorer.report) reports_as_map.append(scorer.report_as_map) if best_f1 < f1: best_f1 = f1 best_index = i if worst_f1 > f1: worst_f1 = f1 worst_index = i total_f1 += f1 total_precision += precision total_recall += recall fold_average_evaluation = {'labels': {}, 'micro': {}, 'macro': {}} micro_f1 = total_f1 / self.model_config.fold_number micro_precision = total_precision / self.model_config.fold_number micro_recall = total_recall / self.model_config.fold_number micro_eval_block = { 'f1': micro_f1, 'precision': micro_precision, 'recall': micro_recall } fold_average_evaluation['micro'] = micro_eval_block # field-level average over the n folds labels = [] for label in sorted(self.p.vocab_tag): if label == 'O' or label == '<PAD>': continue if label.startswith("B-") or label.startswith( "S-") or label.startswith("I-") or label.startswith( "E-"): label = label[2:] if label in labels: continue labels.append(label) sum_p = 0 sum_r = 0 sum_f1 = 0 sum_support = 0 for j in range(0, self.model_config.fold_number): if label not in reports_as_map[j]['labels']: continue report_as_map = reports_as_map[j]['labels'][label] sum_p += report_as_map["precision"] sum_r += report_as_map["recall"] sum_f1 += report_as_map["f1"] sum_support += report_as_map["support"] avg_p = sum_p / self.model_config.fold_number avg_r = sum_r / self.model_config.fold_number avg_f1 = sum_f1 / self.model_config.fold_number avg_support = sum_support / self.model_config.fold_number avg_support_dec = str(avg_support - int(avg_support))[1:] if avg_support_dec != '0': avg_support = math.floor(avg_support) block_label = { 'precision': avg_p, 'recall': avg_r, 'support': avg_support, 'f1': avg_f1 } fold_average_evaluation['labels'][label] = block_label print( "----------------------------------------------------------------------" ) print("\n** Worst ** model scores - run", str(worst_index)) print(reports[worst_index]) print("\n** Best ** model scores - run", str(best_index)) print(reports[best_index]) fold_nb = self.model_config.fold_number self.model_config.fold_number = 1 if self.model_config.transformer_name is None: self.model = self.models[best_index] else: dir_path = 'data/models/sequenceLabelling/' weight_file = DEFAULT_WEIGHT_FILE_NAME.replace( ".hdf5", str(best_index) + ".hdf5") # saved config file must be updated to single fold self.model.load(filepath=os.path.join( dir_path, self.model_config.model_name, weight_file)) print( "----------------------------------------------------------------------" ) print("\nAverage over", str(int(fold_nb)), "folds") print( get_report(fold_average_evaluation, digits=4, include_avgs=['micro']))
def train_nfold(self, x_train, y_train, x_valid=None, y_valid=None, f_train=None, f_valid=None, callbacks=None): """ n-fold training for the instance model for RNN models: -> the n models are stored in self.models, and self.model left unset at this stage fold number is available with self.model_config.fold_number for models with transformer layer: -> fold models are saved on disk (because too large) and self.models is not used, we identify the usage of folds with self.model_config.fold_number """ fold_count = self.model_config.fold_number fold_size = len(x_train) // fold_count dir_path = 'data/models/sequenceLabelling/' output_directory = os.path.join(dir_path, self.model_config.model_name) print("Output directory:", output_directory) if not os.path.exists(output_directory): os.makedirs(output_directory) if self.model_config.transformer_name is not None: # save the config, preprocessor and transformer layer config on disk self.model_config.save( os.path.join(output_directory, CONFIG_FILE_NAME)) self.preprocessor.save( os.path.join(output_directory, PROCESSOR_FILE_NAME)) for fold_id in range(0, fold_count): if x_valid is None: # segment train and valid fold_start = fold_size * fold_id fold_end = fold_start + fold_size if fold_id == fold_size - 1: fold_end = len(x_train) train_x = np.concatenate( [x_train[:fold_start], x_train[fold_end:]]) train_y = np.concatenate( [y_train[:fold_start], y_train[fold_end:]]) train_f = np.concatenate( [f_train[:fold_start], f_train[fold_end:]]) val_x = x_train[fold_start:fold_end] val_y = y_train[fold_start:fold_end] val_f = f_train[fold_start:fold_end] else: # reuse given segmentation train_x = x_train train_y = y_train train_f = f_train val_x = x_valid val_y = y_valid val_f = f_valid foldModel = get_model(self.model_config, self.preprocessor, ntags=len(self.preprocessor.vocab_tag), load_pretrained_weights=True) if fold_id == 0: print_parameters(self.model_config, self.training_config) foldModel.print_summary() print('\n------------------------ fold ' + str(fold_id) + '--------------------------------------') self.transformer_preprocessor = foldModel.transformer_preprocessor foldModel = self.compile_model(foldModel, len(train_x)) foldModel = self.train_model( foldModel, train_x, train_y, x_valid=val_x, y_valid=val_y, f_train=train_f, f_valid=val_f, max_epoch=self.training_config.max_epoch, callbacks=callbacks) if self.model_config.transformer_name is None: self.models.append(foldModel) else: # save the model with transformer layer on disk weight_file = DEFAULT_WEIGHT_FILE_NAME.replace( ".hdf5", str(fold_id) + ".hdf5") foldModel.save(os.path.join(output_directory, weight_file)) if fold_id == 0: foldModel.transformer_config.to_json_file( os.path.join(output_directory, TRANSFORMER_CONFIG_FILE_NAME)) if self.model_config.transformer_name is not None: transformer_preprocessor = foldModel.transformer_preprocessor transformer_preprocessor.tokenizer.save_pretrained( os.path.join(output_directory, DEFAULT_TRANSFORMER_TOKENIZER_DIR))