def __init__(self, **kwargs): # Set parameters print('hello') self.parameters, self.conf_parameters = load_parameters(**kwargs) self.dataset_filepaths, self.dataset_brat_folders = self._get_valid_dataset_filepaths( self.parameters) self._check_param_compatibility(self.parameters, self.dataset_filepaths) # Load dataset self.modeldata = dataset.Dataset(verbose=self.parameters['verbose'], debug=self.parameters['debug']) token_to_vector = self.modeldata.load_dataset(self.dataset_filepaths, self.parameters) # Launch session. Automatically choose a device # if the specified one doesn't exist session_conf = tf.ConfigProto(intra_op_parallelism_threads=self. parameters['number_of_cpu_threads'], inter_op_parallelism_threads=self. parameters['number_of_cpu_threads'], device_count={ 'CPU': 1, 'GPU': self.parameters['number_of_gpus'] }, allow_soft_placement=True, log_device_placement=False) self.sess = tf.Session(config=session_conf) with self.sess.as_default(): # Initialize or load pretrained model self.model = EntityLSTM(self.modeldata, self.parameters) self.sess.run(tf.global_variables_initializer()) if self.parameters['use_pretrained_model']: self.transition_params_trained = self.model.restore_from_pretrained_model( self.parameters, self.modeldata, self.sess, token_to_vector=token_to_vector) else: self.model.load_pretrained_token_embeddings( self.sess, self.modeldata, self.parameters, token_to_vector) self.transition_params_trained = np.random.rand( len(self.modeldata.unique_labels) + 2, len(self.modeldata.unique_labels) + 2)
def trim_model_checkpoint(parameters_filepath, dataset_filepath, input_checkpoint_filepath, output_checkpoint_filepath): ''' Remove all token embeddings except UNK. ''' parameters, _ = neuromodel.load_parameters(parameters_filepath=parameters_filepath) dataset = pickle.load(open(dataset_filepath, 'rb')) model = EntityLSTM(dataset, parameters) with tf.Session() as sess: model_saver = tf.train.Saver() # defaults to saving all variables # Restore the pretrained model model_saver.restore(sess, input_checkpoint_filepath) # Works only when the dimensions of tensor variables are matched. # Get pretrained embeddings token_embedding_weights = sess.run(model.token_embedding_weights) # Restore the sizes of token embedding weights utils_tf.resize_tensor_variable(sess, model.token_embedding_weights, [1, parameters['token_embedding_dimension']]) initial_weights = sess.run(model.token_embedding_weights) initial_weights[dataset.UNK_TOKEN_INDEX] = token_embedding_weights[dataset.UNK_TOKEN_INDEX] sess.run(tf.assign(model.token_embedding_weights, initial_weights, validate_shape=False)) token_embedding_weights = sess.run(model.token_embedding_weights) print("token_embedding_weights: {0}".format(token_embedding_weights)) model_saver.save(sess, output_checkpoint_filepath) dataset.__dict__['vocabulary_size'] = 1 pickle.dump(dataset, open(dataset_filepath, 'wb')) pprint(dataset.__dict__)
def __init__(self, parameters_filepath=argument_default_value, pretrained_model_folder=argument_default_value, dataset_text_folder=argument_default_value, character_embedding_dimension=argument_default_value, character_lstm_hidden_state_dimension=argument_default_value, check_for_digits_replaced_with_zeros=argument_default_value, check_for_lowercase=argument_default_value, debug=argument_default_value, dropout_rate=argument_default_value, experiment_name=argument_default_value, freeze_token_embeddings=argument_default_value, gradient_clipping_value=argument_default_value, learning_rate=argument_default_value, load_only_pretrained_token_embeddings=argument_default_value, load_all_pretrained_token_embeddings=argument_default_value, main_evaluation_mode=argument_default_value, maximum_number_of_epochs=argument_default_value, number_of_cpu_threads=argument_default_value, number_of_gpus=argument_default_value, optimizer=argument_default_value, output_folder=argument_default_value, patience=argument_default_value, plot_format=argument_default_value, reload_character_embeddings=argument_default_value, reload_character_lstm=argument_default_value, reload_crf=argument_default_value, reload_feedforward=argument_default_value, reload_token_embeddings=argument_default_value, reload_token_lstm=argument_default_value, remap_unknown_tokens_to_unk=argument_default_value, spacylanguage=argument_default_value, tagging_format=argument_default_value, token_embedding_dimension=argument_default_value, token_lstm_hidden_state_dimension=argument_default_value, token_pretrained_embedding_filepath=argument_default_value, tokenizer=argument_default_value, train_model=argument_default_value, use_character_lstm=argument_default_value, use_crf=argument_default_value, use_pretrained_model=argument_default_value, verbose=argument_default_value, argument_default_value=argument_default_value, # new arguments num_layers=argument_default_value, use_deep_lstm=argument_default_value): # Parse arguments arguments = dict( (k,str(v)) for k,v in locals().items() if k !='self') # Initialize parameters parameters, conf_parameters = self._load_parameters(arguments['parameters_filepath'], arguments=arguments) dataset_filepaths, dataset_brat_folders = self._get_valid_dataset_filepaths(parameters) self._check_parameter_compatiblity(parameters, dataset_filepaths) # Load dataset dataset = ds.Dataset(verbose=parameters['verbose'], debug=parameters['debug']) token_to_vector = dataset.load_dataset(dataset_filepaths, parameters) # Launch session session_conf = tf.ConfigProto( intra_op_parallelism_threads=parameters['number_of_cpu_threads'], inter_op_parallelism_threads=parameters['number_of_cpu_threads'], device_count={'CPU': 1, 'GPU': parameters['number_of_gpus']}, allow_soft_placement=True, # automatically choose an existing and supported device to run the operations in case the specified one doesn't exist log_device_placement=False ) sess = tf.Session(config=session_conf) with sess.as_default(): # Create model and initialize or load pretrained model ### Instantiate the model model = EntityLSTM(dataset, parameters) ### Initialize the model and restore from pretrained model if needed sess.run(tf.global_variables_initializer()) if not parameters['use_pretrained_model']: model.load_pretrained_token_embeddings(sess, dataset, parameters, token_to_vector) self.transition_params_trained = np.random.rand(len(dataset.unique_labels)+2,len(dataset.unique_labels)+2) else: self.transition_params_trained = model.restore_from_pretrained_model(parameters, dataset, sess, token_to_vector=token_to_vector) del token_to_vector self.dataset = dataset self.dataset_brat_folders = dataset_brat_folders self.dataset_filepaths = dataset_filepaths self.model = model self.parameters = parameters self.conf_parameters = conf_parameters self.sess = sess
def main(): parameters, conf_parameters = load_parameters() dataset_filepaths, dataset_brat_folders = get_valid_dataset_filepaths(parameters) check_parameter_compatiblity(parameters, dataset_filepaths) # Load dataset dataset = ds.Dataset(verbose=parameters['verbose'], debug=parameters['debug']) dataset.load_dataset(dataset_filepaths, parameters) # Create graph and session with tf.Graph().as_default(): session_conf = tf.ConfigProto( intra_op_parallelism_threads=parameters['number_of_cpu_threads'], inter_op_parallelism_threads=parameters['number_of_cpu_threads'], device_count={'CPU': 1, 'GPU': parameters['number_of_gpus']}, allow_soft_placement=True, # automatically choose an existing and supported device to run the operations in case the specified one doesn't exist log_device_placement=False ) sess = tf.Session(config=session_conf) with sess.as_default(): # Initialize and save execution details start_time = time.time() experiment_timestamp = utils.get_current_time_in_miliseconds() results = {} results['epoch'] = {} results['execution_details'] = {} results['execution_details']['train_start'] = start_time results['execution_details']['time_stamp'] = experiment_timestamp results['execution_details']['early_stop'] = False results['execution_details']['keyboard_interrupt'] = False results['execution_details']['num_epochs'] = 0 results['model_options'] = copy.copy(parameters) dataset_name = utils.get_basename_without_extension(parameters['dataset_text_folder']) model_name = '{0}_{1}'.format(dataset_name, results['execution_details']['time_stamp']) output_folder=os.path.join('..', 'output') utils.create_folder_if_not_exists(output_folder) stats_graph_folder=os.path.join(output_folder, model_name) # Folder where to save graphs utils.create_folder_if_not_exists(stats_graph_folder) model_folder = os.path.join(stats_graph_folder, 'model') utils.create_folder_if_not_exists(model_folder) with open(os.path.join(model_folder, 'parameters.ini'), 'w') as parameters_file: conf_parameters.write(parameters_file) tensorboard_log_folder = os.path.join(stats_graph_folder, 'tensorboard_logs') utils.create_folder_if_not_exists(tensorboard_log_folder) tensorboard_log_folders = {} for dataset_type in dataset_filepaths.keys(): tensorboard_log_folders[dataset_type] = os.path.join(stats_graph_folder, 'tensorboard_logs', dataset_type) utils.create_folder_if_not_exists(tensorboard_log_folders[dataset_type]) pickle.dump(dataset, open(os.path.join(model_folder, 'dataset.pickle'), 'wb')) # Instantiate the model # graph initialization should be before FileWriter, otherwise the graph will not appear in TensorBoard model = EntityLSTM(dataset, parameters) # Instantiate the writers for TensorBoard writers = {} for dataset_type in dataset_filepaths.keys(): writers[dataset_type] = tf.summary.FileWriter(tensorboard_log_folders[dataset_type], graph=sess.graph) embedding_writer = tf.summary.FileWriter(model_folder) # embedding_writer has to write in model_folder, otherwise TensorBoard won't be able to view embeddings embeddings_projector_config = projector.ProjectorConfig() tensorboard_token_embeddings = embeddings_projector_config.embeddings.add() tensorboard_token_embeddings.tensor_name = model.token_embedding_weights.name token_list_file_path = os.path.join(model_folder, 'tensorboard_metadata_tokens.tsv') tensorboard_token_embeddings.metadata_path = os.path.relpath(token_list_file_path, '..') tensorboard_character_embeddings = embeddings_projector_config.embeddings.add() tensorboard_character_embeddings.tensor_name = model.character_embedding_weights.name character_list_file_path = os.path.join(model_folder, 'tensorboard_metadata_characters.tsv') tensorboard_character_embeddings.metadata_path = os.path.relpath(character_list_file_path, '..') projector.visualize_embeddings(embedding_writer, embeddings_projector_config) # Write metadata for TensorBoard embeddings token_list_file = codecs.open(token_list_file_path,'w', 'UTF-8') for token_index in range(dataset.vocabulary_size): token_list_file.write('{0}\n'.format(dataset.index_to_token[token_index])) token_list_file.close() character_list_file = codecs.open(character_list_file_path,'w', 'UTF-8') for character_index in range(dataset.alphabet_size): if character_index == dataset.PADDING_CHARACTER_INDEX: character_list_file.write('PADDING\n') else: character_list_file.write('{0}\n'.format(dataset.index_to_character[character_index])) character_list_file.close() # Initialize the model sess.run(tf.global_variables_initializer()) if not parameters['use_pretrained_model']: model.load_pretrained_token_embeddings(sess, dataset, parameters) # Start training + evaluation loop. Each iteration corresponds to 1 epoch. bad_counter = 0 # number of epochs with no improvement on the validation test in terms of F1-score previous_best_valid_f1_score = 0 transition_params_trained = np.random.rand(len(dataset.unique_labels)+2,len(dataset.unique_labels)+2) model_saver = tf.train.Saver(max_to_keep=parameters['maximum_number_of_epochs']) # defaults to saving all variables epoch_number = -1 try: while True: step = 0 epoch_number += 1 print('\nStarting epoch {0}'.format(epoch_number)) epoch_start_time = time.time() if parameters['use_pretrained_model'] and epoch_number == 0: # Restore pretrained model parameters transition_params_trained = train.restore_model_parameters_from_pretrained_model(parameters, dataset, sess, model, model_saver) elif epoch_number != 0: # Train model: loop over all sequences of training set with shuffling sequence_numbers=list(range(len(dataset.token_indices['train']))) random.shuffle(sequence_numbers) for sequence_number in sequence_numbers: transition_params_trained = train.train_step(sess, dataset, sequence_number, model, transition_params_trained, parameters) step += 1 if step % 10 == 0: print('Training {0:.2f}% done'.format(step/len(sequence_numbers)*100), end='\r', flush=True) epoch_elapsed_training_time = time.time() - epoch_start_time print('Training completed in {0:.2f} seconds'.format(epoch_elapsed_training_time), flush=True) y_pred, y_true, output_filepaths = train.predict_labels(sess, model, transition_params_trained, parameters, dataset, epoch_number, stats_graph_folder, dataset_filepaths) # Evaluate model: save and plot results evaluate.evaluate_model(results, dataset, y_pred, y_true, stats_graph_folder, epoch_number, epoch_start_time, output_filepaths, parameters) if parameters['use_pretrained_model'] and not parameters['train_model']: conll_to_brat.output_brat(output_filepaths, dataset_brat_folders, stats_graph_folder) break # Save model model_saver.save(sess, os.path.join(model_folder, 'model_{0:05d}.ckpt'.format(epoch_number))) # Save TensorBoard logs summary = sess.run(model.summary_op, feed_dict=None) writers['train'].add_summary(summary, epoch_number) writers['train'].flush() utils.copytree(writers['train'].get_logdir(), model_folder) # Early stop valid_f1_score = results['epoch'][epoch_number][0]['valid']['f1_score']['micro'] if valid_f1_score > previous_best_valid_f1_score: bad_counter = 0 previous_best_valid_f1_score = valid_f1_score conll_to_brat.output_brat(output_filepaths, dataset_brat_folders, stats_graph_folder, overwrite=True) else: bad_counter += 1 print("The last {0} epochs have not shown improvements on the validation set.".format(bad_counter)) if bad_counter >= parameters['patience']: print('Early Stop!') results['execution_details']['early_stop'] = True break if epoch_number >= parameters['maximum_number_of_epochs']: break except KeyboardInterrupt: results['execution_details']['keyboard_interrupt'] = True print('Training interrupted') print('Finishing the experiment') end_time = time.time() results['execution_details']['train_duration'] = end_time - start_time results['execution_details']['train_end'] = end_time print('ok1') evaluate.save_results(results, stats_graph_folder) print('ok2') print('ok3') #sess.close() # release the session's resources print('ok4')
def main(argv=sys.argv): arguments = parse_arguments(argv[1:]) parameters, conf_parameters = load_parameters( arguments['parameters_filepath'], arguments=arguments) dataset_filepaths, dataset_brat_folders = get_valid_dataset_filepaths( parameters) check_parameter_compatiblity(parameters, dataset_filepaths) # Load dataset dataset = ds.Dataset(verbose=parameters['verbose'], debug=parameters['debug']) dataset.load_dataset(dataset_filepaths, parameters) # Create graph and session with tf.device('/gpu:0'): with tf.Graph().as_default(): session_conf = tf.ConfigProto( intra_op_parallelism_threads=parameters[ 'number_of_cpu_threads'], inter_op_parallelism_threads=parameters[ 'number_of_cpu_threads'], device_count={ 'CPU': 1, 'GPU': parameters['number_of_gpus'] }, allow_soft_placement=True, log_device_placement=False) sess = tf.Session(config=session_conf) with sess.as_default(): start_time = time.time() experiment_timestamp = utils.get_current_time_in_miliseconds() results = {} results['epoch'] = {} results['execution_details'] = {} results['execution_details']['train_start'] = start_time results['execution_details'][ 'time_stamp'] = experiment_timestamp results['execution_details']['early_stop'] = False results['execution_details']['keyboard_interrupt'] = False results['execution_details']['num_epochs'] = 0 results['model_options'] = copy.copy(parameters) dataset_name = utils.get_basename_without_extension( parameters['dataset_text_folder']) model_name = dataset_name utils.create_folder_if_not_exists(parameters['output_folder']) stats_graph_folder = os.path.join( parameters['output_folder'], model_name) # Folder where to save graphs final_weights_folder = os.path.join( parameters['output_folder'], 'weights') utils.create_folder_if_not_exists(stats_graph_folder) utils.create_folder_if_not_exists(final_weights_folder) model_folder = os.path.join(stats_graph_folder, 'model') utils.create_folder_if_not_exists(model_folder) with open(os.path.join(model_folder, 'parameters.ini'), 'w') as parameters_file: conf_parameters.write(parameters_file) tensorboard_log_folder = os.path.join(stats_graph_folder, 'tensorboard_logs') utils.create_folder_if_not_exists(tensorboard_log_folder) tensorboard_log_folders = {} for dataset_type in dataset_filepaths.keys(): tensorboard_log_folders[dataset_type] = os.path.join( stats_graph_folder, 'tensorboard_logs', dataset_type) utils.create_folder_if_not_exists( tensorboard_log_folders[dataset_type]) pickle.dump( dataset, open(os.path.join(model_folder, 'dataset.pickle'), 'wb')) model = EntityLSTM(dataset, parameters) writers = {} for dataset_type in dataset_filepaths.keys(): writers[dataset_type] = tf.summary.FileWriter( tensorboard_log_folders[dataset_type], graph=sess.graph) embedding_writer = tf.summary.FileWriter(model_folder) embeddings_projector_config = projector.ProjectorConfig() tensorboard_token_embeddings = embeddings_projector_config.embeddings.add( ) tensorboard_token_embeddings.tensor_name = model.token_embedding_weights.name token_list_file_path = os.path.join( model_folder, 'tensorboard_metadata_tokens.tsv') tensorboard_token_embeddings.metadata_path = os.path.relpath( token_list_file_path, '..') tensorboard_character_embeddings = embeddings_projector_config.embeddings.add( ) tensorboard_character_embeddings.tensor_name = model.character_embedding_weights.name character_list_file_path = os.path.join( model_folder, 'tensorboard_metadata_characters.tsv') tensorboard_character_embeddings.metadata_path = os.path.relpath( character_list_file_path, '..') projector.visualize_embeddings(embedding_writer, embeddings_projector_config) token_list_file = codecs.open(token_list_file_path, 'w', 'latin-1') for token_index in range(dataset.vocabulary_size): token_list_file.write('{0}\n'.format( dataset.index_to_token[token_index])) token_list_file.close() character_list_file = codecs.open(character_list_file_path, 'w', 'latin-1') for character_index in range(dataset.alphabet_size): if character_index == dataset.PADDING_CHARACTER_INDEX: character_list_file.write('PADDING\n') else: character_list_file.write('{0}\n'.format( dataset.index_to_character[character_index])) character_list_file.close() # Initialize the model sess.run(tf.global_variables_initializer()) if not parameters['use_pretrained_model']: model.load_pretrained_token_embeddings( sess, dataset, parameters) patience_counter = 0 # number of epochs with no improvement on the validation test in terms of F1-score f1_score_best = 0 f1_scores = {'train-F1': [], 'valid-F1': [], 'test-F1': []} transition_params_trained = np.random.rand( len(dataset.unique_labels) + 2, len(dataset.unique_labels) + 2) model_saver = tf.train.Saver( max_to_keep=parameters['num_of_model_to_keep'] ) #, reshape= True) # defaults to saving all variables epoch_number = -1 try: while True: step = 0 epoch_number += 1 print('\nStarting epoch {0}'.format(epoch_number)) epoch_start_time = time.time() if parameters[ 'use_pretrained_model'] and epoch_number == 0: if parameters['use_corrector']: parameters['use_corrector'] = False transition_params_trained = train.restore_pretrained_model( parameters, dataset, sess, model, model_saver) print( 'Getting the 3-label predictions from the step1 model.' ) all_pred_labels, y_pred_for_corrector, y_true_for_corrector, \ output_filepaths = train.predict_labels(sess, model, transition_params_trained, parameters, dataset, epoch_number, stats_graph_folder, dataset_filepaths, for_corrector = True) all_pred_indices = {} #defaultdict(list) for dataset_type in dataset_filepaths.keys(): all_pred_indices[dataset_type] = [] for i in range( len(all_pred_labels[dataset_type]) ): indices = [ dataset. label_corrector_to_index[label] for label in all_pred_labels[dataset_type][i] ] all_pred_indices[dataset_type].append( indices) label_binarizer_corrector = sklearn.preprocessing.LabelBinarizer( ) label_binarizer_corrector.fit( range( max(dataset.index_to_label_corrector. keys()) + 1)) predicted_label_corrector_vector_indices = {} for dataset_type in dataset_filepaths.keys(): predicted_label_corrector_vector_indices[ dataset_type] = [] for label_indices_sequence in all_pred_indices[ dataset_type]: predicted_label_corrector_vector_indices[ dataset_type].append( label_binarizer_corrector. transform( label_indices_sequence)) parameters['use_corrector'] = True transition_params_trained, model, glo_step = \ train.restore_model_parameters_from_pretrained_model(parameters, dataset, sess, model, model_saver) for dataset_type in dataset_filepaths.keys(): writers[dataset_type] = tf.summary.FileWriter( tensorboard_log_folders[dataset_type], graph=sess.graph) embedding_writer = tf.summary.FileWriter( model_folder) init_new_vars_op = tf.initialize_variables( [glo_step]) sess.run(init_new_vars_op) elif epoch_number != 0: sequence_numbers = list( range(len(dataset.token_indices['train']))) random.shuffle(sequence_numbers) for sequence_number in sequence_numbers: transition_params_trained, W_before_crf = train.train_step( sess, dataset, sequence_number, model, transition_params_trained, parameters) step += 1 epoch_elapsed_training_time = time.time( ) - epoch_start_time print('Training completed in {0:.2f} seconds'.format( epoch_elapsed_training_time), flush=False) if parameters['use_corrector']: original_label_corrector_vector_indices = dataset.label_corrector_vector_indices dataset.label_corrector_vector_indices = predicted_label_corrector_vector_indices y_pred, y_true, output_filepaths = train.predict_labels( sess, model, transition_params_trained, parameters, dataset, epoch_number, stats_graph_folder, dataset_filepaths) # Evaluate model: save and plot results evaluate.evaluate_model(results, dataset, y_pred, y_true, stats_graph_folder, epoch_number, epoch_start_time, output_filepaths, parameters) dataset.label_corrector_vector_indices = original_label_corrector_vector_indices else: y_pred, y_true, output_filepaths = train.predict_labels( sess, model, transition_params_trained, parameters, dataset, epoch_number, stats_graph_folder, dataset_filepaths) # Evaluate model: save and plot results evaluate.evaluate_model(results, dataset, y_pred, y_true, stats_graph_folder, epoch_number, epoch_start_time, output_filepaths, parameters) summary = sess.run(model.summary_op, feed_dict=None) writers['train'].add_summary(summary, epoch_number) writers['train'].flush() utils.copytree(writers['train'].get_logdir(), model_folder) # Early stopping train_f1_score = results['epoch'][epoch_number][0][ 'train']['f1_score']['micro'] valid_f1_score = results['epoch'][epoch_number][0][ 'valid']['f1_score']['micro'] test_f1_score = results['epoch'][epoch_number][0][ 'test']['f1_score']['micro'] f1_scores['train-F1'].append(train_f1_score) f1_scores['valid-F1'].append(valid_f1_score) f1_scores['test-F1'].append(test_f1_score) if valid_f1_score > f1_score_best: patience_counter = 0 f1_score_best = valid_f1_score # Save the best model model_saver.save( sess, os.path.join(model_folder, 'best_model.ckpt')) print( 'updated model to current epoch : epoch {:d}'. format(epoch_number)) print('the model is saved in: {:s}'.format( model_folder)) ### newly deleted else: patience_counter += 1 print("In epoch {:d}, the valid F1 is : {:f}".format( epoch_number, valid_f1_score)) print( "The last {0} epochs have not shown improvements on the validation set." .format(patience_counter)) if patience_counter >= parameters['patience']: print('Early Stop!') results['execution_details']['early_stop'] = True if epoch_number >= parameters[ 'maximum_number_of_epochs'] and parameters[ 'refine_with_crf']: model = train.refine_with_crf( parameters, sess, model, model_saver) print('refine model with CRF ...') for additional_epoch in range( parameters['additional_epochs_with_crf']): print('Additional {:d}th epoch'.format( additional_epoch)) sequence_numbers = list( range(len(dataset.token_indices['train']))) random.shuffle(sequence_numbers) for sequence_number in sequence_numbers: transition_params_trained, W_before_crf = train.train_step( sess, dataset, sequence_number, model, transition_params_trained, parameters) step += 1 epoch_elapsed_training_time = time.time( ) - epoch_start_time print( 'Additional training completed in {0:.2f} seconds' .format(epoch_elapsed_training_time), flush=False) y_pred, y_true, output_filepaths = train.predict_labels( sess, model, transition_params_trained, parameters, dataset, epoch_number, stats_graph_folder, dataset_filepaths) evaluate.evaluate_model( results, dataset, y_pred, y_true, stats_graph_folder, epoch_number, epoch_start_time, output_filepaths, parameters) summary = sess.run(model.summary_op, feed_dict=None) writers['train'].add_summary( summary, epoch_number) writers['train'].flush() utils.copytree(writers['train'].get_logdir(), model_folder) if epoch_number >= parameters[ 'maximum_number_of_epochs'] and not parameters[ 'refine_with_crf']: break if not parameters['use_pretrained_model']: plot_name = 'F1-summary-step1.svg' else: plot_name = 'F1-summary-step2.svg' for k, l in f1_scores.items(): print(k, l) utils_plots.plot_f1( f1_scores, os.path.join(stats_graph_folder, '..', plot_name), 'F1 score summary') except KeyboardInterrupt: results['execution_details']['keyboard_interrupt'] = True print('Training interrupted') print('Finishing the experiment') end_time = time.time() results['execution_details'][ 'train_duration'] = end_time - start_time results['execution_details']['train_end'] = end_time evaluate.save_results(results, stats_graph_folder) for dataset_type in dataset_filepaths.keys(): writers[dataset_type].close() sess.close()
def main(argv=sys.argv): ''' NeuroNER main method Args: parameters_filepath the path to the parameters file output_folder the path to the output folder ''' arguments = parse_arguments(argv[1:]) parameters, conf_parameters = load_parameters( arguments['parameters_filepath'], arguments=arguments) dataset_filepaths, dataset_brat_folders = get_valid_dataset_filepaths( parameters) check_parameter_compatiblity(parameters, dataset_filepaths) # Load dataset dataset = ds.Dataset(verbose=parameters['verbose'], debug=parameters['debug']) dataset.load_dataset(dataset_filepaths, parameters) # Create graph and session with tf.device('/gpu:0'): with tf.Graph().as_default(): session_conf = tf.ConfigProto( intra_op_parallelism_threads=parameters[ 'number_of_cpu_threads'], inter_op_parallelism_threads=parameters[ 'number_of_cpu_threads'], device_count={ 'CPU': 1, 'GPU': parameters['number_of_gpus'] }, allow_soft_placement=True, # automatically choose an existing and supported device to run the operations in case the specified one doesn't exist log_device_placement=False) sess = tf.Session(config=session_conf) with sess.as_default(): start_time = time.time() experiment_timestamp = utils.get_current_time_in_miliseconds() results = {} results['epoch'] = {} results['execution_details'] = {} results['execution_details']['train_start'] = start_time results['execution_details'][ 'time_stamp'] = experiment_timestamp results['execution_details']['early_stop'] = False results['execution_details']['keyboard_interrupt'] = False results['execution_details']['num_epochs'] = 0 results['model_options'] = copy.copy(parameters) dataset_name = utils.get_basename_without_extension( parameters['dataset_text_folder']) model_name = dataset_name utils.create_folder_if_not_exists(parameters['output_folder']) stats_graph_folder = os.path.join( parameters['output_folder'], model_name) # Folder where to save graphs final_weights_folder = os.path.join( parameters['output_folder'], 'weights') utils.create_folder_if_not_exists(stats_graph_folder) utils.create_folder_if_not_exists(final_weights_folder) model_folder = os.path.join(stats_graph_folder, 'model') utils.create_folder_if_not_exists(model_folder) # saving the parameter setting to the output model dir. For later resuming training with open(os.path.join(model_folder, 'parameters.ini'), 'w') as parameters_file: conf_parameters.write(parameters_file) tensorboard_log_folder = os.path.join(stats_graph_folder, 'tensorboard_logs') utils.create_folder_if_not_exists(tensorboard_log_folder) tensorboard_log_folders = {} for dataset_type in dataset_filepaths.keys(): tensorboard_log_folders[dataset_type] = os.path.join( stats_graph_folder, 'tensorboard_logs', dataset_type) utils.create_folder_if_not_exists( tensorboard_log_folders[dataset_type]) pickle.dump( dataset, open(os.path.join(model_folder, 'dataset.pickle'), 'wb')) # Instantiate the model # graph initialization should be before FileWriter, otherwise the graph will not appear in TensorBoard model = EntityLSTM(dataset, parameters) # Instantiate the writers for TensorBoard writers = {} for dataset_type in dataset_filepaths.keys(): writers[dataset_type] = tf.summary.FileWriter( tensorboard_log_folders[dataset_type], graph=sess.graph) # embedding_writer has to write in model_folder, otherwise TensorBoard won't be able to view embeddings embedding_writer = tf.summary.FileWriter(model_folder) embeddings_projector_config = projector.ProjectorConfig() tensorboard_token_embeddings = embeddings_projector_config.embeddings.add( ) tensorboard_token_embeddings.tensor_name = model.token_embedding_weights.name token_list_file_path = os.path.join( model_folder, 'tensorboard_metadata_tokens.tsv') tensorboard_token_embeddings.metadata_path = os.path.relpath( token_list_file_path, '..') tensorboard_character_embeddings = embeddings_projector_config.embeddings.add( ) tensorboard_character_embeddings.tensor_name = model.character_embedding_weights.name character_list_file_path = os.path.join( model_folder, 'tensorboard_metadata_characters.tsv') tensorboard_character_embeddings.metadata_path = os.path.relpath( character_list_file_path, '..') projector.visualize_embeddings(embedding_writer, embeddings_projector_config) # Write metadata for TensorBoard embeddings token_list_file = codecs.open(token_list_file_path, 'w', 'latin-1') for token_index in range(dataset.vocabulary_size): token_list_file.write('{0}\n'.format( dataset.index_to_token[token_index])) token_list_file.close() character_list_file = codecs.open(character_list_file_path, 'w', 'latin-1') for character_index in range(dataset.alphabet_size): if character_index == dataset.PADDING_CHARACTER_INDEX: character_list_file.write('PADDING\n') else: character_list_file.write('{0}\n'.format( dataset.index_to_character[character_index])) character_list_file.close() # Initialize the model sess.run(tf.global_variables_initializer()) if not parameters['use_pretrained_model']: model.load_pretrained_token_embeddings( sess, dataset, parameters) # Start training + evaluation loop. Each iteration corresponds to 1 epoch. patience_counter = 0 f1_score_best = 0 f1_scores = {'train-F1': [], 'valid-F1': [], 'test-F1': []} f1_scores_conll = { 'train-F1': [], 'valid-F1': [], 'test-F1': [] } transition_params_trained = np.random.rand( len(dataset.unique_labels) + 2, len(dataset.unique_labels) + 2) model_saver = tf.train.Saver( max_to_keep=parameters['num_of_model_to_keep']) epoch_number = -1 try: while True: step = 0 epoch_number += 1 print('\nStarting epoch {0}'.format(epoch_number)) epoch_start_time = time.time() # use pre-trained model and epoch_number = 0 if parameters[ 'use_pretrained_model'] and epoch_number == 0: if parameters['use_adapter']: parameters['use_adapter'] = False transition_params_trained = train.restore_pretrained_model( parameters, dataset, sess, model, model_saver) print( 'Getting the 3-label predictions from the step1 model.' ) all_pred_labels, y_pred_for_adapter, y_true_for_adapter, \ output_filepaths = train.predict_labels(sess, model, transition_params_trained, parameters, dataset, epoch_number, stats_graph_folder, dataset_filepaths, for_adapter=True) # use the label2idx mapping (for adapter) in the dataset to transform all_pred_labels all_pred_indices = {} for dataset_type in dataset_filepaths.keys(): all_pred_indices[dataset_type] = [] for i in range( len(all_pred_labels[dataset_type]) ): indices = [ dataset. label_adapter_to_index[label] for label in all_pred_labels[dataset_type][i] ] all_pred_indices[dataset_type].append( indices) # and use binarizer to transform to ndarray label_binarizer_adapter = sklearn.preprocessing.LabelBinarizer( ) label_binarizer_adapter.fit( range( max(dataset.index_to_label_adapter. keys()) + 1)) predicted_label_adapter_vector_indices = {} for dataset_type in dataset_filepaths.keys(): predicted_label_adapter_vector_indices[ dataset_type] = [] for label_indices_sequence in all_pred_indices[ dataset_type]: predicted_label_adapter_vector_indices[ dataset_type].append( label_binarizer_adapter. transform( label_indices_sequence)) parameters['use_adapter'] = True if parameters['train_model'] and parameters[ 'add_class']: transition_params_trained, model, glo_step = \ train.restore_model_parameters_from_pretrained_model(parameters, dataset, sess, model, model_saver) init_new_vars_op = tf.initialize_variables( [glo_step]) sess.run(init_new_vars_op) else: transition_params_trained = \ train.restore_pretrained_model(parameters, dataset, sess, model, model_saver) for dataset_type in dataset_filepaths.keys(): writers[dataset_type] = tf.summary.FileWriter( tensorboard_log_folders[dataset_type], graph=sess.graph) # embedding_writer has to write in model_folder, otherwise TensorBoard won't be able to view embeddings embedding_writer = tf.summary.FileWriter( model_folder) # epoch_number != 0, no matter use or not use pre-trained model elif epoch_number != 0: # Train model: loop over all sequences of training set with shuffling sequence_numbers = list( range(len(dataset.token_indices['train']))) random.shuffle(sequence_numbers) for sequence_number in sequence_numbers: transition_params_trained, W_before_crf = train.train_step( sess, dataset, sequence_number, model, transition_params_trained, parameters) step += 1 epoch_elapsed_training_time = time.time( ) - epoch_start_time print('Training completed in {0:.2f} seconds'.format( epoch_elapsed_training_time), flush=False) if parameters[ 'use_adapter']: # model evaluation, using adapter # pass the pred_for_adapter as label_indices vector original_label_adapter_vector_indices = dataset.label_adapter_vector_indices dataset.label_adapter_vector_indices = predicted_label_adapter_vector_indices y_pred, y_true, output_filepaths = train.predict_labels( sess, model, transition_params_trained, parameters, dataset, epoch_number, stats_graph_folder, dataset_filepaths) evaluate.evaluate_model(results, dataset, y_pred, y_true, stats_graph_folder, epoch_number, epoch_start_time, output_filepaths, parameters) dataset.label_adapter_vector_indices = original_label_adapter_vector_indices else: # model evaluation, not using adapter y_pred, y_true, output_filepaths = train.predict_labels( sess, model, transition_params_trained, parameters, dataset, epoch_number, stats_graph_folder, dataset_filepaths) # Evaluate model: save and plot results evaluate.evaluate_model(results, dataset, y_pred, y_true, stats_graph_folder, epoch_number, epoch_start_time, output_filepaths, parameters) summary = sess.run(model.summary_op, feed_dict=None) writers['train'].add_summary(summary, epoch_number) writers['train'].flush() utils.copytree(writers['train'].get_logdir(), model_folder) # Early stopping train_f1_score = results['epoch'][epoch_number][0][ 'train']['f1_score']['weighted'] valid_f1_score = results['epoch'][epoch_number][0][ 'valid']['f1_score']['weighted'] test_f1_score = results['epoch'][epoch_number][0][ 'test']['f1_score']['weighted'] f1_scores['train-F1'].append(train_f1_score) f1_scores['valid-F1'].append(valid_f1_score) f1_scores['test-F1'].append(test_f1_score) train_f1_score_conll = results['epoch'][epoch_number][ 0]['train']['f1_conll']['micro'] valid_f1_score_conll = results['epoch'][epoch_number][ 0]['valid']['f1_conll']['micro'] test_f1_score_conll = results['epoch'][epoch_number][ 0]['test']['f1_conll']['micro'] f1_scores_conll['train-F1'].append( train_f1_score_conll) f1_scores_conll['valid-F1'].append( valid_f1_score_conll) f1_scores_conll['test-F1'].append(test_f1_score_conll) if valid_f1_score > f1_score_best: patience_counter = 0 f1_score_best = valid_f1_score # Save the best model model_saver.save( sess, os.path.join(model_folder, 'best_model.ckpt')) print( 'updated model to current epoch : epoch {:d}'. format(epoch_number)) print('the model is saved in: {:s}'.format( model_folder)) else: patience_counter += 1 print("In epoch {:d}, the valid F1 is : {:f}".format( epoch_number, valid_f1_score)) print( "The last {0} epochs have not shown improvements on the validation set." .format(patience_counter)) if patience_counter >= parameters['patience']: print('Early Stop!') results['execution_details']['early_stop'] = True # save last model model_saver.save( sess, os.path.join(model_folder, 'last_model.ckpt')) print('the last model is saved in: {:s}'.format( model_folder)) break if epoch_number >= parameters[ 'maximum_number_of_epochs'] and not parameters[ 'refine_with_crf']: break if not parameters['use_pretrained_model']: plot_name = 'F1-summary-step1.svg' else: plot_name = 'F1-summary-step2.svg' print('Sklearn result:') for k, l in f1_scores.items(): print(k, l) print('Conll result:') for k, l in f1_scores_conll.items(): print(k, l) utils_plots.plot_f1( f1_scores, os.path.join(stats_graph_folder, '..', plot_name), 'F1 score summary') # TODO: in step 1, for task a, add the best deploy data to step 2 train set, and call script print('(sklearn micro) test F1:') micro_f1 = ','.join([ str(results['epoch'][ep][0]['test']['f1_score'] ['micro']) for ep in range(epoch_number + 1) ]) print(micro_f1) print('(sklearn macro) test F1:') macro_f1 = ','.join([ str(results['epoch'][ep][0]['test']['f1_score'] ['macro']) for ep in range(epoch_number + 1) ]) print(macro_f1) except KeyboardInterrupt: results['execution_details']['keyboard_interrupt'] = True print('Training interrupted') print('Finishing the experiment') end_time = time.time() results['execution_details'][ 'train_duration'] = end_time - start_time results['execution_details']['train_end'] = end_time evaluate.save_results(results, stats_graph_folder) for dataset_type in dataset_filepaths.keys(): writers[dataset_type].close() sess.close() # release the session's resources
def main(): parameters, dataset_filepaths = load_parameters() # Load dataset dataset = ds.Dataset() dataset.load_dataset(dataset_filepaths, parameters) # Create graph and session with tf.Graph().as_default(): session_conf = tf.ConfigProto( device_count={ 'CPU': 1, 'GPU': 1 }, allow_soft_placement= True, # automatically choose an existing and supported device to run the operations in case the specified one doesn't exist log_device_placement=False) sess = tf.Session(config=session_conf) with sess.as_default(): # Initialize and save execution details start_time = time.time() experiment_timestamp = utils.get_current_time_in_miliseconds() results = {} results['epoch'] = {} results['execution_details'] = {} results['execution_details']['train_start'] = start_time results['execution_details']['time_stamp'] = experiment_timestamp results['execution_details']['early_stop'] = False results['execution_details']['keyboard_interrupt'] = False results['execution_details']['num_epochs'] = 0 results['model_options'] = copy.copy(parameters) dataset_name = utils.get_basename_without_extension( parameters['dataset_text_folder']) model_name = '{0}_{1}'.format( dataset_name, results['execution_details']['time_stamp']) output_folder = os.path.join('..', 'output') utils.create_folder_if_not_exists(output_folder) stats_graph_folder = os.path.join( output_folder, model_name) # Folder where to save graphs utils.create_folder_if_not_exists(stats_graph_folder) model_folder = os.path.join(stats_graph_folder, 'model') utils.create_folder_if_not_exists(model_folder) tensorboard_log_folder = os.path.join(stats_graph_folder, 'tensorboard_logs') utils.create_folder_if_not_exists(tensorboard_log_folder) tensorboard_log_folders = {} for dataset_type in ['train', 'valid', 'test']: tensorboard_log_folders[dataset_type] = os.path.join( stats_graph_folder, 'tensorboard_logs', dataset_type) utils.create_folder_if_not_exists( tensorboard_log_folders[dataset_type]) pickle.dump( dataset, open(os.path.join(stats_graph_folder, 'dataset.pickle'), 'wb')) # Instantiate the model # graph initialization should be before FileWriter, otherwise the graph will not appear in TensorBoard model = EntityLSTM(dataset, parameters) # Instantiate the writers for TensorBoard writers = {} for dataset_type in ['train', 'valid', 'test']: writers[dataset_type] = tf.summary.FileWriter( tensorboard_log_folders[dataset_type], graph=sess.graph) embedding_writer = tf.summary.FileWriter( model_folder ) # embedding_writer has to write in model_folder, otherwise TensorBoard won't be able to view embeddings embeddings_projector_config = projector.ProjectorConfig() tensorboard_token_embeddings = embeddings_projector_config.embeddings.add( ) tensorboard_token_embeddings.tensor_name = model.token_embedding_weights.name token_list_file_path = os.path.join( model_folder, 'tensorboard_metadata_tokens.tsv') tensorboard_token_embeddings.metadata_path = os.path.relpath( token_list_file_path, '..') tensorboard_character_embeddings = embeddings_projector_config.embeddings.add( ) tensorboard_character_embeddings.tensor_name = model.character_embedding_weights.name character_list_file_path = os.path.join( model_folder, 'tensorboard_metadata_characters.tsv') # 'metadata.tsv' tensorboard_character_embeddings.metadata_path = os.path.relpath( character_list_file_path, '..') projector.visualize_embeddings(embedding_writer, embeddings_projector_config) # Write metadata for TensorBoard embeddings token_list_file = open(token_list_file_path, 'w') for token_index in range(dataset.vocabulary_size): token_list_file.write('{0}\n'.format( dataset.index_to_token[token_index])) token_list_file.close() character_list_file = open(character_list_file_path, 'w') print('len(dataset.character_to_index): {0}'.format( len(dataset.character_to_index))) print('len(dataset.index_to_character): {0}'.format( len(dataset.index_to_character))) for character_index in range(dataset.alphabet_size): if character_index == dataset.PADDING_CHARACTER_INDEX: character_list_file.write('PADDING\n') else: character_list_file.write('{0}\n'.format( dataset.index_to_character[character_index])) character_list_file.close() # Initialize the model sess.run(tf.global_variables_initializer()) model.load_pretrained_token_embeddings(sess, dataset, parameters) # Start training + evaluation loop. Each iteration corresponds to 1 epoch. step = 0 bad_counter = 0 # number of epochs with no improvement on the validation test in terms of F1-score previous_best_valid_f1_score = 0 transition_params_trained = np.random.rand( len(dataset.unique_labels), len(dataset.unique_labels)) model_saver = tf.train.Saver( max_to_keep=parameters['maximum_number_of_epochs'] ) # defaults to saving all variables epoch_number = -1 try: while True: epoch_number += 1 #epoch_number = math.floor(step / len(dataset.token_indices['train'])) print('\nStarting epoch {0}'.format(epoch_number), end='') epoch_start_time = time.time() #print('step: {0}'.format(step)) # Train model: loop over all sequences of training set with shuffling sequence_numbers = list( range(len(dataset.token_indices['train']))) random.shuffle(sequence_numbers) for sequence_number in sequence_numbers: transition_params_trained = train.train_step( sess, dataset, sequence_number, model, transition_params_trained, parameters) step += 1 if step % 100 == 0: print('.', end='', flush=True) #break print('.', flush=True) #print('step: {0}'.format(step)) # Predict labels using trained model y_pred = {} y_true = {} output_filepaths = {} for dataset_type in ['train', 'valid', 'test']: #print('dataset_type: {0}'.format(dataset_type)) prediction_output = train.prediction_step( sess, dataset, dataset_type, model, transition_params_trained, step, stats_graph_folder, epoch_number, parameters) y_pred[dataset_type], y_true[ dataset_type], output_filepaths[ dataset_type] = prediction_output # model_options = None epoch_elapsed_training_time = time.time( ) - epoch_start_time print( 'epoch_elapsed_training_time: {0:.2f} seconds'.format( epoch_elapsed_training_time)) results['execution_details']['num_epochs'] = epoch_number # Evaluate model: save and plot results evaluate.evaluate_model(results, dataset, y_pred, y_true, stats_graph_folder, epoch_number, epoch_start_time, output_filepaths, parameters) # Save model model_saver.save( sess, os.path.join(model_folder, 'model_{0:05d}.ckpt'.format(epoch_number)) ) #, global_step, latest_filename, meta_graph_suffix, write_meta_graph, write_state) # Save TensorBoard logs summary = sess.run(model.summary_op, feed_dict=None) writers['train'].add_summary(summary, epoch_number) # Early stop valid_f1_score = results['epoch'][epoch_number][0][ 'valid']['f1_score']['micro'] if valid_f1_score > previous_best_valid_f1_score: bad_counter = 0 previous_best_valid_f1_score = valid_f1_score else: bad_counter += 1 if bad_counter > parameters['patience']: print('Early Stop!') results['execution_details']['early_stop'] = True break if epoch_number > parameters['maximum_number_of_epochs']: break # break # debugging except KeyboardInterrupt: results['execution_details']['keyboard_interrupt'] = True # assess_model.save_results(results, stats_graph_folder) print('Training interrupted') print('Finishing the experiment') end_time = time.time() results['execution_details'][ 'train_duration'] = end_time - start_time results['execution_details']['train_end'] = end_time evaluate.save_results(results, stats_graph_folder) sess.close() # release the session's resources
def main(): #### Parameters - start conf_parameters = configparser.ConfigParser() conf_parameters.read(os.path.join('.','parameters.ini')) nested_parameters = utils.convert_configparser_to_dictionary(conf_parameters) parameters = {} for k,v in nested_parameters.items(): parameters.update(v) for k,v in parameters.items(): if k in ['remove_unknown_tokens','character_embedding_dimension','character_lstm_hidden_state_dimension','token_embedding_dimension','token_lstm_hidden_state_dimension', 'patience','maximum_number_of_epochs','maximum_training_time','number_of_cpu_threads','number_of_gpus']: parameters[k] = int(v) if k in ['dropout_rate']: parameters[k] = float(v) if k in ['use_character_lstm','is_character_lstm_bidirect','is_token_lstm_bidirect','use_crf']: parameters[k] = distutils.util.strtobool(v) pprint(parameters) # Load dataset dataset_filepaths = {} dataset_filepaths['train'] = os.path.join(parameters['dataset_text_folder'], 'train.txt') dataset_filepaths['valid'] = os.path.join(parameters['dataset_text_folder'], 'valid.txt') dataset_filepaths['test'] = os.path.join(parameters['dataset_text_folder'], 'test.txt') dataset = ds.Dataset() dataset.load_dataset(dataset_filepaths, parameters) with tf.Graph().as_default(): session_conf = tf.ConfigProto( device_count={'CPU': 1, 'GPU': 1}, allow_soft_placement=True, # automatically choose an existing and supported device to run the operations in case the specified one doesn't exist log_device_placement=False ) sess = tf.Session(config=session_conf) with sess.as_default(): model = EntityLSTM(dataset, parameters) # Define training procedure global_step = tf.Variable(0, name="global_step", trainable=False) if parameters['optimizer'] == 'adam': optimizer = tf.train.AdamOptimizer(1e-3) elif parameters['optimizer'] == 'sgd': optimizer = tf.train.GradientDescentOptimizer(0.005) else: raise ValueError("The lr_method parameter must be either adam or sgd.") # https://github.com/google/prettytensor/issues/6 # https://www.tensorflow.org/api_docs/python/framework/graph_collections #print('tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES) : {0}'.format(tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES) )) #print('tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES) : {0}'.format(tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES) )) #print('tf.get_collection(tf.GraphKeys.MODEL_VARIABLES) : {0}'.format(tf.get_collection(tf.GraphKeys.MODEL_VARIABLES) )) # https://github.com/blei-lab/edward/issues/286#ref-pullrequest-181330211 : utility function to get all tensorflow variables a node depends on grads_and_vars = optimizer.compute_gradients(model.loss) # By defining a global_step variable and passing it to the optimizer we allow TensorFlow handle the counting of training steps for us. # The global step will be automatically incremented by one every time you execute train_op. train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step) # Initialize all variables sess.run(tf.global_variables_initializer()) # Load pretrained token embeddings if not parameters['token_pretrained_embedding_filepath'] == '': load_token_embeddings(sess, model.W, dataset, parameters) estop = False # early stop start_time = time.time() experiment_timestamp = utils.get_current_time_in_miliseconds() results = {} #results['model_options'] = copy.copy(model_options) #results['model_options'].pop('optimizer', None) results['epoch'] = {} # save/initialize execution details results['execution_details'] = {} results['execution_details']['train_start'] = start_time results['execution_details']['time_stamp'] = experiment_timestamp results['execution_details']['early_stop'] = False results['execution_details']['keyboard_interrupt'] = False results['execution_details']['num_epochs'] = 0 results['model_options'] = copy.copy(parameters) dataset_name = utils.get_basename_without_extension(parameters['dataset_text_folder']) #opts.train.replace('/', '_').split('.')[0] # 'conll2003en' model_name = '{0}_{1}'.format(dataset_name, results['execution_details']['time_stamp']) output_folder=os.path.join('..', 'output') stats_graph_folder=os.path.join(output_folder, model_name) # Folder where to save graphs utils.create_folder_if_not_exists(output_folder) print('stats_graph_folder: {0}'.format(stats_graph_folder)) utils.create_folder_if_not_exists(stats_graph_folder) model_folder = os.path.join(stats_graph_folder, 'model') utils.create_folder_if_not_exists(model_folder) step = 0 bad_counter = 0 previous_best_valid_f1_score = 0 transition_params_trained = np.random.rand(len(dataset.unique_labels),len(dataset.unique_labels)) try: while True: epoch_number = math.floor(step / len(dataset.token_indices['train'])) print('epoch_number: {0}'.format(epoch_number)) epoch_start_time = time.time() #print('step: {0}'.format(step)) # Train model: loop over all sequences of training set with shuffling sequence_numbers=list(range(len(dataset.token_indices['train']))) random.shuffle(sequence_numbers) for sequence_number in sequence_numbers: transition_params_trained = train_step(sess, dataset, sequence_number, train_op, global_step, model, transition_params_trained, parameters) step += 1 if sequence_number % 100 == 0: print('.',end='', flush=True) #break # Evaluate model print('step: {0}'.format(step)) all_predictions = {} all_y_true = {} output_filepaths = {} for dataset_type in ['train', 'valid', 'test']: print('dataset_type: {0}'.format(dataset_type)) all_predictions[dataset_type], all_y_true[dataset_type], output_filepaths[dataset_type] = evaluate_model(sess, dataset, dataset_type, model, transition_params_trained, step, stats_graph_folder, epoch_number, parameters) model_options = None # Save and plot results # TODO: remove uidx uidx = 0 results['epoch'][epoch_number] = [] results['execution_details']['num_epochs'] = epoch_number epoch_elapsed_training_time = time.time() - epoch_start_time print('epoch_elapsed_training_time: {0:02f} seconds'.format(epoch_elapsed_training_time)) assess_model.assess_and_save(results, dataset, model_options, all_predictions, all_y_true, stats_graph_folder, epoch_number, uidx, epoch_start_time) assess_model.plot_f1_vs_epoch(results, stats_graph_folder, 'f1_score') assess_model.plot_f1_vs_epoch(results, stats_graph_folder, 'accuracy_score') # CoNLL evaluation script for dataset_type in ['train', 'valid', 'test']: conll_evaluation_script = os.path.join('.', 'conlleval') conll_output_filepath = '{0}_conll_evaluation.txt'.format(output_filepaths[dataset_type]) shell_command = 'perl {0} < {1} > {2}'.format(conll_evaluation_script, output_filepaths[dataset_type], conll_output_filepath) print('shell_command: {0}'.format(shell_command)) #subprocess.call([shell_command]) os.system(shell_command) conll_parsed_output = utils_nlp.get_parsed_conll_output(conll_output_filepath) print('conll_parsed_output: {0}'.format(conll_parsed_output)) results['epoch'][epoch_number][0][dataset_type]['conll'] = conll_parsed_output results['epoch'][epoch_number][0][dataset_type]['f1_conll'] = {} results['epoch'][epoch_number][0][dataset_type]['f1_conll']['micro'] = results['epoch'][epoch_number][0][dataset_type]['conll']['all']['f1'] assess_model.plot_f1_vs_epoch(results, stats_graph_folder, 'f1_conll', from_json=False) #end_time = time.time() #results['execution_details']['train_duration'] = end_time - start_time #results['execution_details']['train_end'] = end_time # Early stop valid_f1_score = results['epoch'][epoch_number][0]['valid']['f1_score']['micro'] if valid_f1_score > previous_best_valid_f1_score: bad_counter = 0 previous_best_valid_f1_score = valid_f1_score else: bad_counter += 1 if bad_counter > parameters['patience']: print('Early Stop!') results['execution_details']['early_stop'] = True break if epoch_number > parameters['maximum_number_of_epochs']: break except KeyboardInterrupt: results['execution_details']['keyboard_interrupt'] = True # assess_model.save_results(results, stats_graph_folder) print('Training interrupted') print('Finishing the experiment') end_time = time.time() results['execution_details']['train_duration'] = end_time - start_time results['execution_details']['train_end'] = end_time assess_model.save_results(results, stats_graph_folder) sess.close() # release the session's resources
def main(): parameters, conf_parameters = load_parameters() pprint(parameters) dataset_filepaths = get_valid_dataset_filepaths(parameters) check_parameter_compatiblity(parameters, dataset_filepaths) cross_validation = parameters[ 'cross_validation'] if 'cross_validation' in parameters else 1 valid_fscores = [] valid_precisions = [] valid_recalls = [] for cv in range(0, cross_validation): if "als" in dataset_filepaths['train'] and cross_validation > 1: train_files = list(range(0, cv)) + list( range(cv + 1, cross_validation)) test_file = cv file_train = "tmp_combined.train" file_valid = "tmp_combined.test" output = [] for i in train_files: with open(dataset_filepaths['train'] + "_" + str(i), "r", encoding="utf-8") as file: output.append(file.read()) with open(file_train, "w", encoding="utf-8") as file: file.write("\n\n".join(output)) output = [] with open(dataset_filepaths['train'] + "_" + str(test_file), "r", encoding="utf-8") as file: output.append(file.read()) with open(file_valid, "w", encoding="utf-8") as file: file.write("\n\n".join(output)) dataset_filepaths['train'] = file_train dataset_filepaths['valid'] = file_valid # Load dataset dataset = ds.Dataset(verbose=parameters['verbose'], debug=parameters['debug']) dataset.load_vocab_word_embeddings(parameters) dataset.load_dataset(dataset_filepaths, parameters) # Create graph and session with tf.Graph().as_default(): session_conf = tf.ConfigProto( intra_op_parallelism_threads=parameters[ 'number_of_cpu_threads'], inter_op_parallelism_threads=parameters[ 'number_of_cpu_threads'], device_count={ 'CPU': 1, 'GPU': parameters['number_of_gpus'] }, allow_soft_placement= True, # automatically choose an existing and supported device to run the operations in case the specified one doesn't exist log_device_placement=False) session_conf.gpu_options.allow_growth = True sess = tf.Session(config=session_conf) with sess.as_default(): # Initialize and save execution details start_time = time.time() experiment_timestamp = utils.get_current_time_in_miliseconds() results = {} results['epoch'] = {} results['execution_details'] = {} results['execution_details']['train_start'] = start_time results['execution_details'][ 'time_stamp'] = experiment_timestamp results['execution_details']['early_stop'] = False results['execution_details']['keyboard_interrupt'] = False results['execution_details']['num_epochs'] = 0 results['model_options'] = copy.copy(parameters) dataset_name = utils.get_basename_without_extension( parameters['dataset_train']) if 'data_to_use' in parameters: model_name = '{0}_{1}'.format( parameters['language'] + "_" + dataset_name + "_small", results['execution_details']['time_stamp']) else: model_name = '{0}_{1}'.format( parameters['language'] + "_" + dataset_name, results['execution_details']['time_stamp']) output_folder = os.path.join('..', 'output') utils.create_folder_if_not_exists(output_folder) stats_graph_folder = os.path.join( output_folder, model_name) # Folder where to save graphs utils.create_folder_if_not_exists(stats_graph_folder) model_folder = os.path.join(stats_graph_folder, 'model') utils.create_folder_if_not_exists(model_folder) with open(os.path.join(model_folder, 'parameters.ini'), 'w') as parameters_file: conf_parameters.write(parameters_file) tensorboard_log_folder = os.path.join(stats_graph_folder, 'tensorboard_logs') utils.create_folder_if_not_exists(tensorboard_log_folder) tensorboard_log_folders = {} for dataset_type in dataset_filepaths.keys(): tensorboard_log_folders[dataset_type] = os.path.join( stats_graph_folder, 'tensorboard_logs', dataset_type) utils.create_folder_if_not_exists( tensorboard_log_folders[dataset_type]) #del dataset.embeddings_matrix if not parameters['use_pretrained_model']: pickle.dump( dataset, open(os.path.join(model_folder, 'dataset.pickle'), 'wb')) #dataset.load_pretrained_word_embeddings(parameters) # Instantiate the model # graph initialization should be before FileWriter, otherwise the graph will not appear in TensorBoard model = EntityLSTM(dataset, parameters) # Instantiate the writers for TensorBoard writers = {} for dataset_type in dataset_filepaths.keys(): writers[dataset_type] = tf.summary.FileWriter( tensorboard_log_folders[dataset_type], graph=sess.graph) embedding_writer = tf.summary.FileWriter( model_folder ) # embedding_writer has to write in model_folder, otherwise TensorBoard won't be able to view embeddings embeddings_projector_config = projector.ProjectorConfig() tensorboard_token_embeddings = embeddings_projector_config.embeddings.add( ) tensorboard_token_embeddings.tensor_name = model.token_embedding_weights.name token_list_file_path = os.path.join( model_folder, 'tensorboard_metadata_tokens.tsv') tensorboard_token_embeddings.metadata_path = os.path.relpath( token_list_file_path, '..') if parameters['use_character_lstm']: tensorboard_character_embeddings = embeddings_projector_config.embeddings.add( ) tensorboard_character_embeddings.tensor_name = model.character_embedding_weights.name character_list_file_path = os.path.join( model_folder, 'tensorboard_metadata_characters.tsv') tensorboard_character_embeddings.metadata_path = os.path.relpath( character_list_file_path, '..') projector.visualize_embeddings(embedding_writer, embeddings_projector_config) # Write metadata for TensorBoard embeddings token_list_file = codecs.open(token_list_file_path, 'w', 'UTF-8') for token_index in range(len(dataset.index_to_token)): token_list_file.write('{0}\n'.format( dataset.index_to_token[token_index])) token_list_file.close() if parameters['use_character_lstm']: character_list_file = codecs.open(character_list_file_path, 'w', 'UTF-8') for character_index in range(dataset.alphabet_size): if character_index == dataset.PADDING_CHARACTER_INDEX: character_list_file.write('PADDING\n') else: character_list_file.write('{0}\n'.format( dataset.index_to_character[character_index])) character_list_file.close() try: # Initialize the model sess.run(tf.global_variables_initializer()) if not parameters['use_pretrained_model']: model.load_pretrained_token_embeddings( sess, dataset, parameters) # Start training + evaluation loop. Each iteration corresponds to 1 epoch. bad_counter = 0 # number of epochs with no improvement on the validation test in terms of F1-score previous_best_valid_f1_score = 0 transition_params_trained = np.random.rand( len(dataset.unique_labels), len(dataset.unique_labels) ) #TODO np.random.rand(len(dataset.unique_labels)+2,len(dataset.unique_labels)+2) model_saver = tf.train.Saver( max_to_keep=None ) #parameters['maximum_number_of_epochs']) # defaults to saving all variables epoch_number = 0 while True: epoch_number += 1 print('\nStarting epoch {0}'.format(epoch_number)) epoch_start_time = time.time() if parameters[ 'use_pretrained_model'] and epoch_number == 1: # Restore pretrained model parameters transition_params_trained = train.restore_model_parameters_from_pretrained_model( parameters, dataset, sess, model, model_saver) elif epoch_number != 0: # Train model: loop over all sequences of training set with shuffling sequence_numbers = list( range(len(dataset.token_indices['train']))) random.shuffle(sequence_numbers) data_counter = 0 sub_id = 0 for i in tqdm(range(0, len(sequence_numbers), parameters['batch_size']), "Training", mininterval=1): data_counter += parameters['batch_size'] if data_counter >= 20000: data_counter = 0 sub_id += 0.001 print("Intermediate evaluation number: ", sub_id) #model_saver.save(sess, # os.path.join(model_folder, 'model_{0:05d}_{1}.ckpt'.format(epoch_number, len(sequence_numbers)/4/len(sequence_numbers)))) epoch_elapsed_training_time = time.time( ) - epoch_start_time print( 'Training completed in {0:.2f} seconds' .format(epoch_elapsed_training_time), flush=True) y_pred, y_true, output_filepaths = train.predict_labels( sess, model, transition_params_trained, parameters, dataset, epoch_number + sub_id, stats_graph_folder, dataset_filepaths) # Evaluate model: save and plot results evaluate.evaluate_model( results, dataset, y_pred, y_true, stats_graph_folder, epoch_number, epoch_start_time, output_filepaths, parameters) # Save model model_saver.save( sess, os.path.join( model_folder, 'model_{0:07.3f}.ckpt'.format( epoch_number + sub_id))) # Save TensorBoard logs summary = sess.run(model.summary_op, feed_dict=None) writers['train'].add_summary( summary, epoch_number) writers['train'].flush() utils.copytree( writers['train'].get_logdir(), model_folder) # Early stop valid_f1_score = results['epoch'][ epoch_number][0]['valid']['f1_score'][ 'micro'] # valid_precision = results['epoch'][epoch_number][0]['valid']['precision']['micro'] # valid_recall = results['epoch'][epoch_number][0]['valid']['recall']['micro'] # valid_fscores.append(valid_f1_score) if valid_f1_score > previous_best_valid_f1_score: bad_counter = 0 previous_best_valid_f1_score = valid_f1_score # previous_best_valid_precision = valid_precision # previous_best_valid_recall = valid_recall else: bad_counter += 1 sequence_number = sequence_numbers[ i:i + parameters['batch_size']] transition_params_trained, loss = train.train_step( sess, dataset, sequence_number, model, transition_params_trained, parameters) epoch_elapsed_training_time = time.time( ) - epoch_start_time print('Training completed in {0:.2f} seconds'.format( epoch_elapsed_training_time), flush=True) y_pred, y_true, output_filepaths = train.predict_labels( sess, model, transition_params_trained, parameters, dataset, epoch_number, stats_graph_folder, dataset_filepaths) # Evaluate model: save and plot results evaluate.evaluate_model(results, dataset, y_pred, y_true, stats_graph_folder, epoch_number, epoch_start_time, output_filepaths, parameters) # Save model model_saver.save( sess, os.path.join( model_folder, 'model_{0:05d}.ckpt'.format(epoch_number))) # Save TensorBoard logs summary = sess.run(model.summary_op, feed_dict=None) writers['train'].add_summary(summary, epoch_number) writers['train'].flush() utils.copytree(writers['train'].get_logdir(), model_folder) # Early stop valid_f1_score = results['epoch'][epoch_number][0][ 'valid']['f1_score']['micro'] #valid_precision = results['epoch'][epoch_number][0]['valid']['precision']['micro'] #valid_recall = results['epoch'][epoch_number][0]['valid']['recall']['micro'] #valid_fscores.append(valid_f1_score) if valid_f1_score > previous_best_valid_f1_score: bad_counter = 0 previous_best_valid_f1_score = valid_f1_score #previous_best_valid_precision = valid_precision #previous_best_valid_recall = valid_recall else: bad_counter += 1 print( "The last {0} epochs have not shown improvements on the validation set." .format(bad_counter)) if bad_counter >= parameters['patience']: print('Early Stop!') results['execution_details']['early_stop'] = True break if epoch_number >= parameters[ 'maximum_number_of_epochs']: break except KeyboardInterrupt: results['execution_details']['keyboard_interrupt'] = True print('Training interrupted') # remove the experiment remove_experiment = input( "Do you want to remove the experiment? (yes/y/Yes)") if remove_experiment in ["Yes", "yes", "y"]: shutil.rmtree(stats_graph_folder) print("Folder removed") else: print('Finishing the experiment') end_time = time.time() results['execution_details'][ 'train_duration'] = end_time - start_time results['execution_details']['train_end'] = end_time evaluate.save_results(results, stats_graph_folder) except Exception: logging.exception("") remove_experiment = input( "Do you want to remove the experiment? (yes/y/Yes)") if remove_experiment in ["Yes", "yes", "y"]: shutil.rmtree(stats_graph_folder) print("Folder removed") sess.close() # release the session's resources if 'cross_validation' in parameters and parameters[ 'cross_validation'] > 1: valid_fscores.append(previous_best_valid_f1_score) #valid_precisions.append(previous_best_valid_precision) #valid_recalls.append(previous_best_valid_recall) if 'cross_validation' in parameters and parameters['cross_validation'] > 1: print("mean f1score:", np.mean(valid_fscores)) #print("mean precision:", np.mean(valid_precisions)) #print("mean recall:", np.mean(valid_recalls)) with codecs.open(os.path.join(stats_graph_folder, "result_cv.txt"), "w") as file: file.write("F1score " + ", ".join(map(str, valid_fscores))) # file.write("Precision " + valid_precisions) # file.write("Recall " + valid_recalls) file.write("Mean F1score " + str(np.mean(valid_fscores)))
class NeuroNER(object): """ NeuroNER model. Args: param_filepath (type): description pretrained_model_folder (type): description dataset_text_folder (type): description character_embedding_dimension (type): description character_lstm_hidden_state_dimension (type): description check_for_digits_replaced_with_zeros (type): description check_for_lowercase (type): description debug (type): description dropout_rate (type): description experiment_name (type): description freeze_token_embeddings (type): description gradient_clipping_value (type): description learning_rate (type): description load_only_pretrained_token_embeddings (type): description load_all_pretrained_token_embeddings (type): description main_evaluation_mode (type): description maximum_number_of_epochs (type): description number_of_cpu_threads (type): description number_of_gpus (type): description optimizer (type): description output_folder (type): description output_scores (bool): description patience (type): description plot_format (type): description reload_character_embeddings (type): description reload_character_lstm (type): description reload_crf (type): description reload_feedforward (type): description reload_token_embeddings (type): description reload_token_lstm (type): description remap_unknown_tokens_to_unk (type): description spacylanguage (type): description tagging_format (type): description token_embedding_dimension (type): description token_lstm_hidden_state_dimension (type): description token_pretrained_embedding_filepath (type): description tokenizer (type): description train_model (type): description use_character_lstm (type): description use_crf (type): description use_pretrained_model (type): description verbose (type): description """ prediction_count = 0 def __init__(self, **kwargs): # Set parameters print('hello') self.parameters, self.conf_parameters = load_parameters(**kwargs) self.dataset_filepaths, self.dataset_brat_folders = self._get_valid_dataset_filepaths( self.parameters) self._check_param_compatibility(self.parameters, self.dataset_filepaths) # Load dataset self.modeldata = dataset.Dataset(verbose=self.parameters['verbose'], debug=self.parameters['debug']) token_to_vector = self.modeldata.load_dataset(self.dataset_filepaths, self.parameters) # Launch session. Automatically choose a device # if the specified one doesn't exist session_conf = tf.ConfigProto(intra_op_parallelism_threads=self. parameters['number_of_cpu_threads'], inter_op_parallelism_threads=self. parameters['number_of_cpu_threads'], device_count={ 'CPU': 1, 'GPU': self.parameters['number_of_gpus'] }, allow_soft_placement=True, log_device_placement=False) self.sess = tf.Session(config=session_conf) with self.sess.as_default(): # Initialize or load pretrained model self.model = EntityLSTM(self.modeldata, self.parameters) self.sess.run(tf.global_variables_initializer()) if self.parameters['use_pretrained_model']: self.transition_params_trained = self.model.restore_from_pretrained_model( self.parameters, self.modeldata, self.sess, token_to_vector=token_to_vector) else: self.model.load_pretrained_token_embeddings( self.sess, self.modeldata, self.parameters, token_to_vector) self.transition_params_trained = np.random.rand( len(self.modeldata.unique_labels) + 2, len(self.modeldata.unique_labels) + 2) def _create_stats_graph_folder(self, parameters): """ Initialize stats_graph_folder. Args: parameters (type): description. """ experiment_timestamp = utils.get_current_time_in_miliseconds() dataset_name = utils.get_basename_without_extension( parameters['dataset_text_folder']) model_name = '{0}_{1}'.format(dataset_name, experiment_timestamp) utils.create_folder_if_not_exists(parameters['output_folder']) # Folder where to save graphs stats_graph_folder = os.path.join(parameters['output_folder'], model_name) utils.create_folder_if_not_exists(stats_graph_folder) return stats_graph_folder, experiment_timestamp def _get_valid_dataset_filepaths(self, parameters, dataset_types=[ 'train', 'valid', 'test', 'deploy' ]): """ Get paths for the datasets. Args: parameters (type): description. dataset_types (type): description. """ dataset_filepaths = {} dataset_brat_folders = {} for dataset_type in dataset_types: dataset_filepaths[dataset_type] = os.path.join( parameters['dataset_text_folder'], '{0}.txt'.format(dataset_type)) # print('dataset_type, dataset_filepaths[dataset_type]: ',dataset_type ,dataset_filepaths[dataset_type]) dataset_brat_folders[dataset_type] = os.path.join( parameters['dataset_text_folder'], dataset_type) # print('dataset_type, dataset_brat_folders[dataset_type]: ',dataset_type ,dataset_brat_folders[dataset_type] ) dataset_compatible_with_brat_filepath = os.path.join( parameters['dataset_text_folder'], '{0}_compatible_with_brat.txt'.format(dataset_type)) # print('dataset_type, dataset_compatible_with_brat_filepath: ',dataset_type ,dataset_compatible_with_brat_filepath) # Conll file exists if os.path.isfile(dataset_filepaths[dataset_type]) \ and os.path.getsize(dataset_filepaths[dataset_type]) > 0: # Brat text files exist if os.path.exists(dataset_brat_folders[dataset_type]) and \ len(glob.glob(os.path.join(dataset_brat_folders[dataset_type], '*.txt'))) > 0: # Check compatibility between conll and brat files brat_to_conll.check_brat_annotation_and_text_compatibility( dataset_brat_folders[dataset_type]) if os.path.exists(dataset_compatible_with_brat_filepath): dataset_filepaths[ dataset_type] = dataset_compatible_with_brat_filepath conll_to_brat.check_compatibility_between_conll_and_brat_text( dataset_filepaths[dataset_type], dataset_brat_folders[dataset_type]) # Brat text files do not exist else: # Populate brat text and annotation files based on conll file conll_to_brat.conll_to_brat( dataset_filepaths[dataset_type], dataset_compatible_with_brat_filepath, dataset_brat_folders[dataset_type], dataset_brat_folders[dataset_type]) dataset_filepaths[ dataset_type] = dataset_compatible_with_brat_filepath # Conll file does not exist else: # Brat text files exist # print(glob.glob(os.path.join(dataset_brat_folders[dataset_type], '*.txt')),len(glob.glob(os.path.join(dataset_brat_folders[dataset_type], '*.txt')))) if os.path.exists(dataset_brat_folders[dataset_type]) \ and len(glob.glob(os.path.join(dataset_brat_folders[dataset_type], '*.txt'))) > 0: # print('dataset_type, dataset_brat_folders[dataset_type]: ',dataset_type ,dataset_brat_folders[dataset_type]) dataset_filepath_for_tokenizer = os.path.join( parameters['dataset_text_folder'], '{0}_{1}.txt'.format(dataset_type, parameters['tokenizer'])) if os.path.exists(dataset_filepath_for_tokenizer): conll_to_brat.check_compatibility_between_conll_and_brat_text( dataset_filepath_for_tokenizer, dataset_brat_folders[dataset_type]) else: # Populate conll file based on brat files brat_to_conll.brat_to_conll( dataset_brat_folders[dataset_type], dataset_filepath_for_tokenizer, parameters['tokenizer'], parameters['spacylanguage']) dataset_filepaths[ dataset_type] = dataset_filepath_for_tokenizer # Brat text files do not exist else: del dataset_filepaths[dataset_type] del dataset_brat_folders[dataset_type] continue if parameters['tagging_format'] == 'bioes': # Generate conll file with BIOES format bioes_filepath = os.path.join( parameters['dataset_text_folder'], '{0}_bioes.txt'.format( utils.get_basename_without_extension( dataset_filepaths[dataset_type]))) utils_nlp.convert_conll_from_bio_to_bioes( dataset_filepaths[dataset_type], bioes_filepath) dataset_filepaths[dataset_type] = bioes_filepath # print('dataset_filepaths:',dataset_filepaths) return dataset_filepaths, dataset_brat_folders def _check_param_compatibility(self, parameters, dataset_filepaths): """ Check parameters are compatible. Args: parameters (type): description. dataset_filepaths (type): description. """ check_param_compatibility(parameters, dataset_filepaths) def fit(self): """ Fit the model. """ parameters = self.parameters conf_parameters = self.conf_parameters dataset_filepaths = self.dataset_filepaths modeldata = self.modeldata dataset_brat_folders = self.dataset_brat_folders sess = self.sess model = self.model transition_params_trained = self.transition_params_trained stats_graph_folder, experiment_timestamp = self._create_stats_graph_folder( parameters) # Initialize and save execution details start_time = time.time() results = {} results['epoch'] = {} results['execution_details'] = {} results['execution_details']['train_start'] = start_time results['execution_details']['time_stamp'] = experiment_timestamp results['execution_details']['early_stop'] = False results['execution_details']['keyboard_interrupt'] = False results['execution_details']['num_epochs'] = 0 results['model_options'] = copy.copy(parameters) model_folder = os.path.join(stats_graph_folder, 'model') utils.create_folder_if_not_exists(model_folder) with open(os.path.join(model_folder, 'parameters.ini'), 'w') as parameters_file: conf_parameters.write(parameters_file) pickle.dump(modeldata, open(os.path.join(model_folder, 'dataset.pickle'), 'wb')) tensorboard_log_folder = os.path.join(stats_graph_folder, 'tensorboard_logs') utils.create_folder_if_not_exists(tensorboard_log_folder) tensorboard_log_folders = {} for dataset_type in dataset_filepaths.keys(): tensorboard_log_folders[dataset_type] = os.path.join( stats_graph_folder, 'tensorboard_logs', dataset_type) utils.create_folder_if_not_exists( tensorboard_log_folders[dataset_type]) # Instantiate the writers for TensorBoard writers = {} for dataset_type in dataset_filepaths.keys(): writers[dataset_type] = tf.summary.FileWriter( tensorboard_log_folders[dataset_type], graph=sess.graph) # embedding_writer has to write in model_folder, otherwise TensorBoard won't be able to view embeddings embedding_writer = tf.summary.FileWriter(model_folder) embeddings_projector_config = projector.ProjectorConfig() tensorboard_token_embeddings = embeddings_projector_config.embeddings.add( ) tensorboard_token_embeddings.tensor_name = model.token_embedding_weights.name token_list_file_path = os.path.join(model_folder, 'tensorboard_metadata_tokens.tsv') tensorboard_token_embeddings.metadata_path = os.path.relpath( token_list_file_path, '.') tensorboard_character_embeddings = embeddings_projector_config.embeddings.add( ) tensorboard_character_embeddings.tensor_name = model.character_embedding_weights.name character_list_file_path = os.path.join( model_folder, 'tensorboard_metadata_characters.tsv') tensorboard_character_embeddings.metadata_path = os.path.relpath( character_list_file_path, '.') projector.visualize_embeddings(embedding_writer, embeddings_projector_config) # Write metadata for TensorBoard embeddings token_list_file = codecs.open(token_list_file_path, 'w', 'UTF-8') for token_index in range(modeldata.vocabulary_size): token_list_file.write('{0}\n'.format( modeldata.index_to_token[token_index])) token_list_file.close() character_list_file = codecs.open(character_list_file_path, 'w', 'UTF-8') for character_index in range(modeldata.alphabet_size): if character_index == modeldata.PADDING_CHARACTER_INDEX: character_list_file.write('PADDING\n') else: character_list_file.write('{0}\n'.format( modeldata.index_to_character[character_index])) character_list_file.close() # Start training + evaluation loop. Each iteration corresponds to 1 epoch. # number of epochs with no improvement on the validation test in terms of F1-score bad_counter = 0 previous_best_valid_f1_score = 0 epoch_number = -1 try: while True: step = 0 epoch_number += 1 # print('\nStarting epoch {0}'.format(epoch_number)) epoch_start_time = time.time() if epoch_number != 0: # Train model: loop over all sequences of training set with shuffling sequence_numbers = list( range(len(modeldata.token_indices['train']))) random.shuffle(sequence_numbers) for sequence_number in sequence_numbers: transition_params_trained = train.train_step( sess, modeldata, sequence_number, model, parameters) step += 1 if step % 10 == 0: print('Training {0:.2f}% done'.format( step / len(sequence_numbers) * 100), end='\r', flush=True) epoch_elapsed_training_time = time.time() - epoch_start_time # print('Training completed in {0:.2f} seconds'.format(epoch_elapsed_training_time), # flush=True) y_pred, y_true, output_filepaths = train.predict_labels( sess, model, transition_params_trained, parameters, modeldata, epoch_number, stats_graph_folder, dataset_filepaths) # Evaluate model: save and plot results evaluate.evaluate_model(results, modeldata, y_pred, y_true, stats_graph_folder, epoch_number, epoch_start_time, output_filepaths, parameters) if parameters['use_pretrained_model'] and not parameters[ 'train_model']: conll_to_brat.output_brat(output_filepaths, dataset_brat_folders, stats_graph_folder) break # Save model model.saver.save( sess, os.path.join(model_folder, 'model_{0:05d}.ckpt'.format(epoch_number))) # Save TensorBoard logs summary = sess.run(model.summary_op, feed_dict=None) writers['train'].add_summary(summary, epoch_number) writers['train'].flush() utils.copytree(writers['train'].get_logdir(), model_folder) # Early stop valid_f1_score = results['epoch'][epoch_number][0]['valid'][ 'f1_score']['micro'] if valid_f1_score > previous_best_valid_f1_score: bad_counter = 0 previous_best_valid_f1_score = valid_f1_score conll_to_brat.output_brat(output_filepaths, dataset_brat_folders, stats_graph_folder, overwrite=True) self.transition_params_trained = transition_params_trained else: bad_counter += 1 # print("The last {0} epochs have not shown improvements on the validation set.".format(bad_counter)) if bad_counter >= parameters['patience']: # print('Early Stop!') results['execution_details']['early_stop'] = True break if epoch_number >= parameters['maximum_number_of_epochs']: break except KeyboardInterrupt: results['execution_details']['keyboard_interrupt'] = True # print('Training interrupted') print('Finishing the experiment') end_time = time.time() results['execution_details']['train_duration'] = end_time - start_time results['execution_details']['train_end'] = end_time evaluate.save_results(results, stats_graph_folder) for dataset_type in dataset_filepaths.keys(): writers[dataset_type].close() def predict(self, text): """ Predict Args: text (str): Description. """ self.prediction_count += 1 if self.prediction_count == 1: self.parameters['dataset_text_folder'] = os.path.join( '.', 'data', 'temp') self.stats_graph_folder, _ = self._create_stats_graph_folder( self.parameters) # Update the deploy folder, file, and modeldata dataset_type = 'deploy' # Delete all deployment data for filepath in glob.glob( os.path.join(self.parameters['dataset_text_folder'], '{0}*'.format(dataset_type))): if os.path.isdir(filepath): shutil.rmtree(filepath) else: os.remove(filepath) # Create brat folder and file dataset_brat_deploy_folder = os.path.join( self.parameters['dataset_text_folder'], dataset_type) utils.create_folder_if_not_exists(dataset_brat_deploy_folder) dataset_brat_deploy_filepath = os.path.join( dataset_brat_deploy_folder, 'temp_{0}.txt'.format(str(self.prediction_count).zfill(5))) #self._get_dataset_brat_deploy_filepath(dataset_brat_deploy_folder) # print('over here: ',dataset_brat_deploy_filepath) with codecs.open(dataset_brat_deploy_filepath, 'w', 'UTF-8') as f: f.write(text) # Update deploy filepaths dataset_filepaths, dataset_brat_folders = self._get_valid_dataset_filepaths( self.parameters, dataset_types=[dataset_type]) self.dataset_filepaths.update(dataset_filepaths) self.dataset_brat_folders.update(dataset_brat_folders) # Update the dataset for the new deploy set self.modeldata.update_dataset(self.dataset_filepaths, [dataset_type]) # Predict labels and output brat output_filepaths = {} prediction_output = train.prediction_step( self.sess, self.modeldata, dataset_type, self.model, self.transition_params_trained, self.stats_graph_folder, self.prediction_count, self.parameters, self.dataset_filepaths) _, _, output_filepaths[dataset_type] = prediction_output conll_to_brat.output_brat(output_filepaths, self.dataset_brat_folders, self.stats_graph_folder, overwrite=True) # Print and output result text_filepath = os.path.join( self.stats_graph_folder, 'brat', 'deploy', os.path.basename(dataset_brat_deploy_filepath)) annotation_filepath = os.path.join( self.stats_graph_folder, 'brat', 'deploy', '{0}.ann'.format( utils.get_basename_without_extension( dataset_brat_deploy_filepath))) text2, entities = brat_to_conll.get_entities_from_brat( text_filepath, annotation_filepath, verbose=True) assert (text == text2) return entities def get_params(self): return self.parameters def close(self): self.__del__() def __del__(self): self.sess.close()
def __init__(self, parameters_filepath=argument_default_value, pretrained_model_folder=argument_default_value, dataset_text_folder=argument_default_value, character_embedding_dimension=argument_default_value, character_lstm_hidden_state_dimension=argument_default_value, check_for_digits_replaced_with_zeros=argument_default_value, check_for_lowercase=argument_default_value, debug=argument_default_value, dropout_rate=argument_default_value, experiment_name=argument_default_value, freeze_token_embeddings=argument_default_value, gradient_clipping_value=argument_default_value, learning_rate=argument_default_value, load_only_pretrained_token_embeddings=argument_default_value, load_all_pretrained_token_embeddings=argument_default_value, main_evaluation_mode=argument_default_value, maximum_number_of_epochs=argument_default_value, number_of_cpu_threads=argument_default_value, number_of_gpus=argument_default_value, optimizer=argument_default_value, output_folder=argument_default_value, patience=argument_default_value, plot_format=argument_default_value, reload_character_embeddings=argument_default_value, reload_character_lstm=argument_default_value, reload_crf=argument_default_value, reload_feedforward=argument_default_value, reload_token_embeddings=argument_default_value, reload_token_lstm=argument_default_value, remap_unknown_tokens_to_unk=argument_default_value, spacylanguage=argument_default_value, tagging_format=argument_default_value, token_embedding_dimension=argument_default_value, token_lstm_hidden_state_dimension=argument_default_value, token_pretrained_embedding_filepath=argument_default_value, tokenizer=argument_default_value, train_model=argument_default_value, use_character_lstm=argument_default_value, use_crf=argument_default_value, use_pretrained_model=argument_default_value, verbose=argument_default_value, argument_default_value=argument_default_value): # Lấy toàn bộ argument được truyền vào arguments = dict( (k,str(v)) for k,v in locals().items() if k !='self') # Khởi tạo parameter từ đường dẫn và các parameter truyền vào (coi trong file paremeters.ini) parameters, conf_parameters = self._load_parameters(arguments['parameters_filepath'], arguments=arguments) dataset_filepaths, dataset_brat_folders = self._get_valid_dataset_filepaths(parameters) # Kiểm tra độ phù hợp của các parameter self._check_parameter_compatiblity(parameters, dataset_filepaths) # Load dataset dataset = ds.Dataset(verbose=parameters['verbose'], debug=parameters['debug']) token_to_vector = dataset.load_dataset(dataset_filepaths, parameters) ''' token_to_vector = { "token": <ndarray> - giá trị word vector trong file glove.6B.100d.txt } ''' # Launch session session_conf = tf.ConfigProto( intra_op_parallelism_threads=parameters['number_of_cpu_threads'], inter_op_parallelism_threads=parameters['number_of_cpu_threads'], device_count={'CPU': 1, 'GPU': parameters['number_of_gpus']}, allow_soft_placement=True, # automatically choose an existing and supported device to run the operations in case the specified one doesn't exist log_device_placement=False ) sess = tf.Session(config=session_conf) with sess.as_default(): # Create model and initialize or load pretrained model ### Instantiate the model model = EntityLSTM(dataset, parameters) ### Initialize the model and restore from pretrained model if needed sess.run(tf.global_variables_initializer()) if not parameters['use_pretrained_model']: ''' Init token_embedding_weights với token_to_vector ''' model.load_pretrained_token_embeddings(sess, dataset, parameters, token_to_vector) ''' unique_labels: các nhãn entity độc lập trong dataset hiện có (conll) ''' self.transition_params_trained = np.random.rand(len(dataset.unique_labels)+2,len(dataset.unique_labels)+2) else: self.transition_params_trained = model.restore_from_pretrained_model(parameters, dataset, sess, token_to_vector=token_to_vector) del token_to_vector # TODO: Cần tìm hiểu ý nghĩa của transition_params_trained self.dataset = dataset # quản lý dataset <Dataset> self.dataset_brat_folders = dataset_brat_folders # Folder data dạng brat <dict> self.dataset_filepaths = dataset_filepaths # đường dẫn các file conll data <dict> self.model = model # model huấn luyện <EntityLSTM> self.parameters = parameters # Các parameters self.conf_parameters = conf_parameters # Các paramters config dùng cho Tensorflow self.sess = sess # session chạy của tensorflow
def main(languages): #embeddings_type = ['polyglot', 'fasttext'] #embeddings_type = ['fasttext', 'fasttext_noOOV'] embeddings_type = ['fasttext_noOOV'] character_lstm = [True] embedding_language = ['target', 'source'] combination = product(languages, embeddings_type, embedding_language, character_lstm) create_folder_if_not_exists(os.path.join("..", "log")) experiment_timestamp = utils.get_current_time_in_miliseconds() log_file = os.path.join("..", "log", "experiment-{}.log".format(experiment_timestamp)) for language, emb_type, emb_language, char_lstm in combination: conf_parameters = load_parameters() conf_parameters = set_datasets(conf_parameters, language) conf_parameters.set('ann','use_character_lstm', str(char_lstm)) conf_parameters.set('ann','embedding_type', emb_type) conf_parameters.set('ann','embedding_language', emb_language) if emb_type == 'polyglot': conf_parameters.set('ann', 'embedding_dimension', str(64)) elif 'fasttext' in emb_type: conf_parameters.set('ann', 'embedding_dimension', str(300)) else: raise("Uknown embedding type") if emb_language == 'source': conf_parameters.set('dataset', 'language', constants.MAPPING_LANGUAGE[language]) else: conf_parameters.set('dataset', 'language', language) parameters, conf_parameters = parse_parameters(conf_parameters) start_time = time.time() experiment_timestamp = utils.get_current_time_in_miliseconds() results = {} results['epoch'] = {} results['execution_details'] = {} results['execution_details']['train_start'] = start_time results['execution_details']['time_stamp'] = experiment_timestamp results['execution_details']['early_stop'] = False results['execution_details']['keyboard_interrupt'] = False results['execution_details']['num_epochs'] = 0 results['model_options'] = copy.copy(parameters) dataset_name = utils.get_basename_without_extension(parameters['dataset_train']) model_name = '{0}_{1}_{2}_{3}_{4}'.format(language, emb_type, char_lstm, emb_language, results['execution_details']['time_stamp']) sys.stdout = open(os.path.join("..", "log", model_name), "w") print(language, emb_type, char_lstm, emb_language) with open(log_file, "a") as file: file.write("Experiment: {}\n".format(model_name)) file.write("Start time:{}\n".format(experiment_timestamp)) file.write("-------------------------------------\n\n") pprint(parameters) dataset_filepaths = get_valid_dataset_filepaths(parameters) check_parameter_compatiblity(parameters, dataset_filepaths) previous_best_valid_epoch = -1 # Load dataset dataset = ds.Dataset(verbose=parameters['verbose'], debug=parameters['debug']) dataset.load_vocab_word_embeddings(parameters) dataset.load_dataset(dataset_filepaths, parameters) # Create graph and session with tf.Graph().as_default(): session_conf = tf.ConfigProto( intra_op_parallelism_threads=parameters['number_of_cpu_threads'], inter_op_parallelism_threads=parameters['number_of_cpu_threads'], device_count={'CPU': 1, 'GPU': parameters['number_of_gpus']}, allow_soft_placement=True, # automatically choose an existing and supported device to run the operations in case the specified one doesn't exist log_device_placement=False ) session_conf.gpu_options.allow_growth = True sess = tf.Session(config=session_conf) with sess.as_default(): # Initialize and save execution details print(model_name) output_folder = os.path.join('..', 'output') utils.create_folder_if_not_exists(output_folder) stats_graph_folder = os.path.join(output_folder, model_name) # Folder where to save graphs utils.create_folder_if_not_exists(stats_graph_folder) model_folder = os.path.join(stats_graph_folder, 'model') utils.create_folder_if_not_exists(model_folder) with open(os.path.join(model_folder, 'parameters.ini'), 'w') as parameters_file: conf_parameters.write(parameters_file) tensorboard_log_folder = os.path.join(stats_graph_folder, 'tensorboard_logs') utils.create_folder_if_not_exists(tensorboard_log_folder) tensorboard_log_folders = {} for dataset_type in dataset_filepaths.keys(): tensorboard_log_folders[dataset_type] = os.path.join(stats_graph_folder, 'tensorboard_logs', dataset_type) utils.create_folder_if_not_exists(tensorboard_log_folders[dataset_type]) # del dataset.embeddings_matrix if not parameters['use_pretrained_model']: pickle.dump(dataset, open(os.path.join(model_folder, 'dataset.pickle'), 'wb')) # dataset.load_pretrained_word_embeddings(parameters) # Instantiate the model # graph initialization should be before FileWriter, otherwise the graph will not appear in TensorBoard model = EntityLSTM(dataset, parameters) # Instantiate the writers for TensorBoard writers = {} for dataset_type in dataset_filepaths.keys(): writers[dataset_type] = tf.summary.FileWriter(tensorboard_log_folders[dataset_type], graph=sess.graph) embedding_writer = tf.summary.FileWriter( model_folder) # embedding_writer has to write in model_folder, otherwise TensorBoard won't be able to view embeddings embeddings_projector_config = projector.ProjectorConfig() tensorboard_token_embeddings = embeddings_projector_config.embeddings.add() tensorboard_token_embeddings.tensor_name = model.token_embedding_weights.name token_list_file_path = os.path.join(model_folder, 'tensorboard_metadata_tokens.tsv') tensorboard_token_embeddings.metadata_path = os.path.relpath(token_list_file_path, '..') if parameters['use_character_lstm']: tensorboard_character_embeddings = embeddings_projector_config.embeddings.add() tensorboard_character_embeddings.tensor_name = model.character_embedding_weights.name character_list_file_path = os.path.join(model_folder, 'tensorboard_metadata_characters.tsv') tensorboard_character_embeddings.metadata_path = os.path.relpath(character_list_file_path, '..') projector.visualize_embeddings(embedding_writer, embeddings_projector_config) # Write metadata for TensorBoard embeddings token_list_file = codecs.open(token_list_file_path, 'w', 'UTF-8') for token_index in range(len(dataset.index_to_token)): token_list_file.write('{0}\n'.format(dataset.index_to_token[token_index])) token_list_file.close() if parameters['use_character_lstm']: character_list_file = codecs.open(character_list_file_path, 'w', 'UTF-8') for character_index in range(dataset.alphabet_size): if character_index == dataset.PADDING_CHARACTER_INDEX: character_list_file.write('PADDING\n') else: character_list_file.write('{0}\n'.format(dataset.index_to_character[character_index])) character_list_file.close() try: # Initialize the model sess.run(tf.global_variables_initializer()) if not parameters['use_pretrained_model']: model.load_pretrained_token_embeddings(sess, dataset, parameters) # Start training + evaluation loop. Each iteration corresponds to 1 epoch. bad_counter = 0 # number of epochs with no improvement on the validation test in terms of F1-score previous_best_valid_f1_score = -1 transition_params_trained = np.random.rand(len(dataset.unique_labels), len( dataset.unique_labels)) # TODO np.random.rand(len(dataset.unique_labels)+2,len(dataset.unique_labels)+2) model_saver = tf.train.Saver( max_to_keep=None) # parameters['maximum_number_of_epochs']) # defaults to saving all variables epoch_number = 0 while True: step = 0 epoch_number += 1 print('\nStarting epoch {0}'.format(epoch_number)) epoch_start_time = time.time() if parameters['use_pretrained_model'] and epoch_number == 1: # Restore pretrained model parameters transition_params_trained = train.restore_model_parameters_from_pretrained_model(parameters, dataset, sess, model, model_saver) elif epoch_number != 0: # Train model: loop over all sequences of training set with shuffling sequence_numbers = list(range(len(dataset.token_indices['train']))) random.shuffle(sequence_numbers) data_counter = 0 sub_id = 0 for i in tqdm(range(0, len(sequence_numbers), parameters['batch_size']), "Training epoch {}".format(epoch_number), mininterval=1): data_counter += parameters['batch_size'] if data_counter >= 20000: data_counter = 0 sub_id += 0.001 print("Intermediate evaluation number: ", sub_id) epoch_elapsed_training_time = time.time() - epoch_start_time print('Training completed in {0:.2f} seconds'.format(epoch_elapsed_training_time), flush=True) y_pred, y_true, output_filepaths = train.predict_labels(sess, model, transition_params_trained, parameters, dataset, epoch_number + sub_id, stats_graph_folder, dataset_filepaths) # Evaluate model: save and plot results evaluate.evaluate_model(results, dataset, y_pred, y_true, stats_graph_folder, epoch_number, epoch_start_time, output_filepaths, parameters) # Save model model_saver.save(sess, os.path.join(model_folder, 'model_{0:07.3f}.ckpt'.format( epoch_number + sub_id))) # Save TensorBoard logs summary = sess.run(model.summary_op, feed_dict=None) writers['train'].add_summary(summary, epoch_number) writers['train'].flush() utils.copytree(writers['train'].get_logdir(), model_folder) # Early stop valid_f1_score = results['epoch'][epoch_number][0]['valid']['f1_score']['micro'] if valid_f1_score > previous_best_valid_f1_score: bad_counter = 0 previous_best_valid_f1_score = valid_f1_score else: bad_counter += 1 sequence_number = sequence_numbers[i: i + parameters['batch_size']] transition_params_trained, loss = train.train_step(sess, dataset, sequence_number, model, transition_params_trained, parameters) epoch_elapsed_training_time = time.time() - epoch_start_time print('Training completed in {0:.2f} seconds'.format(epoch_elapsed_training_time), flush=True) y_pred, y_true, output_filepaths = train.predict_labels(sess, model, transition_params_trained, parameters, dataset, epoch_number, stats_graph_folder, dataset_filepaths) # Evaluate model: save and plot results evaluate.evaluate_model(results, dataset, y_pred, y_true, stats_graph_folder, epoch_number, epoch_start_time, output_filepaths, parameters) # Save model model_saver.save(sess, os.path.join(model_folder, 'model_{0:05d}.ckpt'.format(epoch_number))) # Save TensorBoard logs summary = sess.run(model.summary_op, feed_dict=None) writers['train'].add_summary(summary, epoch_number) writers['train'].flush() utils.copytree(writers['train'].get_logdir(), model_folder) # Early stop valid_f1_score = results['epoch'][epoch_number][0]['valid']['f1_score']['micro'] if valid_f1_score > previous_best_valid_f1_score: bad_counter = 0 previous_best_valid_f1_score = valid_f1_score previous_best_valid_epoch = epoch_number else: bad_counter += 1 print("The last {0} epochs have not shown improvements on the validation set.".format( bad_counter)) if bad_counter >= parameters['patience']: print('Early Stop!') results['execution_details']['early_stop'] = True break if epoch_number >= parameters['maximum_number_of_epochs']: break keep_only_best_model(model_folder,previous_best_valid_epoch ,parameters['maximum_number_of_epochs']+1) except KeyboardInterrupt: results['execution_details']['keyboard_interrupt'] = True print('Training interrupted') # remove the experiment remove_experiment = input("Do you want to remove the experiment? (yes/y/Yes)") if remove_experiment in ["Yes", "yes", "y"]: shutil.rmtree(stats_graph_folder) print("Folder removed") else: print('Finishing the experiment') end_time = time.time() results['execution_details']['train_duration'] = end_time - start_time results['execution_details']['train_end'] = end_time evaluate.save_results(results, stats_graph_folder) sys.stdout.close() except Exception: logging.exception("") remove_experiment = input("Do you want to remove the experiment? (yes/y/Yes)") if remove_experiment in ["Yes", "yes", "y"]: shutil.rmtree(stats_graph_folder) print("Folder removed") sys.stdout.close() sess.close() # release the session's resources sys.stdout.close()
def main(args): experiments = utils.load_experiments() #parameters, conf_parameters = load_parameters2() #if args.file: # parameters['predict_text'] = args.file #parameters = process_input(parameters) #if not parameters["use_pretrained_model"]: # raise IOError('Set use_pretrained_model parameter to True if you want to predict') #dataset_filepaths = get_valid_dataset_filepaths(parameters) #check_parameter_compatiblity(parameters, dataset_filepaths) pprint(experiments) time_stamp = utils.get_current_time_in_miliseconds() result_file = '{0}_{1}'.format(args.experiment_set + "_" + "results_", time_stamp + ".txt") print(result_file) with open(os.path.join("../predictions",result_file), "w", encoding="utf-8") as file: for elem in experiments['experiments'][args.experiment_set]: trained_model = elem[0] test = elem[1] print("======================") print("Train on {0}, test {1}".format(trained_model,test)) print("======================") pretrained_model_folder = os.path.dirname(experiments['models'][trained_model]) dataset = pickle.load(open(os.path.join(pretrained_model_folder, 'dataset.pickle'), 'rb')) parameters, conf_parameters = load_parameters(os.path.join(pretrained_model_folder, 'parameters.ini'), verbose=False) parameters['train_model'] = False parameters['use_pretrained_model'] = True parameters['dataset_predict'] = experiments['datasets'][test] parameters['pretrained_model_name'] = "{0}_on_{1}".format(trained_model,test) parameters['pretrained_model_checkpoint_filepath'] = experiments['models'][trained_model] dataset_filepaths = get_valid_dataset_filepaths(parameters) pprint(parameters) #sys.exit() #if args.file: # parameters['predict_text'] = args.file #parameters = process_input(parameters) # Load dataset #dataset = ds.Dataset(verbose=parameters['verbose'], debug=parameters['debug']) #dataset.load_vocab_word_embeddings(parameters) #pretrained_model_folder = os.path.dirname(parameters['pretrained_model_checkpoint_filepath']) #dataset = pickle.load(open(os.path.join(pretrained_model_folder, 'dataset.pickle'), 'rb')) #dataset.load_dataset(dataset_filepaths, parameters) dataset_type = "predict" dataset.labels[dataset_type], dataset.tokens[dataset_type], _, _, _ = dataset._parse_dataset(dataset_filepaths.get(dataset_type, None), parameters['language']) #dataset.load_vocab_word_embeddings(parameters) iteration_number = 0 dataset.token_to_index = dict() dataset.number_of_unknown_tokens = 0 for token_sentence in tqdm(dataset.tokens['predict']): for token in token_sentence: if iteration_number == dataset.UNK_TOKEN_INDEX: iteration_number += 1 if iteration_number == dataset.PADDING_TOKEN_INDEX: iteration_number += 1 if token == "CD": a=1 if not utils_nlp.is_token_in_pretrained_embeddings(token, dataset.vocab_embeddings, parameters): if parameters['embedding_type'] == 'fasttext': dataset.token_to_index[token] = iteration_number iteration_number += 1 else: dataset.token_to_index[token] = dataset.UNK_TOKEN_INDEX dataset.number_of_unknown_tokens += 1 dataset.tokens_mapped_to_unk.append(token) else: if token not in dataset.token_to_index: dataset.token_to_index[token] = iteration_number iteration_number += 1 dataset_type = "predict" for dataset_type in dataset_filepaths.keys(): dataset.token_indices[dataset_type] = [] dataset.characters[dataset_type] = [] dataset.character_indices[dataset_type] = [] dataset.token_lengths[dataset_type] = [] dataset.sequence_lengths[dataset_type] = [] dataset.longest_token_length_in_sequence[dataset_type] = [] # character_indices_padded[dataset_type] = [] for token_sequence in dataset.tokens[dataset_type]: dataset.token_indices[dataset_type].append([dataset.token_to_index.get(token, dataset.UNK_TOKEN_INDEX) for token in token_sequence]) dataset.characters[dataset_type].append([list(token) for token in token_sequence]) dataset.character_indices[dataset_type].append( [[dataset.character_to_index.get(character,dataset.UNK_CHARACTER_INDEX) for character in token] for token in token_sequence]) dataset.token_lengths[dataset_type].append([len(token) for token in token_sequence]) dataset.sequence_lengths[dataset_type].append(len(token_sequence)) dataset.longest_token_length_in_sequence[dataset_type].append(max(dataset.token_lengths[dataset_type][-1])) # character_indices_padded[dataset_type].append([ utils.pad_list(temp_token_indices, longest_token_length_in_sequence, self.PADDING_CHARACTER_INDEX) # for temp_token_indices in character_indices[dataset_type][-1]]) dataset.label_indices[dataset_type] = [] for label_sequence in dataset.labels[dataset_type]: dataset.label_indices[dataset_type].append([dataset.label_to_index[label] for label in label_sequence]) tmp_vector = [0] * len(dataset.unique_labels) tmp_vector[dataset.label_to_index["O"]] = 1 dataset.PADDING_LABEL_VECTOR = tmp_vector for dataset_type in dataset_filepaths.keys(): dataset.label_vector_indices[dataset_type] = [] for label_indices_sequence in dataset.label_indices[dataset_type]: vector_sequence = [] for indice in label_indices_sequence: vector = [0] * len(dataset.unique_labels) vector[indice] = 1 vector_sequence.append(vector) dataset.label_vector_indices[dataset_type].append(vector_sequence) # Create graph and session with tf.Graph().as_default(): session_conf = tf.ConfigProto( intra_op_parallelism_threads=parameters['number_of_cpu_threads'], inter_op_parallelism_threads=parameters['number_of_cpu_threads'], device_count={'CPU': 1, 'GPU': parameters['number_of_gpus']}, allow_soft_placement=True, # automatically choose an existing and supported device to run the operations in case the specified one doesn't exist log_device_placement=False, ) session_conf.gpu_options.allow_growth = True sess = tf.Session(config=session_conf) model = EntityLSTM(dataset, parameters) model_saver = tf.train.Saver() prediction_folder = os.path.join('..', 'predictions') utils.create_folder_if_not_exists(prediction_folder) dataset_name = parameters['pretrained_model_name'] model_name = '{0}_{1}'.format(dataset_name, time_stamp) prediction_folder = os.path.join(prediction_folder, model_name) utils.create_folder_if_not_exists(prediction_folder) epoch_number = 100 #dataset_name = utils.get_basename_without_extension(parameters['dataset_test']) with open(os.path.join(prediction_folder, 'parameters.ini'), 'w') as parameters_file: conf_parameters.write(parameters_file) if parameters['use_pretrained_model']: # Restore pretrained model parameters transition_params_trained = train.restore_model_parameters_from_pretrained_model(parameters, dataset, sess, model, model_saver) model.load_pretrained_token_embeddings(sess, dataset, parameters) start_time = time.time() results = {} results['epoch'] = {} results['execution_details'] = {} results['execution_details']['train_start'] = start_time results['execution_details']['time_stamp'] = start_time results['execution_details']['early_stop'] = False results['execution_details']['keyboard_interrupt'] = False results['execution_details']['num_epochs'] = epoch_number results['model_options'] = copy.copy(parameters) demo = parameters['pretrained_model_name'] == "demo" y_pred, y_true, output_filepaths = train.predict_labels(sess, model, transition_params_trained, parameters, dataset, epoch_number, prediction_folder, dataset_filepaths, demo=demo) conll_output_file = evaluate.evaluate_model(results, dataset, y_pred, y_true, prediction_folder, epoch_number, start_time , output_filepaths, parameters) file.write(parameters['pretrained_model_name'] + "\n") with open(conll_output_file, "r") as conll_file: conll = conll_file.read() file.write(conll) file.write("\n\n\n") if parameters['pretrained_model_name'] == "demo": print("============") print(" Prediction ") print("============") i = 0 for sentence in dataset.tokens['predict']: for token in sentence: predict_label = dataset.index_to_label[y_pred['predict'][i]] if dataset.index_to_label[y_pred['predict'][i]] != "O": print(token,predict_label) else: print(token) i += 1 print("") else: raise IOError('Set use_pretrained_model parameter to True')
def main(args): parameters, conf_parameters = load_parameters() if args.file: parameters['predict_text'] = args.file parameters = process_input(parameters) dataset_filepaths = get_valid_dataset_filepaths(parameters) # Load dataset dataset = ds.Dataset(verbose=parameters['verbose'], debug=parameters['debug']) dataset.load_vocab_word_embeddings(parameters) pretrained_model_folder = os.path.dirname(parameters['pretrained_model_checkpoint_filepath']) dataset = pickle.load(open(os.path.join(pretrained_model_folder, 'dataset.pickle'), 'rb')) dataset.load_dataset(dataset_filepaths, parameters) dataset_type = "predict" dataset.labels[dataset_type], dataset.tokens[dataset_type], _, _, _ = dataset._parse_dataset(dataset_filepaths.get(dataset_type, None), parameters['language']) #dataset.load_vocab_word_embeddings(parameters) iteration_number = 0 dataset.token_to_index = dict() for token_sentence in dataset.tokens['predict']: for token in token_sentence: if iteration_number == dataset.UNK_TOKEN_INDEX: iteration_number += 1 if iteration_number == dataset.PADDING_TOKEN_INDEX: iteration_number += 1 if not utils_nlp.is_token_in_pretrained_embeddings(token, dataset.vocab_embeddings, parameters): if parameters['embedding_type'] == 'glove': dataset.token_to_index[token] = dataset.UNK_TOKEN_INDEX dataset.number_of_unknown_tokens += 1 dataset.tokens_mapped_to_unk.append(token) elif parameters['embedding_type'] == 'fasttext': dataset.token_to_index[token] = iteration_number iteration_number += 1 else: raise AssertionError("Embedding type not recognized") else: if token not in dataset.token_to_index: dataset.token_to_index[token] = iteration_number iteration_number += 1 dataset_type = "predict" for dataset_type in dataset_filepaths.keys(): dataset.token_indices[dataset_type] = [] dataset.characters[dataset_type] = [] dataset.character_indices[dataset_type] = [] dataset.token_lengths[dataset_type] = [] dataset.sequence_lengths[dataset_type] = [] dataset.longest_token_length_in_sequence[dataset_type] = [] # character_indices_padded[dataset_type] = [] for token_sequence in dataset.tokens[dataset_type]: dataset.token_indices[dataset_type].append([dataset.token_to_index.get(token, dataset.UNK_TOKEN_INDEX) for token in token_sequence]) dataset.characters[dataset_type].append([list(token) for token in token_sequence]) dataset.character_indices[dataset_type].append( [[dataset.character_to_index.get(character,dataset.UNK_CHARACTER_INDEX) for character in token] for token in token_sequence]) dataset.token_lengths[dataset_type].append([len(token) for token in token_sequence]) dataset.sequence_lengths[dataset_type].append(len(token_sequence)) dataset.longest_token_length_in_sequence[dataset_type].append(max(dataset.token_lengths[dataset_type][-1])) # character_indices_padded[dataset_type].append([ utils.pad_list(temp_token_indices, longest_token_length_in_sequence, self.PADDING_CHARACTER_INDEX) # for temp_token_indices in character_indices[dataset_type][-1]]) dataset.label_indices[dataset_type] = [] for label_sequence in dataset.labels[dataset_type]: dataset.label_indices[dataset_type].append([dataset.label_to_index[label] for label in label_sequence]) tmp_vector = [0] * len(dataset.unique_labels) tmp_vector[dataset.label_to_index["O"]] = 1 dataset.PADDING_LABEL_VECTOR = tmp_vector for dataset_type in dataset_filepaths.keys(): dataset.label_vector_indices[dataset_type] = [] for label_indices_sequence in dataset.label_indices[dataset_type]: vector_sequence = [] for indice in label_indices_sequence: vector = [0] * len(dataset.unique_labels) vector[indice] = 1 vector_sequence.append(vector) dataset.label_vector_indices[dataset_type].append(vector_sequence) # Create graph and session with tf.Graph().as_default(): session_conf = tf.ConfigProto( intra_op_parallelism_threads=parameters['number_of_cpu_threads'], inter_op_parallelism_threads=parameters['number_of_cpu_threads'], device_count={'CPU': 1, 'GPU': parameters['number_of_gpus']}, allow_soft_placement=True, # automatically choose an existing and supported device to run the operations in case the specified one doesn't exist log_device_placement=False, ) session_conf.gpu_options.allow_growth = True sess = tf.Session(config=session_conf) model = EntityLSTM(dataset, parameters) model_saver = tf.train.Saver() prediction_folder = os.path.join('..', 'predictions') utils.create_folder_if_not_exists(prediction_folder) dataset_name = parameters['pretrained_model_name'] model_name = '{0}_{1}'.format(parameters["language"] + "_" + dataset_name, utils.get_current_time_in_miliseconds()) prediction_folder = os.path.join(prediction_folder, model_name) utils.create_folder_if_not_exists(prediction_folder) epoch_number = 100 #dataset_name = utils.get_basename_without_extension(parameters['dataset_test']) with open(os.path.join(prediction_folder, 'parameters.ini'), 'w') as parameters_file: conf_parameters.write(parameters_file) if parameters['use_pretrained_model']: # Restore pretrained model parameters transition_params_trained = train.restore_model_parameters_from_pretrained_model(parameters, dataset, sess, model, model_saver) model.load_pretrained_token_embeddings(sess, dataset, parameters) demo = parameters['pretrained_model_name'] == "demo" y_pred, y_true, output_filepaths = train.predict_labels(sess, model, transition_params_trained, parameters, dataset, epoch_number, prediction_folder, dataset_filepaths, demo=demo) if parameters['pretrained_model_name'] == "demo": print("============") print(" Prediction ") print("============") i = 0 for sentence in dataset.tokens['predict']: for token in sentence: predict_label = dataset.index_to_label[y_pred['predict'][i]] if dataset.index_to_label[y_pred['predict'][i]] != "O": print(token,predict_label) else: print(token) i += 1 print("") else: raise IOError('Set use_pretrained_model parameter to True')
def main(): #### Parameters - start conf_parameters = configparser.ConfigParser() conf_parameters.read(os.path.join('.', 'parameters.ini')) nested_parameters = utils.convert_configparser_to_dictionary( conf_parameters) parameters = {} for k, v in nested_parameters.items(): parameters.update(v) for k, v in parameters.items(): if k in [ 'remove_unknown_tokens', 'character_embedding_dimension', 'character_lstm_hidden_state_dimension', 'token_embedding_dimension', 'token_lstm_hidden_state_dimension', 'patience', 'maximum_number_of_epochs', 'maximum_training_time', 'number_of_cpu_threads', 'number_of_gpus' ]: parameters[k] = int(v) if k in ['dropout_rate']: parameters[k] = float(v) if k in [ 'use_character_lstm', 'is_character_lstm_bidirect', 'is_token_lstm_bidirect', 'use_crf' ]: parameters[k] = distutils.util.strtobool(v) pprint(parameters) # Load dataset dataset_filepaths = {} dataset_filepaths['train'] = os.path.join( parameters['dataset_text_folder'], 'train.txt') dataset_filepaths['valid'] = os.path.join( parameters['dataset_text_folder'], 'valid.txt') dataset_filepaths['test'] = os.path.join(parameters['dataset_text_folder'], 'test.txt') dataset = ds.Dataset() dataset.load_dataset(dataset_filepaths, parameters) with tf.Graph().as_default(): session_conf = tf.ConfigProto( device_count={ 'CPU': 1, 'GPU': 1 }, allow_soft_placement= True, # automatically choose an existing and supported device to run the operations in case the specified one doesn't exist log_device_placement=False) sess = tf.Session(config=session_conf) with sess.as_default(): # Instantiate model model = EntityLSTM(dataset, parameters) sess.run(tf.global_variables_initializer()) model.load_pretrained_token_embeddings(sess, dataset, parameters) # Initialize and save execution details start_time = time.time() experiment_timestamp = utils.get_current_time_in_miliseconds() results = {} #results['model_options'] = copy.copy(model_options) #results['model_options'].pop('optimizer', None) results['epoch'] = {} results['execution_details'] = {} results['execution_details']['train_start'] = start_time results['execution_details']['time_stamp'] = experiment_timestamp results['execution_details']['early_stop'] = False results['execution_details']['keyboard_interrupt'] = False results['execution_details']['num_epochs'] = 0 results['model_options'] = copy.copy(parameters) dataset_name = utils.get_basename_without_extension( parameters['dataset_text_folder'] ) #opts.train.replace('/', '_').split('.')[0] # 'conll2003en' model_name = '{0}_{1}'.format( dataset_name, results['execution_details']['time_stamp']) output_folder = os.path.join('..', 'output') utils.create_folder_if_not_exists(output_folder) stats_graph_folder = os.path.join( output_folder, model_name) # Folder where to save graphs #print('stats_graph_folder: {0}'.format(stats_graph_folder)) utils.create_folder_if_not_exists(stats_graph_folder) # model_folder = os.path.join(stats_graph_folder, 'model') # utils.create_folder_if_not_exists(model_folder) step = 0 bad_counter = 0 previous_best_valid_f1_score = 0 transition_params_trained = np.random.rand( len(dataset.unique_labels), len(dataset.unique_labels)) try: while True: epoch_number = math.floor( step / len(dataset.token_indices['train'])) print('\nStarting epoch {0}'.format(epoch_number), end='') epoch_start_time = time.time() #print('step: {0}'.format(step)) # Train model: loop over all sequences of training set with shuffling sequence_numbers = list( range(len(dataset.token_indices['train']))) random.shuffle(sequence_numbers) for sequence_number in sequence_numbers: transition_params_trained = train.train_step( sess, dataset, sequence_number, model, transition_params_trained, parameters) step += 1 if step % 100 == 0: print('.', end='', flush=True) #break print('.', flush=True) #print('step: {0}'.format(step)) # Predict labels using trained model all_predictions = {} all_y_true = {} output_filepaths = {} for dataset_type in ['train', 'valid', 'test']: #print('dataset_type: {0}'.format(dataset_type)) prediction_output = train.prediction_step( sess, dataset, dataset_type, model, transition_params_trained, step, stats_graph_folder, epoch_number, parameters) all_predictions[dataset_type], all_y_true[ dataset_type], output_filepaths[ dataset_type] = prediction_output # model_options = None epoch_elapsed_training_time = time.time( ) - epoch_start_time print( 'epoch_elapsed_training_time: {0:.2f} seconds'.format( epoch_elapsed_training_time)) results['execution_details']['num_epochs'] = epoch_number # Evaluate model: save and plot results evaluate.evaluate_model(results, dataset, all_predictions, all_y_true, stats_graph_folder, epoch_number, epoch_start_time, output_filepaths) # Early stop valid_f1_score = results['epoch'][epoch_number][0][ 'valid']['f1_score']['micro'] if valid_f1_score > previous_best_valid_f1_score: bad_counter = 0 previous_best_valid_f1_score = valid_f1_score else: bad_counter += 1 if bad_counter > parameters['patience']: print('Early Stop!') results['execution_details']['early_stop'] = True break if epoch_number > parameters['maximum_number_of_epochs']: break # break # debugging except KeyboardInterrupt: results['execution_details']['keyboard_interrupt'] = True # assess_model.save_results(results, stats_graph_folder) print('Training interrupted') print('Finishing the experiment') end_time = time.time() results['execution_details'][ 'train_duration'] = end_time - start_time results['execution_details']['train_end'] = end_time evaluate.save_results(results, stats_graph_folder) sess.close() # release the session's resources