def predict(self, text): assert False self.prediction_count += 1 if self.prediction_count == 1: self.parameters['dataset_text_folder'] = os.path.join('..', 'data', 'temp') self.stats_graph_folder, _ = self._create_stats_graph_folder(self.parameters) # Update the deploy folder, file, and dataset dataset_type = 'deploy' ### Delete all deployment data for filepath in glob.glob(os.path.join(self.parameters['dataset_text_folder'], '{0}*'.format(dataset_type))): if os.path.isdir(filepath): shutil.rmtree(filepath) else: os.remove(filepath) ### Create brat folder and file dataset_brat_deploy_folder = os.path.join(self.parameters['dataset_text_folder'], dataset_type) utils.create_folder_if_not_exists(dataset_brat_deploy_folder) dataset_brat_deploy_filepath = os.path.join(dataset_brat_deploy_folder, 'temp_{0}.txt'.format(str(self.prediction_count).zfill(5))) with codecs.open(dataset_brat_deploy_filepath, 'w', 'UTF-8') as f: f.write(text) ### Update deploy filepaths dataset_filepaths, dataset_brat_folders = self._get_valid_dataset_filepaths(self.parameters, dataset_types=[dataset_type]) self.dataset_filepaths.update(dataset_filepaths) self.dataset_brat_folders.update(dataset_brat_folders) ### Update the dataset for the new deploy set self.dataset.update_dataset(self.dataset_filepaths, [dataset_type]) # Predict labels and output brat output_filepaths = {} prediction_output = train.prediction_step(self.sess, self.dataset, dataset_type, self.model, self.transition_params_trained, self.stats_graph_folder, self.prediction_count, self.parameters, self.dataset_filepaths) _, _, output_filepaths[dataset_type] = prediction_output conll_to_brat.output_brat(output_filepaths, self.dataset_brat_folders, self.stats_graph_folder, overwrite=True) # Print and output result text_filepath = os.path.join(self.stats_graph_folder, 'brat', 'deploy', os.path.basename(dataset_brat_deploy_filepath)) annotation_filepath = os.path.join(self.stats_graph_folder, 'brat', 'deploy', '{0}.ann'.format(utils.get_basename_without_extension(dataset_brat_deploy_filepath))) text2, entities = brat_to_conll.get_entities_from_brat(text_filepath, annotation_filepath, verbose=True) assert(text == text2) return entities
def fit(self): parameters = self.parameters conf_parameters = self.conf_parameters dataset_filepaths = self.dataset_filepaths dataset = self.dataset dataset_brat_folders = self.dataset_brat_folders sess = self.sess model = self.model transition_params_trained = self.transition_params_trained stats_graph_folder, experiment_timestamp = self._create_stats_graph_folder(parameters) # Initialize and save execution details start_time = time.time() results = {} results['epoch'] = {} results['execution_details'] = {} results['execution_details']['train_start'] = start_time results['execution_details']['time_stamp'] = experiment_timestamp results['execution_details']['early_stop'] = False results['execution_details']['keyboard_interrupt'] = False results['execution_details']['num_epochs'] = 0 results['model_options'] = copy.copy(parameters) model_folder = os.path.join(stats_graph_folder, 'model') utils.create_folder_if_not_exists(model_folder) with open(os.path.join(model_folder, 'parameters.ini'), 'w') as parameters_file: conf_parameters.write(parameters_file) pickle.dump(dataset, open(os.path.join(model_folder, 'dataset.pickle'), 'wb')) tensorboard_log_folder = os.path.join(stats_graph_folder, 'tensorboard_logs') utils.create_folder_if_not_exists(tensorboard_log_folder) tensorboard_log_folders = {} for dataset_type in dataset_filepaths.keys(): tensorboard_log_folders[dataset_type] = os.path.join(stats_graph_folder, 'tensorboard_logs', dataset_type) utils.create_folder_if_not_exists(tensorboard_log_folders[dataset_type]) # Instantiate the writers for TensorBoard writers = {} for dataset_type in dataset_filepaths.keys(): writers[dataset_type] = tf.summary.FileWriter(tensorboard_log_folders[dataset_type], graph=sess.graph) embedding_writer = tf.summary.FileWriter(model_folder) # embedding_writer has to write in model_folder, otherwise TensorBoard won't be able to view embeddings embeddings_projector_config = projector.ProjectorConfig() tensorboard_token_embeddings = embeddings_projector_config.embeddings.add() tensorboard_token_embeddings.tensor_name = model.token_embedding_weights.name token_list_file_path = os.path.join(model_folder, 'tensorboard_metadata_tokens.tsv') tensorboard_token_embeddings.metadata_path = os.path.relpath(token_list_file_path, '..') tensorboard_character_embeddings = embeddings_projector_config.embeddings.add() tensorboard_character_embeddings.tensor_name = model.character_embedding_weights.name character_list_file_path = os.path.join(model_folder, 'tensorboard_metadata_characters.tsv') tensorboard_character_embeddings.metadata_path = os.path.relpath(character_list_file_path, '..') projector.visualize_embeddings(embedding_writer, embeddings_projector_config) # Write metadata for TensorBoard embeddings token_list_file = codecs.open(token_list_file_path,'w', 'UTF-8') for token_index in range(dataset.vocabulary_size): token_list_file.write('{0}\n'.format(dataset.index_to_token[token_index])) token_list_file.close() character_list_file = codecs.open(character_list_file_path,'w', 'UTF-8') for character_index in range(dataset.alphabet_size): if character_index == dataset.PADDING_CHARACTER_INDEX: character_list_file.write('PADDING\n') else: character_list_file.write('{0}\n'.format(dataset.index_to_character[character_index])) character_list_file.close() # Start training + evaluation loop. Each iteration corresponds to 1 epoch. bad_counter = 0 # number of epochs with no improvement on the validation test in terms of F1-score previous_best_valid_f1_score = 0 epoch_number = -1 try: while True: step = 0 epoch_number += 1 print('\nStarting epoch {0}'.format(epoch_number)) epoch_start_time = time.time() if epoch_number != 0: # Train model: loop over all sequences of training set with shuffling sequence_numbers=list(range(len(dataset.token_indices['train']))) random.shuffle(sequence_numbers) for sequence_number in sequence_numbers: transition_params_trained = train.train_step(sess, dataset, sequence_number, model, parameters) step += 1 if step % 10 == 0: print('Training {0:.2f}% done'.format(step/len(sequence_numbers)*100), end='\r', flush=True) epoch_elapsed_training_time = time.time() - epoch_start_time print('Training completed in {0:.2f} seconds'.format(epoch_elapsed_training_time), flush=True) y_pred, y_true, output_filepaths = train.predict_labels(sess, model, transition_params_trained, parameters, dataset, epoch_number, stats_graph_folder, dataset_filepaths) # Evaluate model: save and plot results evaluate.evaluate_model(results, dataset, y_pred, y_true, stats_graph_folder, epoch_number, epoch_start_time, output_filepaths, parameters) if parameters['use_pretrained_model'] and not parameters['train_model']: conll_to_brat.output_brat(output_filepaths, dataset_brat_folders, stats_graph_folder) break # Save model model.saver.save(sess, os.path.join(model_folder, 'model_{0:05d}.ckpt'.format(epoch_number))) # Save TensorBoard logs summary = sess.run(model.summary_op, feed_dict=None) writers['train'].add_summary(summary, epoch_number) writers['train'].flush() utils.copytree(writers['train'].get_logdir(), model_folder) # Early stop valid_f1_score = results['epoch'][epoch_number][0]['valid']['f1_score']['micro'] if valid_f1_score > previous_best_valid_f1_score: bad_counter = 0 previous_best_valid_f1_score = valid_f1_score conll_to_brat.output_brat(output_filepaths, dataset_brat_folders, stats_graph_folder, overwrite=True) self.transition_params_trained = transition_params_trained else: bad_counter += 1 print("The last {0} epochs have not shown improvements on the validation set.".format(bad_counter)) if bad_counter >= parameters['patience']: print('Early Stop!') results['execution_details']['early_stop'] = True break if epoch_number >= parameters['maximum_number_of_epochs']: break except KeyboardInterrupt: results['execution_details']['keyboard_interrupt'] = True print('Training interrupted') print('Finishing the experiment') end_time = time.time() results['execution_details']['train_duration'] = end_time - start_time results['execution_details']['train_end'] = end_time evaluate.save_results(results, stats_graph_folder) for dataset_type in dataset_filepaths.keys(): writers[dataset_type].close()
def main(): parameters, conf_parameters = load_parameters() dataset_filepaths, dataset_brat_folders = get_valid_dataset_filepaths(parameters) check_parameter_compatiblity(parameters, dataset_filepaths) # Load dataset dataset = ds.Dataset(verbose=parameters['verbose'], debug=parameters['debug']) dataset.load_dataset(dataset_filepaths, parameters) # Create graph and session with tf.Graph().as_default(): session_conf = tf.ConfigProto( intra_op_parallelism_threads=parameters['number_of_cpu_threads'], inter_op_parallelism_threads=parameters['number_of_cpu_threads'], device_count={'CPU': 1, 'GPU': parameters['number_of_gpus']}, allow_soft_placement=True, # automatically choose an existing and supported device to run the operations in case the specified one doesn't exist log_device_placement=False ) sess = tf.Session(config=session_conf) with sess.as_default(): # Initialize and save execution details start_time = time.time() experiment_timestamp = utils.get_current_time_in_miliseconds() results = {} results['epoch'] = {} results['execution_details'] = {} results['execution_details']['train_start'] = start_time results['execution_details']['time_stamp'] = experiment_timestamp results['execution_details']['early_stop'] = False results['execution_details']['keyboard_interrupt'] = False results['execution_details']['num_epochs'] = 0 results['model_options'] = copy.copy(parameters) dataset_name = utils.get_basename_without_extension(parameters['dataset_text_folder']) model_name = '{0}_{1}'.format(dataset_name, results['execution_details']['time_stamp']) output_folder=os.path.join('..', 'output') utils.create_folder_if_not_exists(output_folder) stats_graph_folder=os.path.join(output_folder, model_name) # Folder where to save graphs utils.create_folder_if_not_exists(stats_graph_folder) model_folder = os.path.join(stats_graph_folder, 'model') utils.create_folder_if_not_exists(model_folder) with open(os.path.join(model_folder, 'parameters.ini'), 'w') as parameters_file: conf_parameters.write(parameters_file) tensorboard_log_folder = os.path.join(stats_graph_folder, 'tensorboard_logs') utils.create_folder_if_not_exists(tensorboard_log_folder) tensorboard_log_folders = {} for dataset_type in dataset_filepaths.keys(): tensorboard_log_folders[dataset_type] = os.path.join(stats_graph_folder, 'tensorboard_logs', dataset_type) utils.create_folder_if_not_exists(tensorboard_log_folders[dataset_type]) pickle.dump(dataset, open(os.path.join(model_folder, 'dataset.pickle'), 'wb')) # Instantiate the model # graph initialization should be before FileWriter, otherwise the graph will not appear in TensorBoard model = EntityLSTM(dataset, parameters) # Instantiate the writers for TensorBoard writers = {} for dataset_type in dataset_filepaths.keys(): writers[dataset_type] = tf.summary.FileWriter(tensorboard_log_folders[dataset_type], graph=sess.graph) embedding_writer = tf.summary.FileWriter(model_folder) # embedding_writer has to write in model_folder, otherwise TensorBoard won't be able to view embeddings embeddings_projector_config = projector.ProjectorConfig() tensorboard_token_embeddings = embeddings_projector_config.embeddings.add() tensorboard_token_embeddings.tensor_name = model.token_embedding_weights.name token_list_file_path = os.path.join(model_folder, 'tensorboard_metadata_tokens.tsv') tensorboard_token_embeddings.metadata_path = os.path.relpath(token_list_file_path, '..') tensorboard_character_embeddings = embeddings_projector_config.embeddings.add() tensorboard_character_embeddings.tensor_name = model.character_embedding_weights.name character_list_file_path = os.path.join(model_folder, 'tensorboard_metadata_characters.tsv') tensorboard_character_embeddings.metadata_path = os.path.relpath(character_list_file_path, '..') projector.visualize_embeddings(embedding_writer, embeddings_projector_config) # Write metadata for TensorBoard embeddings token_list_file = codecs.open(token_list_file_path,'w', 'UTF-8') for token_index in range(dataset.vocabulary_size): token_list_file.write('{0}\n'.format(dataset.index_to_token[token_index])) token_list_file.close() character_list_file = codecs.open(character_list_file_path,'w', 'UTF-8') for character_index in range(dataset.alphabet_size): if character_index == dataset.PADDING_CHARACTER_INDEX: character_list_file.write('PADDING\n') else: character_list_file.write('{0}\n'.format(dataset.index_to_character[character_index])) character_list_file.close() # Initialize the model sess.run(tf.global_variables_initializer()) if not parameters['use_pretrained_model']: model.load_pretrained_token_embeddings(sess, dataset, parameters) # Start training + evaluation loop. Each iteration corresponds to 1 epoch. bad_counter = 0 # number of epochs with no improvement on the validation test in terms of F1-score previous_best_valid_f1_score = 0 transition_params_trained = np.random.rand(len(dataset.unique_labels)+2,len(dataset.unique_labels)+2) model_saver = tf.train.Saver(max_to_keep=parameters['maximum_number_of_epochs']) # defaults to saving all variables epoch_number = -1 try: while True: step = 0 epoch_number += 1 print('\nStarting epoch {0}'.format(epoch_number)) epoch_start_time = time.time() if parameters['use_pretrained_model'] and epoch_number == 0: # Restore pretrained model parameters transition_params_trained = train.restore_model_parameters_from_pretrained_model(parameters, dataset, sess, model, model_saver) elif epoch_number != 0: # Train model: loop over all sequences of training set with shuffling sequence_numbers=list(range(len(dataset.token_indices['train']))) random.shuffle(sequence_numbers) for sequence_number in sequence_numbers: transition_params_trained = train.train_step(sess, dataset, sequence_number, model, transition_params_trained, parameters) step += 1 if step % 10 == 0: print('Training {0:.2f}% done'.format(step/len(sequence_numbers)*100), end='\r', flush=True) epoch_elapsed_training_time = time.time() - epoch_start_time print('Training completed in {0:.2f} seconds'.format(epoch_elapsed_training_time), flush=True) y_pred, y_true, output_filepaths = train.predict_labels(sess, model, transition_params_trained, parameters, dataset, epoch_number, stats_graph_folder, dataset_filepaths) # Evaluate model: save and plot results evaluate.evaluate_model(results, dataset, y_pred, y_true, stats_graph_folder, epoch_number, epoch_start_time, output_filepaths, parameters) if parameters['use_pretrained_model'] and not parameters['train_model']: conll_to_brat.output_brat(output_filepaths, dataset_brat_folders, stats_graph_folder) break # Save model model_saver.save(sess, os.path.join(model_folder, 'model_{0:05d}.ckpt'.format(epoch_number))) # Save TensorBoard logs summary = sess.run(model.summary_op, feed_dict=None) writers['train'].add_summary(summary, epoch_number) writers['train'].flush() utils.copytree(writers['train'].get_logdir(), model_folder) # Early stop valid_f1_score = results['epoch'][epoch_number][0]['valid']['f1_score']['micro'] if valid_f1_score > previous_best_valid_f1_score: bad_counter = 0 previous_best_valid_f1_score = valid_f1_score conll_to_brat.output_brat(output_filepaths, dataset_brat_folders, stats_graph_folder, overwrite=True) else: bad_counter += 1 print("The last {0} epochs have not shown improvements on the validation set.".format(bad_counter)) if bad_counter >= parameters['patience']: print('Early Stop!') results['execution_details']['early_stop'] = True break if epoch_number >= parameters['maximum_number_of_epochs']: break except KeyboardInterrupt: results['execution_details']['keyboard_interrupt'] = True print('Training interrupted') print('Finishing the experiment') end_time = time.time() results['execution_details']['train_duration'] = end_time - start_time results['execution_details']['train_end'] = end_time print('ok1') evaluate.save_results(results, stats_graph_folder) print('ok2') print('ok3') #sess.close() # release the session's resources print('ok4')
def fit(self): ''' Dùng để train data ''' parameters = self.parameters conf_parameters = self.conf_parameters dataset_filepaths = self.dataset_filepaths dataset = self.dataset dataset_brat_folders = self.dataset_brat_folders sess = self.sess model = self.model transition_params_trained = self.transition_params_trained stats_graph_folder, experiment_timestamp = self._create_stats_graph_folder(parameters) # Khởi tạo và lưu các thông tin của lần chạy start_time = time.time() results = {} results['epoch'] = {} ''' An epoch, in Machine Learning, is the entire processing by the learning algorithm of the entire train-set. Ex: The MNIST train set is composed by 55000 samples. Once the algorithm processed all those 55000 samples an epoch is passed. ''' results['execution_details'] = {} results['execution_details']['train_start'] = start_time # Thời gian bắt đầu chạy results['execution_details']['time_stamp'] = experiment_timestamp # Nhãn thời gian results['execution_details']['early_stop'] = False # Cho biết có lỗi xảy ra nên bị dừng sớm ko results['execution_details']['keyboard_interrupt'] = False # Cho biết có bị dừng bởi keyboard results['execution_details']['num_epochs'] = 0 # Số lượng epoch đã chạy results['model_options'] = copy.copy(parameters) # Các tham số model_folder = os.path.join(stats_graph_folder, 'model') # output/en.../model utils.create_folder_if_not_exists(model_folder) # Save value cac parameters vao file parameters.ini with open(os.path.join(model_folder, 'parameters.ini'), 'w') as parameters_file: conf_parameters.write(parameters_file) # Log các tham số ra file pickle.dump(dataset, open(os.path.join(model_folder, 'dataset.pickle'), 'wb')) # Dump dataset thành pickle file để lần sau chạy # Tạo folder tensorboard logs để dùng cho việc vẽ biểu đồ sau này tensorboard_log_folder = os.path.join(stats_graph_folder, 'tensorboard_logs') # folder lưu file log của tensorboard -> dùng cho việc plot biểu đồ lên utils.create_folder_if_not_exists(tensorboard_log_folder) tensorboard_log_folders = {} for dataset_type in dataset_filepaths.keys(): tensorboard_log_folders[dataset_type] = os.path.join(stats_graph_folder, 'tensorboard_logs', dataset_type) utils.create_folder_if_not_exists(tensorboard_log_folders[dataset_type]) # Khởi tạo các writers cho tensorboard writers = {} # Có nhiều nhất 4 writers train, test, valid, deploy for dataset_type in dataset_filepaths.keys(): writers[dataset_type] = tf.summary.FileWriter(tensorboard_log_folders[dataset_type], graph=sess.graph) embedding_writer = tf.summary.FileWriter(model_folder) # embedding_writer has to write in model_folder, otherwise TensorBoard won't be able to view embeddings # Dùng cho việc visualize embedding bằng tensorboard embeddings_projector_config = projector.ProjectorConfig() tensorboard_token_embeddings = embeddings_projector_config.embeddings.add() tensorboard_token_embeddings.tensor_name = model.token_embedding_weights.name token_list_file_path = os.path.join(model_folder, 'tensorboard_metadata_tokens.tsv') tensorboard_token_embeddings.metadata_path = 'tensorboard_metadata_tokens.tsv'#os.path.relpath(token_list_file_path, '..') tensorboard_character_embeddings = embeddings_projector_config.embeddings.add() tensorboard_character_embeddings.tensor_name = model.character_embedding_weights.name character_list_file_path = os.path.join(model_folder, 'tensorboard_metadata_characters.tsv') tensorboard_character_embeddings.metadata_path = 'tensorboard_metadata_characters.tsv'#os.path.relpath(character_list_file_path, '..') # Saves a configuration file that TensorBoard will read during startup. projector.visualize_embeddings(embedding_writer, embeddings_projector_config) # Ghi token vào file tsv dùng làm metadata cho embedding token_list_file = codecs.open(token_list_file_path,'w', 'UTF-8') for token_index in range(dataset.vocabulary_size): token_list_file.write('{0}\n'.format(dataset.index_to_token[token_index])) token_list_file.close() # Ghi characters vào file tsv dùng làm metadata cho embedding character_list_file = codecs.open(character_list_file_path,'w', 'UTF-8') for character_index in range(dataset.alphabet_size): if character_index == dataset.PADDING_CHARACTER_INDEX: character_list_file.write('PADDING\n') else: character_list_file.write('{0}\n'.format(dataset.index_to_character[character_index])) character_list_file.close() # Start training + evaluation loop. Each iteration corresponds to 1 epoch. bad_counter = 0 # number of epochs with no improvement on the validation test in terms of F1-score previous_best_valid_f1_score = 0 # f1-Score tốt nhất ở các lần chạy trước epoch_number = -1 try: while True: step = 0 epoch_number += 1 print('\nStarting epoch {0}'.format(epoch_number)) epoch_start_time = time.time() if epoch_number != 0: # Train model: loop over all sequences of training set with shuffling sequence_numbers=list(range(len(dataset.token_indices['train']))) print("----****____") print(dataset.token_indices['train'][:10]) random.shuffle(sequence_numbers) # Thuc hien train for sequence_number in sequence_numbers: transition_params_trained = train.train_step(sess, dataset, sequence_number, model, parameters) step += 1 if step % 10 == 0: print('Training {0:.2f}% done'.format(step/len(sequence_numbers)*100), end='\r', flush=True) # Tinh thoi gian thuc hien 1 epoch epoch_elapsed_training_time = time.time() - epoch_start_time print('Training completed in {0:.2f} seconds'.format(epoch_elapsed_training_time), flush=True) y_pred, y_true, output_filepaths = train.predict_labels(sess, model, transition_params_trained, parameters, dataset, epoch_number, stats_graph_folder, dataset_filepaths) # Evaluate model: save and plot results evaluate.evaluate_model(results, dataset, y_pred, y_true, stats_graph_folder, epoch_number, epoch_start_time, output_filepaths, parameters) if parameters['use_pretrained_model'] and not parameters['train_model']: conll_to_brat.output_brat(output_filepaths, dataset_brat_folders, stats_graph_folder) break # Save model model.saver.save(sess, os.path.join(model_folder, 'model_{0:05d}.ckpt'.format(epoch_number))) # Save TensorBoard logs summary = sess.run(model.summary_op, feed_dict=None) writers['train'].add_summary(summary, epoch_number) writers['train'].flush() utils.copytree(writers['train'].get_logdir(), model_folder) # Early stop valid_f1_score = results['epoch'][epoch_number][0]['valid']['f1_score']['micro'] # If do chinh xac cua epoch > do chinh xac cua epoch truoc if valid_f1_score > previous_best_valid_f1_score: bad_counter = 0 previous_best_valid_f1_score = valid_f1_score conll_to_brat.output_brat(output_filepaths, dataset_brat_folders, stats_graph_folder, overwrite=True) self.transition_params_trained = transition_params_trained else: bad_counter += 1 print("The last {0} epochs have not shown improvements on the validation set.".format(bad_counter)) # If bad_counter den mot muc gioi han parameters['patience'] = 10 (gia tri khoi tao) finish train if bad_counter >= parameters['patience']: print('Early Stop!') results['execution_details']['early_stop'] = True break # Neu so epoch >= so luong epoch toi da quy dinh --> ket thuc train if epoch_number >= parameters['maximum_number_of_epochs']: break except KeyboardInterrupt: results['execution_details']['keyboard_interrupt'] = True print('Training interrupted') # Ket thuc train luu cac tham so time, ket qua print('Finishing the experiment') end_time = time.time() results['execution_details']['train_duration'] = end_time - start_time results['execution_details']['train_end'] = end_time evaluate.save_results(results, stats_graph_folder) for dataset_type in dataset_filepaths.keys(): writers[dataset_type].close()