def __init__(self, **kwargs): # Set parameters self.parameters, self.conf_parameters = load_parameters(**kwargs) self.dataset_filepaths, self.dataset_brat_folders = self._get_valid_dataset_filepaths(self.parameters) self._check_param_compatibility(self.parameters, self.dataset_filepaths) # Load dataset self.modeldata = dataset.Dataset(verbose=self.parameters['verbose'], debug=self.parameters['debug']) token_to_vector = self.modeldata.load_dataset(self.dataset_filepaths, self.parameters) # Launch session. Automatically choose a device # if the specified one doesn't exist session_conf = tf.ConfigProto( intra_op_parallelism_threads=self.parameters['number_of_cpu_threads'], inter_op_parallelism_threads=self.parameters['number_of_cpu_threads'], device_count={'CPU': 1, 'GPU': self.parameters['number_of_gpus']}, allow_soft_placement=True, log_device_placement=False) self.sess = tf.Session(config=session_conf) with self.sess.as_default(): # Initialize or load pretrained model self.model = EntityLSTM(self.modeldata, self.parameters) self.sess.run(tf.global_variables_initializer()) if self.parameters['use_pretrained_model']: self.transition_params_trained = self.model.restore_from_pretrained_model(self.parameters, self.modeldata, self.sess, token_to_vector=token_to_vector) else: self.model.load_pretrained_token_embeddings(self.sess, self.modeldata, self.parameters, token_to_vector) self.transition_params_trained = np.random.rand(len(self.modeldata.unique_labels)+2, len(self.modeldata.unique_labels)+2)
def trim_model_checkpoint(parameters_filepath, dataset_filepath, input_checkpoint_filepath, output_checkpoint_filepath, verbose=False): ''' Remove all token embeddings except UNK. ''' parameters, _ = neuromodel.load_parameters( parameters_filepath=parameters_filepath) dataset = pickle.load(open(dataset_filepath, 'rb')) model = EntityLSTM(dataset, parameters) with tf.Session() as sess: model_saver = tf.train.Saver() # defaults to saving all variables # Restore the pretrained model model_saver.restore( sess, input_checkpoint_filepath ) # Works only when the dimensions of tensor variables are matched. # Get pretrained embeddings token_embedding_weights = sess.run(model.token_embedding_weights) # Restore the sizes of token embedding weights utils_tf.resize_tensor_variable( sess, model.token_embedding_weights, [1, parameters['token_embedding_dimension']]) initial_weights = sess.run(model.token_embedding_weights) initial_weights[dataset.UNK_TOKEN_INDEX] = token_embedding_weights[ dataset.UNK_TOKEN_INDEX] sess.run( tf.assign(model.token_embedding_weights, initial_weights, validate_shape=False)) token_embedding_weights = sess.run(model.token_embedding_weights) if verbose: print( "token_embedding_weights: {0}".format(token_embedding_weights)) model_saver.save(sess, output_checkpoint_filepath) dataset.__dict__['vocabulary_size'] = 1 pickle.dump(dataset, open(dataset_filepath, 'wb')) if verbose: pprint(dataset.__dict__)
class NeuroNER(object): """ NeuroNER model. Args: param_filepath (type): description pretrained_model_folder (type): description dataset_text_folder (type): description character_embedding_dimension (type): description character_lstm_hidden_state_dimension (type): description check_for_digits_replaced_with_zeros (type): description check_for_lowercase (type): description debug (type): description dropout_rate (type): description experiment_name (type): description freeze_token_embeddings (type): description gradient_clipping_value (type): description learning_rate (type): description load_only_pretrained_token_embeddings (type): description load_all_pretrained_token_embeddings (type): description main_evaluation_mode (type): description maximum_number_of_epochs (type): description number_of_cpu_threads (type): description number_of_gpus (type): description optimizer (type): description output_folder (type): description output_scores (bool): description patience (type): description plot_format (type): description reload_character_embeddings (type): description reload_character_lstm (type): description reload_crf (type): description reload_feedforward (type): description reload_token_embeddings (type): description reload_token_lstm (type): description remap_unknown_tokens_to_unk (type): description spacylanguage (type): description tagging_format (type): description token_embedding_dimension (type): description token_lstm_hidden_state_dimension (type): description token_pretrained_embedding_filepath (type): description tokenizer (type): description train_model (type): description use_character_lstm (type): description use_crf (type): description use_pretrained_model (type): description verbose (type): description """ prediction_count = 0 def __init__(self, **kwargs): # Set parameters self.parameters, self.conf_parameters = load_parameters(**kwargs) self.dataset_filepaths, self.dataset_brat_folders = self._get_valid_dataset_filepaths( self.parameters) self._check_param_compatibility(self.parameters, self.dataset_filepaths) # Load dataset self.modeldata = dataset.Dataset(verbose=self.parameters['verbose'], debug=self.parameters['debug']) token_to_vector = self.modeldata.load_dataset(self.dataset_filepaths, self.parameters) # Launch session. Automatically choose a device # if the specified one doesn't exist session_conf = tf.ConfigProto(intra_op_parallelism_threads=self. parameters['number_of_cpu_threads'], inter_op_parallelism_threads=self. parameters['number_of_cpu_threads'], device_count={ 'CPU': 1, 'GPU': self.parameters['number_of_gpus'] }, allow_soft_placement=True, log_device_placement=False) self.sess = tf.Session(config=session_conf) with self.sess.as_default(): # Initialize or load pretrained model self.model = EntityLSTM(self.modeldata, self.parameters) self.sess.run(tf.global_variables_initializer()) if self.parameters['use_pretrained_model']: self.transition_params_trained = self.model.restore_from_pretrained_model( self.parameters, self.modeldata, self.sess, token_to_vector=token_to_vector) else: self.model.load_pretrained_token_embeddings( self.sess, self.modeldata, self.parameters, token_to_vector) self.transition_params_trained = np.random.rand( len(self.modeldata.unique_labels) + 2, len(self.modeldata.unique_labels) + 2) def _create_stats_graph_folder(self, parameters): """ Initialize stats_graph_folder. Args: parameters (type): description. """ experiment_timestamp = utils.get_current_time_in_miliseconds() dataset_name = utils.get_basename_without_extension( parameters['dataset_text_folder']) model_name = '{0}_{1}'.format(dataset_name, experiment_timestamp) utils.create_folder_if_not_exists(parameters['output_folder']) # Folder where to save graphs stats_graph_folder = os.path.join(parameters['output_folder'], model_name) utils.create_folder_if_not_exists(stats_graph_folder) return stats_graph_folder, experiment_timestamp def _get_valid_dataset_filepaths(self, parameters, dataset_types=[ 'train', 'valid', 'test', 'deploy' ]): """ Get paths for the datasets. Args: parameters (type): description. dataset_types (type): description. """ dataset_filepaths = {} dataset_brat_folders = {} for dataset_type in dataset_types: dataset_filepaths[dataset_type] = os.path.join( parameters['dataset_text_folder'], '{0}.txt'.format(dataset_type)) dataset_brat_folders[dataset_type] = os.path.join( parameters['dataset_text_folder'], dataset_type) dataset_compatible_with_brat_filepath = os.path.join( parameters['dataset_text_folder'], '{0}_compatible_with_brat.txt'.format(dataset_type)) # Conll file exists if os.path.isfile(dataset_filepaths[dataset_type]) \ and os.path.getsize(dataset_filepaths[dataset_type]) > 0: # Brat text files exist if os.path.exists(dataset_brat_folders[dataset_type]) and \ len(glob.glob(os.path.join(dataset_brat_folders[dataset_type], '*.txt'))) > 0: # Check compatibility between conll and brat files brat_to_conll.check_brat_annotation_and_text_compatibility( dataset_brat_folders[dataset_type]) if os.path.exists(dataset_compatible_with_brat_filepath): dataset_filepaths[ dataset_type] = dataset_compatible_with_brat_filepath conll_to_brat.check_compatibility_between_conll_and_brat_text( dataset_filepaths[dataset_type], dataset_brat_folders[dataset_type]) # Brat text files do not exist else: # Populate brat text and annotation files based on conll file conll_to_brat.conll_to_brat( dataset_filepaths[dataset_type], dataset_compatible_with_brat_filepath, dataset_brat_folders[dataset_type], dataset_brat_folders[dataset_type]) dataset_filepaths[ dataset_type] = dataset_compatible_with_brat_filepath # Conll file does not exist else: # Brat text files exist if os.path.exists(dataset_brat_folders[dataset_type]) \ and len(glob.glob(os.path.join(dataset_brat_folders[dataset_type], '*.txt'))) > 0: dataset_filepath_for_tokenizer = os.path.join( parameters['dataset_text_folder'], '{0}_{1}.txt'.format(dataset_type, parameters['tokenizer'])) if os.path.exists(dataset_filepath_for_tokenizer): conll_to_brat.check_compatibility_between_conll_and_brat_text( dataset_filepath_for_tokenizer, dataset_brat_folders[dataset_type]) else: # Populate conll file based on brat files brat_to_conll.brat_to_conll( dataset_brat_folders[dataset_type], dataset_filepath_for_tokenizer, parameters['tokenizer'], parameters['spacylanguage']) dataset_filepaths[ dataset_type] = dataset_filepath_for_tokenizer # Brat text files do not exist else: del dataset_filepaths[dataset_type] del dataset_brat_folders[dataset_type] continue if parameters['tagging_format'] == 'bioes': # Generate conll file with BIOES format bioes_filepath = os.path.join( parameters['dataset_text_folder'], '{0}_bioes.txt'.format( utils.get_basename_without_extension( dataset_filepaths[dataset_type]))) utils_nlp.convert_conll_from_bio_to_bioes( dataset_filepaths[dataset_type], bioes_filepath) dataset_filepaths[dataset_type] = bioes_filepath return dataset_filepaths, dataset_brat_folders def _check_param_compatibility(self, parameters, dataset_filepaths): """ Check parameters are compatible. Args: parameters (type): description. dataset_filepaths (type): description. """ check_param_compatibility(parameters, dataset_filepaths) def fit(self): """ Fit the model. """ parameters = self.parameters conf_parameters = self.conf_parameters dataset_filepaths = self.dataset_filepaths modeldata = self.modeldata dataset_brat_folders = self.dataset_brat_folders sess = self.sess model = self.model transition_params_trained = self.transition_params_trained stats_graph_folder, experiment_timestamp = self._create_stats_graph_folder( parameters) # Initialize and save execution details start_time = time.time() results = {} results['epoch'] = {} results['execution_details'] = {} results['execution_details']['train_start'] = start_time results['execution_details']['time_stamp'] = experiment_timestamp results['execution_details']['early_stop'] = False results['execution_details']['keyboard_interrupt'] = False results['execution_details']['num_epochs'] = 0 results['model_options'] = copy.copy(parameters) model_folder = os.path.join(stats_graph_folder, 'model') utils.create_folder_if_not_exists(model_folder) with open(os.path.join(model_folder, 'parameters.ini'), 'w') as parameters_file: conf_parameters.write(parameters_file) pickle.dump(modeldata, open(os.path.join(model_folder, 'dataset.pickle'), 'wb')) tensorboard_log_folder = os.path.join(stats_graph_folder, 'tensorboard_logs') utils.create_folder_if_not_exists(tensorboard_log_folder) tensorboard_log_folders = {} for dataset_type in dataset_filepaths.keys(): tensorboard_log_folders[dataset_type] = os.path.join( stats_graph_folder, 'tensorboard_logs', dataset_type) utils.create_folder_if_not_exists( tensorboard_log_folders[dataset_type]) # Instantiate the writers for TensorBoard writers = {} for dataset_type in dataset_filepaths.keys(): writers[dataset_type] = tf.summary.FileWriter( tensorboard_log_folders[dataset_type], graph=sess.graph) # embedding_writer has to write in model_folder, otherwise TensorBoard won't be able to view embeddings embedding_writer = tf.summary.FileWriter(model_folder) embeddings_projector_config = projector.ProjectorConfig() tensorboard_token_embeddings = embeddings_projector_config.embeddings.add( ) tensorboard_token_embeddings.tensor_name = model.token_embedding_weights.name token_list_file_path = os.path.join(model_folder, 'tensorboard_metadata_tokens.tsv') tensorboard_token_embeddings.metadata_path = os.path.relpath( token_list_file_path, '.') tensorboard_character_embeddings = embeddings_projector_config.embeddings.add( ) tensorboard_character_embeddings.tensor_name = model.character_embedding_weights.name character_list_file_path = os.path.join( model_folder, 'tensorboard_metadata_characters.tsv') tensorboard_character_embeddings.metadata_path = os.path.relpath( character_list_file_path, '.') projector.visualize_embeddings(embedding_writer, embeddings_projector_config) # Write metadata for TensorBoard embeddings token_list_file = codecs.open(token_list_file_path, 'w', 'UTF-8') for token_index in range(modeldata.vocabulary_size): token_list_file.write('{0}\n'.format( modeldata.index_to_token[token_index])) token_list_file.close() character_list_file = codecs.open(character_list_file_path, 'w', 'UTF-8') for character_index in range(modeldata.alphabet_size): if character_index == modeldata.PADDING_CHARACTER_INDEX: character_list_file.write('PADDING\n') else: character_list_file.write('{0}\n'.format( modeldata.index_to_character[character_index])) character_list_file.close() # Start training + evaluation loop. Each iteration corresponds to 1 epoch. # number of epochs with no improvement on the validation test in terms of F1-score bad_counter = 0 previous_best_valid_f1_score = 0 epoch_number = -1 try: while True: step = 0 epoch_number += 1 print('\nStarting epoch {0}'.format(epoch_number)) epoch_start_time = time.time() if epoch_number != 0: # Train model: loop over all sequences of training set with shuffling sequence_numbers = list( range(len(modeldata.token_indices['train']))) random.shuffle(sequence_numbers) for sequence_number in sequence_numbers: transition_params_trained = train.train_step( sess, modeldata, sequence_number, model, parameters) step += 1 if step % 10 == 0: print('Training {0:.2f}% done'.format( step / len(sequence_numbers) * 100), end='\r', flush=True) epoch_elapsed_training_time = time.time() - epoch_start_time print('Training completed in {0:.2f} seconds'.format( epoch_elapsed_training_time), flush=True) y_pred, y_true, output_filepaths = train.predict_labels( sess, model, transition_params_trained, parameters, modeldata, epoch_number, stats_graph_folder, dataset_filepaths) # Evaluate model: save and plot results evaluate.evaluate_model(results, modeldata, y_pred, y_true, stats_graph_folder, epoch_number, epoch_start_time, output_filepaths, parameters) if parameters['use_pretrained_model'] and not parameters[ 'train_model']: conll_to_brat.output_brat(output_filepaths, dataset_brat_folders, stats_graph_folder) break # Save model model.saver.save( sess, os.path.join(model_folder, 'model_{0:05d}.ckpt'.format(epoch_number))) # Save TensorBoard logs summary = sess.run(model.summary_op, feed_dict=None) writers['train'].add_summary(summary, epoch_number) writers['train'].flush() utils.copytree(writers['train'].get_logdir(), model_folder) # Early stop valid_f1_score = results['epoch'][epoch_number][0]['valid'][ 'f1_score']['micro'] if valid_f1_score > previous_best_valid_f1_score: bad_counter = 0 previous_best_valid_f1_score = valid_f1_score conll_to_brat.output_brat(output_filepaths, dataset_brat_folders, stats_graph_folder, overwrite=True) self.transition_params_trained = transition_params_trained else: bad_counter += 1 print( "The last {0} epochs have not shown improvements on the validation set." .format(bad_counter)) if bad_counter >= parameters['patience']: print('Early Stop!') results['execution_details']['early_stop'] = True break if epoch_number >= parameters['maximum_number_of_epochs']: break except KeyboardInterrupt: results['execution_details']['keyboard_interrupt'] = True print('Training interrupted') print('Finishing the experiment') end_time = time.time() results['execution_details']['train_duration'] = end_time - start_time results['execution_details']['train_end'] = end_time evaluate.save_results(results, stats_graph_folder) for dataset_type in dataset_filepaths.keys(): writers[dataset_type].close() def predict(self, text): """ Predict Args: text (str): Description. """ self.prediction_count += 1 if self.prediction_count == 1: self.parameters['dataset_text_folder'] = os.path.join( '.', 'data', 'temp') self.stats_graph_folder, _ = self._create_stats_graph_folder( self.parameters) # Update the deploy folder, file, and modeldata dataset_type = 'deploy' # Delete all deployment data for filepath in glob.glob( os.path.join(self.parameters['dataset_text_folder'], '{0}*'.format(dataset_type))): if os.path.isdir(filepath): shutil.rmtree(filepath) else: os.remove(filepath) # Create brat folder and file dataset_brat_deploy_folder = os.path.join( self.parameters['dataset_text_folder'], dataset_type) utils.create_folder_if_not_exists(dataset_brat_deploy_folder) dataset_brat_deploy_filepath = os.path.join( dataset_brat_deploy_folder, 'temp_{0}.txt'.format(str(self.prediction_count).zfill(5))) #self._get_dataset_brat_deploy_filepath(dataset_brat_deploy_folder) with codecs.open(dataset_brat_deploy_filepath, 'w', 'UTF-8') as f: f.write(text) # Update deploy filepaths dataset_filepaths, dataset_brat_folders = self._get_valid_dataset_filepaths( self.parameters, dataset_types=[dataset_type]) self.dataset_filepaths.update(dataset_filepaths) self.dataset_brat_folders.update(dataset_brat_folders) # Update the dataset for the new deploy set self.modeldata.update_dataset(self.dataset_filepaths, [dataset_type]) # Predict labels and output brat output_filepaths = {} prediction_output = train.prediction_step( self.sess, self.modeldata, dataset_type, self.model, self.transition_params_trained, self.stats_graph_folder, self.prediction_count, self.parameters, self.dataset_filepaths) _, _, output_filepaths[dataset_type] = prediction_output conll_to_brat.output_brat(output_filepaths, self.dataset_brat_folders, self.stats_graph_folder, overwrite=True) # Print and output result text_filepath = os.path.join( self.stats_graph_folder, 'brat', 'deploy', os.path.basename(dataset_brat_deploy_filepath)) annotation_filepath = os.path.join( self.stats_graph_folder, 'brat', 'deploy', '{0}.ann'.format( utils.get_basename_without_extension( dataset_brat_deploy_filepath))) text2, entities = brat_to_conll.get_entities_from_brat( text_filepath, annotation_filepath, verbose=True) assert (text == text2) return entities def get_params(self): return self.parameters def close(self): self.__del__() def __del__(self): self.sess.close()
def __init__(self, parameters_filepath=argument_default_value, pretrained_model_folder=argument_default_value, dataset_text_folder=argument_default_value, character_embedding_dimension=argument_default_value, character_lstm_hidden_state_dimension=argument_default_value, check_for_digits_replaced_with_zeros=argument_default_value, check_for_lowercase=argument_default_value, debug=argument_default_value, dropout_rate=argument_default_value, experiment_name=argument_default_value, freeze_token_embeddings=argument_default_value, gradient_clipping_value=argument_default_value, learning_rate=argument_default_value, load_only_pretrained_token_embeddings=argument_default_value, load_all_pretrained_token_embeddings=argument_default_value, main_evaluation_mode=argument_default_value, maximum_number_of_epochs=argument_default_value, number_of_cpu_threads=argument_default_value, number_of_gpus=argument_default_value, optimizer=argument_default_value, output_folder=argument_default_value, patience=argument_default_value, plot_format=argument_default_value, reload_character_embeddings=argument_default_value, reload_character_lstm=argument_default_value, reload_crf=argument_default_value, reload_feedforward=argument_default_value, reload_token_embeddings=argument_default_value, reload_token_lstm=argument_default_value, remap_unknown_tokens_to_unk=argument_default_value, spacylanguage=argument_default_value, tagging_format=argument_default_value, token_embedding_dimension=argument_default_value, token_lstm_hidden_state_dimension=argument_default_value, token_pretrained_embedding_filepath=argument_default_value, tokenizer=argument_default_value, train_model=argument_default_value, use_character_lstm=argument_default_value, use_crf=argument_default_value, use_pretrained_model=argument_default_value, verbose=argument_default_value, argument_default_value=argument_default_value): # Parse arguments arguments = dict( (k, str(v)) for k, v in locals().items() if k != 'self') # Initialize parameters parameters, conf_parameters = self._load_parameters( arguments['parameters_filepath'], arguments=arguments) dataset_filepaths, dataset_brat_folders = self._get_valid_dataset_filepaths( parameters) self._check_parameter_compatiblity(parameters, dataset_filepaths) # Load dataset dataset = ds.Dataset(verbose=parameters['verbose'], debug=parameters['debug']) token_to_vector = dataset.load_dataset(dataset_filepaths, parameters) # Launch session session_conf = tf.ConfigProto( intra_op_parallelism_threads=parameters['number_of_cpu_threads'], inter_op_parallelism_threads=parameters['number_of_cpu_threads'], device_count={ 'CPU': 1, 'GPU': parameters['number_of_gpus'] }, allow_soft_placement= True, # automatically choose an existing and supported device to run the operations in case the specified one doesn't exist log_device_placement=False) sess = tf.Session(config=session_conf) with sess.as_default(): # Create model and initialize or load pretrained model ### Instantiate the model model = EntityLSTM(dataset, parameters) ### Initialize the model and restore from pretrained model if needed sess.run(tf.global_variables_initializer()) if not parameters['use_pretrained_model']: model.load_pretrained_token_embeddings(sess, dataset, parameters, token_to_vector) self.transition_params_trained = np.random.rand( len(dataset.unique_labels) + 2, len(dataset.unique_labels) + 2) else: self.transition_params_trained = model.restore_from_pretrained_model( parameters, dataset, sess, token_to_vector=token_to_vector) del token_to_vector self.dataset = dataset self.dataset_brat_folders = dataset_brat_folders self.dataset_filepaths = dataset_filepaths self.model = model self.parameters = parameters self.conf_parameters = conf_parameters self.sess = sess