Beispiel #1
0
 def _create_stats_graph_folder(self, parameters):
     # Initialize stats_graph_folder
     experiment_timestamp = utils.get_current_time_in_miliseconds()
     dataset_name = utils.get_basename_without_extension(parameters['dataset_text_folder'])
     model_name = '{0}_{1}'.format(dataset_name, experiment_timestamp)
     utils.create_folder_if_not_exists(parameters['output_folder'])
     stats_graph_folder = os.path.join(parameters['output_folder'], model_name) # Folder where to save graphs
     utils.create_folder_if_not_exists(stats_graph_folder)
     return stats_graph_folder, experiment_timestamp
Beispiel #2
0
 def _create_stats_graph_folder(self, parameters):
     '''
     Tạo folder output/en để chứa các file sinh ra trong quá trình huấn luyện
     Trả về
     (đường dẫn đến folder output, timestamp của lần chạy)
     '''
     experiment_timestamp = utils.get_current_time_in_miliseconds()
     dataset_name = utils.get_basename_without_extension(parameters['dataset_text_folder'])
     model_name = '{0}_{1}'.format(dataset_name, experiment_timestamp)
     utils.create_folder_if_not_exists(parameters['output_folder'])
     stats_graph_folder = os.path.join(parameters['output_folder'], model_name) # Folder where to save graphs
     utils.create_folder_if_not_exists(stats_graph_folder)
     return stats_graph_folder, experiment_timestamp
Beispiel #3
0
def main():

    parameters, conf_parameters = load_parameters()
    dataset_filepaths, dataset_brat_folders = get_valid_dataset_filepaths(parameters)
    check_parameter_compatiblity(parameters, dataset_filepaths)

    # Load dataset
    dataset = ds.Dataset(verbose=parameters['verbose'], debug=parameters['debug'])
    dataset.load_dataset(dataset_filepaths, parameters)

    # Create graph and session
    with tf.Graph().as_default():
        session_conf = tf.ConfigProto(
            intra_op_parallelism_threads=parameters['number_of_cpu_threads'],
            inter_op_parallelism_threads=parameters['number_of_cpu_threads'],
            device_count={'CPU': 1, 'GPU': parameters['number_of_gpus']},
            allow_soft_placement=True, # automatically choose an existing and supported device to run the operations in case the specified one doesn't exist
            log_device_placement=False
            )

        sess = tf.Session(config=session_conf)

        with sess.as_default():
            # Initialize and save execution details
            start_time = time.time()
            experiment_timestamp = utils.get_current_time_in_miliseconds()
            results = {}
            results['epoch'] = {}
            results['execution_details'] = {}
            results['execution_details']['train_start'] = start_time
            results['execution_details']['time_stamp'] = experiment_timestamp
            results['execution_details']['early_stop'] = False
            results['execution_details']['keyboard_interrupt'] = False
            results['execution_details']['num_epochs'] = 0
            results['model_options'] = copy.copy(parameters)

            dataset_name = utils.get_basename_without_extension(parameters['dataset_text_folder'])
            model_name = '{0}_{1}'.format(dataset_name, results['execution_details']['time_stamp'])

            output_folder=os.path.join('..', 'output')
            utils.create_folder_if_not_exists(output_folder)
            stats_graph_folder=os.path.join(output_folder, model_name) # Folder where to save graphs
            utils.create_folder_if_not_exists(stats_graph_folder)
            model_folder = os.path.join(stats_graph_folder, 'model')
            utils.create_folder_if_not_exists(model_folder)
            with open(os.path.join(model_folder, 'parameters.ini'), 'w') as parameters_file:
                conf_parameters.write(parameters_file)
            tensorboard_log_folder = os.path.join(stats_graph_folder, 'tensorboard_logs')
            utils.create_folder_if_not_exists(tensorboard_log_folder)
            tensorboard_log_folders = {}
            for dataset_type in dataset_filepaths.keys():
                tensorboard_log_folders[dataset_type] = os.path.join(stats_graph_folder, 'tensorboard_logs', dataset_type)
                utils.create_folder_if_not_exists(tensorboard_log_folders[dataset_type])
            pickle.dump(dataset, open(os.path.join(model_folder, 'dataset.pickle'), 'wb'))

            # Instantiate the model
            # graph initialization should be before FileWriter, otherwise the graph will not appear in TensorBoard
            model = EntityLSTM(dataset, parameters)

            # Instantiate the writers for TensorBoard
            writers = {}
            for dataset_type in dataset_filepaths.keys():
                writers[dataset_type] = tf.summary.FileWriter(tensorboard_log_folders[dataset_type], graph=sess.graph)
            embedding_writer = tf.summary.FileWriter(model_folder) # embedding_writer has to write in model_folder, otherwise TensorBoard won't be able to view embeddings

            embeddings_projector_config = projector.ProjectorConfig()
            tensorboard_token_embeddings = embeddings_projector_config.embeddings.add()
            tensorboard_token_embeddings.tensor_name = model.token_embedding_weights.name
            token_list_file_path = os.path.join(model_folder, 'tensorboard_metadata_tokens.tsv')
            tensorboard_token_embeddings.metadata_path = os.path.relpath(token_list_file_path, '..')

            tensorboard_character_embeddings = embeddings_projector_config.embeddings.add()
            tensorboard_character_embeddings.tensor_name = model.character_embedding_weights.name
            character_list_file_path = os.path.join(model_folder, 'tensorboard_metadata_characters.tsv')
            tensorboard_character_embeddings.metadata_path = os.path.relpath(character_list_file_path, '..')

            projector.visualize_embeddings(embedding_writer, embeddings_projector_config)

            # Write metadata for TensorBoard embeddings
            token_list_file = codecs.open(token_list_file_path,'w', 'UTF-8')
            for token_index in range(dataset.vocabulary_size):
                token_list_file.write('{0}\n'.format(dataset.index_to_token[token_index]))
            token_list_file.close()

            character_list_file = codecs.open(character_list_file_path,'w', 'UTF-8')
            for character_index in range(dataset.alphabet_size):
                if character_index == dataset.PADDING_CHARACTER_INDEX:
                    character_list_file.write('PADDING\n')
                else:
                    character_list_file.write('{0}\n'.format(dataset.index_to_character[character_index]))
            character_list_file.close()


            # Initialize the model
            sess.run(tf.global_variables_initializer())
            if not parameters['use_pretrained_model']:
                model.load_pretrained_token_embeddings(sess, dataset, parameters)

            # Start training + evaluation loop. Each iteration corresponds to 1 epoch.
            bad_counter = 0 # number of epochs with no improvement on the validation test in terms of F1-score
            previous_best_valid_f1_score = 0
            transition_params_trained = np.random.rand(len(dataset.unique_labels)+2,len(dataset.unique_labels)+2)
            model_saver = tf.train.Saver(max_to_keep=parameters['maximum_number_of_epochs'])  # defaults to saving all variables
            epoch_number = -1
            try:
                while True:
                    step = 0
                    epoch_number += 1
                    print('\nStarting epoch {0}'.format(epoch_number))

                    epoch_start_time = time.time()

                    if parameters['use_pretrained_model'] and epoch_number == 0:
                        # Restore pretrained model parameters
                        transition_params_trained = train.restore_model_parameters_from_pretrained_model(parameters, dataset, sess, model, model_saver)
                    elif epoch_number != 0:
                        # Train model: loop over all sequences of training set with shuffling
                        sequence_numbers=list(range(len(dataset.token_indices['train'])))
                        random.shuffle(sequence_numbers)
                        for sequence_number in sequence_numbers:
                            transition_params_trained = train.train_step(sess, dataset, sequence_number, model, transition_params_trained, parameters)
                            step += 1
                            if step % 10 == 0:
                                print('Training {0:.2f}% done'.format(step/len(sequence_numbers)*100), end='\r', flush=True)

                    epoch_elapsed_training_time = time.time() - epoch_start_time
                    print('Training completed in {0:.2f} seconds'.format(epoch_elapsed_training_time), flush=True)

                    y_pred, y_true, output_filepaths = train.predict_labels(sess, model, transition_params_trained, parameters, dataset, epoch_number, stats_graph_folder, dataset_filepaths)

                    # Evaluate model: save and plot results
                    evaluate.evaluate_model(results, dataset, y_pred, y_true, stats_graph_folder, epoch_number, epoch_start_time, output_filepaths, parameters)

                    if parameters['use_pretrained_model'] and not parameters['train_model']:
                        conll_to_brat.output_brat(output_filepaths, dataset_brat_folders, stats_graph_folder)
                        break

                    # Save model
                    model_saver.save(sess, os.path.join(model_folder, 'model_{0:05d}.ckpt'.format(epoch_number)))

                    # Save TensorBoard logs
                    summary = sess.run(model.summary_op, feed_dict=None)
                    writers['train'].add_summary(summary, epoch_number)
                    writers['train'].flush()
                    utils.copytree(writers['train'].get_logdir(), model_folder)


                    # Early stop
                    valid_f1_score = results['epoch'][epoch_number][0]['valid']['f1_score']['micro']
                    if  valid_f1_score > previous_best_valid_f1_score:
                        bad_counter = 0
                        previous_best_valid_f1_score = valid_f1_score
                        conll_to_brat.output_brat(output_filepaths, dataset_brat_folders, stats_graph_folder, overwrite=True)
                    else:
                        bad_counter += 1
                    print("The last {0} epochs have not shown improvements on the validation set.".format(bad_counter))

                    if bad_counter >= parameters['patience']:
                        print('Early Stop!')
                        results['execution_details']['early_stop'] = True
                        break

                    if epoch_number >= parameters['maximum_number_of_epochs']: break


            except KeyboardInterrupt:
                results['execution_details']['keyboard_interrupt'] = True
                print('Training interrupted')

            print('Finishing the experiment')
            end_time = time.time()
            results['execution_details']['train_duration'] = end_time - start_time
            results['execution_details']['train_end'] = end_time
            print('ok1')
            evaluate.save_results(results, stats_graph_folder)
            print('ok2')
        print('ok3')
        #sess.close() # release the session's resources
    print('ok4')
Beispiel #4
0
def main(argv=sys.argv):

    arguments = parse_arguments(argv[1:])

    parameters, conf_parameters = load_parameters(
        arguments['parameters_filepath'], arguments=arguments)
    dataset_filepaths, dataset_brat_folders = get_valid_dataset_filepaths(
        parameters)
    check_parameter_compatiblity(parameters, dataset_filepaths)

    # Load dataset
    dataset = ds.Dataset(verbose=parameters['verbose'],
                         debug=parameters['debug'])
    dataset.load_dataset(dataset_filepaths, parameters)

    # Create graph and session
    with tf.device('/gpu:0'):
        with tf.Graph().as_default():
            session_conf = tf.ConfigProto(
                intra_op_parallelism_threads=parameters[
                    'number_of_cpu_threads'],
                inter_op_parallelism_threads=parameters[
                    'number_of_cpu_threads'],
                device_count={
                    'CPU': 1,
                    'GPU': parameters['number_of_gpus']
                },
                allow_soft_placement=True,
                log_device_placement=False)

            sess = tf.Session(config=session_conf)

            with sess.as_default():
                start_time = time.time()
                experiment_timestamp = utils.get_current_time_in_miliseconds()
                results = {}
                results['epoch'] = {}
                results['execution_details'] = {}
                results['execution_details']['train_start'] = start_time
                results['execution_details'][
                    'time_stamp'] = experiment_timestamp
                results['execution_details']['early_stop'] = False
                results['execution_details']['keyboard_interrupt'] = False
                results['execution_details']['num_epochs'] = 0
                results['model_options'] = copy.copy(parameters)

                dataset_name = utils.get_basename_without_extension(
                    parameters['dataset_text_folder'])
                model_name = dataset_name
                utils.create_folder_if_not_exists(parameters['output_folder'])
                stats_graph_folder = os.path.join(
                    parameters['output_folder'],
                    model_name)  # Folder where to save graphs
                final_weights_folder = os.path.join(
                    parameters['output_folder'], 'weights')
                utils.create_folder_if_not_exists(stats_graph_folder)
                utils.create_folder_if_not_exists(final_weights_folder)
                model_folder = os.path.join(stats_graph_folder, 'model')
                utils.create_folder_if_not_exists(model_folder)
                with open(os.path.join(model_folder, 'parameters.ini'),
                          'w') as parameters_file:
                    conf_parameters.write(parameters_file)
                tensorboard_log_folder = os.path.join(stats_graph_folder,
                                                      'tensorboard_logs')
                utils.create_folder_if_not_exists(tensorboard_log_folder)
                tensorboard_log_folders = {}
                for dataset_type in dataset_filepaths.keys():
                    tensorboard_log_folders[dataset_type] = os.path.join(
                        stats_graph_folder, 'tensorboard_logs', dataset_type)
                    utils.create_folder_if_not_exists(
                        tensorboard_log_folders[dataset_type])
                pickle.dump(
                    dataset,
                    open(os.path.join(model_folder, 'dataset.pickle'), 'wb'))

                model = EntityLSTM(dataset, parameters)

                writers = {}
                for dataset_type in dataset_filepaths.keys():
                    writers[dataset_type] = tf.summary.FileWriter(
                        tensorboard_log_folders[dataset_type],
                        graph=sess.graph)
                embedding_writer = tf.summary.FileWriter(model_folder)

                embeddings_projector_config = projector.ProjectorConfig()
                tensorboard_token_embeddings = embeddings_projector_config.embeddings.add(
                )
                tensorboard_token_embeddings.tensor_name = model.token_embedding_weights.name
                token_list_file_path = os.path.join(
                    model_folder, 'tensorboard_metadata_tokens.tsv')
                tensorboard_token_embeddings.metadata_path = os.path.relpath(
                    token_list_file_path, '..')

                tensorboard_character_embeddings = embeddings_projector_config.embeddings.add(
                )
                tensorboard_character_embeddings.tensor_name = model.character_embedding_weights.name
                character_list_file_path = os.path.join(
                    model_folder, 'tensorboard_metadata_characters.tsv')
                tensorboard_character_embeddings.metadata_path = os.path.relpath(
                    character_list_file_path, '..')

                projector.visualize_embeddings(embedding_writer,
                                               embeddings_projector_config)

                token_list_file = codecs.open(token_list_file_path, 'w',
                                              'latin-1')
                for token_index in range(dataset.vocabulary_size):
                    token_list_file.write('{0}\n'.format(
                        dataset.index_to_token[token_index]))
                token_list_file.close()

                character_list_file = codecs.open(character_list_file_path,
                                                  'w', 'latin-1')
                for character_index in range(dataset.alphabet_size):
                    if character_index == dataset.PADDING_CHARACTER_INDEX:
                        character_list_file.write('PADDING\n')
                    else:
                        character_list_file.write('{0}\n'.format(
                            dataset.index_to_character[character_index]))
                character_list_file.close()

                # Initialize the model
                sess.run(tf.global_variables_initializer())
                if not parameters['use_pretrained_model']:
                    model.load_pretrained_token_embeddings(
                        sess, dataset, parameters)

                patience_counter = 0  # number of epochs with no improvement on the validation test in terms of F1-score
                f1_score_best = 0
                f1_scores = {'train-F1': [], 'valid-F1': [], 'test-F1': []}
                transition_params_trained = np.random.rand(
                    len(dataset.unique_labels) + 2,
                    len(dataset.unique_labels) + 2)
                model_saver = tf.train.Saver(
                    max_to_keep=parameters['num_of_model_to_keep']
                )  #, reshape= True)  # defaults to saving all variables
                epoch_number = -1
                try:
                    while True:
                        step = 0
                        epoch_number += 1
                        print('\nStarting epoch {0}'.format(epoch_number))

                        epoch_start_time = time.time()

                        if parameters[
                                'use_pretrained_model'] and epoch_number == 0:

                            if parameters['use_corrector']:
                                parameters['use_corrector'] = False
                                transition_params_trained = train.restore_pretrained_model(
                                    parameters, dataset, sess, model,
                                    model_saver)
                                print(
                                    'Getting the 3-label predictions from the step1 model.'
                                )
                                all_pred_labels, y_pred_for_corrector, y_true_for_corrector, \
                                output_filepaths = train.predict_labels(sess, model,
                                                                        transition_params_trained,
                                                                        parameters, dataset,
                                                                        epoch_number,
                                                                        stats_graph_folder,
                                                                        dataset_filepaths,
                                                                        for_corrector = True)
                                all_pred_indices = {}  #defaultdict(list)
                                for dataset_type in dataset_filepaths.keys():
                                    all_pred_indices[dataset_type] = []
                                    for i in range(
                                            len(all_pred_labels[dataset_type])
                                    ):
                                        indices = [
                                            dataset.
                                            label_corrector_to_index[label]
                                            for label in
                                            all_pred_labels[dataset_type][i]
                                        ]
                                        all_pred_indices[dataset_type].append(
                                            indices)

                                label_binarizer_corrector = sklearn.preprocessing.LabelBinarizer(
                                )
                                label_binarizer_corrector.fit(
                                    range(
                                        max(dataset.index_to_label_corrector.
                                            keys()) + 1))
                                predicted_label_corrector_vector_indices = {}
                                for dataset_type in dataset_filepaths.keys():
                                    predicted_label_corrector_vector_indices[
                                        dataset_type] = []
                                    for label_indices_sequence in all_pred_indices[
                                            dataset_type]:
                                        predicted_label_corrector_vector_indices[
                                            dataset_type].append(
                                                label_binarizer_corrector.
                                                transform(
                                                    label_indices_sequence))
                                parameters['use_corrector'] = True

                            transition_params_trained, model, glo_step = \
                                train.restore_model_parameters_from_pretrained_model(parameters, dataset, sess, model, model_saver)

                            for dataset_type in dataset_filepaths.keys():
                                writers[dataset_type] = tf.summary.FileWriter(
                                    tensorboard_log_folders[dataset_type],
                                    graph=sess.graph)
                                embedding_writer = tf.summary.FileWriter(
                                    model_folder)
                            init_new_vars_op = tf.initialize_variables(
                                [glo_step])
                            sess.run(init_new_vars_op)

                        elif epoch_number != 0:
                            sequence_numbers = list(
                                range(len(dataset.token_indices['train'])))
                            random.shuffle(sequence_numbers)
                            for sequence_number in sequence_numbers:
                                transition_params_trained, W_before_crf = train.train_step(
                                    sess, dataset, sequence_number, model,
                                    transition_params_trained, parameters)
                                step += 1

                        epoch_elapsed_training_time = time.time(
                        ) - epoch_start_time
                        print('Training completed in {0:.2f} seconds'.format(
                            epoch_elapsed_training_time),
                              flush=False)
                        if parameters['use_corrector']:
                            original_label_corrector_vector_indices = dataset.label_corrector_vector_indices
                            dataset.label_corrector_vector_indices = predicted_label_corrector_vector_indices
                            y_pred, y_true, output_filepaths = train.predict_labels(
                                sess, model, transition_params_trained,
                                parameters, dataset, epoch_number,
                                stats_graph_folder, dataset_filepaths)

                            # Evaluate model: save and plot results
                            evaluate.evaluate_model(results, dataset, y_pred,
                                                    y_true, stats_graph_folder,
                                                    epoch_number,
                                                    epoch_start_time,
                                                    output_filepaths,
                                                    parameters)
                            dataset.label_corrector_vector_indices = original_label_corrector_vector_indices
                        else:
                            y_pred, y_true, output_filepaths = train.predict_labels(
                                sess, model, transition_params_trained,
                                parameters, dataset, epoch_number,
                                stats_graph_folder, dataset_filepaths)

                            # Evaluate model: save and plot results
                            evaluate.evaluate_model(results, dataset, y_pred,
                                                    y_true, stats_graph_folder,
                                                    epoch_number,
                                                    epoch_start_time,
                                                    output_filepaths,
                                                    parameters)

                        summary = sess.run(model.summary_op, feed_dict=None)
                        writers['train'].add_summary(summary, epoch_number)
                        writers['train'].flush()
                        utils.copytree(writers['train'].get_logdir(),
                                       model_folder)

                        # Early stopping
                        train_f1_score = results['epoch'][epoch_number][0][
                            'train']['f1_score']['micro']
                        valid_f1_score = results['epoch'][epoch_number][0][
                            'valid']['f1_score']['micro']
                        test_f1_score = results['epoch'][epoch_number][0][
                            'test']['f1_score']['micro']
                        f1_scores['train-F1'].append(train_f1_score)
                        f1_scores['valid-F1'].append(valid_f1_score)
                        f1_scores['test-F1'].append(test_f1_score)

                        if valid_f1_score > f1_score_best:
                            patience_counter = 0
                            f1_score_best = valid_f1_score
                            # Save the best model
                            model_saver.save(
                                sess,
                                os.path.join(model_folder, 'best_model.ckpt'))
                            print(
                                'updated model to current epoch : epoch {:d}'.
                                format(epoch_number))
                            print('the model is saved in: {:s}'.format(
                                model_folder))
                            ### newly deleted
                        else:
                            patience_counter += 1
                        print("In epoch {:d}, the valid F1 is : {:f}".format(
                            epoch_number, valid_f1_score))
                        print(
                            "The last {0} epochs have not shown improvements on the validation set."
                            .format(patience_counter))

                        if patience_counter >= parameters['patience']:
                            print('Early Stop!')
                            results['execution_details']['early_stop'] = True

                        if epoch_number >= parameters[
                                'maximum_number_of_epochs'] and parameters[
                                    'refine_with_crf']:
                            model = train.refine_with_crf(
                                parameters, sess, model, model_saver)
                            print('refine model with CRF ...')

                            for additional_epoch in range(
                                    parameters['additional_epochs_with_crf']):
                                print('Additional {:d}th epoch'.format(
                                    additional_epoch))
                                sequence_numbers = list(
                                    range(len(dataset.token_indices['train'])))
                                random.shuffle(sequence_numbers)
                                for sequence_number in sequence_numbers:
                                    transition_params_trained, W_before_crf = train.train_step(
                                        sess, dataset, sequence_number, model,
                                        transition_params_trained, parameters)
                                    step += 1
                                epoch_elapsed_training_time = time.time(
                                ) - epoch_start_time
                                print(
                                    'Additional training completed in {0:.2f} seconds'
                                    .format(epoch_elapsed_training_time),
                                    flush=False)

                                y_pred, y_true, output_filepaths = train.predict_labels(
                                    sess, model, transition_params_trained,
                                    parameters, dataset, epoch_number,
                                    stats_graph_folder, dataset_filepaths)

                                evaluate.evaluate_model(
                                    results, dataset, y_pred, y_true,
                                    stats_graph_folder, epoch_number,
                                    epoch_start_time, output_filepaths,
                                    parameters)

                                summary = sess.run(model.summary_op,
                                                   feed_dict=None)
                                writers['train'].add_summary(
                                    summary, epoch_number)
                                writers['train'].flush()
                                utils.copytree(writers['train'].get_logdir(),
                                               model_folder)

                        if epoch_number >= parameters[
                                'maximum_number_of_epochs'] and not parameters[
                                    'refine_with_crf']:
                            break
                    if not parameters['use_pretrained_model']:
                        plot_name = 'F1-summary-step1.svg'
                    else:
                        plot_name = 'F1-summary-step2.svg'
                    for k, l in f1_scores.items():
                        print(k, l)
                    utils_plots.plot_f1(
                        f1_scores,
                        os.path.join(stats_graph_folder, '..', plot_name),
                        'F1 score summary')

                except KeyboardInterrupt:
                    results['execution_details']['keyboard_interrupt'] = True
                    print('Training interrupted')

                print('Finishing the experiment')
                end_time = time.time()
                results['execution_details'][
                    'train_duration'] = end_time - start_time
                results['execution_details']['train_end'] = end_time
                evaluate.save_results(results, stats_graph_folder)
                for dataset_type in dataset_filepaths.keys():
                    writers[dataset_type].close()

    sess.close()
Beispiel #5
0
def main(argv=sys.argv):
    ''' NeuroNER main method

    Args:
        parameters_filepath the path to the parameters file
        output_folder the path to the output folder
    '''
    arguments = parse_arguments(argv[1:])
    parameters, conf_parameters = load_parameters(
        arguments['parameters_filepath'], arguments=arguments)
    dataset_filepaths, dataset_brat_folders = get_valid_dataset_filepaths(
        parameters)
    check_parameter_compatiblity(parameters, dataset_filepaths)

    # Load dataset
    dataset = ds.Dataset(verbose=parameters['verbose'],
                         debug=parameters['debug'])
    dataset.load_dataset(dataset_filepaths, parameters)

    # Create graph and session
    with tf.device('/gpu:0'):
        with tf.Graph().as_default():
            session_conf = tf.ConfigProto(
                intra_op_parallelism_threads=parameters[
                    'number_of_cpu_threads'],
                inter_op_parallelism_threads=parameters[
                    'number_of_cpu_threads'],
                device_count={
                    'CPU': 1,
                    'GPU': parameters['number_of_gpus']
                },
                allow_soft_placement=True,
                # automatically choose an existing and supported device to run the operations in case the specified one doesn't exist
                log_device_placement=False)

            sess = tf.Session(config=session_conf)

            with sess.as_default():

                start_time = time.time()
                experiment_timestamp = utils.get_current_time_in_miliseconds()
                results = {}
                results['epoch'] = {}
                results['execution_details'] = {}
                results['execution_details']['train_start'] = start_time
                results['execution_details'][
                    'time_stamp'] = experiment_timestamp
                results['execution_details']['early_stop'] = False
                results['execution_details']['keyboard_interrupt'] = False
                results['execution_details']['num_epochs'] = 0
                results['model_options'] = copy.copy(parameters)

                dataset_name = utils.get_basename_without_extension(
                    parameters['dataset_text_folder'])
                model_name = dataset_name
                utils.create_folder_if_not_exists(parameters['output_folder'])
                stats_graph_folder = os.path.join(
                    parameters['output_folder'],
                    model_name)  # Folder where to save graphs
                final_weights_folder = os.path.join(
                    parameters['output_folder'], 'weights')
                utils.create_folder_if_not_exists(stats_graph_folder)
                utils.create_folder_if_not_exists(final_weights_folder)
                model_folder = os.path.join(stats_graph_folder, 'model')
                utils.create_folder_if_not_exists(model_folder)
                # saving the parameter setting to the output model dir. For later resuming training
                with open(os.path.join(model_folder, 'parameters.ini'),
                          'w') as parameters_file:
                    conf_parameters.write(parameters_file)
                tensorboard_log_folder = os.path.join(stats_graph_folder,
                                                      'tensorboard_logs')
                utils.create_folder_if_not_exists(tensorboard_log_folder)
                tensorboard_log_folders = {}
                for dataset_type in dataset_filepaths.keys():
                    tensorboard_log_folders[dataset_type] = os.path.join(
                        stats_graph_folder, 'tensorboard_logs', dataset_type)
                    utils.create_folder_if_not_exists(
                        tensorboard_log_folders[dataset_type])
                pickle.dump(
                    dataset,
                    open(os.path.join(model_folder, 'dataset.pickle'), 'wb'))

                # Instantiate the model
                # graph initialization should be before FileWriter, otherwise the graph will not appear in TensorBoard
                model = EntityLSTM(dataset, parameters)

                # Instantiate the writers for TensorBoard
                writers = {}
                for dataset_type in dataset_filepaths.keys():
                    writers[dataset_type] = tf.summary.FileWriter(
                        tensorboard_log_folders[dataset_type],
                        graph=sess.graph)
                # embedding_writer has to write in model_folder, otherwise TensorBoard won't be able to view embeddings
                embedding_writer = tf.summary.FileWriter(model_folder)

                embeddings_projector_config = projector.ProjectorConfig()
                tensorboard_token_embeddings = embeddings_projector_config.embeddings.add(
                )
                tensorboard_token_embeddings.tensor_name = model.token_embedding_weights.name
                token_list_file_path = os.path.join(
                    model_folder, 'tensorboard_metadata_tokens.tsv')
                tensorboard_token_embeddings.metadata_path = os.path.relpath(
                    token_list_file_path, '..')

                tensorboard_character_embeddings = embeddings_projector_config.embeddings.add(
                )
                tensorboard_character_embeddings.tensor_name = model.character_embedding_weights.name
                character_list_file_path = os.path.join(
                    model_folder, 'tensorboard_metadata_characters.tsv')
                tensorboard_character_embeddings.metadata_path = os.path.relpath(
                    character_list_file_path, '..')

                projector.visualize_embeddings(embedding_writer,
                                               embeddings_projector_config)

                # Write metadata for TensorBoard embeddings
                token_list_file = codecs.open(token_list_file_path, 'w',
                                              'latin-1')
                for token_index in range(dataset.vocabulary_size):
                    token_list_file.write('{0}\n'.format(
                        dataset.index_to_token[token_index]))
                token_list_file.close()

                character_list_file = codecs.open(character_list_file_path,
                                                  'w', 'latin-1')
                for character_index in range(dataset.alphabet_size):
                    if character_index == dataset.PADDING_CHARACTER_INDEX:
                        character_list_file.write('PADDING\n')
                    else:
                        character_list_file.write('{0}\n'.format(
                            dataset.index_to_character[character_index]))
                character_list_file.close()

                # Initialize the model
                sess.run(tf.global_variables_initializer())
                if not parameters['use_pretrained_model']:
                    model.load_pretrained_token_embeddings(
                        sess, dataset, parameters)

                # Start training + evaluation loop. Each iteration corresponds to 1 epoch.
                patience_counter = 0
                f1_score_best = 0
                f1_scores = {'train-F1': [], 'valid-F1': [], 'test-F1': []}
                f1_scores_conll = {
                    'train-F1': [],
                    'valid-F1': [],
                    'test-F1': []
                }
                transition_params_trained = np.random.rand(
                    len(dataset.unique_labels) + 2,
                    len(dataset.unique_labels) + 2)
                model_saver = tf.train.Saver(
                    max_to_keep=parameters['num_of_model_to_keep'])
                epoch_number = -1
                try:
                    while True:
                        step = 0
                        epoch_number += 1
                        print('\nStarting epoch {0}'.format(epoch_number))

                        epoch_start_time = time.time()

                        # use pre-trained model and epoch_number = 0
                        if parameters[
                                'use_pretrained_model'] and epoch_number == 0:

                            if parameters['use_adapter']:
                                parameters['use_adapter'] = False
                                transition_params_trained = train.restore_pretrained_model(
                                    parameters, dataset, sess, model,
                                    model_saver)
                                print(
                                    'Getting the 3-label predictions from the step1 model.'
                                )
                                all_pred_labels, y_pred_for_adapter, y_true_for_adapter, \
                                output_filepaths = train.predict_labels(sess, model,
                                                                        transition_params_trained,
                                                                        parameters, dataset,
                                                                        epoch_number,
                                                                        stats_graph_folder,
                                                                        dataset_filepaths,
                                                                        for_adapter=True)
                                # use the label2idx mapping (for adapter) in the dataset to transform all_pred_labels
                                all_pred_indices = {}
                                for dataset_type in dataset_filepaths.keys():
                                    all_pred_indices[dataset_type] = []
                                    for i in range(
                                            len(all_pred_labels[dataset_type])
                                    ):
                                        indices = [
                                            dataset.
                                            label_adapter_to_index[label]
                                            for label in
                                            all_pred_labels[dataset_type][i]
                                        ]
                                        all_pred_indices[dataset_type].append(
                                            indices)

                                # and use binarizer to transform to ndarray
                                label_binarizer_adapter = sklearn.preprocessing.LabelBinarizer(
                                )
                                label_binarizer_adapter.fit(
                                    range(
                                        max(dataset.index_to_label_adapter.
                                            keys()) + 1))
                                predicted_label_adapter_vector_indices = {}
                                for dataset_type in dataset_filepaths.keys():
                                    predicted_label_adapter_vector_indices[
                                        dataset_type] = []
                                    for label_indices_sequence in all_pred_indices[
                                            dataset_type]:
                                        predicted_label_adapter_vector_indices[
                                            dataset_type].append(
                                                label_binarizer_adapter.
                                                transform(
                                                    label_indices_sequence))
                                parameters['use_adapter'] = True

                            if parameters['train_model'] and parameters[
                                    'add_class']:
                                transition_params_trained, model, glo_step = \
                                    train.restore_model_parameters_from_pretrained_model(parameters, dataset, sess,
                                                                                         model, model_saver)
                                init_new_vars_op = tf.initialize_variables(
                                    [glo_step])
                                sess.run(init_new_vars_op)
                            else:
                                transition_params_trained = \
                                    train.restore_pretrained_model(parameters, dataset, sess, model, model_saver)

                            for dataset_type in dataset_filepaths.keys():
                                writers[dataset_type] = tf.summary.FileWriter(
                                    tensorboard_log_folders[dataset_type],
                                    graph=sess.graph)
                                # embedding_writer has to write in model_folder, otherwise TensorBoard won't be able to view embeddings
                                embedding_writer = tf.summary.FileWriter(
                                    model_folder)

                        # epoch_number != 0, no matter use or not use pre-trained model
                        elif epoch_number != 0:
                            # Train model: loop over all sequences of training set with shuffling
                            sequence_numbers = list(
                                range(len(dataset.token_indices['train'])))
                            random.shuffle(sequence_numbers)
                            for sequence_number in sequence_numbers:
                                transition_params_trained, W_before_crf = train.train_step(
                                    sess, dataset, sequence_number, model,
                                    transition_params_trained, parameters)
                                step += 1
                        epoch_elapsed_training_time = time.time(
                        ) - epoch_start_time
                        print('Training completed in {0:.2f} seconds'.format(
                            epoch_elapsed_training_time),
                              flush=False)

                        if parameters[
                                'use_adapter']:  # model evaluation, using adapter
                            # pass the pred_for_adapter as label_indices vector
                            original_label_adapter_vector_indices = dataset.label_adapter_vector_indices
                            dataset.label_adapter_vector_indices = predicted_label_adapter_vector_indices
                            y_pred, y_true, output_filepaths = train.predict_labels(
                                sess, model, transition_params_trained,
                                parameters, dataset, epoch_number,
                                stats_graph_folder, dataset_filepaths)

                            evaluate.evaluate_model(results, dataset, y_pred,
                                                    y_true, stats_graph_folder,
                                                    epoch_number,
                                                    epoch_start_time,
                                                    output_filepaths,
                                                    parameters)
                            dataset.label_adapter_vector_indices = original_label_adapter_vector_indices

                        else:  # model evaluation,  not using adapter
                            y_pred, y_true, output_filepaths = train.predict_labels(
                                sess, model, transition_params_trained,
                                parameters, dataset, epoch_number,
                                stats_graph_folder, dataset_filepaths)

                            # Evaluate model: save and plot results
                            evaluate.evaluate_model(results, dataset, y_pred,
                                                    y_true, stats_graph_folder,
                                                    epoch_number,
                                                    epoch_start_time,
                                                    output_filepaths,
                                                    parameters)

                        summary = sess.run(model.summary_op, feed_dict=None)
                        writers['train'].add_summary(summary, epoch_number)
                        writers['train'].flush()
                        utils.copytree(writers['train'].get_logdir(),
                                       model_folder)

                        # Early stopping
                        train_f1_score = results['epoch'][epoch_number][0][
                            'train']['f1_score']['weighted']
                        valid_f1_score = results['epoch'][epoch_number][0][
                            'valid']['f1_score']['weighted']
                        test_f1_score = results['epoch'][epoch_number][0][
                            'test']['f1_score']['weighted']
                        f1_scores['train-F1'].append(train_f1_score)
                        f1_scores['valid-F1'].append(valid_f1_score)
                        f1_scores['test-F1'].append(test_f1_score)

                        train_f1_score_conll = results['epoch'][epoch_number][
                            0]['train']['f1_conll']['micro']
                        valid_f1_score_conll = results['epoch'][epoch_number][
                            0]['valid']['f1_conll']['micro']
                        test_f1_score_conll = results['epoch'][epoch_number][
                            0]['test']['f1_conll']['micro']
                        f1_scores_conll['train-F1'].append(
                            train_f1_score_conll)
                        f1_scores_conll['valid-F1'].append(
                            valid_f1_score_conll)
                        f1_scores_conll['test-F1'].append(test_f1_score_conll)

                        if valid_f1_score > f1_score_best:
                            patience_counter = 0
                            f1_score_best = valid_f1_score
                            # Save the best model
                            model_saver.save(
                                sess,
                                os.path.join(model_folder, 'best_model.ckpt'))
                            print(
                                'updated model to current epoch : epoch {:d}'.
                                format(epoch_number))
                            print('the model is saved in: {:s}'.format(
                                model_folder))
                        else:
                            patience_counter += 1
                        print("In epoch {:d}, the valid F1 is : {:f}".format(
                            epoch_number, valid_f1_score))
                        print(
                            "The last {0} epochs have not shown improvements on the validation set."
                            .format(patience_counter))

                        if patience_counter >= parameters['patience']:
                            print('Early Stop!')
                            results['execution_details']['early_stop'] = True
                            # save last model
                            model_saver.save(
                                sess,
                                os.path.join(model_folder, 'last_model.ckpt'))
                            print('the last model is saved in: {:s}'.format(
                                model_folder))

                            break

                        if epoch_number >= parameters[
                                'maximum_number_of_epochs'] and not parameters[
                                    'refine_with_crf']:
                            break
                    if not parameters['use_pretrained_model']:
                        plot_name = 'F1-summary-step1.svg'
                    else:
                        plot_name = 'F1-summary-step2.svg'

                    print('Sklearn result:')
                    for k, l in f1_scores.items():
                        print(k, l)

                    print('Conll result:')
                    for k, l in f1_scores_conll.items():
                        print(k, l)
                    utils_plots.plot_f1(
                        f1_scores,
                        os.path.join(stats_graph_folder, '..', plot_name),
                        'F1 score summary')

                    # TODO: in step 1, for task a, add the best deploy data to step 2 train set, and call script
                    print('(sklearn micro) test F1:')
                    micro_f1 = ','.join([
                        str(results['epoch'][ep][0]['test']['f1_score']
                            ['micro']) for ep in range(epoch_number + 1)
                    ])
                    print(micro_f1)
                    print('(sklearn macro) test F1:')
                    macro_f1 = ','.join([
                        str(results['epoch'][ep][0]['test']['f1_score']
                            ['macro']) for ep in range(epoch_number + 1)
                    ])
                    print(macro_f1)

                except KeyboardInterrupt:
                    results['execution_details']['keyboard_interrupt'] = True
                    print('Training interrupted')

                print('Finishing the experiment')
                end_time = time.time()
                results['execution_details'][
                    'train_duration'] = end_time - start_time
                results['execution_details']['train_end'] = end_time
                evaluate.save_results(results, stats_graph_folder)
                for dataset_type in dataset_filepaths.keys():
                    writers[dataset_type].close()

    sess.close()  # release the session's resources
Beispiel #6
0
def main():

    parameters, dataset_filepaths = load_parameters()

    # Load dataset
    dataset = ds.Dataset()
    dataset.load_dataset(dataset_filepaths, parameters)

    # Create graph and session
    with tf.Graph().as_default():
        session_conf = tf.ConfigProto(
            device_count={
                'CPU': 1,
                'GPU': 1
            },
            allow_soft_placement=
            True,  #  automatically choose an existing and supported device to run the operations in case the specified one doesn't exist
            log_device_placement=False)

        sess = tf.Session(config=session_conf)

        with sess.as_default():
            # Initialize and save execution details
            start_time = time.time()
            experiment_timestamp = utils.get_current_time_in_miliseconds()
            results = {}
            results['epoch'] = {}
            results['execution_details'] = {}
            results['execution_details']['train_start'] = start_time
            results['execution_details']['time_stamp'] = experiment_timestamp
            results['execution_details']['early_stop'] = False
            results['execution_details']['keyboard_interrupt'] = False
            results['execution_details']['num_epochs'] = 0
            results['model_options'] = copy.copy(parameters)

            dataset_name = utils.get_basename_without_extension(
                parameters['dataset_text_folder'])
            model_name = '{0}_{1}'.format(
                dataset_name, results['execution_details']['time_stamp'])

            output_folder = os.path.join('..', 'output')
            utils.create_folder_if_not_exists(output_folder)
            stats_graph_folder = os.path.join(
                output_folder, model_name)  # Folder where to save graphs
            utils.create_folder_if_not_exists(stats_graph_folder)
            model_folder = os.path.join(stats_graph_folder, 'model')
            utils.create_folder_if_not_exists(model_folder)
            tensorboard_log_folder = os.path.join(stats_graph_folder,
                                                  'tensorboard_logs')
            utils.create_folder_if_not_exists(tensorboard_log_folder)
            tensorboard_log_folders = {}
            for dataset_type in ['train', 'valid', 'test']:
                tensorboard_log_folders[dataset_type] = os.path.join(
                    stats_graph_folder, 'tensorboard_logs', dataset_type)
                utils.create_folder_if_not_exists(
                    tensorboard_log_folders[dataset_type])

            pickle.dump(
                dataset,
                open(os.path.join(stats_graph_folder, 'dataset.pickle'), 'wb'))

            # Instantiate the model
            # graph initialization should be before FileWriter, otherwise the graph will not appear in TensorBoard
            model = EntityLSTM(dataset, parameters)

            # Instantiate the writers for TensorBoard
            writers = {}
            for dataset_type in ['train', 'valid', 'test']:
                writers[dataset_type] = tf.summary.FileWriter(
                    tensorboard_log_folders[dataset_type], graph=sess.graph)
            embedding_writer = tf.summary.FileWriter(
                model_folder
            )  # embedding_writer has to write in model_folder, otherwise TensorBoard won't be able to view embeddings

            embeddings_projector_config = projector.ProjectorConfig()
            tensorboard_token_embeddings = embeddings_projector_config.embeddings.add(
            )
            tensorboard_token_embeddings.tensor_name = model.token_embedding_weights.name
            token_list_file_path = os.path.join(
                model_folder, 'tensorboard_metadata_tokens.tsv')
            tensorboard_token_embeddings.metadata_path = os.path.relpath(
                token_list_file_path, '..')

            tensorboard_character_embeddings = embeddings_projector_config.embeddings.add(
            )
            tensorboard_character_embeddings.tensor_name = model.character_embedding_weights.name
            character_list_file_path = os.path.join(
                model_folder,
                'tensorboard_metadata_characters.tsv')  #  'metadata.tsv'
            tensorboard_character_embeddings.metadata_path = os.path.relpath(
                character_list_file_path, '..')

            projector.visualize_embeddings(embedding_writer,
                                           embeddings_projector_config)

            # Write metadata for TensorBoard embeddings
            token_list_file = open(token_list_file_path, 'w')
            for token_index in range(dataset.vocabulary_size):
                token_list_file.write('{0}\n'.format(
                    dataset.index_to_token[token_index]))
            token_list_file.close()

            character_list_file = open(character_list_file_path, 'w')
            print('len(dataset.character_to_index): {0}'.format(
                len(dataset.character_to_index)))
            print('len(dataset.index_to_character): {0}'.format(
                len(dataset.index_to_character)))
            for character_index in range(dataset.alphabet_size):
                if character_index == dataset.PADDING_CHARACTER_INDEX:
                    character_list_file.write('PADDING\n')
                else:
                    character_list_file.write('{0}\n'.format(
                        dataset.index_to_character[character_index]))
            character_list_file.close()

            # Initialize the model
            sess.run(tf.global_variables_initializer())
            model.load_pretrained_token_embeddings(sess, dataset, parameters)

            # Start training + evaluation loop. Each iteration corresponds to 1 epoch.
            step = 0
            bad_counter = 0  # number of epochs with no improvement on the validation test in terms of F1-score
            previous_best_valid_f1_score = 0
            transition_params_trained = np.random.rand(
                len(dataset.unique_labels), len(dataset.unique_labels))
            model_saver = tf.train.Saver(
                max_to_keep=parameters['maximum_number_of_epochs']
            )  # defaults to saving all variables
            epoch_number = -1
            try:
                while True:
                    epoch_number += 1
                    #epoch_number = math.floor(step / len(dataset.token_indices['train']))
                    print('\nStarting epoch {0}'.format(epoch_number), end='')

                    epoch_start_time = time.time()
                    #print('step: {0}'.format(step))

                    # Train model: loop over all sequences of training set with shuffling
                    sequence_numbers = list(
                        range(len(dataset.token_indices['train'])))
                    random.shuffle(sequence_numbers)
                    for sequence_number in sequence_numbers:
                        transition_params_trained = train.train_step(
                            sess, dataset, sequence_number, model,
                            transition_params_trained, parameters)
                        step += 1
                        if step % 100 == 0:
                            print('.', end='', flush=True)
                            #break
                    print('.', flush=True)
                    #print('step: {0}'.format(step))

                    # Predict labels using trained model
                    y_pred = {}
                    y_true = {}
                    output_filepaths = {}
                    for dataset_type in ['train', 'valid', 'test']:
                        #print('dataset_type:     {0}'.format(dataset_type))
                        prediction_output = train.prediction_step(
                            sess, dataset, dataset_type, model,
                            transition_params_trained, step,
                            stats_graph_folder, epoch_number, parameters)
                        y_pred[dataset_type], y_true[
                            dataset_type], output_filepaths[
                                dataset_type] = prediction_output
#                         model_options = None

                    epoch_elapsed_training_time = time.time(
                    ) - epoch_start_time
                    print(
                        'epoch_elapsed_training_time: {0:.2f} seconds'.format(
                            epoch_elapsed_training_time))

                    results['execution_details']['num_epochs'] = epoch_number

                    # Evaluate model: save and plot results
                    evaluate.evaluate_model(results, dataset, y_pred, y_true,
                                            stats_graph_folder, epoch_number,
                                            epoch_start_time, output_filepaths,
                                            parameters)

                    # Save model
                    model_saver.save(
                        sess,
                        os.path.join(model_folder,
                                     'model_{0:05d}.ckpt'.format(epoch_number))
                    )  #, global_step, latest_filename, meta_graph_suffix, write_meta_graph, write_state)

                    # Save TensorBoard logs
                    summary = sess.run(model.summary_op, feed_dict=None)
                    writers['train'].add_summary(summary, epoch_number)

                    # Early stop
                    valid_f1_score = results['epoch'][epoch_number][0][
                        'valid']['f1_score']['micro']
                    if valid_f1_score > previous_best_valid_f1_score:
                        bad_counter = 0
                        previous_best_valid_f1_score = valid_f1_score
                    else:
                        bad_counter += 1

                    if bad_counter > parameters['patience']:
                        print('Early Stop!')
                        results['execution_details']['early_stop'] = True
                        break

                    if epoch_number > parameters['maximum_number_of_epochs']:
                        break


#                     break # debugging

            except KeyboardInterrupt:
                results['execution_details']['keyboard_interrupt'] = True
                #         assess_model.save_results(results, stats_graph_folder)
                print('Training interrupted')

            print('Finishing the experiment')
            end_time = time.time()
            results['execution_details'][
                'train_duration'] = end_time - start_time
            results['execution_details']['train_end'] = end_time
            evaluate.save_results(results, stats_graph_folder)

    sess.close()  # release the session's resources
Beispiel #7
0
def main():


    #### Parameters - start
    conf_parameters = configparser.ConfigParser()
    conf_parameters.read(os.path.join('.','parameters.ini'))
    nested_parameters = utils.convert_configparser_to_dictionary(conf_parameters)
    parameters = {}
    for k,v in nested_parameters.items():
        parameters.update(v)
    for k,v in parameters.items():
        if k in ['remove_unknown_tokens','character_embedding_dimension','character_lstm_hidden_state_dimension','token_embedding_dimension','token_lstm_hidden_state_dimension',
                 'patience','maximum_number_of_epochs','maximum_training_time','number_of_cpu_threads','number_of_gpus']:
            parameters[k] = int(v)
        if k in ['dropout_rate']:
            parameters[k] = float(v)
        if k in ['use_character_lstm','is_character_lstm_bidirect','is_token_lstm_bidirect','use_crf']:
            parameters[k] = distutils.util.strtobool(v)
    pprint(parameters)

    # Load dataset
    dataset_filepaths = {}
    dataset_filepaths['train'] = os.path.join(parameters['dataset_text_folder'], 'train.txt')
    dataset_filepaths['valid'] = os.path.join(parameters['dataset_text_folder'], 'valid.txt')
    dataset_filepaths['test']  = os.path.join(parameters['dataset_text_folder'], 'test.txt')
    dataset = ds.Dataset()
    dataset.load_dataset(dataset_filepaths, parameters)


    with tf.Graph().as_default():
        session_conf = tf.ConfigProto(
          device_count={'CPU': 1, 'GPU': 1},
          allow_soft_placement=True, #  automatically choose an existing and supported device to run the operations in case the specified one doesn't exist
          log_device_placement=False
          )

        sess = tf.Session(config=session_conf)

        with sess.as_default():
            model = EntityLSTM(dataset, parameters)

            # Define training procedure
            global_step = tf.Variable(0, name="global_step", trainable=False)
            if parameters['optimizer'] == 'adam':
                optimizer = tf.train.AdamOptimizer(1e-3)
            elif parameters['optimizer'] == 'sgd':
                optimizer = tf.train.GradientDescentOptimizer(0.005)
            else:
                raise ValueError("The lr_method parameter must be either adam or sgd.")

            # https://github.com/google/prettytensor/issues/6
            # https://www.tensorflow.org/api_docs/python/framework/graph_collections

            #print('tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES) : {0}'.format(tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES) ))
            #print('tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES) : {0}'.format(tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES) ))
            #print('tf.get_collection(tf.GraphKeys.MODEL_VARIABLES) : {0}'.format(tf.get_collection(tf.GraphKeys.MODEL_VARIABLES) ))

            # https://github.com/blei-lab/edward/issues/286#ref-pullrequest-181330211 : utility function to get all tensorflow variables a node depends on


            grads_and_vars = optimizer.compute_gradients(model.loss)

            # By defining a global_step variable and passing it to the optimizer we allow TensorFlow handle the counting of training steps for us.
            # The global step will be automatically incremented by one every time you execute train_op.
            train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step)


            # Initialize all variables
            sess.run(tf.global_variables_initializer())

            # Load pretrained token embeddings
            if not parameters['token_pretrained_embedding_filepath'] == '':
                load_token_embeddings(sess, model.W, dataset, parameters)


            estop = False  # early stop
            start_time = time.time()
            experiment_timestamp = utils.get_current_time_in_miliseconds()
            results = {}
            #results['model_options'] = copy.copy(model_options)
            #results['model_options'].pop('optimizer', None)
            results['epoch'] = {}
            # save/initialize execution details
            results['execution_details'] = {}
            results['execution_details']['train_start'] = start_time
            results['execution_details']['time_stamp'] = experiment_timestamp
            results['execution_details']['early_stop'] = False
            results['execution_details']['keyboard_interrupt'] = False
            results['execution_details']['num_epochs'] = 0
            results['model_options'] = copy.copy(parameters)

            dataset_name = utils.get_basename_without_extension(parameters['dataset_text_folder']) #opts.train.replace('/', '_').split('.')[0] # 'conll2003en'
            model_name = '{0}_{1}'.format(dataset_name, results['execution_details']['time_stamp'])

            output_folder=os.path.join('..', 'output')
            stats_graph_folder=os.path.join(output_folder, model_name) # Folder where to save graphs
            utils.create_folder_if_not_exists(output_folder)
            print('stats_graph_folder: {0}'.format(stats_graph_folder))
            utils.create_folder_if_not_exists(stats_graph_folder)
            model_folder = os.path.join(stats_graph_folder, 'model')
            utils.create_folder_if_not_exists(model_folder)

            step = 0
            bad_counter = 0
            previous_best_valid_f1_score = 0
            transition_params_trained = np.random.rand(len(dataset.unique_labels),len(dataset.unique_labels))
            try:
                while True:
                    epoch_number = math.floor(step / len(dataset.token_indices['train']))
                    print('epoch_number: {0}'.format(epoch_number))

                    epoch_start_time = time.time()

                    #print('step: {0}'.format(step))

                    # Train model: loop over all sequences of training set with shuffling
                    sequence_numbers=list(range(len(dataset.token_indices['train'])))
                    random.shuffle(sequence_numbers)
                    for sequence_number in sequence_numbers:
                        transition_params_trained = train_step(sess, dataset, sequence_number, train_op, global_step, model, transition_params_trained, parameters)
                        step += 1
                        if sequence_number % 100 == 0:
                            print('.',end='', flush=True)
                            #break

                    # Evaluate model
                    print('step: {0}'.format(step))
                    all_predictions = {}
                    all_y_true  = {}
                    output_filepaths = {}
                    for dataset_type in ['train', 'valid', 'test']:
                        print('dataset_type:     {0}'.format(dataset_type))
                        all_predictions[dataset_type], all_y_true[dataset_type], output_filepaths[dataset_type] = evaluate_model(sess, dataset, dataset_type, model, transition_params_trained, step, stats_graph_folder, epoch_number, parameters)
                        model_options = None

                    # Save and plot results
                    # TODO: remove uidx
                    uidx = 0
                    results['epoch'][epoch_number] = []
                    results['execution_details']['num_epochs'] = epoch_number

                    epoch_elapsed_training_time = time.time() - epoch_start_time
                    print('epoch_elapsed_training_time: {0:02f} seconds'.format(epoch_elapsed_training_time))

                    assess_model.assess_and_save(results, dataset, model_options, all_predictions, all_y_true, stats_graph_folder, epoch_number, uidx, epoch_start_time)
                    assess_model.plot_f1_vs_epoch(results, stats_graph_folder, 'f1_score')
                    assess_model.plot_f1_vs_epoch(results, stats_graph_folder, 'accuracy_score')

                    # CoNLL evaluation script
                    for dataset_type in ['train', 'valid', 'test']:
                        conll_evaluation_script = os.path.join('.', 'conlleval')
                        conll_output_filepath = '{0}_conll_evaluation.txt'.format(output_filepaths[dataset_type])
                        shell_command = 'perl {0} < {1} > {2}'.format(conll_evaluation_script, output_filepaths[dataset_type], conll_output_filepath)
                        print('shell_command: {0}'.format(shell_command))
                        #subprocess.call([shell_command])
                        os.system(shell_command)
                        conll_parsed_output = utils_nlp.get_parsed_conll_output(conll_output_filepath)
                        print('conll_parsed_output: {0}'.format(conll_parsed_output))
                        results['epoch'][epoch_number][0][dataset_type]['conll'] = conll_parsed_output
                        results['epoch'][epoch_number][0][dataset_type]['f1_conll'] = {}
                        results['epoch'][epoch_number][0][dataset_type]['f1_conll']['micro'] = results['epoch'][epoch_number][0][dataset_type]['conll']['all']['f1']
                    assess_model.plot_f1_vs_epoch(results, stats_graph_folder, 'f1_conll', from_json=False)

                    #end_time = time.time()
                    #results['execution_details']['train_duration'] = end_time - start_time
                    #results['execution_details']['train_end'] = end_time

                    # Early stop
                    valid_f1_score = results['epoch'][epoch_number][0]['valid']['f1_score']['micro']
                    if  valid_f1_score > previous_best_valid_f1_score:
                        bad_counter = 0
                        previous_best_valid_f1_score = valid_f1_score
                    else:
                        bad_counter += 1


                    if bad_counter > parameters['patience']:
                        print('Early Stop!')
                        results['execution_details']['early_stop'] = True
                        break

                    if epoch_number > parameters['maximum_number_of_epochs']: break

            except KeyboardInterrupt:
                results['execution_details']['keyboard_interrupt'] = True
        #         assess_model.save_results(results, stats_graph_folder)
                print('Training interrupted')

            print('Finishing the experiment')
            end_time = time.time()
            results['execution_details']['train_duration'] = end_time - start_time
            results['execution_details']['train_end'] = end_time
            assess_model.save_results(results, stats_graph_folder)

    sess.close() # release the session's resources
Beispiel #8
0
def main():

    parameters, conf_parameters = load_parameters()
    pprint(parameters)
    dataset_filepaths = get_valid_dataset_filepaths(parameters)
    check_parameter_compatiblity(parameters, dataset_filepaths)

    cross_validation = parameters[
        'cross_validation'] if 'cross_validation' in parameters else 1
    valid_fscores = []
    valid_precisions = []
    valid_recalls = []
    for cv in range(0, cross_validation):
        if "als" in dataset_filepaths['train'] and cross_validation > 1:
            train_files = list(range(0, cv)) + list(
                range(cv + 1, cross_validation))
            test_file = cv
            file_train = "tmp_combined.train"
            file_valid = "tmp_combined.test"
            output = []
            for i in train_files:
                with open(dataset_filepaths['train'] + "_" + str(i),
                          "r",
                          encoding="utf-8") as file:
                    output.append(file.read())
            with open(file_train, "w", encoding="utf-8") as file:
                file.write("\n\n".join(output))
            output = []
            with open(dataset_filepaths['train'] + "_" + str(test_file),
                      "r",
                      encoding="utf-8") as file:
                output.append(file.read())
            with open(file_valid, "w", encoding="utf-8") as file:
                file.write("\n\n".join(output))
            dataset_filepaths['train'] = file_train
            dataset_filepaths['valid'] = file_valid
        # Load dataset
        dataset = ds.Dataset(verbose=parameters['verbose'],
                             debug=parameters['debug'])
        dataset.load_vocab_word_embeddings(parameters)
        dataset.load_dataset(dataset_filepaths, parameters)

        # Create graph and session
        with tf.Graph().as_default():
            session_conf = tf.ConfigProto(
                intra_op_parallelism_threads=parameters[
                    'number_of_cpu_threads'],
                inter_op_parallelism_threads=parameters[
                    'number_of_cpu_threads'],
                device_count={
                    'CPU': 1,
                    'GPU': parameters['number_of_gpus']
                },
                allow_soft_placement=
                True,  # automatically choose an existing and supported device to run the operations in case the specified one doesn't exist
                log_device_placement=False)

            session_conf.gpu_options.allow_growth = True

            sess = tf.Session(config=session_conf)

            with sess.as_default():
                # Initialize and save execution details
                start_time = time.time()
                experiment_timestamp = utils.get_current_time_in_miliseconds()
                results = {}
                results['epoch'] = {}
                results['execution_details'] = {}
                results['execution_details']['train_start'] = start_time
                results['execution_details'][
                    'time_stamp'] = experiment_timestamp
                results['execution_details']['early_stop'] = False
                results['execution_details']['keyboard_interrupt'] = False
                results['execution_details']['num_epochs'] = 0
                results['model_options'] = copy.copy(parameters)

                dataset_name = utils.get_basename_without_extension(
                    parameters['dataset_train'])
                if 'data_to_use' in parameters:
                    model_name = '{0}_{1}'.format(
                        parameters['language'] + "_" + dataset_name + "_small",
                        results['execution_details']['time_stamp'])
                else:
                    model_name = '{0}_{1}'.format(
                        parameters['language'] + "_" + dataset_name,
                        results['execution_details']['time_stamp'])

                output_folder = os.path.join('..', 'output')
                utils.create_folder_if_not_exists(output_folder)
                stats_graph_folder = os.path.join(
                    output_folder, model_name)  # Folder where to save graphs
                utils.create_folder_if_not_exists(stats_graph_folder)
                model_folder = os.path.join(stats_graph_folder, 'model')
                utils.create_folder_if_not_exists(model_folder)
                with open(os.path.join(model_folder, 'parameters.ini'),
                          'w') as parameters_file:
                    conf_parameters.write(parameters_file)
                tensorboard_log_folder = os.path.join(stats_graph_folder,
                                                      'tensorboard_logs')
                utils.create_folder_if_not_exists(tensorboard_log_folder)
                tensorboard_log_folders = {}
                for dataset_type in dataset_filepaths.keys():
                    tensorboard_log_folders[dataset_type] = os.path.join(
                        stats_graph_folder, 'tensorboard_logs', dataset_type)
                    utils.create_folder_if_not_exists(
                        tensorboard_log_folders[dataset_type])
                #del dataset.embeddings_matrix
                if not parameters['use_pretrained_model']:
                    pickle.dump(
                        dataset,
                        open(os.path.join(model_folder, 'dataset.pickle'),
                             'wb'))
                #dataset.load_pretrained_word_embeddings(parameters)
                # Instantiate the model
                # graph initialization should be before FileWriter, otherwise the graph will not appear in TensorBoard
                model = EntityLSTM(dataset, parameters)

                # Instantiate the writers for TensorBoard
                writers = {}
                for dataset_type in dataset_filepaths.keys():
                    writers[dataset_type] = tf.summary.FileWriter(
                        tensorboard_log_folders[dataset_type],
                        graph=sess.graph)
                embedding_writer = tf.summary.FileWriter(
                    model_folder
                )  # embedding_writer has to write in model_folder, otherwise TensorBoard won't be able to view embeddings

                embeddings_projector_config = projector.ProjectorConfig()
                tensorboard_token_embeddings = embeddings_projector_config.embeddings.add(
                )
                tensorboard_token_embeddings.tensor_name = model.token_embedding_weights.name
                token_list_file_path = os.path.join(
                    model_folder, 'tensorboard_metadata_tokens.tsv')
                tensorboard_token_embeddings.metadata_path = os.path.relpath(
                    token_list_file_path, '..')

                if parameters['use_character_lstm']:
                    tensorboard_character_embeddings = embeddings_projector_config.embeddings.add(
                    )
                    tensorboard_character_embeddings.tensor_name = model.character_embedding_weights.name
                    character_list_file_path = os.path.join(
                        model_folder, 'tensorboard_metadata_characters.tsv')
                    tensorboard_character_embeddings.metadata_path = os.path.relpath(
                        character_list_file_path, '..')

                projector.visualize_embeddings(embedding_writer,
                                               embeddings_projector_config)

                # Write metadata for TensorBoard embeddings
                token_list_file = codecs.open(token_list_file_path, 'w',
                                              'UTF-8')
                for token_index in range(len(dataset.index_to_token)):
                    token_list_file.write('{0}\n'.format(
                        dataset.index_to_token[token_index]))
                token_list_file.close()

                if parameters['use_character_lstm']:
                    character_list_file = codecs.open(character_list_file_path,
                                                      'w', 'UTF-8')
                    for character_index in range(dataset.alphabet_size):
                        if character_index == dataset.PADDING_CHARACTER_INDEX:
                            character_list_file.write('PADDING\n')
                        else:
                            character_list_file.write('{0}\n'.format(
                                dataset.index_to_character[character_index]))
                    character_list_file.close()

                try:
                    # Initialize the model
                    sess.run(tf.global_variables_initializer())
                    if not parameters['use_pretrained_model']:
                        model.load_pretrained_token_embeddings(
                            sess, dataset, parameters)

                    # Start training + evaluation loop. Each iteration corresponds to 1 epoch.
                    bad_counter = 0  # number of epochs with no improvement on the validation test in terms of F1-score
                    previous_best_valid_f1_score = 0
                    transition_params_trained = np.random.rand(
                        len(dataset.unique_labels), len(dataset.unique_labels)
                    )  #TODO np.random.rand(len(dataset.unique_labels)+2,len(dataset.unique_labels)+2)
                    model_saver = tf.train.Saver(
                        max_to_keep=None
                    )  #parameters['maximum_number_of_epochs'])  # defaults to saving all variables
                    epoch_number = 0

                    while True:
                        epoch_number += 1
                        print('\nStarting epoch {0}'.format(epoch_number))

                        epoch_start_time = time.time()

                        if parameters[
                                'use_pretrained_model'] and epoch_number == 1:
                            # Restore pretrained model parameters
                            transition_params_trained = train.restore_model_parameters_from_pretrained_model(
                                parameters, dataset, sess, model, model_saver)
                        elif epoch_number != 0:
                            # Train model: loop over all sequences of training set with shuffling
                            sequence_numbers = list(
                                range(len(dataset.token_indices['train'])))
                            random.shuffle(sequence_numbers)
                            data_counter = 0
                            sub_id = 0
                            for i in tqdm(range(0, len(sequence_numbers),
                                                parameters['batch_size']),
                                          "Training",
                                          mininterval=1):
                                data_counter += parameters['batch_size']
                                if data_counter >= 20000:
                                    data_counter = 0
                                    sub_id += 0.001
                                    print("Intermediate evaluation number: ",
                                          sub_id)

                                    #model_saver.save(sess,
                                    #                 os.path.join(model_folder, 'model_{0:05d}_{1}.ckpt'.format(epoch_number, len(sequence_numbers)/4/len(sequence_numbers))))
                                    epoch_elapsed_training_time = time.time(
                                    ) - epoch_start_time
                                    print(
                                        'Training completed in {0:.2f} seconds'
                                        .format(epoch_elapsed_training_time),
                                        flush=True)

                                    y_pred, y_true, output_filepaths = train.predict_labels(
                                        sess, model, transition_params_trained,
                                        parameters, dataset,
                                        epoch_number + sub_id,
                                        stats_graph_folder, dataset_filepaths)

                                    # Evaluate model: save and plot results
                                    evaluate.evaluate_model(
                                        results, dataset, y_pred, y_true,
                                        stats_graph_folder, epoch_number,
                                        epoch_start_time, output_filepaths,
                                        parameters)

                                    # Save model
                                    model_saver.save(
                                        sess,
                                        os.path.join(
                                            model_folder,
                                            'model_{0:07.3f}.ckpt'.format(
                                                epoch_number + sub_id)))

                                    # Save TensorBoard logs
                                    summary = sess.run(model.summary_op,
                                                       feed_dict=None)
                                    writers['train'].add_summary(
                                        summary, epoch_number)
                                    writers['train'].flush()
                                    utils.copytree(
                                        writers['train'].get_logdir(),
                                        model_folder)

                                    # Early stop
                                    valid_f1_score = results['epoch'][
                                        epoch_number][0]['valid']['f1_score'][
                                            'micro']
                                    # valid_precision = results['epoch'][epoch_number][0]['valid']['precision']['micro']
                                    # valid_recall = results['epoch'][epoch_number][0]['valid']['recall']['micro']

                                    # valid_fscores.append(valid_f1_score)
                                    if valid_f1_score > previous_best_valid_f1_score:
                                        bad_counter = 0
                                        previous_best_valid_f1_score = valid_f1_score
                                        # previous_best_valid_precision = valid_precision
                                        # previous_best_valid_recall = valid_recall
                                    else:
                                        bad_counter += 1

                                sequence_number = sequence_numbers[
                                    i:i + parameters['batch_size']]
                                transition_params_trained, loss = train.train_step(
                                    sess, dataset, sequence_number, model,
                                    transition_params_trained, parameters)
                        epoch_elapsed_training_time = time.time(
                        ) - epoch_start_time
                        print('Training completed in {0:.2f} seconds'.format(
                            epoch_elapsed_training_time),
                              flush=True)

                        y_pred, y_true, output_filepaths = train.predict_labels(
                            sess, model, transition_params_trained, parameters,
                            dataset, epoch_number, stats_graph_folder,
                            dataset_filepaths)

                        # Evaluate model: save and plot results
                        evaluate.evaluate_model(results, dataset, y_pred,
                                                y_true, stats_graph_folder,
                                                epoch_number, epoch_start_time,
                                                output_filepaths, parameters)

                        # Save model
                        model_saver.save(
                            sess,
                            os.path.join(
                                model_folder,
                                'model_{0:05d}.ckpt'.format(epoch_number)))

                        # Save TensorBoard logs
                        summary = sess.run(model.summary_op, feed_dict=None)
                        writers['train'].add_summary(summary, epoch_number)
                        writers['train'].flush()
                        utils.copytree(writers['train'].get_logdir(),
                                       model_folder)

                        # Early stop
                        valid_f1_score = results['epoch'][epoch_number][0][
                            'valid']['f1_score']['micro']
                        #valid_precision = results['epoch'][epoch_number][0]['valid']['precision']['micro']
                        #valid_recall = results['epoch'][epoch_number][0]['valid']['recall']['micro']

                        #valid_fscores.append(valid_f1_score)
                        if valid_f1_score > previous_best_valid_f1_score:
                            bad_counter = 0
                            previous_best_valid_f1_score = valid_f1_score
                            #previous_best_valid_precision = valid_precision
                            #previous_best_valid_recall = valid_recall
                        else:
                            bad_counter += 1
                        print(
                            "The last {0} epochs have not shown improvements on the validation set."
                            .format(bad_counter))

                        if bad_counter >= parameters['patience']:
                            print('Early Stop!')
                            results['execution_details']['early_stop'] = True
                            break

                        if epoch_number >= parameters[
                                'maximum_number_of_epochs']:
                            break

                except KeyboardInterrupt:
                    results['execution_details']['keyboard_interrupt'] = True
                    print('Training interrupted')
                    # remove the experiment
                    remove_experiment = input(
                        "Do you want to remove the experiment? (yes/y/Yes)")
                    if remove_experiment in ["Yes", "yes", "y"]:
                        shutil.rmtree(stats_graph_folder)
                        print("Folder removed")
                    else:
                        print('Finishing the experiment')
                        end_time = time.time()
                        results['execution_details'][
                            'train_duration'] = end_time - start_time
                        results['execution_details']['train_end'] = end_time
                        evaluate.save_results(results, stats_graph_folder)
                except Exception:
                    logging.exception("")
                    remove_experiment = input(
                        "Do you want to remove the experiment? (yes/y/Yes)")
                    if remove_experiment in ["Yes", "yes", "y"]:
                        shutil.rmtree(stats_graph_folder)
                        print("Folder removed")

        sess.close()  # release the session's resources
        if 'cross_validation' in parameters and parameters[
                'cross_validation'] > 1:
            valid_fscores.append(previous_best_valid_f1_score)
            #valid_precisions.append(previous_best_valid_precision)
            #valid_recalls.append(previous_best_valid_recall)
    if 'cross_validation' in parameters and parameters['cross_validation'] > 1:
        print("mean f1score:", np.mean(valid_fscores))
        #print("mean precision:", np.mean(valid_precisions))
        #print("mean recall:", np.mean(valid_recalls))
        with codecs.open(os.path.join(stats_graph_folder, "result_cv.txt"),
                         "w") as file:
            file.write("F1score " + ", ".join(map(str, valid_fscores)))
            # file.write("Precision " + valid_precisions)
            # file.write("Recall " + valid_recalls)
            file.write("Mean F1score " + str(np.mean(valid_fscores)))
Beispiel #9
0
def main(languages):
    #embeddings_type = ['polyglot', 'fasttext']
    #embeddings_type = ['fasttext', 'fasttext_noOOV']
    embeddings_type = ['fasttext_noOOV']
    character_lstm = [True]
    embedding_language = ['target', 'source']
    combination = product(languages, embeddings_type, embedding_language, character_lstm)
    create_folder_if_not_exists(os.path.join("..", "log"))
    experiment_timestamp = utils.get_current_time_in_miliseconds()
    log_file = os.path.join("..", "log", "experiment-{}.log".format(experiment_timestamp))

    for language, emb_type, emb_language, char_lstm in combination:
        conf_parameters = load_parameters()
        conf_parameters = set_datasets(conf_parameters, language)
        conf_parameters.set('ann','use_character_lstm', str(char_lstm))
        conf_parameters.set('ann','embedding_type', emb_type)
        conf_parameters.set('ann','embedding_language', emb_language)
        if emb_type == 'polyglot':
            conf_parameters.set('ann', 'embedding_dimension', str(64))
        elif 'fasttext' in emb_type:
            conf_parameters.set('ann', 'embedding_dimension', str(300))
        else:
            raise("Uknown embedding type")
        if emb_language == 'source':
            conf_parameters.set('dataset', 'language', constants.MAPPING_LANGUAGE[language])
        else:
            conf_parameters.set('dataset', 'language', language)
        parameters, conf_parameters = parse_parameters(conf_parameters)

        start_time = time.time()
        experiment_timestamp = utils.get_current_time_in_miliseconds()

        results = {}
        results['epoch'] = {}
        results['execution_details'] = {}
        results['execution_details']['train_start'] = start_time
        results['execution_details']['time_stamp'] = experiment_timestamp
        results['execution_details']['early_stop'] = False
        results['execution_details']['keyboard_interrupt'] = False
        results['execution_details']['num_epochs'] = 0
        results['model_options'] = copy.copy(parameters)

        dataset_name = utils.get_basename_without_extension(parameters['dataset_train'])
        model_name = '{0}_{1}_{2}_{3}_{4}'.format(language, emb_type, char_lstm, emb_language,
                                                  results['execution_details']['time_stamp'])

        sys.stdout = open(os.path.join("..", "log", model_name), "w")
        print(language, emb_type, char_lstm, emb_language)

        with open(log_file, "a") as file:
            file.write("Experiment: {}\n".format(model_name))
            file.write("Start time:{}\n".format(experiment_timestamp))
            file.write("-------------------------------------\n\n")
        pprint(parameters)
        dataset_filepaths = get_valid_dataset_filepaths(parameters)
        check_parameter_compatiblity(parameters, dataset_filepaths)
        previous_best_valid_epoch = -1

        # Load dataset
        dataset = ds.Dataset(verbose=parameters['verbose'], debug=parameters['debug'])
        dataset.load_vocab_word_embeddings(parameters)
        dataset.load_dataset(dataset_filepaths, parameters)

        # Create graph and session
        with tf.Graph().as_default():
            session_conf = tf.ConfigProto(
                intra_op_parallelism_threads=parameters['number_of_cpu_threads'],
                inter_op_parallelism_threads=parameters['number_of_cpu_threads'],
                device_count={'CPU': 1, 'GPU': parameters['number_of_gpus']},
                allow_soft_placement=True,
                # automatically choose an existing and supported device to run the operations in case the specified one doesn't exist
                log_device_placement=False
            )

            session_conf.gpu_options.allow_growth = True

            sess = tf.Session(config=session_conf)

            with sess.as_default():
                # Initialize and save execution details

                print(model_name)
                output_folder = os.path.join('..', 'output')
                utils.create_folder_if_not_exists(output_folder)
                stats_graph_folder = os.path.join(output_folder, model_name)  # Folder where to save graphs
                utils.create_folder_if_not_exists(stats_graph_folder)
                model_folder = os.path.join(stats_graph_folder, 'model')
                utils.create_folder_if_not_exists(model_folder)
                with open(os.path.join(model_folder, 'parameters.ini'), 'w') as parameters_file:
                    conf_parameters.write(parameters_file)
                tensorboard_log_folder = os.path.join(stats_graph_folder, 'tensorboard_logs')
                utils.create_folder_if_not_exists(tensorboard_log_folder)
                tensorboard_log_folders = {}
                for dataset_type in dataset_filepaths.keys():
                    tensorboard_log_folders[dataset_type] = os.path.join(stats_graph_folder, 'tensorboard_logs',
                                                                         dataset_type)
                    utils.create_folder_if_not_exists(tensorboard_log_folders[dataset_type])
                # del dataset.embeddings_matrix
                if not parameters['use_pretrained_model']:
                    pickle.dump(dataset, open(os.path.join(model_folder, 'dataset.pickle'), 'wb'))
                # dataset.load_pretrained_word_embeddings(parameters)
                # Instantiate the model
                # graph initialization should be before FileWriter, otherwise the graph will not appear in TensorBoard
                model = EntityLSTM(dataset, parameters)

                # Instantiate the writers for TensorBoard
                writers = {}
                for dataset_type in dataset_filepaths.keys():
                    writers[dataset_type] = tf.summary.FileWriter(tensorboard_log_folders[dataset_type],
                                                                  graph=sess.graph)
                embedding_writer = tf.summary.FileWriter(
                    model_folder)  # embedding_writer has to write in model_folder, otherwise TensorBoard won't be able to view embeddings

                embeddings_projector_config = projector.ProjectorConfig()
                tensorboard_token_embeddings = embeddings_projector_config.embeddings.add()
                tensorboard_token_embeddings.tensor_name = model.token_embedding_weights.name
                token_list_file_path = os.path.join(model_folder, 'tensorboard_metadata_tokens.tsv')
                tensorboard_token_embeddings.metadata_path = os.path.relpath(token_list_file_path, '..')

                if parameters['use_character_lstm']:
                    tensorboard_character_embeddings = embeddings_projector_config.embeddings.add()
                    tensorboard_character_embeddings.tensor_name = model.character_embedding_weights.name
                    character_list_file_path = os.path.join(model_folder, 'tensorboard_metadata_characters.tsv')
                    tensorboard_character_embeddings.metadata_path = os.path.relpath(character_list_file_path, '..')

                projector.visualize_embeddings(embedding_writer, embeddings_projector_config)

                # Write metadata for TensorBoard embeddings
                token_list_file = codecs.open(token_list_file_path, 'w', 'UTF-8')
                for token_index in range(len(dataset.index_to_token)):
                    token_list_file.write('{0}\n'.format(dataset.index_to_token[token_index]))
                token_list_file.close()

                if parameters['use_character_lstm']:
                    character_list_file = codecs.open(character_list_file_path, 'w', 'UTF-8')
                    for character_index in range(dataset.alphabet_size):
                        if character_index == dataset.PADDING_CHARACTER_INDEX:
                            character_list_file.write('PADDING\n')
                        else:
                            character_list_file.write('{0}\n'.format(dataset.index_to_character[character_index]))
                    character_list_file.close()

                try:
                    # Initialize the model
                    sess.run(tf.global_variables_initializer())
                    if not parameters['use_pretrained_model']:
                        model.load_pretrained_token_embeddings(sess, dataset, parameters)

                    # Start training + evaluation loop. Each iteration corresponds to 1 epoch.
                    bad_counter = 0  # number of epochs with no improvement on the validation test in terms of F1-score
                    previous_best_valid_f1_score = -1
                    transition_params_trained = np.random.rand(len(dataset.unique_labels), len(
                        dataset.unique_labels))  # TODO np.random.rand(len(dataset.unique_labels)+2,len(dataset.unique_labels)+2)
                    model_saver = tf.train.Saver(
                        max_to_keep=None)  # parameters['maximum_number_of_epochs'])  # defaults to saving all variables
                    epoch_number = 0

                    while True:
                        step = 0
                        epoch_number += 1
                        print('\nStarting epoch {0}'.format(epoch_number))

                        epoch_start_time = time.time()

                        if parameters['use_pretrained_model'] and epoch_number == 1:
                            # Restore pretrained model parameters
                            transition_params_trained = train.restore_model_parameters_from_pretrained_model(parameters,
                                                                                                             dataset,
                                                                                                             sess,
                                                                                                             model,
                                                                                                             model_saver)
                        elif epoch_number != 0:
                            # Train model: loop over all sequences of training set with shuffling
                            sequence_numbers = list(range(len(dataset.token_indices['train'])))
                            random.shuffle(sequence_numbers)
                            data_counter = 0
                            sub_id = 0
                            for i in tqdm(range(0, len(sequence_numbers), parameters['batch_size']), "Training epoch {}".format(epoch_number),
                                          mininterval=1):
                                data_counter += parameters['batch_size']
                                if data_counter >= 20000:
                                    data_counter = 0
                                    sub_id += 0.001
                                    print("Intermediate evaluation number: ", sub_id)
                                    epoch_elapsed_training_time = time.time() - epoch_start_time
                                    print('Training completed in {0:.2f} seconds'.format(epoch_elapsed_training_time),
                                          flush=True)

                                    y_pred, y_true, output_filepaths = train.predict_labels(sess, model,
                                                                                            transition_params_trained,
                                                                                            parameters, dataset,
                                                                                            epoch_number + sub_id,
                                                                                            stats_graph_folder,
                                                                                            dataset_filepaths)
                                    # Evaluate model: save and plot results
                                    evaluate.evaluate_model(results, dataset, y_pred, y_true, stats_graph_folder,
                                                            epoch_number, epoch_start_time, output_filepaths,
                                                            parameters)
                                    # Save model
                                    model_saver.save(sess, os.path.join(model_folder,
                                                                        'model_{0:07.3f}.ckpt'.format(
                                                                            epoch_number + sub_id)))
                                    # Save TensorBoard logs
                                    summary = sess.run(model.summary_op, feed_dict=None)
                                    writers['train'].add_summary(summary, epoch_number)
                                    writers['train'].flush()
                                    utils.copytree(writers['train'].get_logdir(), model_folder)
                                    # Early stop
                                    valid_f1_score = results['epoch'][epoch_number][0]['valid']['f1_score']['micro']
                                    if valid_f1_score > previous_best_valid_f1_score:
                                        bad_counter = 0
                                        previous_best_valid_f1_score = valid_f1_score
                                    else:
                                        bad_counter += 1

                                sequence_number = sequence_numbers[i: i + parameters['batch_size']]
                                transition_params_trained, loss = train.train_step(sess, dataset, sequence_number,
                                                                                   model, transition_params_trained,
                                                                                   parameters)
                        epoch_elapsed_training_time = time.time() - epoch_start_time
                        print('Training completed in {0:.2f} seconds'.format(epoch_elapsed_training_time), flush=True)

                        y_pred, y_true, output_filepaths = train.predict_labels(sess, model, transition_params_trained,
                                                                                parameters, dataset, epoch_number,
                                                                                stats_graph_folder, dataset_filepaths)

                        # Evaluate model: save and plot results
                        evaluate.evaluate_model(results, dataset, y_pred, y_true, stats_graph_folder, epoch_number,
                                                epoch_start_time, output_filepaths, parameters)

                        # Save model
                        model_saver.save(sess, os.path.join(model_folder, 'model_{0:05d}.ckpt'.format(epoch_number)))

                        # Save TensorBoard logs
                        summary = sess.run(model.summary_op, feed_dict=None)
                        writers['train'].add_summary(summary, epoch_number)
                        writers['train'].flush()
                        utils.copytree(writers['train'].get_logdir(), model_folder)

                        # Early stop
                        valid_f1_score = results['epoch'][epoch_number][0]['valid']['f1_score']['micro']
                        if valid_f1_score > previous_best_valid_f1_score:
                            bad_counter = 0
                            previous_best_valid_f1_score = valid_f1_score
                            previous_best_valid_epoch = epoch_number
                        else:
                            bad_counter += 1
                        print("The last {0} epochs have not shown improvements on the validation set.".format(
                            bad_counter))

                        if bad_counter >= parameters['patience']:
                            print('Early Stop!')
                            results['execution_details']['early_stop'] = True
                            break

                        if epoch_number >= parameters['maximum_number_of_epochs']: break

                    keep_only_best_model(model_folder,previous_best_valid_epoch ,parameters['maximum_number_of_epochs']+1)

                except KeyboardInterrupt:
                    results['execution_details']['keyboard_interrupt'] = True
                    print('Training interrupted')
                    # remove the experiment
                    remove_experiment = input("Do you want to remove the experiment? (yes/y/Yes)")
                    if remove_experiment in ["Yes", "yes", "y"]:
                        shutil.rmtree(stats_graph_folder)
                        print("Folder removed")
                    else:
                        print('Finishing the experiment')
                        end_time = time.time()
                        results['execution_details']['train_duration'] = end_time - start_time
                        results['execution_details']['train_end'] = end_time
                        evaluate.save_results(results, stats_graph_folder)
                    sys.stdout.close()
                except Exception:
                    logging.exception("")
                    remove_experiment = input("Do you want to remove the experiment? (yes/y/Yes)")
                    if remove_experiment in ["Yes", "yes", "y"]:
                        shutil.rmtree(stats_graph_folder)
                        print("Folder removed")
                    sys.stdout.close()

            sess.close()  # release the session's resources
            sys.stdout.close()
Beispiel #10
0
def main(args):
    experiments = utils.load_experiments()

    #parameters, conf_parameters = load_parameters2()
    #if args.file:
    #    parameters['predict_text'] = args.file
    #parameters = process_input(parameters)

    #if not parameters["use_pretrained_model"]:
    #    raise IOError('Set use_pretrained_model parameter to True if you want to predict')
    #dataset_filepaths = get_valid_dataset_filepaths(parameters)
    #check_parameter_compatiblity(parameters, dataset_filepaths)
    pprint(experiments)
    time_stamp = utils.get_current_time_in_miliseconds()
    result_file = '{0}_{1}'.format(args.experiment_set + "_" + "results_",
                                   time_stamp + ".txt")
    print(result_file)
    with open(os.path.join("../predictions",result_file), "w", encoding="utf-8") as file:
        for elem in experiments['experiments'][args.experiment_set]:
            trained_model = elem[0]
            test = elem[1]
            print("======================")
            print("Train on {0}, test {1}".format(trained_model,test))
            print("======================")

            pretrained_model_folder = os.path.dirname(experiments['models'][trained_model])
            dataset = pickle.load(open(os.path.join(pretrained_model_folder, 'dataset.pickle'), 'rb'))

            parameters, conf_parameters = load_parameters(os.path.join(pretrained_model_folder, 'parameters.ini'), verbose=False)
            parameters['train_model'] = False
            parameters['use_pretrained_model'] = True
            parameters['dataset_predict'] = experiments['datasets'][test]
            parameters['pretrained_model_name'] = "{0}_on_{1}".format(trained_model,test)
            parameters['pretrained_model_checkpoint_filepath'] = experiments['models'][trained_model]
            dataset_filepaths = get_valid_dataset_filepaths(parameters)
            pprint(parameters)
            #sys.exit()

            #if args.file:
            #    parameters['predict_text'] = args.file
            #parameters = process_input(parameters)
            # Load dataset
            #dataset = ds.Dataset(verbose=parameters['verbose'], debug=parameters['debug'])
            #dataset.load_vocab_word_embeddings(parameters)

            #pretrained_model_folder = os.path.dirname(parameters['pretrained_model_checkpoint_filepath'])
            #dataset = pickle.load(open(os.path.join(pretrained_model_folder, 'dataset.pickle'), 'rb'))
            #dataset.load_dataset(dataset_filepaths, parameters)
            dataset_type = "predict"
            dataset.labels[dataset_type], dataset.tokens[dataset_type], _, _, _  = dataset._parse_dataset(dataset_filepaths.get(dataset_type, None), parameters['language'])
            #dataset.load_vocab_word_embeddings(parameters)
            iteration_number = 0
            dataset.token_to_index = dict()
            dataset.number_of_unknown_tokens = 0
            for token_sentence in tqdm(dataset.tokens['predict']):
                for token in token_sentence:
                    if iteration_number == dataset.UNK_TOKEN_INDEX: iteration_number += 1
                    if iteration_number == dataset.PADDING_TOKEN_INDEX: iteration_number += 1
                    if token == "CD":
                        a=1
                    if not utils_nlp.is_token_in_pretrained_embeddings(token, dataset.vocab_embeddings, parameters):
                        if parameters['embedding_type'] == 'fasttext':
                            dataset.token_to_index[token] = iteration_number
                            iteration_number += 1
                        else:
                            dataset.token_to_index[token] = dataset.UNK_TOKEN_INDEX
                            dataset.number_of_unknown_tokens += 1
                            dataset.tokens_mapped_to_unk.append(token)
                    else:
                        if token not in dataset.token_to_index:
                            dataset.token_to_index[token] = iteration_number
                            iteration_number += 1

            dataset_type = "predict"


            for dataset_type in dataset_filepaths.keys():
                dataset.token_indices[dataset_type] = []
                dataset.characters[dataset_type] = []
                dataset.character_indices[dataset_type] = []
                dataset.token_lengths[dataset_type] = []
                dataset.sequence_lengths[dataset_type] = []
                dataset.longest_token_length_in_sequence[dataset_type] = []
                # character_indices_padded[dataset_type] = []
                for token_sequence in dataset.tokens[dataset_type]:
                    dataset.token_indices[dataset_type].append([dataset.token_to_index.get(token, dataset.UNK_TOKEN_INDEX) for token in token_sequence])
                    dataset.characters[dataset_type].append([list(token) for token in token_sequence])
                    dataset.character_indices[dataset_type].append(
                        [[dataset.character_to_index.get(character,dataset.UNK_CHARACTER_INDEX) for character in token] for token in token_sequence])
                    dataset.token_lengths[dataset_type].append([len(token) for token in token_sequence])
                    dataset.sequence_lengths[dataset_type].append(len(token_sequence))
                    dataset.longest_token_length_in_sequence[dataset_type].append(max(dataset.token_lengths[dataset_type][-1]))

                    # character_indices_padded[dataset_type].append([ utils.pad_list(temp_token_indices, longest_token_length_in_sequence, self.PADDING_CHARACTER_INDEX)
                    #                                                for temp_token_indices in character_indices[dataset_type][-1]])

                dataset.label_indices[dataset_type] = []
                for label_sequence in dataset.labels[dataset_type]:
                    dataset.label_indices[dataset_type].append([dataset.label_to_index[label] for label in label_sequence])

            tmp_vector = [0] * len(dataset.unique_labels)
            tmp_vector[dataset.label_to_index["O"]] = 1
            dataset.PADDING_LABEL_VECTOR = tmp_vector
            for dataset_type in dataset_filepaths.keys():
                dataset.label_vector_indices[dataset_type] = []
                for label_indices_sequence in dataset.label_indices[dataset_type]:
                    vector_sequence = []
                    for indice in label_indices_sequence:
                        vector = [0] * len(dataset.unique_labels)
                        vector[indice] = 1
                        vector_sequence.append(vector)
                    dataset.label_vector_indices[dataset_type].append(vector_sequence)

            # Create graph and session
            with tf.Graph().as_default():
                session_conf = tf.ConfigProto(
                    intra_op_parallelism_threads=parameters['number_of_cpu_threads'],
                    inter_op_parallelism_threads=parameters['number_of_cpu_threads'],
                    device_count={'CPU': 1, 'GPU': parameters['number_of_gpus']},
                    allow_soft_placement=True,
                    # automatically choose an existing and supported device to run the operations in case the specified one doesn't exist
                    log_device_placement=False,
                )
                session_conf.gpu_options.allow_growth = True


                sess = tf.Session(config=session_conf)

                model = EntityLSTM(dataset, parameters)
                model_saver = tf.train.Saver()

                prediction_folder = os.path.join('..', 'predictions')
                utils.create_folder_if_not_exists(prediction_folder)
                dataset_name = parameters['pretrained_model_name']
                model_name = '{0}_{1}'.format(dataset_name,
                                              time_stamp)
                prediction_folder = os.path.join(prediction_folder, model_name)
                utils.create_folder_if_not_exists(prediction_folder)
                epoch_number = 100
                #dataset_name = utils.get_basename_without_extension(parameters['dataset_test'])
                with open(os.path.join(prediction_folder, 'parameters.ini'), 'w') as parameters_file:
                    conf_parameters.write(parameters_file)

                if parameters['use_pretrained_model']:
                    # Restore pretrained model parameters
                    transition_params_trained = train.restore_model_parameters_from_pretrained_model(parameters, dataset, sess, model, model_saver)
                    model.load_pretrained_token_embeddings(sess, dataset, parameters)
                    start_time = time.time()
                    results = {}
                    results['epoch'] = {}
                    results['execution_details'] = {}
                    results['execution_details']['train_start'] = start_time
                    results['execution_details']['time_stamp'] = start_time
                    results['execution_details']['early_stop'] = False
                    results['execution_details']['keyboard_interrupt'] = False
                    results['execution_details']['num_epochs'] = epoch_number
                    results['model_options'] = copy.copy(parameters)
                    demo = parameters['pretrained_model_name'] == "demo"
                    y_pred, y_true, output_filepaths = train.predict_labels(sess, model, transition_params_trained, parameters, dataset, epoch_number, prediction_folder, dataset_filepaths, demo=demo)
                    conll_output_file = evaluate.evaluate_model(results, dataset, y_pred, y_true, prediction_folder, epoch_number,
                                           start_time , output_filepaths, parameters)


                    file.write(parameters['pretrained_model_name'] + "\n")
                    with open(conll_output_file, "r") as conll_file:
                        conll = conll_file.read()
                    file.write(conll)
                    file.write("\n\n\n")
                    if parameters['pretrained_model_name'] == "demo":
                        print("============")
                        print(" Prediction ")
                        print("============")
                        i = 0
                        for sentence in dataset.tokens['predict']:
                            for token in sentence:
                                predict_label = dataset.index_to_label[y_pred['predict'][i]]
                                if dataset.index_to_label[y_pred['predict'][i]] != "O":
                                    print(token,predict_label)
                                else:
                                    print(token)
                                i += 1
                            print("")
                else:
                    raise IOError('Set use_pretrained_model parameter to True')
Beispiel #11
0
def main():
    file_params = 'parameters_yelp_50k.ini'
    if len(sys.argv) > 1 and '.ini' in sys.argv[1]:
        file_params = sys.argv[1]

    # Load config
    parameters, conf_parameters = load_parameters(
        parameters_filepath=os.path.join('.', file_params))
    dataset_filepaths = get_valid_dataset_filepaths(parameters)
    #check_parameter_compatiblity(parameters, dataset_filepaths)

    if parameters['seed'] != -1:
        random.seed(parameters['seed'])

    # Create annotator
    annotator = stanford_corenlp_pywrapper.CoreNLP(
        configdict={
            'annotators': 'tokenize, ssplit',
            'ssplit.eolonly': True
        },
        corenlp_jars=[parameters['stanford_folder'] + '/*'])
    # Load dataset
    dataset = ds.Dataset(verbose=parameters['verbose'],
                         debug=parameters['debug'])
    dataset.load_dataset(dataset_filepaths, parameters, annotator)

    # Adapt train/valid/test to be multiple of batch_size
    for size in ['train_size', 'valid_size', 'test_size']:
        if parameters[size] % parameters['batch_size'] != 0:
            parameters[size] = int(
                parameters[size] /
                parameters['batch_size']) * parameters['batch_size']
            print('Changed {}'.format(size))

    # Set GPU device if more GPUs are specified
    if parameters['number_of_gpus'] > 1 and parameters['gpu_device'] != -1:
        os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
        os.environ["CUDA_VISIBLE_DEVICES"] = parameters['gpu_device']

    # GPUs
    print(device_lib.list_local_devices())
    # Create graph and session
    with tf.Graph().as_default():
        session_conf = tf.ConfigProto(
            intra_op_parallelism_threads=parameters['number_of_cpu_threads'],
            inter_op_parallelism_threads=parameters['number_of_cpu_threads'],
            device_count={
                'CPU': 1,
                'GPU': parameters['number_of_gpus']
            },
            allow_soft_placement=
            True,  # automatically choose an existing and supported device to run the operations in case the specified one doesn't exist
            log_device_placement=False)

        sess = tf.Session(config=session_conf)

        with sess.as_default():
            if parameters['seed'] != -1:
                tf.set_random_seed(parameters['seed'])

                # Initialize and save execution details
                start_time = time.time()
                experiment_timestamp = utils.get_current_time_in_miliseconds()

                results = {}
                results['epoch'] = {}
                results['execution_details'] = {}
                results['execution_details']['train_start'] = start_time
                results['execution_details'][
                    'time_stamp'] = experiment_timestamp
                results['execution_details']['early_stop'] = False
                results['execution_details']['keyboard_interrupt'] = False
                results['execution_details']['num_epochs'] = 0
                results['model_options'] = copy.copy(parameters)

                dataset_name = utils.get_basename_without_extension(
                    parameters['dataset_folder'])
                model_name = '{0}_{1}'.format(
                    dataset_name, results['execution_details']['time_stamp'])

                output_folder = os.path.join('..', 'output')
                utils.create_folder_if_not_exists(output_folder)

                stats_graph_folder = os.path.join(
                    output_folder, model_name)  # Folder where to save graphs
                utils.create_folder_if_not_exists(stats_graph_folder)
                model_folder = os.path.join(stats_graph_folder, 'model')
                utils.create_folder_if_not_exists(model_folder)

                with open(os.path.join(model_folder, file_params),
                          'w') as parameters_file:
                    conf_parameters.write(parameters_file)

                pickle.dump(
                    dataset,
                    open(os.path.join(model_folder, 'dataset.pickle'), 'wb'))

                # Instantiate the model
                # graph initialization should be before FileWriter, otherwise the graph will not appear in TensorBoard
                model = SelfSent(dataset, parameters)

                # Initialize the model
                sess.run(tf.global_variables_initializer())
                if not parameters['use_pretrained_model']:
                    model.load_pretrained_token_embeddings(
                        sess, dataset, parameters)

                # Start training + evaluation loop. Each iteration corresponds to 1 epoch.
                bad_counter = 0  # number of epochs with no improvement on the validation test
                previous_best_valid_accuracy = 0
                previous_best_test_accuracy = 0
                model_saver = tf.train.Saver(
                    max_to_keep=parameters['maximum_number_of_epochs']
                )  # defaults to saving all variables
                epoch_number = -1
                try:
                    while True:
                        epoch_number += 1
                        print('\nStarting epoch {0}'.format(epoch_number))

                        epoch_start_time = time.time()

                        if parameters[
                                'use_pretrained_model'] and epoch_number == 0:
                            # Restore pretrained model parameters
                            dataset = train.restore_model_parameters_from_pretrained_model(
                                parameters, dataset, sess, model_saver)
                            dataset.load_deploy(
                                os.path.join(parameters['dataset_folder'],
                                             '{0}.json'.format('deploy')),
                                parameters, annotator)
                            y_pred, y_true, output_filepaths, attentions = train.predict_labels(
                                sess,
                                model,
                                parameters,
                                dataset,
                                epoch_number,
                                stats_graph_folder,
                                dataset_filepaths,
                                only_deploy=True)
                            y_pred = y_pred['deploy']

                            with open(
                                    output_filepaths['deploy']
                                [:output_filepaths['deploy'].rfind('/') + 1] +
                                    'attention.txt',
                                    'w',
                                    encoding='utf-8') as fp:
                                # Compute attention
                                tokens_with_attentions = []
                                for sample_id in range(len(y_pred)):
                                    attention = attentions[int(
                                        sample_id / parameters['batch_size'])][
                                            sample_id %
                                            parameters['batch_size']]
                                    # Remove padded dimension
                                    attention = attention[:dataset.
                                                          token_lengths[
                                                              'deploy']
                                                          [sample_id]]

                                    # Save current attention
                                    fp.write("{}\t{:05.2f}\t".format(
                                        y_pred[sample_id][0],
                                        y_pred[sample_id][1]))
                                    fp.write(' '.join(dataset.tokens['deploy']
                                                      [sample_id]) + '\t')
                                    fp.write(' '.join(
                                        [str(a)
                                         for a in attention.flatten()]) + '\n')

                                    # Sum over columns (we combine all the annotation vectors)
                                    attention = np.sum(attention, axis=1)
                                    # Normalize to sum at 1
                                    attention = attention / np.linalg.norm(
                                        attention)

                                    # Keep only high confidence
                                    if y_pred[sample_id][1] >= parameters[
                                            'attention_visualization_conf']:
                                        tokens_with_attentions.append(
                                            (y_pred[sample_id][0],
                                             y_pred[sample_id][1],
                                             dataset.tokens['deploy']
                                             [sample_id], attention))

                            # Plot attention
                            utils_plots.visualize_attention(
                                tokens_with_attentions, dataset.unique_labels,
                                output_filepaths['deploy']
                                [:output_filepaths['deploy'].rfind('/') + 1],
                                parameters['attention_visualization_conf'])
                            break
                        elif epoch_number != 0:
                            total_loss, total_accuracy = train.train_step(
                                sess, dataset, model, parameters)
                            print('Mean loss: {:.2f}\tMean accuracy: {:.2f}'.
                                  format(np.mean(total_loss),
                                         100.0 * np.mean(total_accuracy)),
                                  flush=True)

                        epoch_elapsed_training_time = time.time(
                        ) - epoch_start_time
                        print('Training completed in {0:.2f} seconds'.format(
                            epoch_elapsed_training_time),
                              flush=True)

                        y_pred, y_true, output_filepaths, _ = train.predict_labels(
                            sess, model, parameters, dataset, epoch_number,
                            stats_graph_folder, dataset_filepaths)

                        # Evaluate model: save and plot results
                        evaluate.evaluate_model(results, dataset, y_pred,
                                                y_true, stats_graph_folder,
                                                epoch_number, epoch_start_time,
                                                output_filepaths, parameters)

                        # Save model
                        model_saver.save(
                            sess,
                            os.path.join(
                                model_folder,
                                'model_{0:05d}.ckpt'.format(epoch_number)))

                        # Early stop
                        valid_accuracy = results['epoch'][epoch_number][0][
                            'valid']['accuracy_score']
                        if valid_accuracy > previous_best_valid_accuracy:
                            bad_counter = 0
                            previous_best_valid_accuracy = valid_accuracy
                            previous_best_test_accuracy = results['epoch'][
                                epoch_number][0]['test']['accuracy_score']
                        else:
                            bad_counter += 1
                        print(
                            "The last {0} epochs have not shown improvements on the validation set."
                            .format(bad_counter))
                        print("Best valid with test performances in epoch " +
                              str(epoch_number - bad_counter) +
                              ": {:05.2f}%\t{:05.2f}%".format(
                                  previous_best_valid_accuracy,
                                  previous_best_test_accuracy))
                        if bad_counter >= parameters['patience']:
                            print('Early Stop!')
                            results['execution_details']['early_stop'] = True
                            break

                        if epoch_number >= parameters[
                                'maximum_number_of_epochs']:
                            break

                except KeyboardInterrupt:
                    results['execution_details']['keyboard_interrupt'] = True
                    print('Training interrupted')

                print('Finishing the experiment')
                end_time = time.time()
                results['execution_details'][
                    'train_duration'] = end_time - start_time
                results['execution_details']['train_end'] = end_time
                evaluate.save_results(results, stats_graph_folder)

            sess.close()  # release the session's resources
Beispiel #12
0
def main(args):
    parameters, conf_parameters = load_parameters()
    if args.file:
        parameters['predict_text'] = args.file
    parameters = process_input(parameters)
    dataset_filepaths = get_valid_dataset_filepaths(parameters)
    # Load dataset
    dataset = ds.Dataset(verbose=parameters['verbose'], debug=parameters['debug'])
    dataset.load_vocab_word_embeddings(parameters)

    pretrained_model_folder = os.path.dirname(parameters['pretrained_model_checkpoint_filepath'])
    dataset = pickle.load(open(os.path.join(pretrained_model_folder, 'dataset.pickle'), 'rb'))
    dataset.load_dataset(dataset_filepaths, parameters)
    dataset_type = "predict"
    dataset.labels[dataset_type], dataset.tokens[dataset_type], _, _, _  = dataset._parse_dataset(dataset_filepaths.get(dataset_type, None), parameters['language'])
    #dataset.load_vocab_word_embeddings(parameters)
    iteration_number = 0
    dataset.token_to_index = dict()
    for token_sentence in dataset.tokens['predict']:
        for token in token_sentence:
            if iteration_number == dataset.UNK_TOKEN_INDEX: iteration_number += 1
            if iteration_number == dataset.PADDING_TOKEN_INDEX: iteration_number += 1

            if not utils_nlp.is_token_in_pretrained_embeddings(token, dataset.vocab_embeddings, parameters):
                if parameters['embedding_type'] == 'glove':
                    dataset.token_to_index[token] =  dataset.UNK_TOKEN_INDEX
                    dataset.number_of_unknown_tokens += 1
                    dataset.tokens_mapped_to_unk.append(token)
                elif parameters['embedding_type'] == 'fasttext':
                    dataset.token_to_index[token] = iteration_number
                    iteration_number += 1
                else:
                    raise AssertionError("Embedding type not recognized")
            else:
                if token not in dataset.token_to_index:
                    dataset.token_to_index[token] = iteration_number
                    iteration_number += 1

    dataset_type = "predict"
    for dataset_type in dataset_filepaths.keys():
        dataset.token_indices[dataset_type] = []
        dataset.characters[dataset_type] = []
        dataset.character_indices[dataset_type] = []
        dataset.token_lengths[dataset_type] = []
        dataset.sequence_lengths[dataset_type] = []
        dataset.longest_token_length_in_sequence[dataset_type] = []
        # character_indices_padded[dataset_type] = []
        for token_sequence in dataset.tokens[dataset_type]:
            dataset.token_indices[dataset_type].append([dataset.token_to_index.get(token, dataset.UNK_TOKEN_INDEX) for token in token_sequence])
            dataset.characters[dataset_type].append([list(token) for token in token_sequence])
            dataset.character_indices[dataset_type].append(
                [[dataset.character_to_index.get(character,dataset.UNK_CHARACTER_INDEX) for character in token] for token in token_sequence])
            dataset.token_lengths[dataset_type].append([len(token) for token in token_sequence])
            dataset.sequence_lengths[dataset_type].append(len(token_sequence))
            dataset.longest_token_length_in_sequence[dataset_type].append(max(dataset.token_lengths[dataset_type][-1]))

            # character_indices_padded[dataset_type].append([ utils.pad_list(temp_token_indices, longest_token_length_in_sequence, self.PADDING_CHARACTER_INDEX)
            #                                                for temp_token_indices in character_indices[dataset_type][-1]])

        dataset.label_indices[dataset_type] = []
        for label_sequence in dataset.labels[dataset_type]:
            dataset.label_indices[dataset_type].append([dataset.label_to_index[label] for label in label_sequence])

    tmp_vector = [0] * len(dataset.unique_labels)
    tmp_vector[dataset.label_to_index["O"]] = 1
    dataset.PADDING_LABEL_VECTOR = tmp_vector
    for dataset_type in dataset_filepaths.keys():
        dataset.label_vector_indices[dataset_type] = []
        for label_indices_sequence in dataset.label_indices[dataset_type]:
            vector_sequence = []
            for indice in label_indices_sequence:
                vector = [0] * len(dataset.unique_labels)
                vector[indice] = 1
                vector_sequence.append(vector)
            dataset.label_vector_indices[dataset_type].append(vector_sequence)

    # Create graph and session
    with tf.Graph().as_default():
        session_conf = tf.ConfigProto(
            intra_op_parallelism_threads=parameters['number_of_cpu_threads'],
            inter_op_parallelism_threads=parameters['number_of_cpu_threads'],
            device_count={'CPU': 1, 'GPU': parameters['number_of_gpus']},
            allow_soft_placement=True,
            # automatically choose an existing and supported device to run the operations in case the specified one doesn't exist
            log_device_placement=False,
        )
        session_conf.gpu_options.allow_growth = True


        sess = tf.Session(config=session_conf)

        model = EntityLSTM(dataset, parameters)
        model_saver = tf.train.Saver()

        prediction_folder = os.path.join('..', 'predictions')
        utils.create_folder_if_not_exists(prediction_folder)
        dataset_name = parameters['pretrained_model_name']
        model_name = '{0}_{1}'.format(parameters["language"] + "_" + dataset_name,
                                      utils.get_current_time_in_miliseconds())
        prediction_folder = os.path.join(prediction_folder, model_name)
        utils.create_folder_if_not_exists(prediction_folder)
        epoch_number = 100
        #dataset_name = utils.get_basename_without_extension(parameters['dataset_test'])
        with open(os.path.join(prediction_folder, 'parameters.ini'), 'w') as parameters_file:
            conf_parameters.write(parameters_file)

        if parameters['use_pretrained_model']:
            # Restore pretrained model parameters
            transition_params_trained = train.restore_model_parameters_from_pretrained_model(parameters, dataset, sess, model, model_saver)
            model.load_pretrained_token_embeddings(sess, dataset, parameters)

            demo = parameters['pretrained_model_name'] == "demo"
            y_pred, y_true, output_filepaths = train.predict_labels(sess, model, transition_params_trained, parameters, dataset, epoch_number, prediction_folder, dataset_filepaths, demo=demo)

            if parameters['pretrained_model_name'] == "demo":
                print("============")
                print(" Prediction ")
                print("============")
                i = 0
                for sentence in dataset.tokens['predict']:
                    for token in sentence:
                        predict_label = dataset.index_to_label[y_pred['predict'][i]]
                        if dataset.index_to_label[y_pred['predict'][i]] != "O":
                            print(token,predict_label)
                        else:
                            print(token)
                        i += 1
                    print("")
        else:
            raise IOError('Set use_pretrained_model parameter to True')
Beispiel #13
0
def main():

    #### Parameters - start
    conf_parameters = configparser.ConfigParser()
    conf_parameters.read(os.path.join('.', 'parameters.ini'))
    nested_parameters = utils.convert_configparser_to_dictionary(
        conf_parameters)
    parameters = {}
    for k, v in nested_parameters.items():
        parameters.update(v)
    for k, v in parameters.items():
        if k in [
                'remove_unknown_tokens', 'character_embedding_dimension',
                'character_lstm_hidden_state_dimension',
                'token_embedding_dimension',
                'token_lstm_hidden_state_dimension', 'patience',
                'maximum_number_of_epochs', 'maximum_training_time',
                'number_of_cpu_threads', 'number_of_gpus'
        ]:
            parameters[k] = int(v)
        if k in ['dropout_rate']:
            parameters[k] = float(v)
        if k in [
                'use_character_lstm', 'is_character_lstm_bidirect',
                'is_token_lstm_bidirect', 'use_crf'
        ]:
            parameters[k] = distutils.util.strtobool(v)
    pprint(parameters)

    # Load dataset
    dataset_filepaths = {}
    dataset_filepaths['train'] = os.path.join(
        parameters['dataset_text_folder'], 'train.txt')
    dataset_filepaths['valid'] = os.path.join(
        parameters['dataset_text_folder'], 'valid.txt')
    dataset_filepaths['test'] = os.path.join(parameters['dataset_text_folder'],
                                             'test.txt')
    dataset = ds.Dataset()
    dataset.load_dataset(dataset_filepaths, parameters)

    with tf.Graph().as_default():
        session_conf = tf.ConfigProto(
            device_count={
                'CPU': 1,
                'GPU': 1
            },
            allow_soft_placement=
            True,  #  automatically choose an existing and supported device to run the operations in case the specified one doesn't exist
            log_device_placement=False)

        sess = tf.Session(config=session_conf)

        with sess.as_default():
            # Instantiate model
            model = EntityLSTM(dataset, parameters)
            sess.run(tf.global_variables_initializer())
            model.load_pretrained_token_embeddings(sess, dataset, parameters)

            # Initialize and save execution details
            start_time = time.time()
            experiment_timestamp = utils.get_current_time_in_miliseconds()
            results = {}
            #results['model_options'] = copy.copy(model_options)
            #results['model_options'].pop('optimizer', None)
            results['epoch'] = {}
            results['execution_details'] = {}
            results['execution_details']['train_start'] = start_time
            results['execution_details']['time_stamp'] = experiment_timestamp
            results['execution_details']['early_stop'] = False
            results['execution_details']['keyboard_interrupt'] = False
            results['execution_details']['num_epochs'] = 0
            results['model_options'] = copy.copy(parameters)

            dataset_name = utils.get_basename_without_extension(
                parameters['dataset_text_folder']
            )  #opts.train.replace('/', '_').split('.')[0] # 'conll2003en'
            model_name = '{0}_{1}'.format(
                dataset_name, results['execution_details']['time_stamp'])

            output_folder = os.path.join('..', 'output')
            utils.create_folder_if_not_exists(output_folder)
            stats_graph_folder = os.path.join(
                output_folder, model_name)  # Folder where to save graphs
            #print('stats_graph_folder: {0}'.format(stats_graph_folder))
            utils.create_folder_if_not_exists(stats_graph_folder)
            #             model_folder = os.path.join(stats_graph_folder, 'model')
            #             utils.create_folder_if_not_exists(model_folder)

            step = 0
            bad_counter = 0
            previous_best_valid_f1_score = 0
            transition_params_trained = np.random.rand(
                len(dataset.unique_labels), len(dataset.unique_labels))
            try:
                while True:
                    epoch_number = math.floor(
                        step / len(dataset.token_indices['train']))
                    print('\nStarting epoch {0}'.format(epoch_number), end='')

                    epoch_start_time = time.time()
                    #print('step: {0}'.format(step))

                    # Train model: loop over all sequences of training set with shuffling
                    sequence_numbers = list(
                        range(len(dataset.token_indices['train'])))
                    random.shuffle(sequence_numbers)
                    for sequence_number in sequence_numbers:
                        transition_params_trained = train.train_step(
                            sess, dataset, sequence_number, model,
                            transition_params_trained, parameters)
                        step += 1
                        if step % 100 == 0:
                            print('.', end='', flush=True)
                            #break
                    print('.', flush=True)
                    #print('step: {0}'.format(step))

                    # Predict labels using trained model
                    all_predictions = {}
                    all_y_true = {}
                    output_filepaths = {}
                    for dataset_type in ['train', 'valid', 'test']:
                        #print('dataset_type:     {0}'.format(dataset_type))
                        prediction_output = train.prediction_step(
                            sess, dataset, dataset_type, model,
                            transition_params_trained, step,
                            stats_graph_folder, epoch_number, parameters)
                        all_predictions[dataset_type], all_y_true[
                            dataset_type], output_filepaths[
                                dataset_type] = prediction_output
#                         model_options = None

                    epoch_elapsed_training_time = time.time(
                    ) - epoch_start_time
                    print(
                        'epoch_elapsed_training_time: {0:.2f} seconds'.format(
                            epoch_elapsed_training_time))

                    results['execution_details']['num_epochs'] = epoch_number

                    # Evaluate model: save and plot results
                    evaluate.evaluate_model(results, dataset, all_predictions,
                                            all_y_true, stats_graph_folder,
                                            epoch_number, epoch_start_time,
                                            output_filepaths)

                    # Early stop
                    valid_f1_score = results['epoch'][epoch_number][0][
                        'valid']['f1_score']['micro']
                    if valid_f1_score > previous_best_valid_f1_score:
                        bad_counter = 0
                        previous_best_valid_f1_score = valid_f1_score
                    else:
                        bad_counter += 1

                    if bad_counter > parameters['patience']:
                        print('Early Stop!')
                        results['execution_details']['early_stop'] = True
                        break

                    if epoch_number > parameters['maximum_number_of_epochs']:
                        break


#                     break # debugging

            except KeyboardInterrupt:
                results['execution_details']['keyboard_interrupt'] = True
                #         assess_model.save_results(results, stats_graph_folder)
                print('Training interrupted')

            print('Finishing the experiment')
            end_time = time.time()
            results['execution_details'][
                'train_duration'] = end_time - start_time
            results['execution_details']['train_end'] = end_time
            evaluate.save_results(results, stats_graph_folder)

    sess.close()  # release the session's resources