Example #1
0
def output_brat(output_filepaths,
                dataset_brat_folders,
                stats_graph_folder,
                overwrite=False):
    # Output brat files
    for dataset_type in ['train', 'valid', 'test', 'deploy']:
        if dataset_type not in output_filepaths.keys():
            continue
        brat_output_folder = os.path.join(stats_graph_folder, 'brat',
                                          dataset_type)
        utils.create_folder_if_not_exists(brat_output_folder)
        conll_to_brat(output_filepaths[dataset_type],
                      output_filepaths[dataset_type],
                      dataset_brat_folders[dataset_type],
                      brat_output_folder,
                      overwrite=overwrite)
Example #2
0
    def predict(self, test_file_path):
        # Not use
        text = ''
        with open(test_file_path, "r") as f:
            text = f.read()
        test_file_path = test_file_path.split('/')[-1]
        self.prediction_count += 1

        if self.prediction_count == 1:
            self.parameters['dataset_text_folder'] = os.path.join('..', 'data', 'temp')
            self.stats_graph_folder, _ = self._create_stats_graph_folder(self.parameters)

        # Update the deploy folder, file, and dataset
        dataset_type = 'deploy'
        ### Delete all deployment data
        for filepath in glob.glob(os.path.join(self.parameters['dataset_text_folder'], '{0}*'.format(dataset_type))):
            if os.path.isdir(filepath):
                shutil.rmtree(filepath)
            else:
                os.remove(filepath)
        ### Create brat folder and file
        dataset_brat_deploy_folder = os.path.join(self.parameters['dataset_text_folder'], dataset_type)
        utils.create_folder_if_not_exists(dataset_brat_deploy_folder)
        dataset_brat_deploy_filepath = os.path.join(dataset_brat_deploy_folder, test_file_path.format(str(self.prediction_count).zfill(5)))#self._get_dataset_brat_deploy_filepath(dataset_brat_deploy_folder)
        with codecs.open(dataset_brat_deploy_filepath, 'w', 'UTF-8') as f:
            f.write(text)
        ### Update deploy filepaths
        dataset_filepaths, dataset_brat_folders = self._get_valid_dataset_filepaths(self.parameters, dataset_types=[dataset_type])
        self.dataset_filepaths.update(dataset_filepaths)
        self.dataset_brat_folders.update(dataset_brat_folders)
        ### Update the dataset for the new deploy set
        self.dataset.update_dataset(self.dataset_filepaths, [dataset_type])

        # Predict labels and output brat
        output_filepaths = {}
        prediction_output = train.prediction_step(self.sess, self.dataset, dataset_type, self.model, self.transition_params_trained, self.stats_graph_folder, self.prediction_count, self.parameters, self.dataset_filepaths)
        _, _, output_filepaths[dataset_type] = prediction_output
        conll_to_brat.output_brat(output_filepaths, self.dataset_brat_folders, self.stats_graph_folder, overwrite=True)

        # Print and output result
        text_filepath = os.path.join(self.stats_graph_folder, 'brat', 'deploy', os.path.basename(dataset_brat_deploy_filepath))
        annotation_filepath = os.path.join(self.stats_graph_folder, 'brat', 'deploy', '{0}.ann'.format(utils.get_basename_without_extension(dataset_brat_deploy_filepath)))
        text2, entities = brat_to_conll.get_entities_from_brat(text_filepath, annotation_filepath, verbose=True)
        assert(text == text2)
        #print (entities)
        os.rename(self.stats_graph_folder, "../data/" + self.stats_graph_folder.split('/')[-1])
        print("Use brat tool to see result at ", "../data/" + self.stats_graph_folder.split('/')[-1])
    def _create_stats_graph_folder(self, parameters):
        """
        Initialize stats_graph_folder.

        Args:
            parameters (type): description.
        """
        experiment_timestamp = utils.get_current_time_in_miliseconds()
        dataset_name = utils.get_basename_without_extension(
            parameters['dataset_text_folder'])
        model_name = '{0}_{1}'.format(dataset_name, experiment_timestamp)
        utils.create_folder_if_not_exists(parameters['output_folder'])

        # Folder where to save graphs
        stats_graph_folder = os.path.join(parameters['output_folder'],
                                          model_name)
        utils.create_folder_if_not_exists(stats_graph_folder)
        return stats_graph_folder, experiment_timestamp
Example #4
0
def prepare_pretrained_model_for_restoring(output_folder_name,
                                           epoch_number,
                                           model_name,
                                           delete_token_mappings=False):

    input_model_folder = os.path.join('..', 'output', output_folder_name,
                                      'model')
    output_model_folder = os.path.join('..', 'trained_models', model_name)

    utils.create_folder_if_not_exists(output_model_folder)

    # trim and copy dataset.pickle
    input_dataset_filepath = os.path.join(input_model_folder, 'dataset.pickle')
    output_dataset_filepath = os.path.join(output_model_folder,
                                           'dataset.pickle')
    trim_dataset_pickle(input_dataset_filepath,
                        output_dataset_filepath,
                        delete_token_mappings=delete_token_mappings)

    # copy parameters.ini
    parameters_filepath = os.path.join(input_model_folder, 'parameters.ini')
    shutil.copy(parameters_filepath, output_model_folder)

    # (trim and) copy checkpoint files
    epoch_number_string = str(epoch_number).zfill(5)
    if delete_token_mappings:
        input_checkpoint_filepath = os.path.join(
            input_model_folder, 'model_{0}.ckpt'.format(epoch_number_string))
        output_checkpoint_filepath = os.path.join(output_model_folder,
                                                  'model.ckpt')
        trim_model_checkpoint(parameters_filepath, output_dataset_filepath,
                              input_checkpoint_filepath,
                              output_checkpoint_filepath)
    else:
        for filepath in glob.glob(
                os.path.join(input_model_folder,
                             'model_{0}.ckpt*'.format(epoch_number_string))):
            shutil.copyfile(
                filepath,
                os.path.join(
                    output_model_folder,
                    os.path.basename(filepath).replace(
                        '_' + epoch_number_string, '')))
Example #5
0
def xml_to_brat(input_folder, output_folder, overwrite=True):
    print('input_folder: {0}'.format(input_folder))
    assert os.path.exists(input_folder)
    start_time = time.time()
    if overwrite:
        shutil.rmtree(output_folder, ignore_errors=True)
    utils.create_folder_if_not_exists(output_folder)

    for input_filepath in sorted(glob.glob(os.path.join(input_folder,
                                                        '*.xml'))):
        filename = utils.get_basename_without_extension(input_filepath)
        output_text_filepath = os.path.join(output_folder,
                                            '{0}.txt'.format(filename))
        xmldoc = xml.etree.ElementTree.parse(input_filepath).getroot()
        # Get text
        text = xmldoc.findtext('TEXT')
        with codecs.open(output_text_filepath, 'w', 'UTF-8') as f:
            f.write(text)

        # Get PHI tags
        tags = xmldoc.findall('TAGS')[
            0]  # [0] because there is only one <TAGS>...</TAGS>
        entities = []
        for tag in tags:
            entity = {}
            entity['label'] = tag.get('TYPE')
            entity['text'] = tag.get('text')
            entity['start'] = int(tag.get('start'))
            entity['end'] = int(tag.get('end'))
            entities.append(entity)
        output_entities(output_folder,
                        filename,
                        entities,
                        output_text_filepath,
                        text,
                        overwrite=overwrite)

    time_spent = time.time() - start_time
    print("Time spent formatting: {0:.2f} seconds".format(time_spent))
def telegraph_link_from_zipy_site(link):
    parser = ZipyParser()
    resp = urllib.request.urlopen(link)
    parser.feed(resp.read().decode('utf-8'))
    # while not parser.image_link:
    # 	chunck = resp.read(chunck_size)
    # 	parser.feed(chunck.decode('utf-8'))

    image_link = parser.image_link
    if image_link is None:
        print('image not found')
        return None
    img_resp = urllib.request.urlopen(image_link)
    tmp_media_path = '/tmp/il_shopping_bot'
    utils.create_folder_if_not_exists(tmp_media_path)
    tmp_path = os.path.join(tmp_media_path, 'anyf.jpg')

    image_down = img_resp.read()
    with open(tmp_path, 'wb') as f:
        f.write(image_down)
    photo_link = linking.telegraph_link_media(tmp_path)
    os.remove(tmp_path)
    return photo_link
Example #7
0
def prepare_pretrained_model_for_restoring(output_folder_name, epoch_number, model_name, delete_token_mappings=False):
    '''
    Copy the dataset.pickle, parameters.ini, and model checkpoint files after removing the data used for training.
    
    The dataset and labels are deleted from dataset.pickle by default. The only information about the dataset that remain in the pretrained model
    is the list of tokens that appears in the dataset and the corresponding token embeddings learned from the dataset.
    
    If delete_token_mappings is set to True, index_to_token and token_to_index mappings are deleted from dataset.pickle additionally,
    and the corresponding token embeddings are deleted from the model checkpoint files. In this case, the pretrained model would not contain
    any information about the dataset used for training the model. 
    
    If you wish to share a pretrained model with delete_token_mappings = True, it is highly recommended to use some external pre-trained token 
    embeddings and freeze them while training the model to obtain high performance. This can be done by specifying the token_pretrained_embedding_filepath 
    and setting freeze_token_embeddings = True in parameters.ini for training.
    '''
    input_model_folder = os.path.join('..', 'output', output_folder_name, 'model')
    output_model_folder = os.path.join('..', 'trained_models', model_name)
    utils.create_folder_if_not_exists(output_model_folder)

    # trim and copy dataset.pickle
    input_dataset_filepath = os.path.join(input_model_folder, 'dataset.pickle')
    output_dataset_filepath = os.path.join(output_model_folder, 'dataset.pickle')
    trim_dataset_pickle(input_dataset_filepath, output_dataset_filepath, delete_token_mappings=delete_token_mappings)
    
    # copy parameters.ini
    parameters_filepath = os.path.join(input_model_folder, 'parameters.ini')
    shutil.copy(parameters_filepath, output_model_folder)
    
    # (trim and) copy checkpoint files
    epoch_number_string = str(epoch_number).zfill(5)
    if delete_token_mappings:
        input_checkpoint_filepath = os.path.join(input_model_folder, 'model_{0}.ckpt'.format(epoch_number_string))
        output_checkpoint_filepath = os.path.join(output_model_folder, 'model.ckpt')
        trim_model_checkpoint(parameters_filepath, output_dataset_filepath, input_checkpoint_filepath, output_checkpoint_filepath)
    else:
        for filepath in glob.glob(os.path.join(input_model_folder, 'model_{0}.ckpt*'.format(epoch_number_string))):
            shutil.copyfile(filepath, os.path.join(output_model_folder, os.path.basename(filepath).replace('_' + epoch_number_string, '')))
Example #8
0
def main():


    #### Parameters - start
    conf_parameters = configparser.ConfigParser()
    conf_parameters.read(os.path.join('.','parameters.ini'))
    nested_parameters = utils.convert_configparser_to_dictionary(conf_parameters)
    parameters = {}
    for k,v in nested_parameters.items():
        parameters.update(v)
    for k,v in parameters.items():
        if k in ['remove_unknown_tokens','character_embedding_dimension','character_lstm_hidden_state_dimension','token_embedding_dimension','token_lstm_hidden_state_dimension',
                 'patience','maximum_number_of_epochs','maximum_training_time','number_of_cpu_threads','number_of_gpus']:
            parameters[k] = int(v)
        if k in ['dropout_rate']:
            parameters[k] = float(v)
        if k in ['use_character_lstm','is_character_lstm_bidirect','is_token_lstm_bidirect','use_crf']:
            parameters[k] = distutils.util.strtobool(v)
    pprint(parameters)

    # Load dataset
    dataset_filepaths = {}
    dataset_filepaths['train'] = os.path.join(parameters['dataset_text_folder'], 'train.txt')
    dataset_filepaths['valid'] = os.path.join(parameters['dataset_text_folder'], 'valid.txt')
    dataset_filepaths['test']  = os.path.join(parameters['dataset_text_folder'], 'test.txt')
    dataset = ds.Dataset()
    dataset.load_dataset(dataset_filepaths, parameters)


    with tf.Graph().as_default():
        session_conf = tf.ConfigProto(
          device_count={'CPU': 1, 'GPU': 1},
          allow_soft_placement=True, #  automatically choose an existing and supported device to run the operations in case the specified one doesn't exist
          log_device_placement=False
          )

        sess = tf.Session(config=session_conf)

        with sess.as_default():
            model = EntityLSTM(dataset, parameters)

            # Define training procedure
            global_step = tf.Variable(0, name="global_step", trainable=False)
            if parameters['optimizer'] == 'adam':
                optimizer = tf.train.AdamOptimizer(1e-3)
            elif parameters['optimizer'] == 'sgd':
                optimizer = tf.train.GradientDescentOptimizer(0.005)
            else:
                raise ValueError("The lr_method parameter must be either adam or sgd.")

            # https://github.com/google/prettytensor/issues/6
            # https://www.tensorflow.org/api_docs/python/framework/graph_collections

            #print('tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES) : {0}'.format(tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES) ))
            #print('tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES) : {0}'.format(tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES) ))
            #print('tf.get_collection(tf.GraphKeys.MODEL_VARIABLES) : {0}'.format(tf.get_collection(tf.GraphKeys.MODEL_VARIABLES) ))

            # https://github.com/blei-lab/edward/issues/286#ref-pullrequest-181330211 : utility function to get all tensorflow variables a node depends on


            grads_and_vars = optimizer.compute_gradients(model.loss)

            # By defining a global_step variable and passing it to the optimizer we allow TensorFlow handle the counting of training steps for us.
            # The global step will be automatically incremented by one every time you execute train_op.
            train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step)


            # Initialize all variables
            sess.run(tf.global_variables_initializer())

            # Load pretrained token embeddings
            if not parameters['token_pretrained_embedding_filepath'] == '':
                load_token_embeddings(sess, model.W, dataset, parameters)


            estop = False  # early stop
            start_time = time.time()
            experiment_timestamp = utils.get_current_time_in_miliseconds()
            results = {}
            #results['model_options'] = copy.copy(model_options)
            #results['model_options'].pop('optimizer', None)
            results['epoch'] = {}
            # save/initialize execution details
            results['execution_details'] = {}
            results['execution_details']['train_start'] = start_time
            results['execution_details']['time_stamp'] = experiment_timestamp
            results['execution_details']['early_stop'] = False
            results['execution_details']['keyboard_interrupt'] = False
            results['execution_details']['num_epochs'] = 0
            results['model_options'] = copy.copy(parameters)

            dataset_name = utils.get_basename_without_extension(parameters['dataset_text_folder']) #opts.train.replace('/', '_').split('.')[0] # 'conll2003en'
            model_name = '{0}_{1}'.format(dataset_name, results['execution_details']['time_stamp'])

            output_folder=os.path.join('..', 'output')
            stats_graph_folder=os.path.join(output_folder, model_name) # Folder where to save graphs
            utils.create_folder_if_not_exists(output_folder)
            print('stats_graph_folder: {0}'.format(stats_graph_folder))
            utils.create_folder_if_not_exists(stats_graph_folder)
            model_folder = os.path.join(stats_graph_folder, 'model')
            utils.create_folder_if_not_exists(model_folder)

            step = 0
            bad_counter = 0
            previous_best_valid_f1_score = 0
            transition_params_trained = np.random.rand(len(dataset.unique_labels),len(dataset.unique_labels))
            try:
                while True:
                    epoch_number = math.floor(step / len(dataset.token_indices['train']))
                    print('epoch_number: {0}'.format(epoch_number))

                    epoch_start_time = time.time()

                    #print('step: {0}'.format(step))

                    # Train model: loop over all sequences of training set with shuffling
                    sequence_numbers=list(range(len(dataset.token_indices['train'])))
                    random.shuffle(sequence_numbers)
                    for sequence_number in sequence_numbers:
                        transition_params_trained = train_step(sess, dataset, sequence_number, train_op, global_step, model, transition_params_trained, parameters)
                        step += 1
                        if sequence_number % 100 == 0:
                            print('.',end='', flush=True)
                            #break

                    # Evaluate model
                    print('step: {0}'.format(step))
                    all_predictions = {}
                    all_y_true  = {}
                    output_filepaths = {}
                    for dataset_type in ['train', 'valid', 'test']:
                        print('dataset_type:     {0}'.format(dataset_type))
                        all_predictions[dataset_type], all_y_true[dataset_type], output_filepaths[dataset_type] = evaluate_model(sess, dataset, dataset_type, model, transition_params_trained, step, stats_graph_folder, epoch_number, parameters)
                        model_options = None

                    # Save and plot results
                    # TODO: remove uidx
                    uidx = 0
                    results['epoch'][epoch_number] = []
                    results['execution_details']['num_epochs'] = epoch_number

                    epoch_elapsed_training_time = time.time() - epoch_start_time
                    print('epoch_elapsed_training_time: {0:02f} seconds'.format(epoch_elapsed_training_time))

                    assess_model.assess_and_save(results, dataset, model_options, all_predictions, all_y_true, stats_graph_folder, epoch_number, uidx, epoch_start_time)
                    assess_model.plot_f1_vs_epoch(results, stats_graph_folder, 'f1_score')
                    assess_model.plot_f1_vs_epoch(results, stats_graph_folder, 'accuracy_score')

                    # CoNLL evaluation script
                    for dataset_type in ['train', 'valid', 'test']:
                        conll_evaluation_script = os.path.join('.', 'conlleval')
                        conll_output_filepath = '{0}_conll_evaluation.txt'.format(output_filepaths[dataset_type])
                        shell_command = 'perl {0} < {1} > {2}'.format(conll_evaluation_script, output_filepaths[dataset_type], conll_output_filepath)
                        print('shell_command: {0}'.format(shell_command))
                        #subprocess.call([shell_command])
                        os.system(shell_command)
                        conll_parsed_output = utils_nlp.get_parsed_conll_output(conll_output_filepath)
                        print('conll_parsed_output: {0}'.format(conll_parsed_output))
                        results['epoch'][epoch_number][0][dataset_type]['conll'] = conll_parsed_output
                        results['epoch'][epoch_number][0][dataset_type]['f1_conll'] = {}
                        results['epoch'][epoch_number][0][dataset_type]['f1_conll']['micro'] = results['epoch'][epoch_number][0][dataset_type]['conll']['all']['f1']
                    assess_model.plot_f1_vs_epoch(results, stats_graph_folder, 'f1_conll', from_json=False)

                    #end_time = time.time()
                    #results['execution_details']['train_duration'] = end_time - start_time
                    #results['execution_details']['train_end'] = end_time

                    # Early stop
                    valid_f1_score = results['epoch'][epoch_number][0]['valid']['f1_score']['micro']
                    if  valid_f1_score > previous_best_valid_f1_score:
                        bad_counter = 0
                        previous_best_valid_f1_score = valid_f1_score
                    else:
                        bad_counter += 1


                    if bad_counter > parameters['patience']:
                        print('Early Stop!')
                        results['execution_details']['early_stop'] = True
                        break

                    if epoch_number > parameters['maximum_number_of_epochs']: break

            except KeyboardInterrupt:
                results['execution_details']['keyboard_interrupt'] = True
        #         assess_model.save_results(results, stats_graph_folder)
                print('Training interrupted')

            print('Finishing the experiment')
            end_time = time.time()
            results['execution_details']['train_duration'] = end_time - start_time
            results['execution_details']['train_end'] = end_time
            assess_model.save_results(results, stats_graph_folder)

    sess.close() # release the session's resources
Example #9
0
import subprocess
from utils import create_folder_if_not_exists

procs = []
log_files = []
log_path = './results/logs/'
create_folder_if_not_exists(log_path)

# Regular state-dependent value network
prefix = 'regular_value_network'
log_file = open(log_path + prefix, 'w')
log_files.append(log_file)
p = subprocess.Popen(
    'python3 load_balance_actor_critic_train.py ' + \
    '--num_workers 10 --service_rates 0.15 0.25 0.35 0.45 0.55 0.65 0.75 0.85 0.95 1.05 '
    '--result_folder ./results/' + prefix + '/ ' + \
    '--model_folder ./results/parameters/' + prefix + '/',
    stdout=log_file, stderr=log_file, shell=True)
procs.append(p)

# Multi-value network (10 values)
prefix = '10_value_networks'
log_file = open(log_path + prefix, 'w')
log_files.append(log_file)
p = subprocess.Popen(
    'python3 load_balance_actor_multi_critic_train.py ' + \
    '--num_workers 10 --service_rates 0.15 0.25 0.35 0.45 0.55 0.65 0.75 0.85 0.95 1.05 '
    '--result_folder ./results/' + prefix + '/ ' + \
    '--model_folder ./results/parameters/' + prefix + '/',
    stdout=log_file, stderr=log_file, shell=True)
procs.append(p)
Example #10
0
    def predict(self, text):
        """
        Predict

        Args:
            text (str): Description.
        """
        self.prediction_count += 1

        if self.prediction_count == 1:
            self.parameters['dataset_text_folder'] = os.path.join(
                '.', 'data', 'temp')
            self.stats_graph_folder, _ = self._create_stats_graph_folder(
                self.parameters)

        # Update the deploy folder, file, and modeldata
        dataset_type = 'deploy'

        # Delete all deployment data
        for filepath in glob.glob(
                os.path.join(self.parameters['dataset_text_folder'],
                             '{0}*'.format(dataset_type))):
            if os.path.isdir(filepath):
                shutil.rmtree(filepath)
            else:
                os.remove(filepath)

        # Create brat folder and file
        dataset_brat_deploy_folder = os.path.join(
            self.parameters['dataset_text_folder'], dataset_type)
        utils.create_folder_if_not_exists(dataset_brat_deploy_folder)
        dataset_brat_deploy_filepath = os.path.join(
            dataset_brat_deploy_folder,
            'temp_{0}.txt'.format(str(self.prediction_count).zfill(5)))
        #self._get_dataset_brat_deploy_filepath(dataset_brat_deploy_folder)
        # print('over here: ',dataset_brat_deploy_filepath)
        with codecs.open(dataset_brat_deploy_filepath, 'w', 'UTF-8') as f:
            f.write(text)

        # Update deploy filepaths
        dataset_filepaths, dataset_brat_folders = self._get_valid_dataset_filepaths(
            self.parameters, dataset_types=[dataset_type])
        self.dataset_filepaths.update(dataset_filepaths)
        self.dataset_brat_folders.update(dataset_brat_folders)

        # Update the dataset for the new deploy set
        self.modeldata.update_dataset(self.dataset_filepaths, [dataset_type])

        # Predict labels and output brat
        output_filepaths = {}
        prediction_output = train.prediction_step(
            self.sess, self.modeldata, dataset_type, self.model,
            self.transition_params_trained, self.stats_graph_folder,
            self.prediction_count, self.parameters, self.dataset_filepaths)

        _, _, output_filepaths[dataset_type] = prediction_output
        conll_to_brat.output_brat(output_filepaths,
                                  self.dataset_brat_folders,
                                  self.stats_graph_folder,
                                  overwrite=True)

        # Print and output result
        text_filepath = os.path.join(
            self.stats_graph_folder, 'brat', 'deploy',
            os.path.basename(dataset_brat_deploy_filepath))
        annotation_filepath = os.path.join(
            self.stats_graph_folder, 'brat', 'deploy', '{0}.ann'.format(
                utils.get_basename_without_extension(
                    dataset_brat_deploy_filepath)))
        text2, entities = brat_to_conll.get_entities_from_brat(
            text_filepath, annotation_filepath, verbose=True)
        assert (text == text2)
        return entities
Example #11
0
def main():

    parameters, conf_parameters = load_parameters()
    pprint(parameters)
    dataset_filepaths = get_valid_dataset_filepaths(parameters)
    check_parameter_compatiblity(parameters, dataset_filepaths)

    cross_validation = parameters[
        'cross_validation'] if 'cross_validation' in parameters else 1
    valid_fscores = []
    valid_precisions = []
    valid_recalls = []
    for cv in range(0, cross_validation):
        if "als" in dataset_filepaths['train'] and cross_validation > 1:
            train_files = list(range(0, cv)) + list(
                range(cv + 1, cross_validation))
            test_file = cv
            file_train = "tmp_combined.train"
            file_valid = "tmp_combined.test"
            output = []
            for i in train_files:
                with open(dataset_filepaths['train'] + "_" + str(i),
                          "r",
                          encoding="utf-8") as file:
                    output.append(file.read())
            with open(file_train, "w", encoding="utf-8") as file:
                file.write("\n\n".join(output))
            output = []
            with open(dataset_filepaths['train'] + "_" + str(test_file),
                      "r",
                      encoding="utf-8") as file:
                output.append(file.read())
            with open(file_valid, "w", encoding="utf-8") as file:
                file.write("\n\n".join(output))
            dataset_filepaths['train'] = file_train
            dataset_filepaths['valid'] = file_valid
        # Load dataset
        dataset = ds.Dataset(verbose=parameters['verbose'],
                             debug=parameters['debug'])
        dataset.load_vocab_word_embeddings(parameters)
        dataset.load_dataset(dataset_filepaths, parameters)

        # Create graph and session
        with tf.Graph().as_default():
            session_conf = tf.ConfigProto(
                intra_op_parallelism_threads=parameters[
                    'number_of_cpu_threads'],
                inter_op_parallelism_threads=parameters[
                    'number_of_cpu_threads'],
                device_count={
                    'CPU': 1,
                    'GPU': parameters['number_of_gpus']
                },
                allow_soft_placement=
                True,  # automatically choose an existing and supported device to run the operations in case the specified one doesn't exist
                log_device_placement=False)

            session_conf.gpu_options.allow_growth = True

            sess = tf.Session(config=session_conf)

            with sess.as_default():
                # Initialize and save execution details
                start_time = time.time()
                experiment_timestamp = utils.get_current_time_in_miliseconds()
                results = {}
                results['epoch'] = {}
                results['execution_details'] = {}
                results['execution_details']['train_start'] = start_time
                results['execution_details'][
                    'time_stamp'] = experiment_timestamp
                results['execution_details']['early_stop'] = False
                results['execution_details']['keyboard_interrupt'] = False
                results['execution_details']['num_epochs'] = 0
                results['model_options'] = copy.copy(parameters)

                dataset_name = utils.get_basename_without_extension(
                    parameters['dataset_train'])
                if 'data_to_use' in parameters:
                    model_name = '{0}_{1}'.format(
                        parameters['language'] + "_" + dataset_name + "_small",
                        results['execution_details']['time_stamp'])
                else:
                    model_name = '{0}_{1}'.format(
                        parameters['language'] + "_" + dataset_name,
                        results['execution_details']['time_stamp'])

                output_folder = os.path.join('..', 'output')
                utils.create_folder_if_not_exists(output_folder)
                stats_graph_folder = os.path.join(
                    output_folder, model_name)  # Folder where to save graphs
                utils.create_folder_if_not_exists(stats_graph_folder)
                model_folder = os.path.join(stats_graph_folder, 'model')
                utils.create_folder_if_not_exists(model_folder)
                with open(os.path.join(model_folder, 'parameters.ini'),
                          'w') as parameters_file:
                    conf_parameters.write(parameters_file)
                tensorboard_log_folder = os.path.join(stats_graph_folder,
                                                      'tensorboard_logs')
                utils.create_folder_if_not_exists(tensorboard_log_folder)
                tensorboard_log_folders = {}
                for dataset_type in dataset_filepaths.keys():
                    tensorboard_log_folders[dataset_type] = os.path.join(
                        stats_graph_folder, 'tensorboard_logs', dataset_type)
                    utils.create_folder_if_not_exists(
                        tensorboard_log_folders[dataset_type])
                #del dataset.embeddings_matrix
                if not parameters['use_pretrained_model']:
                    pickle.dump(
                        dataset,
                        open(os.path.join(model_folder, 'dataset.pickle'),
                             'wb'))
                #dataset.load_pretrained_word_embeddings(parameters)
                # Instantiate the model
                # graph initialization should be before FileWriter, otherwise the graph will not appear in TensorBoard
                model = EntityLSTM(dataset, parameters)

                # Instantiate the writers for TensorBoard
                writers = {}
                for dataset_type in dataset_filepaths.keys():
                    writers[dataset_type] = tf.summary.FileWriter(
                        tensorboard_log_folders[dataset_type],
                        graph=sess.graph)
                embedding_writer = tf.summary.FileWriter(
                    model_folder
                )  # embedding_writer has to write in model_folder, otherwise TensorBoard won't be able to view embeddings

                embeddings_projector_config = projector.ProjectorConfig()
                tensorboard_token_embeddings = embeddings_projector_config.embeddings.add(
                )
                tensorboard_token_embeddings.tensor_name = model.token_embedding_weights.name
                token_list_file_path = os.path.join(
                    model_folder, 'tensorboard_metadata_tokens.tsv')
                tensorboard_token_embeddings.metadata_path = os.path.relpath(
                    token_list_file_path, '..')

                if parameters['use_character_lstm']:
                    tensorboard_character_embeddings = embeddings_projector_config.embeddings.add(
                    )
                    tensorboard_character_embeddings.tensor_name = model.character_embedding_weights.name
                    character_list_file_path = os.path.join(
                        model_folder, 'tensorboard_metadata_characters.tsv')
                    tensorboard_character_embeddings.metadata_path = os.path.relpath(
                        character_list_file_path, '..')

                projector.visualize_embeddings(embedding_writer,
                                               embeddings_projector_config)

                # Write metadata for TensorBoard embeddings
                token_list_file = codecs.open(token_list_file_path, 'w',
                                              'UTF-8')
                for token_index in range(len(dataset.index_to_token)):
                    token_list_file.write('{0}\n'.format(
                        dataset.index_to_token[token_index]))
                token_list_file.close()

                if parameters['use_character_lstm']:
                    character_list_file = codecs.open(character_list_file_path,
                                                      'w', 'UTF-8')
                    for character_index in range(dataset.alphabet_size):
                        if character_index == dataset.PADDING_CHARACTER_INDEX:
                            character_list_file.write('PADDING\n')
                        else:
                            character_list_file.write('{0}\n'.format(
                                dataset.index_to_character[character_index]))
                    character_list_file.close()

                try:
                    # Initialize the model
                    sess.run(tf.global_variables_initializer())
                    if not parameters['use_pretrained_model']:
                        model.load_pretrained_token_embeddings(
                            sess, dataset, parameters)

                    # Start training + evaluation loop. Each iteration corresponds to 1 epoch.
                    bad_counter = 0  # number of epochs with no improvement on the validation test in terms of F1-score
                    previous_best_valid_f1_score = 0
                    transition_params_trained = np.random.rand(
                        len(dataset.unique_labels), len(dataset.unique_labels)
                    )  #TODO np.random.rand(len(dataset.unique_labels)+2,len(dataset.unique_labels)+2)
                    model_saver = tf.train.Saver(
                        max_to_keep=None
                    )  #parameters['maximum_number_of_epochs'])  # defaults to saving all variables
                    epoch_number = 0

                    while True:
                        epoch_number += 1
                        print('\nStarting epoch {0}'.format(epoch_number))

                        epoch_start_time = time.time()

                        if parameters[
                                'use_pretrained_model'] and epoch_number == 1:
                            # Restore pretrained model parameters
                            transition_params_trained = train.restore_model_parameters_from_pretrained_model(
                                parameters, dataset, sess, model, model_saver)
                        elif epoch_number != 0:
                            # Train model: loop over all sequences of training set with shuffling
                            sequence_numbers = list(
                                range(len(dataset.token_indices['train'])))
                            random.shuffle(sequence_numbers)
                            data_counter = 0
                            sub_id = 0
                            for i in tqdm(range(0, len(sequence_numbers),
                                                parameters['batch_size']),
                                          "Training",
                                          mininterval=1):
                                data_counter += parameters['batch_size']
                                if data_counter >= 20000:
                                    data_counter = 0
                                    sub_id += 0.001
                                    print("Intermediate evaluation number: ",
                                          sub_id)

                                    #model_saver.save(sess,
                                    #                 os.path.join(model_folder, 'model_{0:05d}_{1}.ckpt'.format(epoch_number, len(sequence_numbers)/4/len(sequence_numbers))))
                                    epoch_elapsed_training_time = time.time(
                                    ) - epoch_start_time
                                    print(
                                        'Training completed in {0:.2f} seconds'
                                        .format(epoch_elapsed_training_time),
                                        flush=True)

                                    y_pred, y_true, output_filepaths = train.predict_labels(
                                        sess, model, transition_params_trained,
                                        parameters, dataset,
                                        epoch_number + sub_id,
                                        stats_graph_folder, dataset_filepaths)

                                    # Evaluate model: save and plot results
                                    evaluate.evaluate_model(
                                        results, dataset, y_pred, y_true,
                                        stats_graph_folder, epoch_number,
                                        epoch_start_time, output_filepaths,
                                        parameters)

                                    # Save model
                                    model_saver.save(
                                        sess,
                                        os.path.join(
                                            model_folder,
                                            'model_{0:07.3f}.ckpt'.format(
                                                epoch_number + sub_id)))

                                    # Save TensorBoard logs
                                    summary = sess.run(model.summary_op,
                                                       feed_dict=None)
                                    writers['train'].add_summary(
                                        summary, epoch_number)
                                    writers['train'].flush()
                                    utils.copytree(
                                        writers['train'].get_logdir(),
                                        model_folder)

                                    # Early stop
                                    valid_f1_score = results['epoch'][
                                        epoch_number][0]['valid']['f1_score'][
                                            'micro']
                                    # valid_precision = results['epoch'][epoch_number][0]['valid']['precision']['micro']
                                    # valid_recall = results['epoch'][epoch_number][0]['valid']['recall']['micro']

                                    # valid_fscores.append(valid_f1_score)
                                    if valid_f1_score > previous_best_valid_f1_score:
                                        bad_counter = 0
                                        previous_best_valid_f1_score = valid_f1_score
                                        # previous_best_valid_precision = valid_precision
                                        # previous_best_valid_recall = valid_recall
                                    else:
                                        bad_counter += 1

                                sequence_number = sequence_numbers[
                                    i:i + parameters['batch_size']]
                                transition_params_trained, loss = train.train_step(
                                    sess, dataset, sequence_number, model,
                                    transition_params_trained, parameters)
                        epoch_elapsed_training_time = time.time(
                        ) - epoch_start_time
                        print('Training completed in {0:.2f} seconds'.format(
                            epoch_elapsed_training_time),
                              flush=True)

                        y_pred, y_true, output_filepaths = train.predict_labels(
                            sess, model, transition_params_trained, parameters,
                            dataset, epoch_number, stats_graph_folder,
                            dataset_filepaths)

                        # Evaluate model: save and plot results
                        evaluate.evaluate_model(results, dataset, y_pred,
                                                y_true, stats_graph_folder,
                                                epoch_number, epoch_start_time,
                                                output_filepaths, parameters)

                        # Save model
                        model_saver.save(
                            sess,
                            os.path.join(
                                model_folder,
                                'model_{0:05d}.ckpt'.format(epoch_number)))

                        # Save TensorBoard logs
                        summary = sess.run(model.summary_op, feed_dict=None)
                        writers['train'].add_summary(summary, epoch_number)
                        writers['train'].flush()
                        utils.copytree(writers['train'].get_logdir(),
                                       model_folder)

                        # Early stop
                        valid_f1_score = results['epoch'][epoch_number][0][
                            'valid']['f1_score']['micro']
                        #valid_precision = results['epoch'][epoch_number][0]['valid']['precision']['micro']
                        #valid_recall = results['epoch'][epoch_number][0]['valid']['recall']['micro']

                        #valid_fscores.append(valid_f1_score)
                        if valid_f1_score > previous_best_valid_f1_score:
                            bad_counter = 0
                            previous_best_valid_f1_score = valid_f1_score
                            #previous_best_valid_precision = valid_precision
                            #previous_best_valid_recall = valid_recall
                        else:
                            bad_counter += 1
                        print(
                            "The last {0} epochs have not shown improvements on the validation set."
                            .format(bad_counter))

                        if bad_counter >= parameters['patience']:
                            print('Early Stop!')
                            results['execution_details']['early_stop'] = True
                            break

                        if epoch_number >= parameters[
                                'maximum_number_of_epochs']:
                            break

                except KeyboardInterrupt:
                    results['execution_details']['keyboard_interrupt'] = True
                    print('Training interrupted')
                    # remove the experiment
                    remove_experiment = input(
                        "Do you want to remove the experiment? (yes/y/Yes)")
                    if remove_experiment in ["Yes", "yes", "y"]:
                        shutil.rmtree(stats_graph_folder)
                        print("Folder removed")
                    else:
                        print('Finishing the experiment')
                        end_time = time.time()
                        results['execution_details'][
                            'train_duration'] = end_time - start_time
                        results['execution_details']['train_end'] = end_time
                        evaluate.save_results(results, stats_graph_folder)
                except Exception:
                    logging.exception("")
                    remove_experiment = input(
                        "Do you want to remove the experiment? (yes/y/Yes)")
                    if remove_experiment in ["Yes", "yes", "y"]:
                        shutil.rmtree(stats_graph_folder)
                        print("Folder removed")

        sess.close()  # release the session's resources
        if 'cross_validation' in parameters and parameters[
                'cross_validation'] > 1:
            valid_fscores.append(previous_best_valid_f1_score)
            #valid_precisions.append(previous_best_valid_precision)
            #valid_recalls.append(previous_best_valid_recall)
    if 'cross_validation' in parameters and parameters['cross_validation'] > 1:
        print("mean f1score:", np.mean(valid_fscores))
        #print("mean precision:", np.mean(valid_precisions))
        #print("mean recall:", np.mean(valid_recalls))
        with codecs.open(os.path.join(stats_graph_folder, "result_cv.txt"),
                         "w") as file:
            file.write("F1score " + ", ".join(map(str, valid_fscores)))
            # file.write("Precision " + valid_precisions)
            # file.write("Recall " + valid_recalls)
            file.write("Mean F1score " + str(np.mean(valid_fscores)))
Example #12
0
def main():

    parameters, conf_parameters = load_parameters()
    dataset_filepaths, dataset_brat_folders = get_valid_dataset_filepaths(parameters)
    check_parameter_compatiblity(parameters, dataset_filepaths)

    # Load dataset
    dataset = ds.Dataset(verbose=parameters['verbose'], debug=parameters['debug'])
    dataset.load_dataset(dataset_filepaths, parameters)

    # Create graph and session
    with tf.Graph().as_default():
        session_conf = tf.ConfigProto(
            intra_op_parallelism_threads=parameters['number_of_cpu_threads'],
            inter_op_parallelism_threads=parameters['number_of_cpu_threads'],
            device_count={'CPU': 1, 'GPU': parameters['number_of_gpus']},
            allow_soft_placement=True, # automatically choose an existing and supported device to run the operations in case the specified one doesn't exist
            log_device_placement=False
            )

        sess = tf.Session(config=session_conf)

        with sess.as_default():
            # Initialize and save execution details
            start_time = time.time()
            experiment_timestamp = utils.get_current_time_in_miliseconds()
            results = {}
            results['epoch'] = {}
            results['execution_details'] = {}
            results['execution_details']['train_start'] = start_time
            results['execution_details']['time_stamp'] = experiment_timestamp
            results['execution_details']['early_stop'] = False
            results['execution_details']['keyboard_interrupt'] = False
            results['execution_details']['num_epochs'] = 0
            results['model_options'] = copy.copy(parameters)

            dataset_name = utils.get_basename_without_extension(parameters['dataset_text_folder'])
            model_name = '{0}_{1}'.format(dataset_name, results['execution_details']['time_stamp'])

            output_folder=os.path.join('..', 'output')
            utils.create_folder_if_not_exists(output_folder)
            stats_graph_folder=os.path.join(output_folder, model_name) # Folder where to save graphs
            utils.create_folder_if_not_exists(stats_graph_folder)
            model_folder = os.path.join(stats_graph_folder, 'model')
            utils.create_folder_if_not_exists(model_folder)
            with open(os.path.join(model_folder, 'parameters.ini'), 'w') as parameters_file:
                conf_parameters.write(parameters_file)
            tensorboard_log_folder = os.path.join(stats_graph_folder, 'tensorboard_logs')
            utils.create_folder_if_not_exists(tensorboard_log_folder)
            tensorboard_log_folders = {}
            for dataset_type in dataset_filepaths.keys():
                tensorboard_log_folders[dataset_type] = os.path.join(stats_graph_folder, 'tensorboard_logs', dataset_type)
                utils.create_folder_if_not_exists(tensorboard_log_folders[dataset_type])
            pickle.dump(dataset, open(os.path.join(model_folder, 'dataset.pickle'), 'wb'))

            # Instantiate the model
            # graph initialization should be before FileWriter, otherwise the graph will not appear in TensorBoard
            model = EntityLSTM(dataset, parameters)

            # Instantiate the writers for TensorBoard
            writers = {}
            for dataset_type in dataset_filepaths.keys():
                writers[dataset_type] = tf.summary.FileWriter(tensorboard_log_folders[dataset_type], graph=sess.graph)
            embedding_writer = tf.summary.FileWriter(model_folder) # embedding_writer has to write in model_folder, otherwise TensorBoard won't be able to view embeddings

            embeddings_projector_config = projector.ProjectorConfig()
            tensorboard_token_embeddings = embeddings_projector_config.embeddings.add()
            tensorboard_token_embeddings.tensor_name = model.token_embedding_weights.name
            token_list_file_path = os.path.join(model_folder, 'tensorboard_metadata_tokens.tsv')
            tensorboard_token_embeddings.metadata_path = os.path.relpath(token_list_file_path, '..')

            tensorboard_character_embeddings = embeddings_projector_config.embeddings.add()
            tensorboard_character_embeddings.tensor_name = model.character_embedding_weights.name
            character_list_file_path = os.path.join(model_folder, 'tensorboard_metadata_characters.tsv')
            tensorboard_character_embeddings.metadata_path = os.path.relpath(character_list_file_path, '..')

            projector.visualize_embeddings(embedding_writer, embeddings_projector_config)

            # Write metadata for TensorBoard embeddings
            token_list_file = codecs.open(token_list_file_path,'w', 'UTF-8')
            for token_index in range(dataset.vocabulary_size):
                token_list_file.write('{0}\n'.format(dataset.index_to_token[token_index]))
            token_list_file.close()

            character_list_file = codecs.open(character_list_file_path,'w', 'UTF-8')
            for character_index in range(dataset.alphabet_size):
                if character_index == dataset.PADDING_CHARACTER_INDEX:
                    character_list_file.write('PADDING\n')
                else:
                    character_list_file.write('{0}\n'.format(dataset.index_to_character[character_index]))
            character_list_file.close()


            # Initialize the model
            sess.run(tf.global_variables_initializer())
            if not parameters['use_pretrained_model']:
                model.load_pretrained_token_embeddings(sess, dataset, parameters)

            # Start training + evaluation loop. Each iteration corresponds to 1 epoch.
            bad_counter = 0 # number of epochs with no improvement on the validation test in terms of F1-score
            previous_best_valid_f1_score = 0
            transition_params_trained = np.random.rand(len(dataset.unique_labels)+2,len(dataset.unique_labels)+2)
            model_saver = tf.train.Saver(max_to_keep=parameters['maximum_number_of_epochs'])  # defaults to saving all variables
            epoch_number = -1
            try:
                while True:
                    step = 0
                    epoch_number += 1
                    print('\nStarting epoch {0}'.format(epoch_number))

                    epoch_start_time = time.time()

                    if parameters['use_pretrained_model'] and epoch_number == 0:
                        # Restore pretrained model parameters
                        transition_params_trained = train.restore_model_parameters_from_pretrained_model(parameters, dataset, sess, model, model_saver)
                    elif epoch_number != 0:
                        # Train model: loop over all sequences of training set with shuffling
                        sequence_numbers=list(range(len(dataset.token_indices['train'])))
                        random.shuffle(sequence_numbers)
                        for sequence_number in sequence_numbers:
                            transition_params_trained = train.train_step(sess, dataset, sequence_number, model, transition_params_trained, parameters)
                            step += 1
                            if step % 10 == 0:
                                print('Training {0:.2f}% done'.format(step/len(sequence_numbers)*100), end='\r', flush=True)

                    epoch_elapsed_training_time = time.time() - epoch_start_time
                    print('Training completed in {0:.2f} seconds'.format(epoch_elapsed_training_time), flush=True)

                    y_pred, y_true, output_filepaths = train.predict_labels(sess, model, transition_params_trained, parameters, dataset, epoch_number, stats_graph_folder, dataset_filepaths)

                    # Evaluate model: save and plot results
                    evaluate.evaluate_model(results, dataset, y_pred, y_true, stats_graph_folder, epoch_number, epoch_start_time, output_filepaths, parameters)

                    if parameters['use_pretrained_model'] and not parameters['train_model']:
                        conll_to_brat.output_brat(output_filepaths, dataset_brat_folders, stats_graph_folder)
                        break

                    # Save model
                    model_saver.save(sess, os.path.join(model_folder, 'model_{0:05d}.ckpt'.format(epoch_number)))

                    # Save TensorBoard logs
                    summary = sess.run(model.summary_op, feed_dict=None)
                    writers['train'].add_summary(summary, epoch_number)
                    writers['train'].flush()
                    utils.copytree(writers['train'].get_logdir(), model_folder)


                    # Early stop
                    valid_f1_score = results['epoch'][epoch_number][0]['valid']['f1_score']['micro']
                    if  valid_f1_score > previous_best_valid_f1_score:
                        bad_counter = 0
                        previous_best_valid_f1_score = valid_f1_score
                        conll_to_brat.output_brat(output_filepaths, dataset_brat_folders, stats_graph_folder, overwrite=True)
                    else:
                        bad_counter += 1
                    print("The last {0} epochs have not shown improvements on the validation set.".format(bad_counter))

                    if bad_counter >= parameters['patience']:
                        print('Early Stop!')
                        results['execution_details']['early_stop'] = True
                        break

                    if epoch_number >= parameters['maximum_number_of_epochs']: break


            except KeyboardInterrupt:
                results['execution_details']['keyboard_interrupt'] = True
                print('Training interrupted')

            print('Finishing the experiment')
            end_time = time.time()
            results['execution_details']['train_duration'] = end_time - start_time
            results['execution_details']['train_end'] = end_time
            print('ok1')
            evaluate.save_results(results, stats_graph_folder)
            print('ok2')
        print('ok3')
        #sess.close() # release the session's resources
    print('ok4')
Example #13
0
def main():
    file_params = 'parameters_yelp_50k.ini'
    if len(sys.argv) > 1 and '.ini' in sys.argv[1]:
        file_params = sys.argv[1]

    # Load config
    parameters, conf_parameters = load_parameters(
        parameters_filepath=os.path.join('.', file_params))
    dataset_filepaths = get_valid_dataset_filepaths(parameters)
    #check_parameter_compatiblity(parameters, dataset_filepaths)

    if parameters['seed'] != -1:
        random.seed(parameters['seed'])

    # Create annotator
    annotator = stanford_corenlp_pywrapper.CoreNLP(
        configdict={
            'annotators': 'tokenize, ssplit',
            'ssplit.eolonly': True
        },
        corenlp_jars=[parameters['stanford_folder'] + '/*'])
    # Load dataset
    dataset = ds.Dataset(verbose=parameters['verbose'],
                         debug=parameters['debug'])
    dataset.load_dataset(dataset_filepaths, parameters, annotator)

    # Adapt train/valid/test to be multiple of batch_size
    for size in ['train_size', 'valid_size', 'test_size']:
        if parameters[size] % parameters['batch_size'] != 0:
            parameters[size] = int(
                parameters[size] /
                parameters['batch_size']) * parameters['batch_size']
            print('Changed {}'.format(size))

    # Set GPU device if more GPUs are specified
    if parameters['number_of_gpus'] > 1 and parameters['gpu_device'] != -1:
        os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
        os.environ["CUDA_VISIBLE_DEVICES"] = parameters['gpu_device']

    # GPUs
    print(device_lib.list_local_devices())
    # Create graph and session
    with tf.Graph().as_default():
        session_conf = tf.ConfigProto(
            intra_op_parallelism_threads=parameters['number_of_cpu_threads'],
            inter_op_parallelism_threads=parameters['number_of_cpu_threads'],
            device_count={
                'CPU': 1,
                'GPU': parameters['number_of_gpus']
            },
            allow_soft_placement=
            True,  # automatically choose an existing and supported device to run the operations in case the specified one doesn't exist
            log_device_placement=False)

        sess = tf.Session(config=session_conf)

        with sess.as_default():
            if parameters['seed'] != -1:
                tf.set_random_seed(parameters['seed'])

                # Initialize and save execution details
                start_time = time.time()
                experiment_timestamp = utils.get_current_time_in_miliseconds()

                results = {}
                results['epoch'] = {}
                results['execution_details'] = {}
                results['execution_details']['train_start'] = start_time
                results['execution_details'][
                    'time_stamp'] = experiment_timestamp
                results['execution_details']['early_stop'] = False
                results['execution_details']['keyboard_interrupt'] = False
                results['execution_details']['num_epochs'] = 0
                results['model_options'] = copy.copy(parameters)

                dataset_name = utils.get_basename_without_extension(
                    parameters['dataset_folder'])
                model_name = '{0}_{1}'.format(
                    dataset_name, results['execution_details']['time_stamp'])

                output_folder = os.path.join('..', 'output')
                utils.create_folder_if_not_exists(output_folder)

                stats_graph_folder = os.path.join(
                    output_folder, model_name)  # Folder where to save graphs
                utils.create_folder_if_not_exists(stats_graph_folder)
                model_folder = os.path.join(stats_graph_folder, 'model')
                utils.create_folder_if_not_exists(model_folder)

                with open(os.path.join(model_folder, file_params),
                          'w') as parameters_file:
                    conf_parameters.write(parameters_file)

                pickle.dump(
                    dataset,
                    open(os.path.join(model_folder, 'dataset.pickle'), 'wb'))

                # Instantiate the model
                # graph initialization should be before FileWriter, otherwise the graph will not appear in TensorBoard
                model = SelfSent(dataset, parameters)

                # Initialize the model
                sess.run(tf.global_variables_initializer())
                if not parameters['use_pretrained_model']:
                    model.load_pretrained_token_embeddings(
                        sess, dataset, parameters)

                # Start training + evaluation loop. Each iteration corresponds to 1 epoch.
                bad_counter = 0  # number of epochs with no improvement on the validation test
                previous_best_valid_accuracy = 0
                previous_best_test_accuracy = 0
                model_saver = tf.train.Saver(
                    max_to_keep=parameters['maximum_number_of_epochs']
                )  # defaults to saving all variables
                epoch_number = -1
                try:
                    while True:
                        epoch_number += 1
                        print('\nStarting epoch {0}'.format(epoch_number))

                        epoch_start_time = time.time()

                        if parameters[
                                'use_pretrained_model'] and epoch_number == 0:
                            # Restore pretrained model parameters
                            dataset = train.restore_model_parameters_from_pretrained_model(
                                parameters, dataset, sess, model_saver)
                            dataset.load_deploy(
                                os.path.join(parameters['dataset_folder'],
                                             '{0}.json'.format('deploy')),
                                parameters, annotator)
                            y_pred, y_true, output_filepaths, attentions = train.predict_labels(
                                sess,
                                model,
                                parameters,
                                dataset,
                                epoch_number,
                                stats_graph_folder,
                                dataset_filepaths,
                                only_deploy=True)
                            y_pred = y_pred['deploy']

                            with open(
                                    output_filepaths['deploy']
                                [:output_filepaths['deploy'].rfind('/') + 1] +
                                    'attention.txt',
                                    'w',
                                    encoding='utf-8') as fp:
                                # Compute attention
                                tokens_with_attentions = []
                                for sample_id in range(len(y_pred)):
                                    attention = attentions[int(
                                        sample_id / parameters['batch_size'])][
                                            sample_id %
                                            parameters['batch_size']]
                                    # Remove padded dimension
                                    attention = attention[:dataset.
                                                          token_lengths[
                                                              'deploy']
                                                          [sample_id]]

                                    # Save current attention
                                    fp.write("{}\t{:05.2f}\t".format(
                                        y_pred[sample_id][0],
                                        y_pred[sample_id][1]))
                                    fp.write(' '.join(dataset.tokens['deploy']
                                                      [sample_id]) + '\t')
                                    fp.write(' '.join(
                                        [str(a)
                                         for a in attention.flatten()]) + '\n')

                                    # Sum over columns (we combine all the annotation vectors)
                                    attention = np.sum(attention, axis=1)
                                    # Normalize to sum at 1
                                    attention = attention / np.linalg.norm(
                                        attention)

                                    # Keep only high confidence
                                    if y_pred[sample_id][1] >= parameters[
                                            'attention_visualization_conf']:
                                        tokens_with_attentions.append(
                                            (y_pred[sample_id][0],
                                             y_pred[sample_id][1],
                                             dataset.tokens['deploy']
                                             [sample_id], attention))

                            # Plot attention
                            utils_plots.visualize_attention(
                                tokens_with_attentions, dataset.unique_labels,
                                output_filepaths['deploy']
                                [:output_filepaths['deploy'].rfind('/') + 1],
                                parameters['attention_visualization_conf'])
                            break
                        elif epoch_number != 0:
                            total_loss, total_accuracy = train.train_step(
                                sess, dataset, model, parameters)
                            print('Mean loss: {:.2f}\tMean accuracy: {:.2f}'.
                                  format(np.mean(total_loss),
                                         100.0 * np.mean(total_accuracy)),
                                  flush=True)

                        epoch_elapsed_training_time = time.time(
                        ) - epoch_start_time
                        print('Training completed in {0:.2f} seconds'.format(
                            epoch_elapsed_training_time),
                              flush=True)

                        y_pred, y_true, output_filepaths, _ = train.predict_labels(
                            sess, model, parameters, dataset, epoch_number,
                            stats_graph_folder, dataset_filepaths)

                        # Evaluate model: save and plot results
                        evaluate.evaluate_model(results, dataset, y_pred,
                                                y_true, stats_graph_folder,
                                                epoch_number, epoch_start_time,
                                                output_filepaths, parameters)

                        # Save model
                        model_saver.save(
                            sess,
                            os.path.join(
                                model_folder,
                                'model_{0:05d}.ckpt'.format(epoch_number)))

                        # Early stop
                        valid_accuracy = results['epoch'][epoch_number][0][
                            'valid']['accuracy_score']
                        if valid_accuracy > previous_best_valid_accuracy:
                            bad_counter = 0
                            previous_best_valid_accuracy = valid_accuracy
                            previous_best_test_accuracy = results['epoch'][
                                epoch_number][0]['test']['accuracy_score']
                        else:
                            bad_counter += 1
                        print(
                            "The last {0} epochs have not shown improvements on the validation set."
                            .format(bad_counter))
                        print("Best valid with test performances in epoch " +
                              str(epoch_number - bad_counter) +
                              ": {:05.2f}%\t{:05.2f}%".format(
                                  previous_best_valid_accuracy,
                                  previous_best_test_accuracy))
                        if bad_counter >= parameters['patience']:
                            print('Early Stop!')
                            results['execution_details']['early_stop'] = True
                            break

                        if epoch_number >= parameters[
                                'maximum_number_of_epochs']:
                            break

                except KeyboardInterrupt:
                    results['execution_details']['keyboard_interrupt'] = True
                    print('Training interrupted')

                print('Finishing the experiment')
                end_time = time.time()
                results['execution_details'][
                    'train_duration'] = end_time - start_time
                results['execution_details']['train_end'] = end_time
                evaluate.save_results(results, stats_graph_folder)

            sess.close()  # release the session's resources
Example #14
0
def conll_to_brat(conll_input_filepath, conll_output_filepath, brat_original_folder, brat_output_folder,
                  overwrite=False):
    '''
    convert conll file in conll-filepath to brat annotations and output to brat_output_folder, 
    with reference to the existing text files in brat_original_folder 
    if brat_original_folder does not exist or contain any text file, then the text files are generated from conll files,
    and conll file is updated with filenames and token offsets accordingly. 
    
    conll_input_filepath: path to conll file to convert to brat annotations
    conll_output_filepath: path to output conll file with filename and offsets that are compatible with brat annotations
    brat_original_folder: folder that contains the original .txt (and .ann) files that are formatted according to brat.
                          .txt files are used to check if the token offsets match and generate the annotation from conll.                      
    brat_output_folder: folder to output the text and brat annotations 
                        .txt files are copied from brat_original_folder to brat_output_folder
    '''
    verbose = False
    dataset_type = utils.get_basename_without_extension(conll_input_filepath)
    print("Formatting {0} set from CONLL to BRAT... ".format(dataset_type), end='')

    # if brat_original_folder does not exist or have any text file
    if not os.path.exists(brat_original_folder) or len(glob.glob(os.path.join(brat_original_folder, '*.txt'))) == 0:
        assert (conll_input_filepath != conll_output_filepath)
        generate_reference_text_file_for_conll(conll_input_filepath, conll_output_filepath, brat_original_folder)

    utils.create_folder_if_not_exists(brat_output_folder)
    conll_file = codecs.open(conll_output_filepath, 'r', 'latin-1', errors='replace')

    previous_token_label = 'O'
    previous_filename = ''
    text_filepath = ''
    text = ''
    entity_id = 1
    entities = []
    entity = {}
    line_count = 0
    for line in conll_file:
        line = line.strip().split(' ')
        # New sentence
        if len(line) == 0 or len(line[0]) == 0 or '-DOCSTART-' in line[0]:
            # Add the last entity 
            if entity != {}:
                if verbose: print("entity: {0}".format(entity))
                entities.append(entity)
                entity_id += 1
                entity = {}
            previous_token_label = 'O'
            continue

        filename = str(line[1])
        # New file
        if filename != previous_filename:
            output_entities(brat_output_folder, previous_filename, entities, text_filepath, text, overwrite=overwrite)
            text_filepath = os.path.join(brat_original_folder, '{0}.txt'.format(filename))
            with codecs.open(text_filepath, 'r', 'latin-1', errors='replace') as f:
                text = f.read()
            previous_token_label = 'O'
            previous_filename = filename
            entity_id = 1
            entities = []
            entity = {}

        label = str(line[-1]).replace('_', '-')  # For LOCATION-OTHER
        if label == 'O':
            # Previous entity ended
            if previous_token_label != 'O':
                if verbose: print("entity: {0}".format(entity))
                entities.append(entity)
                entity_id += 1
                entity = {}
            previous_token_label = 'O'
            continue

        token = {}
        token['text'] = str(line[0])
        token['start'] = int(line[2])
        token['end'] = int(line[3])
        # check that the token text matches the original
        if token['text'] != text[token['start']:token['end']].replace(' ', '-'):
            print("Warning: conll and brat text do not match.")
            print("\tCONLL: {0}".format(token['text']))
            print("\tBRAT : {0}".format(text[token['start']:token['end']]))
        token['label'] = label[2:]

        if label[:2] == 'B-':
            if previous_token_label != 'O':
                # End the previous entity
                if verbose: print("entity: {0}".format(entity))
                entities.append(entity)
                entity_id += 1
            # Start a new entity
            entity = token
        elif label[:2] == 'I-':
            # Entity continued
            if previous_token_label == token['label']:
                # if there is no newline between the entity and the token
                if '\n' not in text[entity['end']:token['start']]:
                    # Update entity 
                    entity['text'] = entity['text'] + ' ' + token['text']
                    entity['end'] = token['end']
                else:  # newline between the entity and the token
                    # End the previous entity
                    if verbose: print("entity: {0}".format(entity))
                    entities.append(entity)
                    entity_id += 1
                    # Start a new entity
                    entity = token
            elif previous_token_label != 'O':
                # TODO: count BI or II incompatibility
                # End the previous entity
                if verbose: print("entity: {0}".format(entity))
                entities.append(entity)
                entity_id += 1
                # Start new entity
                entity = token
            else:  # previous_token_label == 'O'
                # TODO: count  OI incompatibility
                # Start new entity
                entity = token
        previous_token_label = token['label']
    output_entities(brat_output_folder, previous_filename, entities, text_filepath, text, overwrite=overwrite)
    conll_file.close()
    print('Done.')
Example #15
0
def main(argv=sys.argv):
    ''' NeuroNER main method

    Args:
        parameters_filepath the path to the parameters file
        output_folder the path to the output folder
    '''
    arguments = parse_arguments(argv[1:])
    parameters, conf_parameters = load_parameters(
        arguments['parameters_filepath'], arguments=arguments)
    dataset_filepaths, dataset_brat_folders = get_valid_dataset_filepaths(
        parameters)
    check_parameter_compatiblity(parameters, dataset_filepaths)

    # Load dataset
    dataset = ds.Dataset(verbose=parameters['verbose'],
                         debug=parameters['debug'])
    dataset.load_dataset(dataset_filepaths, parameters)

    # Create graph and session
    with tf.device('/gpu:0'):
        with tf.Graph().as_default():
            session_conf = tf.ConfigProto(
                intra_op_parallelism_threads=parameters[
                    'number_of_cpu_threads'],
                inter_op_parallelism_threads=parameters[
                    'number_of_cpu_threads'],
                device_count={
                    'CPU': 1,
                    'GPU': parameters['number_of_gpus']
                },
                allow_soft_placement=True,
                # automatically choose an existing and supported device to run the operations in case the specified one doesn't exist
                log_device_placement=False)

            sess = tf.Session(config=session_conf)

            with sess.as_default():

                start_time = time.time()
                experiment_timestamp = utils.get_current_time_in_miliseconds()
                results = {}
                results['epoch'] = {}
                results['execution_details'] = {}
                results['execution_details']['train_start'] = start_time
                results['execution_details'][
                    'time_stamp'] = experiment_timestamp
                results['execution_details']['early_stop'] = False
                results['execution_details']['keyboard_interrupt'] = False
                results['execution_details']['num_epochs'] = 0
                results['model_options'] = copy.copy(parameters)

                dataset_name = utils.get_basename_without_extension(
                    parameters['dataset_text_folder'])
                model_name = dataset_name
                utils.create_folder_if_not_exists(parameters['output_folder'])
                stats_graph_folder = os.path.join(
                    parameters['output_folder'],
                    model_name)  # Folder where to save graphs
                final_weights_folder = os.path.join(
                    parameters['output_folder'], 'weights')
                utils.create_folder_if_not_exists(stats_graph_folder)
                utils.create_folder_if_not_exists(final_weights_folder)
                model_folder = os.path.join(stats_graph_folder, 'model')
                utils.create_folder_if_not_exists(model_folder)
                # saving the parameter setting to the output model dir. For later resuming training
                with open(os.path.join(model_folder, 'parameters.ini'),
                          'w') as parameters_file:
                    conf_parameters.write(parameters_file)
                tensorboard_log_folder = os.path.join(stats_graph_folder,
                                                      'tensorboard_logs')
                utils.create_folder_if_not_exists(tensorboard_log_folder)
                tensorboard_log_folders = {}
                for dataset_type in dataset_filepaths.keys():
                    tensorboard_log_folders[dataset_type] = os.path.join(
                        stats_graph_folder, 'tensorboard_logs', dataset_type)
                    utils.create_folder_if_not_exists(
                        tensorboard_log_folders[dataset_type])
                pickle.dump(
                    dataset,
                    open(os.path.join(model_folder, 'dataset.pickle'), 'wb'))

                # Instantiate the model
                # graph initialization should be before FileWriter, otherwise the graph will not appear in TensorBoard
                model = EntityLSTM(dataset, parameters)

                # Instantiate the writers for TensorBoard
                writers = {}
                for dataset_type in dataset_filepaths.keys():
                    writers[dataset_type] = tf.summary.FileWriter(
                        tensorboard_log_folders[dataset_type],
                        graph=sess.graph)
                # embedding_writer has to write in model_folder, otherwise TensorBoard won't be able to view embeddings
                embedding_writer = tf.summary.FileWriter(model_folder)

                embeddings_projector_config = projector.ProjectorConfig()
                tensorboard_token_embeddings = embeddings_projector_config.embeddings.add(
                )
                tensorboard_token_embeddings.tensor_name = model.token_embedding_weights.name
                token_list_file_path = os.path.join(
                    model_folder, 'tensorboard_metadata_tokens.tsv')
                tensorboard_token_embeddings.metadata_path = os.path.relpath(
                    token_list_file_path, '..')

                tensorboard_character_embeddings = embeddings_projector_config.embeddings.add(
                )
                tensorboard_character_embeddings.tensor_name = model.character_embedding_weights.name
                character_list_file_path = os.path.join(
                    model_folder, 'tensorboard_metadata_characters.tsv')
                tensorboard_character_embeddings.metadata_path = os.path.relpath(
                    character_list_file_path, '..')

                projector.visualize_embeddings(embedding_writer,
                                               embeddings_projector_config)

                # Write metadata for TensorBoard embeddings
                token_list_file = codecs.open(token_list_file_path, 'w',
                                              'latin-1')
                for token_index in range(dataset.vocabulary_size):
                    token_list_file.write('{0}\n'.format(
                        dataset.index_to_token[token_index]))
                token_list_file.close()

                character_list_file = codecs.open(character_list_file_path,
                                                  'w', 'latin-1')
                for character_index in range(dataset.alphabet_size):
                    if character_index == dataset.PADDING_CHARACTER_INDEX:
                        character_list_file.write('PADDING\n')
                    else:
                        character_list_file.write('{0}\n'.format(
                            dataset.index_to_character[character_index]))
                character_list_file.close()

                # Initialize the model
                sess.run(tf.global_variables_initializer())
                if not parameters['use_pretrained_model']:
                    model.load_pretrained_token_embeddings(
                        sess, dataset, parameters)

                # Start training + evaluation loop. Each iteration corresponds to 1 epoch.
                patience_counter = 0
                f1_score_best = 0
                f1_scores = {'train-F1': [], 'valid-F1': [], 'test-F1': []}
                f1_scores_conll = {
                    'train-F1': [],
                    'valid-F1': [],
                    'test-F1': []
                }
                transition_params_trained = np.random.rand(
                    len(dataset.unique_labels) + 2,
                    len(dataset.unique_labels) + 2)
                model_saver = tf.train.Saver(
                    max_to_keep=parameters['num_of_model_to_keep'])
                epoch_number = -1
                try:
                    while True:
                        step = 0
                        epoch_number += 1
                        print('\nStarting epoch {0}'.format(epoch_number))

                        epoch_start_time = time.time()

                        # use pre-trained model and epoch_number = 0
                        if parameters[
                                'use_pretrained_model'] and epoch_number == 0:

                            if parameters['use_adapter']:
                                parameters['use_adapter'] = False
                                transition_params_trained = train.restore_pretrained_model(
                                    parameters, dataset, sess, model,
                                    model_saver)
                                print(
                                    'Getting the 3-label predictions from the step1 model.'
                                )
                                all_pred_labels, y_pred_for_adapter, y_true_for_adapter, \
                                output_filepaths = train.predict_labels(sess, model,
                                                                        transition_params_trained,
                                                                        parameters, dataset,
                                                                        epoch_number,
                                                                        stats_graph_folder,
                                                                        dataset_filepaths,
                                                                        for_adapter=True)
                                # use the label2idx mapping (for adapter) in the dataset to transform all_pred_labels
                                all_pred_indices = {}
                                for dataset_type in dataset_filepaths.keys():
                                    all_pred_indices[dataset_type] = []
                                    for i in range(
                                            len(all_pred_labels[dataset_type])
                                    ):
                                        indices = [
                                            dataset.
                                            label_adapter_to_index[label]
                                            for label in
                                            all_pred_labels[dataset_type][i]
                                        ]
                                        all_pred_indices[dataset_type].append(
                                            indices)

                                # and use binarizer to transform to ndarray
                                label_binarizer_adapter = sklearn.preprocessing.LabelBinarizer(
                                )
                                label_binarizer_adapter.fit(
                                    range(
                                        max(dataset.index_to_label_adapter.
                                            keys()) + 1))
                                predicted_label_adapter_vector_indices = {}
                                for dataset_type in dataset_filepaths.keys():
                                    predicted_label_adapter_vector_indices[
                                        dataset_type] = []
                                    for label_indices_sequence in all_pred_indices[
                                            dataset_type]:
                                        predicted_label_adapter_vector_indices[
                                            dataset_type].append(
                                                label_binarizer_adapter.
                                                transform(
                                                    label_indices_sequence))
                                parameters['use_adapter'] = True

                            if parameters['train_model'] and parameters[
                                    'add_class']:
                                transition_params_trained, model, glo_step = \
                                    train.restore_model_parameters_from_pretrained_model(parameters, dataset, sess,
                                                                                         model, model_saver)
                                init_new_vars_op = tf.initialize_variables(
                                    [glo_step])
                                sess.run(init_new_vars_op)
                            else:
                                transition_params_trained = \
                                    train.restore_pretrained_model(parameters, dataset, sess, model, model_saver)

                            for dataset_type in dataset_filepaths.keys():
                                writers[dataset_type] = tf.summary.FileWriter(
                                    tensorboard_log_folders[dataset_type],
                                    graph=sess.graph)
                                # embedding_writer has to write in model_folder, otherwise TensorBoard won't be able to view embeddings
                                embedding_writer = tf.summary.FileWriter(
                                    model_folder)

                        # epoch_number != 0, no matter use or not use pre-trained model
                        elif epoch_number != 0:
                            # Train model: loop over all sequences of training set with shuffling
                            sequence_numbers = list(
                                range(len(dataset.token_indices['train'])))
                            random.shuffle(sequence_numbers)
                            for sequence_number in sequence_numbers:
                                transition_params_trained, W_before_crf = train.train_step(
                                    sess, dataset, sequence_number, model,
                                    transition_params_trained, parameters)
                                step += 1
                        epoch_elapsed_training_time = time.time(
                        ) - epoch_start_time
                        print('Training completed in {0:.2f} seconds'.format(
                            epoch_elapsed_training_time),
                              flush=False)

                        if parameters[
                                'use_adapter']:  # model evaluation, using adapter
                            # pass the pred_for_adapter as label_indices vector
                            original_label_adapter_vector_indices = dataset.label_adapter_vector_indices
                            dataset.label_adapter_vector_indices = predicted_label_adapter_vector_indices
                            y_pred, y_true, output_filepaths = train.predict_labels(
                                sess, model, transition_params_trained,
                                parameters, dataset, epoch_number,
                                stats_graph_folder, dataset_filepaths)

                            evaluate.evaluate_model(results, dataset, y_pred,
                                                    y_true, stats_graph_folder,
                                                    epoch_number,
                                                    epoch_start_time,
                                                    output_filepaths,
                                                    parameters)
                            dataset.label_adapter_vector_indices = original_label_adapter_vector_indices

                        else:  # model evaluation,  not using adapter
                            y_pred, y_true, output_filepaths = train.predict_labels(
                                sess, model, transition_params_trained,
                                parameters, dataset, epoch_number,
                                stats_graph_folder, dataset_filepaths)

                            # Evaluate model: save and plot results
                            evaluate.evaluate_model(results, dataset, y_pred,
                                                    y_true, stats_graph_folder,
                                                    epoch_number,
                                                    epoch_start_time,
                                                    output_filepaths,
                                                    parameters)

                        summary = sess.run(model.summary_op, feed_dict=None)
                        writers['train'].add_summary(summary, epoch_number)
                        writers['train'].flush()
                        utils.copytree(writers['train'].get_logdir(),
                                       model_folder)

                        # Early stopping
                        train_f1_score = results['epoch'][epoch_number][0][
                            'train']['f1_score']['weighted']
                        valid_f1_score = results['epoch'][epoch_number][0][
                            'valid']['f1_score']['weighted']
                        test_f1_score = results['epoch'][epoch_number][0][
                            'test']['f1_score']['weighted']
                        f1_scores['train-F1'].append(train_f1_score)
                        f1_scores['valid-F1'].append(valid_f1_score)
                        f1_scores['test-F1'].append(test_f1_score)

                        train_f1_score_conll = results['epoch'][epoch_number][
                            0]['train']['f1_conll']['micro']
                        valid_f1_score_conll = results['epoch'][epoch_number][
                            0]['valid']['f1_conll']['micro']
                        test_f1_score_conll = results['epoch'][epoch_number][
                            0]['test']['f1_conll']['micro']
                        f1_scores_conll['train-F1'].append(
                            train_f1_score_conll)
                        f1_scores_conll['valid-F1'].append(
                            valid_f1_score_conll)
                        f1_scores_conll['test-F1'].append(test_f1_score_conll)

                        if valid_f1_score > f1_score_best:
                            patience_counter = 0
                            f1_score_best = valid_f1_score
                            # Save the best model
                            model_saver.save(
                                sess,
                                os.path.join(model_folder, 'best_model.ckpt'))
                            print(
                                'updated model to current epoch : epoch {:d}'.
                                format(epoch_number))
                            print('the model is saved in: {:s}'.format(
                                model_folder))
                        else:
                            patience_counter += 1
                        print("In epoch {:d}, the valid F1 is : {:f}".format(
                            epoch_number, valid_f1_score))
                        print(
                            "The last {0} epochs have not shown improvements on the validation set."
                            .format(patience_counter))

                        if patience_counter >= parameters['patience']:
                            print('Early Stop!')
                            results['execution_details']['early_stop'] = True
                            # save last model
                            model_saver.save(
                                sess,
                                os.path.join(model_folder, 'last_model.ckpt'))
                            print('the last model is saved in: {:s}'.format(
                                model_folder))

                            break

                        if epoch_number >= parameters[
                                'maximum_number_of_epochs'] and not parameters[
                                    'refine_with_crf']:
                            break
                    if not parameters['use_pretrained_model']:
                        plot_name = 'F1-summary-step1.svg'
                    else:
                        plot_name = 'F1-summary-step2.svg'

                    print('Sklearn result:')
                    for k, l in f1_scores.items():
                        print(k, l)

                    print('Conll result:')
                    for k, l in f1_scores_conll.items():
                        print(k, l)
                    utils_plots.plot_f1(
                        f1_scores,
                        os.path.join(stats_graph_folder, '..', plot_name),
                        'F1 score summary')

                    # TODO: in step 1, for task a, add the best deploy data to step 2 train set, and call script
                    print('(sklearn micro) test F1:')
                    micro_f1 = ','.join([
                        str(results['epoch'][ep][0]['test']['f1_score']
                            ['micro']) for ep in range(epoch_number + 1)
                    ])
                    print(micro_f1)
                    print('(sklearn macro) test F1:')
                    macro_f1 = ','.join([
                        str(results['epoch'][ep][0]['test']['f1_score']
                            ['macro']) for ep in range(epoch_number + 1)
                    ])
                    print(macro_f1)

                except KeyboardInterrupt:
                    results['execution_details']['keyboard_interrupt'] = True
                    print('Training interrupted')

                print('Finishing the experiment')
                end_time = time.time()
                results['execution_details'][
                    'train_duration'] = end_time - start_time
                results['execution_details']['train_end'] = end_time
                evaluate.save_results(results, stats_graph_folder)
                for dataset_type in dataset_filepaths.keys():
                    writers[dataset_type].close()

    sess.close()  # release the session's resources
    def initialize_population(self, params: dict):
        """
        Initializes params.population_size sender and receiver models
            Args:
                params (required): params obtained from argparse
        """
        if params.save_example_batch:
            create_folder_if_not_exists(self.run_folder + "/messages")

        if params.single_pool:
            create_folder_if_not_exists(self.run_folder + "/agents")
            if params.evolution:
                create_folder_if_not_exists(self.run_folder +
                                            "/agents_genotype")
        else:
            create_folder_if_not_exists(self.run_folder + "/senders")
            create_folder_if_not_exists(self.run_folder + "/receivers")
            if params.evolution:
                create_folder_if_not_exists(self.run_folder +
                                            "/senders_genotype")
                create_folder_if_not_exists(self.run_folder +
                                            "/receivers_genotype")

        for i in range(params.population_size):
            sender_genotype = None
            receiver_genotype = None
            if params.evolution:
                sender_genotype = generate_genotype(
                    num_nodes=params.init_nodes)
                receiver_genotype = generate_genotype(
                    num_nodes=params.init_nodes)

            if params.single_pool:
                self.agents.append(
                    SingleAgent(self.run_folder,
                                params,
                                genotype=sender_genotype,
                                agent_id=i))
            else:
                self.senders.append(
                    SenderAgent(self.run_folder,
                                params,
                                genotype=sender_genotype,
                                agent_id=i))
                self.receivers.append(
                    ReceiverAgent(self.run_folder,
                                  params,
                                  genotype=receiver_genotype,
                                  agent_id=i))
    def train(self, max_number_of_epoch, model_folder, dropout_rate=0.5):
        # stats_graph_folder, experiment_timestamp = utils.create_stats_graph_folder(parameters)

        # Initialize and save execution details
        start_time = time.time()

        utils.create_folder_if_not_exists(model_folder)

        pickle.dump(self.dataset,
                    open(os.path.join(model_folder, 'dataset.pickle'), 'wb'))
        pickle.dump(
            self.parameters,
            open(os.path.join(model_folder, 'parameters.pickle'), 'wb'))

        bad_counter = 0  # number of epochs with no improvement on the validation test in terms of F1-score
        previous_best_valid_f1_score = -100
        epoch_number = -1

        while True:

            step = 0
            epoch_number += 1
            print('\nStarting epoch {0}'.format(epoch_number))

            epoch_start_time = time.time()

            if epoch_number != 0:
                # Train model: loop over all sequences of training set with shuffling
                sequence_numbers = list(
                    range(len(self.dataset.token_indices['train'])))
                random.shuffle(sequence_numbers)
                for sequence_number in sequence_numbers:
                    self.transition_params_trained = train.train_step(
                        self.sess, self.dataset, sequence_number, self.model,
                        dropout_rate)
                    step += 1
                    if step % 10 == 0:
                        print('Training {0:.2f}% done'.format(
                            step / len(sequence_numbers) * 100),
                              end='\r',
                              flush=True)

            epoch_elapsed_training_time = time.time() - epoch_start_time
            print('Training completed in {0:.2f} seconds'.format(
                epoch_elapsed_training_time),
                  flush=True)
            f1_score = {}
            for data_type in ['train', 'valid', 'test']:
                if data_type not in self.dataset.label_indices.keys():
                    continue
                _, _, f1_score[data_type] = train.evaluate_step(
                    sess=self.sess,
                    dataset_type=data_type,
                    dataset=self.dataset,
                    model=self.model,
                    transition_params_trained=self.transition_params_trained,
                    tagging_format=self.tagging_format)
            #     if epoch_number % 3 ==0:
            self.model.saver.save(self.sess,
                                  os.path.join(model_folder, 'model.ckpt'))
            if abs(f1_score['valid'][-2] - previous_best_valid_f1_score) < 0.1:
                bad_counter += 1
            else:
                bad_counter = 0
            if bad_counter > 10:
                break
            previous_best_valid_f1_score = f1_score['valid'][-2]
            if epoch_number > max_number_of_epoch:
                break
Example #18
0
    parameters)

# Initialize and save execution details
start_time = time.time()
results = {}
results['epoch'] = {}
results['execution_details'] = {}
results['execution_details']['train_start'] = start_time
results['execution_details']['time_stamp'] = experiment_timestamp
results['execution_details']['early_stop'] = False
results['execution_details']['keyboard_interrupt'] = False
results['execution_details']['num_epochs'] = 0
results['model_options'] = copy.copy(parameters)

model_folder = os.path.join(stats_graph_folder, 'model')
utils.create_folder_if_not_exists(model_folder)

pickle.dump(dataset, open(os.path.join(model_folder, 'dataset.pickle'), 'wb'))

tensorboard_log_folder = os.path.join(stats_graph_folder, 'tensorboard_logs')
utils.create_folder_if_not_exists(tensorboard_log_folder)
tensorboard_log_folders = {}
for dataset_type in dataset_filepaths.keys():
    tensorboard_log_folders[dataset_type] = os.path.join(
        stats_graph_folder, 'tensorboard_logs', dataset_type)
    utils.create_folder_if_not_exists(tensorboard_log_folders[dataset_type])

# Instantiate the writers for TensorBoard
writers = {}
for dataset_type in dataset_filepaths.keys():
    writers[dataset_type] = tf.summary.FileWriter(
Example #19
0
def main(args):
    experiments = utils.load_experiments()

    #parameters, conf_parameters = load_parameters2()
    #if args.file:
    #    parameters['predict_text'] = args.file
    #parameters = process_input(parameters)

    #if not parameters["use_pretrained_model"]:
    #    raise IOError('Set use_pretrained_model parameter to True if you want to predict')
    #dataset_filepaths = get_valid_dataset_filepaths(parameters)
    #check_parameter_compatiblity(parameters, dataset_filepaths)
    pprint(experiments)
    time_stamp = utils.get_current_time_in_miliseconds()
    result_file = '{0}_{1}'.format(args.experiment_set + "_" + "results_",
                                   time_stamp + ".txt")
    print(result_file)
    with open(os.path.join("../predictions",result_file), "w", encoding="utf-8") as file:
        for elem in experiments['experiments'][args.experiment_set]:
            trained_model = elem[0]
            test = elem[1]
            print("======================")
            print("Train on {0}, test {1}".format(trained_model,test))
            print("======================")

            pretrained_model_folder = os.path.dirname(experiments['models'][trained_model])
            dataset = pickle.load(open(os.path.join(pretrained_model_folder, 'dataset.pickle'), 'rb'))

            parameters, conf_parameters = load_parameters(os.path.join(pretrained_model_folder, 'parameters.ini'), verbose=False)
            parameters['train_model'] = False
            parameters['use_pretrained_model'] = True
            parameters['dataset_predict'] = experiments['datasets'][test]
            parameters['pretrained_model_name'] = "{0}_on_{1}".format(trained_model,test)
            parameters['pretrained_model_checkpoint_filepath'] = experiments['models'][trained_model]
            dataset_filepaths = get_valid_dataset_filepaths(parameters)
            pprint(parameters)
            #sys.exit()

            #if args.file:
            #    parameters['predict_text'] = args.file
            #parameters = process_input(parameters)
            # Load dataset
            #dataset = ds.Dataset(verbose=parameters['verbose'], debug=parameters['debug'])
            #dataset.load_vocab_word_embeddings(parameters)

            #pretrained_model_folder = os.path.dirname(parameters['pretrained_model_checkpoint_filepath'])
            #dataset = pickle.load(open(os.path.join(pretrained_model_folder, 'dataset.pickle'), 'rb'))
            #dataset.load_dataset(dataset_filepaths, parameters)
            dataset_type = "predict"
            dataset.labels[dataset_type], dataset.tokens[dataset_type], _, _, _  = dataset._parse_dataset(dataset_filepaths.get(dataset_type, None), parameters['language'])
            #dataset.load_vocab_word_embeddings(parameters)
            iteration_number = 0
            dataset.token_to_index = dict()
            dataset.number_of_unknown_tokens = 0
            for token_sentence in tqdm(dataset.tokens['predict']):
                for token in token_sentence:
                    if iteration_number == dataset.UNK_TOKEN_INDEX: iteration_number += 1
                    if iteration_number == dataset.PADDING_TOKEN_INDEX: iteration_number += 1
                    if token == "CD":
                        a=1
                    if not utils_nlp.is_token_in_pretrained_embeddings(token, dataset.vocab_embeddings, parameters):
                        if parameters['embedding_type'] == 'fasttext':
                            dataset.token_to_index[token] = iteration_number
                            iteration_number += 1
                        else:
                            dataset.token_to_index[token] = dataset.UNK_TOKEN_INDEX
                            dataset.number_of_unknown_tokens += 1
                            dataset.tokens_mapped_to_unk.append(token)
                    else:
                        if token not in dataset.token_to_index:
                            dataset.token_to_index[token] = iteration_number
                            iteration_number += 1

            dataset_type = "predict"


            for dataset_type in dataset_filepaths.keys():
                dataset.token_indices[dataset_type] = []
                dataset.characters[dataset_type] = []
                dataset.character_indices[dataset_type] = []
                dataset.token_lengths[dataset_type] = []
                dataset.sequence_lengths[dataset_type] = []
                dataset.longest_token_length_in_sequence[dataset_type] = []
                # character_indices_padded[dataset_type] = []
                for token_sequence in dataset.tokens[dataset_type]:
                    dataset.token_indices[dataset_type].append([dataset.token_to_index.get(token, dataset.UNK_TOKEN_INDEX) for token in token_sequence])
                    dataset.characters[dataset_type].append([list(token) for token in token_sequence])
                    dataset.character_indices[dataset_type].append(
                        [[dataset.character_to_index.get(character,dataset.UNK_CHARACTER_INDEX) for character in token] for token in token_sequence])
                    dataset.token_lengths[dataset_type].append([len(token) for token in token_sequence])
                    dataset.sequence_lengths[dataset_type].append(len(token_sequence))
                    dataset.longest_token_length_in_sequence[dataset_type].append(max(dataset.token_lengths[dataset_type][-1]))

                    # character_indices_padded[dataset_type].append([ utils.pad_list(temp_token_indices, longest_token_length_in_sequence, self.PADDING_CHARACTER_INDEX)
                    #                                                for temp_token_indices in character_indices[dataset_type][-1]])

                dataset.label_indices[dataset_type] = []
                for label_sequence in dataset.labels[dataset_type]:
                    dataset.label_indices[dataset_type].append([dataset.label_to_index[label] for label in label_sequence])

            tmp_vector = [0] * len(dataset.unique_labels)
            tmp_vector[dataset.label_to_index["O"]] = 1
            dataset.PADDING_LABEL_VECTOR = tmp_vector
            for dataset_type in dataset_filepaths.keys():
                dataset.label_vector_indices[dataset_type] = []
                for label_indices_sequence in dataset.label_indices[dataset_type]:
                    vector_sequence = []
                    for indice in label_indices_sequence:
                        vector = [0] * len(dataset.unique_labels)
                        vector[indice] = 1
                        vector_sequence.append(vector)
                    dataset.label_vector_indices[dataset_type].append(vector_sequence)

            # Create graph and session
            with tf.Graph().as_default():
                session_conf = tf.ConfigProto(
                    intra_op_parallelism_threads=parameters['number_of_cpu_threads'],
                    inter_op_parallelism_threads=parameters['number_of_cpu_threads'],
                    device_count={'CPU': 1, 'GPU': parameters['number_of_gpus']},
                    allow_soft_placement=True,
                    # automatically choose an existing and supported device to run the operations in case the specified one doesn't exist
                    log_device_placement=False,
                )
                session_conf.gpu_options.allow_growth = True


                sess = tf.Session(config=session_conf)

                model = EntityLSTM(dataset, parameters)
                model_saver = tf.train.Saver()

                prediction_folder = os.path.join('..', 'predictions')
                utils.create_folder_if_not_exists(prediction_folder)
                dataset_name = parameters['pretrained_model_name']
                model_name = '{0}_{1}'.format(dataset_name,
                                              time_stamp)
                prediction_folder = os.path.join(prediction_folder, model_name)
                utils.create_folder_if_not_exists(prediction_folder)
                epoch_number = 100
                #dataset_name = utils.get_basename_without_extension(parameters['dataset_test'])
                with open(os.path.join(prediction_folder, 'parameters.ini'), 'w') as parameters_file:
                    conf_parameters.write(parameters_file)

                if parameters['use_pretrained_model']:
                    # Restore pretrained model parameters
                    transition_params_trained = train.restore_model_parameters_from_pretrained_model(parameters, dataset, sess, model, model_saver)
                    model.load_pretrained_token_embeddings(sess, dataset, parameters)
                    start_time = time.time()
                    results = {}
                    results['epoch'] = {}
                    results['execution_details'] = {}
                    results['execution_details']['train_start'] = start_time
                    results['execution_details']['time_stamp'] = start_time
                    results['execution_details']['early_stop'] = False
                    results['execution_details']['keyboard_interrupt'] = False
                    results['execution_details']['num_epochs'] = epoch_number
                    results['model_options'] = copy.copy(parameters)
                    demo = parameters['pretrained_model_name'] == "demo"
                    y_pred, y_true, output_filepaths = train.predict_labels(sess, model, transition_params_trained, parameters, dataset, epoch_number, prediction_folder, dataset_filepaths, demo=demo)
                    conll_output_file = evaluate.evaluate_model(results, dataset, y_pred, y_true, prediction_folder, epoch_number,
                                           start_time , output_filepaths, parameters)


                    file.write(parameters['pretrained_model_name'] + "\n")
                    with open(conll_output_file, "r") as conll_file:
                        conll = conll_file.read()
                    file.write(conll)
                    file.write("\n\n\n")
                    if parameters['pretrained_model_name'] == "demo":
                        print("============")
                        print(" Prediction ")
                        print("============")
                        i = 0
                        for sentence in dataset.tokens['predict']:
                            for token in sentence:
                                predict_label = dataset.index_to_label[y_pred['predict'][i]]
                                if dataset.index_to_label[y_pred['predict'][i]] != "O":
                                    print(token,predict_label)
                                else:
                                    print(token)
                                i += 1
                            print("")
                else:
                    raise IOError('Set use_pretrained_model parameter to True')
def generate_reference_text_file_for_conll(conll_filepath, text_folder):
    '''
    generates reference text files and adds the corresponding filename and token offsets to conll file.
    
    conll_filepath: path to a conll-formatted file without filename and token offsets
    text_folder: folder to write the reference text file to
    '''
    dataset_type = utils.get_basename_without_extension(conll_filepath)
    conll_file = codecs.open(conll_filepath, 'r', 'UTF-8')
    utils.create_folder_if_not_exists(text_folder)
    text = ''
    new_conll_string = ''
    character_index = 0
    document_count = 0
    text_base_filename = '{0}_text_{1}'.format(dataset_type,
                                               str(document_count).zfill(5))
    for line in conll_file:
        split_line = line.strip().split(' ')
        # New document
        if '-DOCSTART-' in split_line[0]:
            new_conll_string += line
            if len(text) != 0:
                with codecs.open(
                        os.path.join(text_folder,
                                     '{0}.txt'.format(text_base_filename)),
                        'w', 'UTF-8') as f:
                    f.write(text)
            text = ''
            character_index = 0
            document_count += 1
            text_base_filename = '{0}_text_{1}'.format(
                dataset_type,
                str(document_count).zfill(5))
            continue
        # New sentence
        elif len(split_line) == 0 or len(split_line[0]) == 0:
            new_conll_string += '\n'
            if text != '':
                text += '\n'
                character_index += 1
            continue
        token = split_line[0]
        start = character_index
        end = start + len(token)
        text += token + ' '
        character_index += len(token) + 1
        new_conll_string += ' '.join(
            [token, text_base_filename,
             str(start), str(end)] + split_line[1:]) + '\n'
    if len(text) != 0:
        with codecs.open(
                os.path.join(text_folder,
                             '{0}.txt'.format(text_base_filename)), 'w',
                'UTF-8') as f:
            f.write(text)
    conll_file.close()

    original_conll_filepath = os.path.join(
        os.path.dirname(conll_filepath),
        '{0}_original.txt'.format(dataset_type))
    shutil.copyfile(conll_filepath, original_conll_filepath)
    with codecs.open(conll_filepath, 'w', 'UTF-8') as f:
        f.write(new_conll_string)
Example #21
0
def main():

    parameters, dataset_filepaths = load_parameters()

    # Load dataset
    dataset = ds.Dataset()
    dataset.load_dataset(dataset_filepaths, parameters)

    # Create graph and session
    with tf.Graph().as_default():
        session_conf = tf.ConfigProto(
            device_count={
                'CPU': 1,
                'GPU': 1
            },
            allow_soft_placement=
            True,  #  automatically choose an existing and supported device to run the operations in case the specified one doesn't exist
            log_device_placement=False)

        sess = tf.Session(config=session_conf)

        with sess.as_default():
            # Initialize and save execution details
            start_time = time.time()
            experiment_timestamp = utils.get_current_time_in_miliseconds()
            results = {}
            results['epoch'] = {}
            results['execution_details'] = {}
            results['execution_details']['train_start'] = start_time
            results['execution_details']['time_stamp'] = experiment_timestamp
            results['execution_details']['early_stop'] = False
            results['execution_details']['keyboard_interrupt'] = False
            results['execution_details']['num_epochs'] = 0
            results['model_options'] = copy.copy(parameters)

            dataset_name = utils.get_basename_without_extension(
                parameters['dataset_text_folder'])
            model_name = '{0}_{1}'.format(
                dataset_name, results['execution_details']['time_stamp'])

            output_folder = os.path.join('..', 'output')
            utils.create_folder_if_not_exists(output_folder)
            stats_graph_folder = os.path.join(
                output_folder, model_name)  # Folder where to save graphs
            utils.create_folder_if_not_exists(stats_graph_folder)
            model_folder = os.path.join(stats_graph_folder, 'model')
            utils.create_folder_if_not_exists(model_folder)
            tensorboard_log_folder = os.path.join(stats_graph_folder,
                                                  'tensorboard_logs')
            utils.create_folder_if_not_exists(tensorboard_log_folder)
            tensorboard_log_folders = {}
            for dataset_type in ['train', 'valid', 'test']:
                tensorboard_log_folders[dataset_type] = os.path.join(
                    stats_graph_folder, 'tensorboard_logs', dataset_type)
                utils.create_folder_if_not_exists(
                    tensorboard_log_folders[dataset_type])

            pickle.dump(
                dataset,
                open(os.path.join(stats_graph_folder, 'dataset.pickle'), 'wb'))

            # Instantiate the model
            # graph initialization should be before FileWriter, otherwise the graph will not appear in TensorBoard
            model = EntityLSTM(dataset, parameters)

            # Instantiate the writers for TensorBoard
            writers = {}
            for dataset_type in ['train', 'valid', 'test']:
                writers[dataset_type] = tf.summary.FileWriter(
                    tensorboard_log_folders[dataset_type], graph=sess.graph)
            embedding_writer = tf.summary.FileWriter(
                model_folder
            )  # embedding_writer has to write in model_folder, otherwise TensorBoard won't be able to view embeddings

            embeddings_projector_config = projector.ProjectorConfig()
            tensorboard_token_embeddings = embeddings_projector_config.embeddings.add(
            )
            tensorboard_token_embeddings.tensor_name = model.token_embedding_weights.name
            token_list_file_path = os.path.join(
                model_folder, 'tensorboard_metadata_tokens.tsv')
            tensorboard_token_embeddings.metadata_path = os.path.relpath(
                token_list_file_path, '..')

            tensorboard_character_embeddings = embeddings_projector_config.embeddings.add(
            )
            tensorboard_character_embeddings.tensor_name = model.character_embedding_weights.name
            character_list_file_path = os.path.join(
                model_folder,
                'tensorboard_metadata_characters.tsv')  #  'metadata.tsv'
            tensorboard_character_embeddings.metadata_path = os.path.relpath(
                character_list_file_path, '..')

            projector.visualize_embeddings(embedding_writer,
                                           embeddings_projector_config)

            # Write metadata for TensorBoard embeddings
            token_list_file = open(token_list_file_path, 'w')
            for token_index in range(dataset.vocabulary_size):
                token_list_file.write('{0}\n'.format(
                    dataset.index_to_token[token_index]))
            token_list_file.close()

            character_list_file = open(character_list_file_path, 'w')
            print('len(dataset.character_to_index): {0}'.format(
                len(dataset.character_to_index)))
            print('len(dataset.index_to_character): {0}'.format(
                len(dataset.index_to_character)))
            for character_index in range(dataset.alphabet_size):
                if character_index == dataset.PADDING_CHARACTER_INDEX:
                    character_list_file.write('PADDING\n')
                else:
                    character_list_file.write('{0}\n'.format(
                        dataset.index_to_character[character_index]))
            character_list_file.close()

            # Initialize the model
            sess.run(tf.global_variables_initializer())
            model.load_pretrained_token_embeddings(sess, dataset, parameters)

            # Start training + evaluation loop. Each iteration corresponds to 1 epoch.
            step = 0
            bad_counter = 0  # number of epochs with no improvement on the validation test in terms of F1-score
            previous_best_valid_f1_score = 0
            transition_params_trained = np.random.rand(
                len(dataset.unique_labels), len(dataset.unique_labels))
            model_saver = tf.train.Saver(
                max_to_keep=parameters['maximum_number_of_epochs']
            )  # defaults to saving all variables
            epoch_number = -1
            try:
                while True:
                    epoch_number += 1
                    #epoch_number = math.floor(step / len(dataset.token_indices['train']))
                    print('\nStarting epoch {0}'.format(epoch_number), end='')

                    epoch_start_time = time.time()
                    #print('step: {0}'.format(step))

                    # Train model: loop over all sequences of training set with shuffling
                    sequence_numbers = list(
                        range(len(dataset.token_indices['train'])))
                    random.shuffle(sequence_numbers)
                    for sequence_number in sequence_numbers:
                        transition_params_trained = train.train_step(
                            sess, dataset, sequence_number, model,
                            transition_params_trained, parameters)
                        step += 1
                        if step % 100 == 0:
                            print('.', end='', flush=True)
                            #break
                    print('.', flush=True)
                    #print('step: {0}'.format(step))

                    # Predict labels using trained model
                    y_pred = {}
                    y_true = {}
                    output_filepaths = {}
                    for dataset_type in ['train', 'valid', 'test']:
                        #print('dataset_type:     {0}'.format(dataset_type))
                        prediction_output = train.prediction_step(
                            sess, dataset, dataset_type, model,
                            transition_params_trained, step,
                            stats_graph_folder, epoch_number, parameters)
                        y_pred[dataset_type], y_true[
                            dataset_type], output_filepaths[
                                dataset_type] = prediction_output
#                         model_options = None

                    epoch_elapsed_training_time = time.time(
                    ) - epoch_start_time
                    print(
                        'epoch_elapsed_training_time: {0:.2f} seconds'.format(
                            epoch_elapsed_training_time))

                    results['execution_details']['num_epochs'] = epoch_number

                    # Evaluate model: save and plot results
                    evaluate.evaluate_model(results, dataset, y_pred, y_true,
                                            stats_graph_folder, epoch_number,
                                            epoch_start_time, output_filepaths,
                                            parameters)

                    # Save model
                    model_saver.save(
                        sess,
                        os.path.join(model_folder,
                                     'model_{0:05d}.ckpt'.format(epoch_number))
                    )  #, global_step, latest_filename, meta_graph_suffix, write_meta_graph, write_state)

                    # Save TensorBoard logs
                    summary = sess.run(model.summary_op, feed_dict=None)
                    writers['train'].add_summary(summary, epoch_number)

                    # Early stop
                    valid_f1_score = results['epoch'][epoch_number][0][
                        'valid']['f1_score']['micro']
                    if valid_f1_score > previous_best_valid_f1_score:
                        bad_counter = 0
                        previous_best_valid_f1_score = valid_f1_score
                    else:
                        bad_counter += 1

                    if bad_counter > parameters['patience']:
                        print('Early Stop!')
                        results['execution_details']['early_stop'] = True
                        break

                    if epoch_number > parameters['maximum_number_of_epochs']:
                        break


#                     break # debugging

            except KeyboardInterrupt:
                results['execution_details']['keyboard_interrupt'] = True
                #         assess_model.save_results(results, stats_graph_folder)
                print('Training interrupted')

            print('Finishing the experiment')
            end_time = time.time()
            results['execution_details'][
                'train_duration'] = end_time - start_time
            results['execution_details']['train_end'] = end_time
            evaluate.save_results(results, stats_graph_folder)

    sess.close()  # release the session's resources
Example #22
0
    def save(self):

        utils.create_folder_if_not_exists(self.save_folder)

        classed_image = os.path.join(self.save_folder, "classed_images")
        utils.create_folder_if_not_exists(classed_image)

        voc_path = os.path.join(self.save_folder, "voc")
        utils.create_folder_if_not_exists(voc_path)

        jpeg_image_path = os.path.join(voc_path, "JPEGImages")
        utils.create_folder_if_not_exists(jpeg_image_path)

        annotation_path = os.path.join(voc_path, "Annotations")
        utils.create_folder_if_not_exists(annotation_path)

        file_name = datetime.datetime.now().strftime('%Y%m%d-%H%M%S')

        annotation_xml = Element('annotation')
        size_xml = SubElement(annotation_xml, 'size')
        width_xml = SubElement(size_xml, 'width')
        width_xml.text = str(self.raw_image.size[0])

        height_xml = SubElement(size_xml, 'height')
        height_xml.text = str(self.raw_image.size[1])

        depth_xml = SubElement(size_xml, 'depth')
        depth_xml.text = str(3)

        for i, label in enumerate(self.labels_listbox.get(0, END)):

            found = self.canvas.find_withtag(label)

            label_path = os.path.join(classed_image, label)

            utils.create_folder_if_not_exists(label_path)

            for j, found_index in enumerate(found):
                # Process crop image
                xmin, ymin, xmax, ymax = self.canvas.bbox(found_index)
                bbox = (xmin, ymin, xmax, ymax)
                crop = self.raw_image.crop(bbox)
                crop_path = os.path.join(
                    label_path,
                    str.format("{0}-{1}[{2}].jpg", label, file_name, j))

                # Remove iligeal part
                xmin = max(xmin, 0)
                ymin = max(ymin, 0)
                xmax = min(xmax, self.raw_image.size[0])
                ymax = min(ymin, self.raw_image.size[1])

                crop.save(crop_path)
                # Process xml object
                object_xml = SubElement(annotation_xml, 'object')

                name_xml = SubElement(object_xml, 'name')
                name_xml.text = str(label)

                difficult_xml = SubElement(object_xml, 'difficult')
                difficult_xml.text = str(0)

                bndbox_xml = SubElement(object_xml, 'bndbox')

                xmin_xml = SubElement(bndbox_xml, 'xmin')
                xmin_xml.text = str(xmin)

                ymin_xml = SubElement(bndbox_xml, 'ymin')
                ymin_xml.text = str(ymin)

                xmax_xml = SubElement(bndbox_xml, 'xmax')
                xmax_xml.text = str(xmax)

                ymax_xml = SubElement(bndbox_xml, 'ymax')
                ymax_xml.text = str(ymax)

                pass

        with open(
                os.path.join(annotation_path,
                             str.format("{0}.xml", file_name)), 'w') as xml:
            tree = ElementTree(annotation_xml)
            tree.write(xml, encoding='unicode')

        self.raw_image.save(
            os.path.join(jpeg_image_path, str.format("{0}.jpeg", file_name)))
Example #23
0
def main(_):
    vocab = read_vocab('data/ICLR_Review_all_with_decision-w2i.pkl')
    glove_embs = load_glove('glove.6B.{}d.txt'.format(FLAGS.emb_size),
                            FLAGS.emb_size, vocab)
    data_reader = DataReader(
        train_file='data/ICLR_Review_all_with_decision-train.pkl',
        dev_file='data/ICLR_Review_all_with_decision-dev.pkl',
        test_file='data/ICLR_Review_all_with_decision-test.pkl')

    config = tf.ConfigProto(allow_soft_placement=FLAGS.allow_soft_placement)
    with tf.Session(config=config) as sess:
        model = Model(cell_dim=FLAGS.cell_dim,
                      att_dim=FLAGS.att_dim,
                      vocab_size=len(vocab),
                      emb_size=FLAGS.emb_size,
                      num_classes=FLAGS.num_classes,
                      dropout_rate=FLAGS.dropout_rate,
                      pretrained_embs=glove_embs)

        loss = loss_fn(model.labels, model.logits)
        train_op, global_step = train_fn(loss)
        batch_acc, total_acc, acc_update, metrics_init, predictions = eval_fn(
            model.labels, model.logits)
        summary_op = tf.summary.merge_all()
        sess.run(tf.global_variables_initializer())

        train_writer.add_graph(sess.graph)
        saver = tf.train.Saver(max_to_keep=FLAGS.num_checkpoints)

        print('\n{}> Start training'.format(datetime.now()))
        result_save_folder = str(datetime.now())
        output_folder = os.path.join('.', 'output')
        create_folder_if_not_exists(output_folder)

        stats_graph_folder = os.path.join(
            output_folder, result_save_folder)  # Folder where to save graphs
        create_folder_if_not_exists(stats_graph_folder)

        epoch = 0
        valid_step = 0
        test_step = 0
        train_test_prop = len(data_reader.train_data) / len(
            data_reader.test_data)
        test_batch_size = int(FLAGS.batch_size / train_test_prop)
        best_acc = float('-inf')

        while epoch < FLAGS.num_epochs:
            epoch += 1
            print('\n{}> Epoch: {}'.format(datetime.now(), epoch))

            sess.run(metrics_init)
            all_labels = []
            all_y_pred = []
            for batch_docs, batch_labels in data_reader.read_train_set(
                    FLAGS.batch_size, shuffle=True):
                _step, _, _loss, _acc, _, y_pred_batch = sess.run(
                    [
                        global_step, train_op, loss, batch_acc, acc_update,
                        predictions
                    ],
                    feed_dict=model.get_feed_dict(batch_docs,
                                                  batch_labels,
                                                  training=True))
                all_labels += batch_labels
                #y_pred_batch_array = y_pred_batch.eval(session=sess)
                y_pred_batch_list = y_pred_batch.tolist()
                all_y_pred += y_pred_batch_list
                if _step % FLAGS.display_step == 0:
                    _summary = sess.run(summary_op,
                                        feed_dict=model.get_feed_dict(
                                            batch_docs, batch_labels))
                    train_writer.add_summary(_summary, global_step=_step)
            print('Training accuracy = {:.2f}'.format(
                sess.run(total_acc) * 100))
            save_results(all_labels, all_y_pred, stats_graph_folder, 'train',
                         epoch)

            sess.run(metrics_init)
            all_valid_labels = []
            all_valid_y_pred = []
            for batch_docs, batch_labels in data_reader.read_valid_set(
                    test_batch_size):
                _loss, _acc, _, valid_y_pred_batch = sess.run(
                    [loss, batch_acc, acc_update, predictions],
                    feed_dict=model.get_feed_dict(batch_docs, batch_labels))
                all_valid_labels += batch_labels
                valid_y_pred_batch_list = valid_y_pred_batch.tolist()
                all_valid_y_pred += valid_y_pred_batch_list

                valid_step += 1
                if valid_step % FLAGS.display_step == 0:
                    _summary = sess.run(summary_op,
                                        feed_dict=model.get_feed_dict(
                                            batch_docs, batch_labels))
                    valid_writer.add_summary(_summary, global_step=valid_step)
            print('Validation accuracy = {:.2f}'.format(
                sess.run(total_acc) * 100))
            #save_optimized_presicion(all_valid_labels, all_valid_y_pred, stats_graph_folder, 'valid', epoch)
            #save_distance_measure(all_valid_labels, all_valid_y_pred, stats_graph_folder, 'valid', epoch)
            save_results(all_valid_labels, all_valid_y_pred,
                         stats_graph_folder, 'valid', epoch)

            sess.run(metrics_init)
            all_test_labels = []
            all_test_y_pred = []
            for batch_docs, batch_labels in data_reader.read_test_set(
                    test_batch_size):
                _loss, _acc, _, test_y_pred_batch = sess.run(
                    [loss, batch_acc, acc_update, predictions],
                    feed_dict=model.get_feed_dict(batch_docs, batch_labels))
                all_test_labels += batch_labels
                test_y_pred_batch_list = test_y_pred_batch.tolist()
                all_test_y_pred += test_y_pred_batch_list

                test_step += 1
                if test_step % FLAGS.display_step == 0:
                    _summary = sess.run(summary_op,
                                        feed_dict=model.get_feed_dict(
                                            batch_docs, batch_labels))
                    test_writer.add_summary(_summary, global_step=test_step)
            test_acc = sess.run(total_acc) * 100
            print('Testing accuracy = {:.2f}'.format(test_acc))
            #save_optimized_presicion(all_test_labels, all_test_y_pred, stats_graph_folder, 'test', epoch)
            #save_distance_measure(all_test_labels, all_test_y_pred, stats_graph_folder, 'test', epoch)
            save_results(all_test_labels, all_test_y_pred, stats_graph_folder,
                         'test', epoch)

            if test_acc > best_acc:
                best_acc = test_acc
                saver.save(sess, FLAGS.checkpoint_dir)
            print('Best testing accuracy = {:.2f}'.format(best_acc))

    print("{} Optimization Finished!".format(datetime.now()))
    print('Best testing accuracy = {:.2f}'.format(best_acc))
Example #24
0
def main(args):
    parameters, conf_parameters = load_parameters()
    if args.file:
        parameters['predict_text'] = args.file
    parameters = process_input(parameters)
    dataset_filepaths = get_valid_dataset_filepaths(parameters)
    # Load dataset
    dataset = ds.Dataset(verbose=parameters['verbose'], debug=parameters['debug'])
    dataset.load_vocab_word_embeddings(parameters)

    pretrained_model_folder = os.path.dirname(parameters['pretrained_model_checkpoint_filepath'])
    dataset = pickle.load(open(os.path.join(pretrained_model_folder, 'dataset.pickle'), 'rb'))
    dataset.load_dataset(dataset_filepaths, parameters)
    dataset_type = "predict"
    dataset.labels[dataset_type], dataset.tokens[dataset_type], _, _, _  = dataset._parse_dataset(dataset_filepaths.get(dataset_type, None), parameters['language'])
    #dataset.load_vocab_word_embeddings(parameters)
    iteration_number = 0
    dataset.token_to_index = dict()
    for token_sentence in dataset.tokens['predict']:
        for token in token_sentence:
            if iteration_number == dataset.UNK_TOKEN_INDEX: iteration_number += 1
            if iteration_number == dataset.PADDING_TOKEN_INDEX: iteration_number += 1

            if not utils_nlp.is_token_in_pretrained_embeddings(token, dataset.vocab_embeddings, parameters):
                if parameters['embedding_type'] == 'glove':
                    dataset.token_to_index[token] =  dataset.UNK_TOKEN_INDEX
                    dataset.number_of_unknown_tokens += 1
                    dataset.tokens_mapped_to_unk.append(token)
                elif parameters['embedding_type'] == 'fasttext':
                    dataset.token_to_index[token] = iteration_number
                    iteration_number += 1
                else:
                    raise AssertionError("Embedding type not recognized")
            else:
                if token not in dataset.token_to_index:
                    dataset.token_to_index[token] = iteration_number
                    iteration_number += 1

    dataset_type = "predict"
    for dataset_type in dataset_filepaths.keys():
        dataset.token_indices[dataset_type] = []
        dataset.characters[dataset_type] = []
        dataset.character_indices[dataset_type] = []
        dataset.token_lengths[dataset_type] = []
        dataset.sequence_lengths[dataset_type] = []
        dataset.longest_token_length_in_sequence[dataset_type] = []
        # character_indices_padded[dataset_type] = []
        for token_sequence in dataset.tokens[dataset_type]:
            dataset.token_indices[dataset_type].append([dataset.token_to_index.get(token, dataset.UNK_TOKEN_INDEX) for token in token_sequence])
            dataset.characters[dataset_type].append([list(token) for token in token_sequence])
            dataset.character_indices[dataset_type].append(
                [[dataset.character_to_index.get(character,dataset.UNK_CHARACTER_INDEX) for character in token] for token in token_sequence])
            dataset.token_lengths[dataset_type].append([len(token) for token in token_sequence])
            dataset.sequence_lengths[dataset_type].append(len(token_sequence))
            dataset.longest_token_length_in_sequence[dataset_type].append(max(dataset.token_lengths[dataset_type][-1]))

            # character_indices_padded[dataset_type].append([ utils.pad_list(temp_token_indices, longest_token_length_in_sequence, self.PADDING_CHARACTER_INDEX)
            #                                                for temp_token_indices in character_indices[dataset_type][-1]])

        dataset.label_indices[dataset_type] = []
        for label_sequence in dataset.labels[dataset_type]:
            dataset.label_indices[dataset_type].append([dataset.label_to_index[label] for label in label_sequence])

    tmp_vector = [0] * len(dataset.unique_labels)
    tmp_vector[dataset.label_to_index["O"]] = 1
    dataset.PADDING_LABEL_VECTOR = tmp_vector
    for dataset_type in dataset_filepaths.keys():
        dataset.label_vector_indices[dataset_type] = []
        for label_indices_sequence in dataset.label_indices[dataset_type]:
            vector_sequence = []
            for indice in label_indices_sequence:
                vector = [0] * len(dataset.unique_labels)
                vector[indice] = 1
                vector_sequence.append(vector)
            dataset.label_vector_indices[dataset_type].append(vector_sequence)

    # Create graph and session
    with tf.Graph().as_default():
        session_conf = tf.ConfigProto(
            intra_op_parallelism_threads=parameters['number_of_cpu_threads'],
            inter_op_parallelism_threads=parameters['number_of_cpu_threads'],
            device_count={'CPU': 1, 'GPU': parameters['number_of_gpus']},
            allow_soft_placement=True,
            # automatically choose an existing and supported device to run the operations in case the specified one doesn't exist
            log_device_placement=False,
        )
        session_conf.gpu_options.allow_growth = True


        sess = tf.Session(config=session_conf)

        model = EntityLSTM(dataset, parameters)
        model_saver = tf.train.Saver()

        prediction_folder = os.path.join('..', 'predictions')
        utils.create_folder_if_not_exists(prediction_folder)
        dataset_name = parameters['pretrained_model_name']
        model_name = '{0}_{1}'.format(parameters["language"] + "_" + dataset_name,
                                      utils.get_current_time_in_miliseconds())
        prediction_folder = os.path.join(prediction_folder, model_name)
        utils.create_folder_if_not_exists(prediction_folder)
        epoch_number = 100
        #dataset_name = utils.get_basename_without_extension(parameters['dataset_test'])
        with open(os.path.join(prediction_folder, 'parameters.ini'), 'w') as parameters_file:
            conf_parameters.write(parameters_file)

        if parameters['use_pretrained_model']:
            # Restore pretrained model parameters
            transition_params_trained = train.restore_model_parameters_from_pretrained_model(parameters, dataset, sess, model, model_saver)
            model.load_pretrained_token_embeddings(sess, dataset, parameters)

            demo = parameters['pretrained_model_name'] == "demo"
            y_pred, y_true, output_filepaths = train.predict_labels(sess, model, transition_params_trained, parameters, dataset, epoch_number, prediction_folder, dataset_filepaths, demo=demo)

            if parameters['pretrained_model_name'] == "demo":
                print("============")
                print(" Prediction ")
                print("============")
                i = 0
                for sentence in dataset.tokens['predict']:
                    for token in sentence:
                        predict_label = dataset.index_to_label[y_pred['predict'][i]]
                        if dataset.index_to_label[y_pred['predict'][i]] != "O":
                            print(token,predict_label)
                        else:
                            print(token)
                        i += 1
                    print("")
        else:
            raise IOError('Set use_pretrained_model parameter to True')
Example #25
0
def main(argv=sys.argv):

    arguments = parse_arguments(argv[1:])

    parameters, conf_parameters = load_parameters(
        arguments['parameters_filepath'], arguments=arguments)
    dataset_filepaths, dataset_brat_folders = get_valid_dataset_filepaths(
        parameters)
    check_parameter_compatiblity(parameters, dataset_filepaths)

    # Load dataset
    dataset = ds.Dataset(verbose=parameters['verbose'],
                         debug=parameters['debug'])
    dataset.load_dataset(dataset_filepaths, parameters)

    # Create graph and session
    with tf.device('/gpu:0'):
        with tf.Graph().as_default():
            session_conf = tf.ConfigProto(
                intra_op_parallelism_threads=parameters[
                    'number_of_cpu_threads'],
                inter_op_parallelism_threads=parameters[
                    'number_of_cpu_threads'],
                device_count={
                    'CPU': 1,
                    'GPU': parameters['number_of_gpus']
                },
                allow_soft_placement=True,
                log_device_placement=False)

            sess = tf.Session(config=session_conf)

            with sess.as_default():
                start_time = time.time()
                experiment_timestamp = utils.get_current_time_in_miliseconds()
                results = {}
                results['epoch'] = {}
                results['execution_details'] = {}
                results['execution_details']['train_start'] = start_time
                results['execution_details'][
                    'time_stamp'] = experiment_timestamp
                results['execution_details']['early_stop'] = False
                results['execution_details']['keyboard_interrupt'] = False
                results['execution_details']['num_epochs'] = 0
                results['model_options'] = copy.copy(parameters)

                dataset_name = utils.get_basename_without_extension(
                    parameters['dataset_text_folder'])
                model_name = dataset_name
                utils.create_folder_if_not_exists(parameters['output_folder'])
                stats_graph_folder = os.path.join(
                    parameters['output_folder'],
                    model_name)  # Folder where to save graphs
                final_weights_folder = os.path.join(
                    parameters['output_folder'], 'weights')
                utils.create_folder_if_not_exists(stats_graph_folder)
                utils.create_folder_if_not_exists(final_weights_folder)
                model_folder = os.path.join(stats_graph_folder, 'model')
                utils.create_folder_if_not_exists(model_folder)
                with open(os.path.join(model_folder, 'parameters.ini'),
                          'w') as parameters_file:
                    conf_parameters.write(parameters_file)
                tensorboard_log_folder = os.path.join(stats_graph_folder,
                                                      'tensorboard_logs')
                utils.create_folder_if_not_exists(tensorboard_log_folder)
                tensorboard_log_folders = {}
                for dataset_type in dataset_filepaths.keys():
                    tensorboard_log_folders[dataset_type] = os.path.join(
                        stats_graph_folder, 'tensorboard_logs', dataset_type)
                    utils.create_folder_if_not_exists(
                        tensorboard_log_folders[dataset_type])
                pickle.dump(
                    dataset,
                    open(os.path.join(model_folder, 'dataset.pickle'), 'wb'))

                model = EntityLSTM(dataset, parameters)

                writers = {}
                for dataset_type in dataset_filepaths.keys():
                    writers[dataset_type] = tf.summary.FileWriter(
                        tensorboard_log_folders[dataset_type],
                        graph=sess.graph)
                embedding_writer = tf.summary.FileWriter(model_folder)

                embeddings_projector_config = projector.ProjectorConfig()
                tensorboard_token_embeddings = embeddings_projector_config.embeddings.add(
                )
                tensorboard_token_embeddings.tensor_name = model.token_embedding_weights.name
                token_list_file_path = os.path.join(
                    model_folder, 'tensorboard_metadata_tokens.tsv')
                tensorboard_token_embeddings.metadata_path = os.path.relpath(
                    token_list_file_path, '..')

                tensorboard_character_embeddings = embeddings_projector_config.embeddings.add(
                )
                tensorboard_character_embeddings.tensor_name = model.character_embedding_weights.name
                character_list_file_path = os.path.join(
                    model_folder, 'tensorboard_metadata_characters.tsv')
                tensorboard_character_embeddings.metadata_path = os.path.relpath(
                    character_list_file_path, '..')

                projector.visualize_embeddings(embedding_writer,
                                               embeddings_projector_config)

                token_list_file = codecs.open(token_list_file_path, 'w',
                                              'latin-1')
                for token_index in range(dataset.vocabulary_size):
                    token_list_file.write('{0}\n'.format(
                        dataset.index_to_token[token_index]))
                token_list_file.close()

                character_list_file = codecs.open(character_list_file_path,
                                                  'w', 'latin-1')
                for character_index in range(dataset.alphabet_size):
                    if character_index == dataset.PADDING_CHARACTER_INDEX:
                        character_list_file.write('PADDING\n')
                    else:
                        character_list_file.write('{0}\n'.format(
                            dataset.index_to_character[character_index]))
                character_list_file.close()

                # Initialize the model
                sess.run(tf.global_variables_initializer())
                if not parameters['use_pretrained_model']:
                    model.load_pretrained_token_embeddings(
                        sess, dataset, parameters)

                patience_counter = 0  # number of epochs with no improvement on the validation test in terms of F1-score
                f1_score_best = 0
                f1_scores = {'train-F1': [], 'valid-F1': [], 'test-F1': []}
                transition_params_trained = np.random.rand(
                    len(dataset.unique_labels) + 2,
                    len(dataset.unique_labels) + 2)
                model_saver = tf.train.Saver(
                    max_to_keep=parameters['num_of_model_to_keep']
                )  #, reshape= True)  # defaults to saving all variables
                epoch_number = -1
                try:
                    while True:
                        step = 0
                        epoch_number += 1
                        print('\nStarting epoch {0}'.format(epoch_number))

                        epoch_start_time = time.time()

                        if parameters[
                                'use_pretrained_model'] and epoch_number == 0:

                            if parameters['use_corrector']:
                                parameters['use_corrector'] = False
                                transition_params_trained = train.restore_pretrained_model(
                                    parameters, dataset, sess, model,
                                    model_saver)
                                print(
                                    'Getting the 3-label predictions from the step1 model.'
                                )
                                all_pred_labels, y_pred_for_corrector, y_true_for_corrector, \
                                output_filepaths = train.predict_labels(sess, model,
                                                                        transition_params_trained,
                                                                        parameters, dataset,
                                                                        epoch_number,
                                                                        stats_graph_folder,
                                                                        dataset_filepaths,
                                                                        for_corrector = True)
                                all_pred_indices = {}  #defaultdict(list)
                                for dataset_type in dataset_filepaths.keys():
                                    all_pred_indices[dataset_type] = []
                                    for i in range(
                                            len(all_pred_labels[dataset_type])
                                    ):
                                        indices = [
                                            dataset.
                                            label_corrector_to_index[label]
                                            for label in
                                            all_pred_labels[dataset_type][i]
                                        ]
                                        all_pred_indices[dataset_type].append(
                                            indices)

                                label_binarizer_corrector = sklearn.preprocessing.LabelBinarizer(
                                )
                                label_binarizer_corrector.fit(
                                    range(
                                        max(dataset.index_to_label_corrector.
                                            keys()) + 1))
                                predicted_label_corrector_vector_indices = {}
                                for dataset_type in dataset_filepaths.keys():
                                    predicted_label_corrector_vector_indices[
                                        dataset_type] = []
                                    for label_indices_sequence in all_pred_indices[
                                            dataset_type]:
                                        predicted_label_corrector_vector_indices[
                                            dataset_type].append(
                                                label_binarizer_corrector.
                                                transform(
                                                    label_indices_sequence))
                                parameters['use_corrector'] = True

                            transition_params_trained, model, glo_step = \
                                train.restore_model_parameters_from_pretrained_model(parameters, dataset, sess, model, model_saver)

                            for dataset_type in dataset_filepaths.keys():
                                writers[dataset_type] = tf.summary.FileWriter(
                                    tensorboard_log_folders[dataset_type],
                                    graph=sess.graph)
                                embedding_writer = tf.summary.FileWriter(
                                    model_folder)
                            init_new_vars_op = tf.initialize_variables(
                                [glo_step])
                            sess.run(init_new_vars_op)

                        elif epoch_number != 0:
                            sequence_numbers = list(
                                range(len(dataset.token_indices['train'])))
                            random.shuffle(sequence_numbers)
                            for sequence_number in sequence_numbers:
                                transition_params_trained, W_before_crf = train.train_step(
                                    sess, dataset, sequence_number, model,
                                    transition_params_trained, parameters)
                                step += 1

                        epoch_elapsed_training_time = time.time(
                        ) - epoch_start_time
                        print('Training completed in {0:.2f} seconds'.format(
                            epoch_elapsed_training_time),
                              flush=False)
                        if parameters['use_corrector']:
                            original_label_corrector_vector_indices = dataset.label_corrector_vector_indices
                            dataset.label_corrector_vector_indices = predicted_label_corrector_vector_indices
                            y_pred, y_true, output_filepaths = train.predict_labels(
                                sess, model, transition_params_trained,
                                parameters, dataset, epoch_number,
                                stats_graph_folder, dataset_filepaths)

                            # Evaluate model: save and plot results
                            evaluate.evaluate_model(results, dataset, y_pred,
                                                    y_true, stats_graph_folder,
                                                    epoch_number,
                                                    epoch_start_time,
                                                    output_filepaths,
                                                    parameters)
                            dataset.label_corrector_vector_indices = original_label_corrector_vector_indices
                        else:
                            y_pred, y_true, output_filepaths = train.predict_labels(
                                sess, model, transition_params_trained,
                                parameters, dataset, epoch_number,
                                stats_graph_folder, dataset_filepaths)

                            # Evaluate model: save and plot results
                            evaluate.evaluate_model(results, dataset, y_pred,
                                                    y_true, stats_graph_folder,
                                                    epoch_number,
                                                    epoch_start_time,
                                                    output_filepaths,
                                                    parameters)

                        summary = sess.run(model.summary_op, feed_dict=None)
                        writers['train'].add_summary(summary, epoch_number)
                        writers['train'].flush()
                        utils.copytree(writers['train'].get_logdir(),
                                       model_folder)

                        # Early stopping
                        train_f1_score = results['epoch'][epoch_number][0][
                            'train']['f1_score']['micro']
                        valid_f1_score = results['epoch'][epoch_number][0][
                            'valid']['f1_score']['micro']
                        test_f1_score = results['epoch'][epoch_number][0][
                            'test']['f1_score']['micro']
                        f1_scores['train-F1'].append(train_f1_score)
                        f1_scores['valid-F1'].append(valid_f1_score)
                        f1_scores['test-F1'].append(test_f1_score)

                        if valid_f1_score > f1_score_best:
                            patience_counter = 0
                            f1_score_best = valid_f1_score
                            # Save the best model
                            model_saver.save(
                                sess,
                                os.path.join(model_folder, 'best_model.ckpt'))
                            print(
                                'updated model to current epoch : epoch {:d}'.
                                format(epoch_number))
                            print('the model is saved in: {:s}'.format(
                                model_folder))
                            ### newly deleted
                        else:
                            patience_counter += 1
                        print("In epoch {:d}, the valid F1 is : {:f}".format(
                            epoch_number, valid_f1_score))
                        print(
                            "The last {0} epochs have not shown improvements on the validation set."
                            .format(patience_counter))

                        if patience_counter >= parameters['patience']:
                            print('Early Stop!')
                            results['execution_details']['early_stop'] = True

                        if epoch_number >= parameters[
                                'maximum_number_of_epochs'] and parameters[
                                    'refine_with_crf']:
                            model = train.refine_with_crf(
                                parameters, sess, model, model_saver)
                            print('refine model with CRF ...')

                            for additional_epoch in range(
                                    parameters['additional_epochs_with_crf']):
                                print('Additional {:d}th epoch'.format(
                                    additional_epoch))
                                sequence_numbers = list(
                                    range(len(dataset.token_indices['train'])))
                                random.shuffle(sequence_numbers)
                                for sequence_number in sequence_numbers:
                                    transition_params_trained, W_before_crf = train.train_step(
                                        sess, dataset, sequence_number, model,
                                        transition_params_trained, parameters)
                                    step += 1
                                epoch_elapsed_training_time = time.time(
                                ) - epoch_start_time
                                print(
                                    'Additional training completed in {0:.2f} seconds'
                                    .format(epoch_elapsed_training_time),
                                    flush=False)

                                y_pred, y_true, output_filepaths = train.predict_labels(
                                    sess, model, transition_params_trained,
                                    parameters, dataset, epoch_number,
                                    stats_graph_folder, dataset_filepaths)

                                evaluate.evaluate_model(
                                    results, dataset, y_pred, y_true,
                                    stats_graph_folder, epoch_number,
                                    epoch_start_time, output_filepaths,
                                    parameters)

                                summary = sess.run(model.summary_op,
                                                   feed_dict=None)
                                writers['train'].add_summary(
                                    summary, epoch_number)
                                writers['train'].flush()
                                utils.copytree(writers['train'].get_logdir(),
                                               model_folder)

                        if epoch_number >= parameters[
                                'maximum_number_of_epochs'] and not parameters[
                                    'refine_with_crf']:
                            break
                    if not parameters['use_pretrained_model']:
                        plot_name = 'F1-summary-step1.svg'
                    else:
                        plot_name = 'F1-summary-step2.svg'
                    for k, l in f1_scores.items():
                        print(k, l)
                    utils_plots.plot_f1(
                        f1_scores,
                        os.path.join(stats_graph_folder, '..', plot_name),
                        'F1 score summary')

                except KeyboardInterrupt:
                    results['execution_details']['keyboard_interrupt'] = True
                    print('Training interrupted')

                print('Finishing the experiment')
                end_time = time.time()
                results['execution_details'][
                    'train_duration'] = end_time - start_time
                results['execution_details']['train_end'] = end_time
                evaluate.save_results(results, stats_graph_folder)
                for dataset_type in dataset_filepaths.keys():
                    writers[dataset_type].close()

    sess.close()
Example #26
0
def main():

    #### Parameters - start
    conf_parameters = configparser.ConfigParser()
    conf_parameters.read(os.path.join('.', 'parameters.ini'))
    nested_parameters = utils.convert_configparser_to_dictionary(
        conf_parameters)
    parameters = {}
    for k, v in nested_parameters.items():
        parameters.update(v)
    for k, v in parameters.items():
        if k in [
                'remove_unknown_tokens', 'character_embedding_dimension',
                'character_lstm_hidden_state_dimension',
                'token_embedding_dimension',
                'token_lstm_hidden_state_dimension', 'patience',
                'maximum_number_of_epochs', 'maximum_training_time',
                'number_of_cpu_threads', 'number_of_gpus'
        ]:
            parameters[k] = int(v)
        if k in ['dropout_rate']:
            parameters[k] = float(v)
        if k in [
                'use_character_lstm', 'is_character_lstm_bidirect',
                'is_token_lstm_bidirect', 'use_crf'
        ]:
            parameters[k] = distutils.util.strtobool(v)
    pprint(parameters)

    # Load dataset
    dataset_filepaths = {}
    dataset_filepaths['train'] = os.path.join(
        parameters['dataset_text_folder'], 'train.txt')
    dataset_filepaths['valid'] = os.path.join(
        parameters['dataset_text_folder'], 'valid.txt')
    dataset_filepaths['test'] = os.path.join(parameters['dataset_text_folder'],
                                             'test.txt')
    dataset = ds.Dataset()
    dataset.load_dataset(dataset_filepaths, parameters)

    with tf.Graph().as_default():
        session_conf = tf.ConfigProto(
            device_count={
                'CPU': 1,
                'GPU': 1
            },
            allow_soft_placement=
            True,  #  automatically choose an existing and supported device to run the operations in case the specified one doesn't exist
            log_device_placement=False)

        sess = tf.Session(config=session_conf)

        with sess.as_default():
            # Instantiate model
            model = EntityLSTM(dataset, parameters)
            sess.run(tf.global_variables_initializer())
            model.load_pretrained_token_embeddings(sess, dataset, parameters)

            # Initialize and save execution details
            start_time = time.time()
            experiment_timestamp = utils.get_current_time_in_miliseconds()
            results = {}
            #results['model_options'] = copy.copy(model_options)
            #results['model_options'].pop('optimizer', None)
            results['epoch'] = {}
            results['execution_details'] = {}
            results['execution_details']['train_start'] = start_time
            results['execution_details']['time_stamp'] = experiment_timestamp
            results['execution_details']['early_stop'] = False
            results['execution_details']['keyboard_interrupt'] = False
            results['execution_details']['num_epochs'] = 0
            results['model_options'] = copy.copy(parameters)

            dataset_name = utils.get_basename_without_extension(
                parameters['dataset_text_folder']
            )  #opts.train.replace('/', '_').split('.')[0] # 'conll2003en'
            model_name = '{0}_{1}'.format(
                dataset_name, results['execution_details']['time_stamp'])

            output_folder = os.path.join('..', 'output')
            utils.create_folder_if_not_exists(output_folder)
            stats_graph_folder = os.path.join(
                output_folder, model_name)  # Folder where to save graphs
            #print('stats_graph_folder: {0}'.format(stats_graph_folder))
            utils.create_folder_if_not_exists(stats_graph_folder)
            #             model_folder = os.path.join(stats_graph_folder, 'model')
            #             utils.create_folder_if_not_exists(model_folder)

            step = 0
            bad_counter = 0
            previous_best_valid_f1_score = 0
            transition_params_trained = np.random.rand(
                len(dataset.unique_labels), len(dataset.unique_labels))
            try:
                while True:
                    epoch_number = math.floor(
                        step / len(dataset.token_indices['train']))
                    print('\nStarting epoch {0}'.format(epoch_number), end='')

                    epoch_start_time = time.time()
                    #print('step: {0}'.format(step))

                    # Train model: loop over all sequences of training set with shuffling
                    sequence_numbers = list(
                        range(len(dataset.token_indices['train'])))
                    random.shuffle(sequence_numbers)
                    for sequence_number in sequence_numbers:
                        transition_params_trained = train.train_step(
                            sess, dataset, sequence_number, model,
                            transition_params_trained, parameters)
                        step += 1
                        if step % 100 == 0:
                            print('.', end='', flush=True)
                            #break
                    print('.', flush=True)
                    #print('step: {0}'.format(step))

                    # Predict labels using trained model
                    all_predictions = {}
                    all_y_true = {}
                    output_filepaths = {}
                    for dataset_type in ['train', 'valid', 'test']:
                        #print('dataset_type:     {0}'.format(dataset_type))
                        prediction_output = train.prediction_step(
                            sess, dataset, dataset_type, model,
                            transition_params_trained, step,
                            stats_graph_folder, epoch_number, parameters)
                        all_predictions[dataset_type], all_y_true[
                            dataset_type], output_filepaths[
                                dataset_type] = prediction_output
#                         model_options = None

                    epoch_elapsed_training_time = time.time(
                    ) - epoch_start_time
                    print(
                        'epoch_elapsed_training_time: {0:.2f} seconds'.format(
                            epoch_elapsed_training_time))

                    results['execution_details']['num_epochs'] = epoch_number

                    # Evaluate model: save and plot results
                    evaluate.evaluate_model(results, dataset, all_predictions,
                                            all_y_true, stats_graph_folder,
                                            epoch_number, epoch_start_time,
                                            output_filepaths)

                    # Early stop
                    valid_f1_score = results['epoch'][epoch_number][0][
                        'valid']['f1_score']['micro']
                    if valid_f1_score > previous_best_valid_f1_score:
                        bad_counter = 0
                        previous_best_valid_f1_score = valid_f1_score
                    else:
                        bad_counter += 1

                    if bad_counter > parameters['patience']:
                        print('Early Stop!')
                        results['execution_details']['early_stop'] = True
                        break

                    if epoch_number > parameters['maximum_number_of_epochs']:
                        break


#                     break # debugging

            except KeyboardInterrupt:
                results['execution_details']['keyboard_interrupt'] = True
                #         assess_model.save_results(results, stats_graph_folder)
                print('Training interrupted')

            print('Finishing the experiment')
            end_time = time.time()
            results['execution_details'][
                'train_duration'] = end_time - start_time
            results['execution_details']['train_end'] = end_time
            evaluate.save_results(results, stats_graph_folder)

    sess.close()  # release the session's resources
Example #27
0
def main(languages):
    #embeddings_type = ['polyglot', 'fasttext']
    #embeddings_type = ['fasttext', 'fasttext_noOOV']
    embeddings_type = ['fasttext_noOOV']
    character_lstm = [True]
    embedding_language = ['target', 'source']
    combination = product(languages, embeddings_type, embedding_language, character_lstm)
    create_folder_if_not_exists(os.path.join("..", "log"))
    experiment_timestamp = utils.get_current_time_in_miliseconds()
    log_file = os.path.join("..", "log", "experiment-{}.log".format(experiment_timestamp))

    for language, emb_type, emb_language, char_lstm in combination:
        conf_parameters = load_parameters()
        conf_parameters = set_datasets(conf_parameters, language)
        conf_parameters.set('ann','use_character_lstm', str(char_lstm))
        conf_parameters.set('ann','embedding_type', emb_type)
        conf_parameters.set('ann','embedding_language', emb_language)
        if emb_type == 'polyglot':
            conf_parameters.set('ann', 'embedding_dimension', str(64))
        elif 'fasttext' in emb_type:
            conf_parameters.set('ann', 'embedding_dimension', str(300))
        else:
            raise("Uknown embedding type")
        if emb_language == 'source':
            conf_parameters.set('dataset', 'language', constants.MAPPING_LANGUAGE[language])
        else:
            conf_parameters.set('dataset', 'language', language)
        parameters, conf_parameters = parse_parameters(conf_parameters)

        start_time = time.time()
        experiment_timestamp = utils.get_current_time_in_miliseconds()

        results = {}
        results['epoch'] = {}
        results['execution_details'] = {}
        results['execution_details']['train_start'] = start_time
        results['execution_details']['time_stamp'] = experiment_timestamp
        results['execution_details']['early_stop'] = False
        results['execution_details']['keyboard_interrupt'] = False
        results['execution_details']['num_epochs'] = 0
        results['model_options'] = copy.copy(parameters)

        dataset_name = utils.get_basename_without_extension(parameters['dataset_train'])
        model_name = '{0}_{1}_{2}_{3}_{4}'.format(language, emb_type, char_lstm, emb_language,
                                                  results['execution_details']['time_stamp'])

        sys.stdout = open(os.path.join("..", "log", model_name), "w")
        print(language, emb_type, char_lstm, emb_language)

        with open(log_file, "a") as file:
            file.write("Experiment: {}\n".format(model_name))
            file.write("Start time:{}\n".format(experiment_timestamp))
            file.write("-------------------------------------\n\n")
        pprint(parameters)
        dataset_filepaths = get_valid_dataset_filepaths(parameters)
        check_parameter_compatiblity(parameters, dataset_filepaths)
        previous_best_valid_epoch = -1

        # Load dataset
        dataset = ds.Dataset(verbose=parameters['verbose'], debug=parameters['debug'])
        dataset.load_vocab_word_embeddings(parameters)
        dataset.load_dataset(dataset_filepaths, parameters)

        # Create graph and session
        with tf.Graph().as_default():
            session_conf = tf.ConfigProto(
                intra_op_parallelism_threads=parameters['number_of_cpu_threads'],
                inter_op_parallelism_threads=parameters['number_of_cpu_threads'],
                device_count={'CPU': 1, 'GPU': parameters['number_of_gpus']},
                allow_soft_placement=True,
                # automatically choose an existing and supported device to run the operations in case the specified one doesn't exist
                log_device_placement=False
            )

            session_conf.gpu_options.allow_growth = True

            sess = tf.Session(config=session_conf)

            with sess.as_default():
                # Initialize and save execution details

                print(model_name)
                output_folder = os.path.join('..', 'output')
                utils.create_folder_if_not_exists(output_folder)
                stats_graph_folder = os.path.join(output_folder, model_name)  # Folder where to save graphs
                utils.create_folder_if_not_exists(stats_graph_folder)
                model_folder = os.path.join(stats_graph_folder, 'model')
                utils.create_folder_if_not_exists(model_folder)
                with open(os.path.join(model_folder, 'parameters.ini'), 'w') as parameters_file:
                    conf_parameters.write(parameters_file)
                tensorboard_log_folder = os.path.join(stats_graph_folder, 'tensorboard_logs')
                utils.create_folder_if_not_exists(tensorboard_log_folder)
                tensorboard_log_folders = {}
                for dataset_type in dataset_filepaths.keys():
                    tensorboard_log_folders[dataset_type] = os.path.join(stats_graph_folder, 'tensorboard_logs',
                                                                         dataset_type)
                    utils.create_folder_if_not_exists(tensorboard_log_folders[dataset_type])
                # del dataset.embeddings_matrix
                if not parameters['use_pretrained_model']:
                    pickle.dump(dataset, open(os.path.join(model_folder, 'dataset.pickle'), 'wb'))
                # dataset.load_pretrained_word_embeddings(parameters)
                # Instantiate the model
                # graph initialization should be before FileWriter, otherwise the graph will not appear in TensorBoard
                model = EntityLSTM(dataset, parameters)

                # Instantiate the writers for TensorBoard
                writers = {}
                for dataset_type in dataset_filepaths.keys():
                    writers[dataset_type] = tf.summary.FileWriter(tensorboard_log_folders[dataset_type],
                                                                  graph=sess.graph)
                embedding_writer = tf.summary.FileWriter(
                    model_folder)  # embedding_writer has to write in model_folder, otherwise TensorBoard won't be able to view embeddings

                embeddings_projector_config = projector.ProjectorConfig()
                tensorboard_token_embeddings = embeddings_projector_config.embeddings.add()
                tensorboard_token_embeddings.tensor_name = model.token_embedding_weights.name
                token_list_file_path = os.path.join(model_folder, 'tensorboard_metadata_tokens.tsv')
                tensorboard_token_embeddings.metadata_path = os.path.relpath(token_list_file_path, '..')

                if parameters['use_character_lstm']:
                    tensorboard_character_embeddings = embeddings_projector_config.embeddings.add()
                    tensorboard_character_embeddings.tensor_name = model.character_embedding_weights.name
                    character_list_file_path = os.path.join(model_folder, 'tensorboard_metadata_characters.tsv')
                    tensorboard_character_embeddings.metadata_path = os.path.relpath(character_list_file_path, '..')

                projector.visualize_embeddings(embedding_writer, embeddings_projector_config)

                # Write metadata for TensorBoard embeddings
                token_list_file = codecs.open(token_list_file_path, 'w', 'UTF-8')
                for token_index in range(len(dataset.index_to_token)):
                    token_list_file.write('{0}\n'.format(dataset.index_to_token[token_index]))
                token_list_file.close()

                if parameters['use_character_lstm']:
                    character_list_file = codecs.open(character_list_file_path, 'w', 'UTF-8')
                    for character_index in range(dataset.alphabet_size):
                        if character_index == dataset.PADDING_CHARACTER_INDEX:
                            character_list_file.write('PADDING\n')
                        else:
                            character_list_file.write('{0}\n'.format(dataset.index_to_character[character_index]))
                    character_list_file.close()

                try:
                    # Initialize the model
                    sess.run(tf.global_variables_initializer())
                    if not parameters['use_pretrained_model']:
                        model.load_pretrained_token_embeddings(sess, dataset, parameters)

                    # Start training + evaluation loop. Each iteration corresponds to 1 epoch.
                    bad_counter = 0  # number of epochs with no improvement on the validation test in terms of F1-score
                    previous_best_valid_f1_score = -1
                    transition_params_trained = np.random.rand(len(dataset.unique_labels), len(
                        dataset.unique_labels))  # TODO np.random.rand(len(dataset.unique_labels)+2,len(dataset.unique_labels)+2)
                    model_saver = tf.train.Saver(
                        max_to_keep=None)  # parameters['maximum_number_of_epochs'])  # defaults to saving all variables
                    epoch_number = 0

                    while True:
                        step = 0
                        epoch_number += 1
                        print('\nStarting epoch {0}'.format(epoch_number))

                        epoch_start_time = time.time()

                        if parameters['use_pretrained_model'] and epoch_number == 1:
                            # Restore pretrained model parameters
                            transition_params_trained = train.restore_model_parameters_from_pretrained_model(parameters,
                                                                                                             dataset,
                                                                                                             sess,
                                                                                                             model,
                                                                                                             model_saver)
                        elif epoch_number != 0:
                            # Train model: loop over all sequences of training set with shuffling
                            sequence_numbers = list(range(len(dataset.token_indices['train'])))
                            random.shuffle(sequence_numbers)
                            data_counter = 0
                            sub_id = 0
                            for i in tqdm(range(0, len(sequence_numbers), parameters['batch_size']), "Training epoch {}".format(epoch_number),
                                          mininterval=1):
                                data_counter += parameters['batch_size']
                                if data_counter >= 20000:
                                    data_counter = 0
                                    sub_id += 0.001
                                    print("Intermediate evaluation number: ", sub_id)
                                    epoch_elapsed_training_time = time.time() - epoch_start_time
                                    print('Training completed in {0:.2f} seconds'.format(epoch_elapsed_training_time),
                                          flush=True)

                                    y_pred, y_true, output_filepaths = train.predict_labels(sess, model,
                                                                                            transition_params_trained,
                                                                                            parameters, dataset,
                                                                                            epoch_number + sub_id,
                                                                                            stats_graph_folder,
                                                                                            dataset_filepaths)
                                    # Evaluate model: save and plot results
                                    evaluate.evaluate_model(results, dataset, y_pred, y_true, stats_graph_folder,
                                                            epoch_number, epoch_start_time, output_filepaths,
                                                            parameters)
                                    # Save model
                                    model_saver.save(sess, os.path.join(model_folder,
                                                                        'model_{0:07.3f}.ckpt'.format(
                                                                            epoch_number + sub_id)))
                                    # Save TensorBoard logs
                                    summary = sess.run(model.summary_op, feed_dict=None)
                                    writers['train'].add_summary(summary, epoch_number)
                                    writers['train'].flush()
                                    utils.copytree(writers['train'].get_logdir(), model_folder)
                                    # Early stop
                                    valid_f1_score = results['epoch'][epoch_number][0]['valid']['f1_score']['micro']
                                    if valid_f1_score > previous_best_valid_f1_score:
                                        bad_counter = 0
                                        previous_best_valid_f1_score = valid_f1_score
                                    else:
                                        bad_counter += 1

                                sequence_number = sequence_numbers[i: i + parameters['batch_size']]
                                transition_params_trained, loss = train.train_step(sess, dataset, sequence_number,
                                                                                   model, transition_params_trained,
                                                                                   parameters)
                        epoch_elapsed_training_time = time.time() - epoch_start_time
                        print('Training completed in {0:.2f} seconds'.format(epoch_elapsed_training_time), flush=True)

                        y_pred, y_true, output_filepaths = train.predict_labels(sess, model, transition_params_trained,
                                                                                parameters, dataset, epoch_number,
                                                                                stats_graph_folder, dataset_filepaths)

                        # Evaluate model: save and plot results
                        evaluate.evaluate_model(results, dataset, y_pred, y_true, stats_graph_folder, epoch_number,
                                                epoch_start_time, output_filepaths, parameters)

                        # Save model
                        model_saver.save(sess, os.path.join(model_folder, 'model_{0:05d}.ckpt'.format(epoch_number)))

                        # Save TensorBoard logs
                        summary = sess.run(model.summary_op, feed_dict=None)
                        writers['train'].add_summary(summary, epoch_number)
                        writers['train'].flush()
                        utils.copytree(writers['train'].get_logdir(), model_folder)

                        # Early stop
                        valid_f1_score = results['epoch'][epoch_number][0]['valid']['f1_score']['micro']
                        if valid_f1_score > previous_best_valid_f1_score:
                            bad_counter = 0
                            previous_best_valid_f1_score = valid_f1_score
                            previous_best_valid_epoch = epoch_number
                        else:
                            bad_counter += 1
                        print("The last {0} epochs have not shown improvements on the validation set.".format(
                            bad_counter))

                        if bad_counter >= parameters['patience']:
                            print('Early Stop!')
                            results['execution_details']['early_stop'] = True
                            break

                        if epoch_number >= parameters['maximum_number_of_epochs']: break

                    keep_only_best_model(model_folder,previous_best_valid_epoch ,parameters['maximum_number_of_epochs']+1)

                except KeyboardInterrupt:
                    results['execution_details']['keyboard_interrupt'] = True
                    print('Training interrupted')
                    # remove the experiment
                    remove_experiment = input("Do you want to remove the experiment? (yes/y/Yes)")
                    if remove_experiment in ["Yes", "yes", "y"]:
                        shutil.rmtree(stats_graph_folder)
                        print("Folder removed")
                    else:
                        print('Finishing the experiment')
                        end_time = time.time()
                        results['execution_details']['train_duration'] = end_time - start_time
                        results['execution_details']['train_end'] = end_time
                        evaluate.save_results(results, stats_graph_folder)
                    sys.stdout.close()
                except Exception:
                    logging.exception("")
                    remove_experiment = input("Do you want to remove the experiment? (yes/y/Yes)")
                    if remove_experiment in ["Yes", "yes", "y"]:
                        shutil.rmtree(stats_graph_folder)
                        print("Folder removed")
                    sys.stdout.close()

            sess.close()  # release the session's resources
            sys.stdout.close()
Example #28
0
senders = {
    'ykaner': [0.54, 'yk12953'],
    'noamh': [1 - developer_fee[0], '2'],
    'rotemsd': [1 - developer_fee[0], '10'],
    # 'ohad1': [0.26, 'yk12953'],
    'duperyuyu': [0.7, 'duper'],
    'Israel_Ben_Ari': [0.5, 'buying']
}

owners = {'noamh': [0.5, '2'], 'rotemsd': [0.5, '10']}

token_data_f = 'token_data'
senders_f = 'senders.json'
owners_f = 'owners.json'
utils.create_folder_if_not_exists(token_data_f)
if not os.path.exists(os.path.join(token_data_f, senders_f)):
    with open(os.path.join(token_data_f, senders_f), 'w') as f:
        json.dump(senders, f, indent='\t')
else:
    with open(os.path.join(token_data_f, senders_f), 'r') as f:
        senders = json.load(f)
if not os.path.exists(os.path.join(token_data_f, owners_f)):
    with open(os.path.join(token_data_f, owners_f), 'w') as f:
        json.dump(senders, f, indent='\t')
else:
    with open(os.path.join(token_data_f, owners_f), 'r') as f:
        senders = json.load(f)


def choose_token(sender):
Example #29
0
    def fit(self):
        parameters = self.parameters
        conf_parameters = self.conf_parameters
        dataset_filepaths = self.dataset_filepaths
        dataset = self.dataset
        dataset_brat_folders = self.dataset_brat_folders
        sess = self.sess
        model = self.model
        transition_params_trained = self.transition_params_trained
        stats_graph_folder, experiment_timestamp = self._create_stats_graph_folder(parameters)

        # Initialize and save execution details
        start_time = time.time()
        results = {}
        results['epoch'] = {}
        results['execution_details'] = {}
        results['execution_details']['train_start'] = start_time
        results['execution_details']['time_stamp'] = experiment_timestamp
        results['execution_details']['early_stop'] = False
        results['execution_details']['keyboard_interrupt'] = False
        results['execution_details']['num_epochs'] = 0
        results['model_options'] = copy.copy(parameters)

        model_folder = os.path.join(stats_graph_folder, 'model')
        utils.create_folder_if_not_exists(model_folder)
        with open(os.path.join(model_folder, 'parameters.ini'), 'w') as parameters_file:
            conf_parameters.write(parameters_file)
        pickle.dump(dataset, open(os.path.join(model_folder, 'dataset.pickle'), 'wb'))
            
        tensorboard_log_folder = os.path.join(stats_graph_folder, 'tensorboard_logs')
        utils.create_folder_if_not_exists(tensorboard_log_folder)
        tensorboard_log_folders = {}
        for dataset_type in dataset_filepaths.keys():
            tensorboard_log_folders[dataset_type] = os.path.join(stats_graph_folder, 'tensorboard_logs', dataset_type)
            utils.create_folder_if_not_exists(tensorboard_log_folders[dataset_type])
                
        # Instantiate the writers for TensorBoard
        writers = {}
        for dataset_type in dataset_filepaths.keys():
            writers[dataset_type] = tf.summary.FileWriter(tensorboard_log_folders[dataset_type], graph=sess.graph)
        embedding_writer = tf.summary.FileWriter(model_folder) # embedding_writer has to write in model_folder, otherwise TensorBoard won't be able to view embeddings

        embeddings_projector_config = projector.ProjectorConfig()
        tensorboard_token_embeddings = embeddings_projector_config.embeddings.add()
        tensorboard_token_embeddings.tensor_name = model.token_embedding_weights.name
        token_list_file_path = os.path.join(model_folder, 'tensorboard_metadata_tokens.tsv')
        tensorboard_token_embeddings.metadata_path = os.path.relpath(token_list_file_path, '..')

        tensorboard_character_embeddings = embeddings_projector_config.embeddings.add()
        tensorboard_character_embeddings.tensor_name = model.character_embedding_weights.name
        character_list_file_path = os.path.join(model_folder, 'tensorboard_metadata_characters.tsv')
        tensorboard_character_embeddings.metadata_path = os.path.relpath(character_list_file_path, '..')

        projector.visualize_embeddings(embedding_writer, embeddings_projector_config)

        # Write metadata for TensorBoard embeddings
        token_list_file = codecs.open(token_list_file_path,'w', 'UTF-8')
        for token_index in range(dataset.vocabulary_size):
            token_list_file.write('{0}\n'.format(dataset.index_to_token[token_index]))
        token_list_file.close()

        character_list_file = codecs.open(character_list_file_path,'w', 'UTF-8')
        for character_index in range(dataset.alphabet_size):
            if character_index == dataset.PADDING_CHARACTER_INDEX:
                character_list_file.write('PADDING\n')
            else:
                character_list_file.write('{0}\n'.format(dataset.index_to_character[character_index]))
        character_list_file.close()


        # Start training + evaluation loop. Each iteration corresponds to 1 epoch.
        bad_counter = 0 # number of epochs with no improvement on the validation test in terms of F1-score
        previous_best_valid_f1_score = 0
        epoch_number = -1
        try:
            while True:
                step = 0
                epoch_number += 1
                print('\nStarting epoch {0}'.format(epoch_number))

                epoch_start_time = time.time()

                if epoch_number != 0:
                    # Train model: loop over all sequences of training set with shuffling
                    sequence_numbers=list(range(len(dataset.token_indices['train'])))
                    random.shuffle(sequence_numbers)
                    for sequence_number in sequence_numbers:
                        transition_params_trained = train.train_step(sess, dataset, sequence_number, model, parameters)
                        step += 1
                        if step % 10 == 0:
                            print('Training {0:.2f}% done'.format(step/len(sequence_numbers)*100), end='\r', flush=True)

                epoch_elapsed_training_time = time.time() - epoch_start_time
                print('Training completed in {0:.2f} seconds'.format(epoch_elapsed_training_time), flush=True)

                y_pred, y_true, output_filepaths = train.predict_labels(sess, model, transition_params_trained, parameters, dataset, epoch_number, stats_graph_folder, dataset_filepaths)

                # Evaluate model: save and plot results
                evaluate.evaluate_model(results, dataset, y_pred, y_true, stats_graph_folder, epoch_number, epoch_start_time, output_filepaths, parameters)

                if parameters['use_pretrained_model'] and not parameters['train_model']:
                    conll_to_brat.output_brat(output_filepaths, dataset_brat_folders, stats_graph_folder)
                    break

                # Save model
                model.saver.save(sess, os.path.join(model_folder, 'model_{0:05d}.ckpt'.format(epoch_number)))

                # Save TensorBoard logs
                summary = sess.run(model.summary_op, feed_dict=None)
                writers['train'].add_summary(summary, epoch_number)
                writers['train'].flush()
                utils.copytree(writers['train'].get_logdir(), model_folder)


                # Early stop
                valid_f1_score = results['epoch'][epoch_number][0]['valid']['f1_score']['micro']
                if  valid_f1_score > previous_best_valid_f1_score:
                    bad_counter = 0
                    previous_best_valid_f1_score = valid_f1_score
                    conll_to_brat.output_brat(output_filepaths, dataset_brat_folders, stats_graph_folder, overwrite=True)
                    self.transition_params_trained = transition_params_trained
                else:
                    bad_counter += 1
                print("The last {0} epochs have not shown improvements on the validation set.".format(bad_counter))

                if bad_counter >= parameters['patience']:
                    print('Early Stop!')
                    results['execution_details']['early_stop'] = True
                    break

                if epoch_number >= parameters['maximum_number_of_epochs']: break


        except KeyboardInterrupt:
            results['execution_details']['keyboard_interrupt'] = True
            print('Training interrupted')

        print('Finishing the experiment')
        end_time = time.time()
        results['execution_details']['train_duration'] = end_time - start_time
        results['execution_details']['train_end'] = end_time
        evaluate.save_results(results, stats_graph_folder)
        for dataset_type in dataset_filepaths.keys():
            writers[dataset_type].close()
Example #30
0
    def fit(self):
        '''
        Dùng để train data
        '''
        parameters = self.parameters
        conf_parameters = self.conf_parameters
        dataset_filepaths = self.dataset_filepaths
        dataset = self.dataset
        dataset_brat_folders = self.dataset_brat_folders
        sess = self.sess
        model = self.model
        transition_params_trained = self.transition_params_trained
        stats_graph_folder, experiment_timestamp = self._create_stats_graph_folder(parameters)

        # Khởi tạo và lưu các thông tin của lần chạy
        start_time = time.time()
        results = {}
        results['epoch'] = {}
        '''
        An epoch, in Machine Learning, is the entire processing by the learning algorithm of the entire train-set.
        Ex:
        The MNIST train set is composed by 55000 samples. Once the algorithm processed all those 55000 samples an epoch is passed.
        '''
        results['execution_details'] = {}
        results['execution_details']['train_start'] = start_time                # Thời gian bắt đầu chạy
        results['execution_details']['time_stamp'] = experiment_timestamp       # Nhãn thời gian
        results['execution_details']['early_stop'] = False                      # Cho biết có lỗi xảy ra nên bị dừng sớm ko
        results['execution_details']['keyboard_interrupt'] = False              # Cho biết có bị dừng bởi keyboard
        results['execution_details']['num_epochs'] = 0                          # Số lượng epoch đã chạy
        results['model_options'] = copy.copy(parameters)                        # Các tham số

        model_folder = os.path.join(stats_graph_folder, 'model')                # output/en.../model
        utils.create_folder_if_not_exists(model_folder)
        # Save value cac parameters vao file parameters.ini
        with open(os.path.join(model_folder, 'parameters.ini'), 'w') as parameters_file:
            conf_parameters.write(parameters_file)                                          # Log các tham số ra file
        pickle.dump(dataset, open(os.path.join(model_folder, 'dataset.pickle'), 'wb'))      # Dump dataset thành pickle file để lần sau chạy

        # Tạo folder tensorboard logs để dùng cho việc vẽ biểu đồ sau này
        tensorboard_log_folder = os.path.join(stats_graph_folder, 'tensorboard_logs')       # folder lưu file log của tensorboard -> dùng cho việc plot biểu đồ lên
        utils.create_folder_if_not_exists(tensorboard_log_folder)
        tensorboard_log_folders = {}
        for dataset_type in dataset_filepaths.keys():
            tensorboard_log_folders[dataset_type] = os.path.join(stats_graph_folder, 'tensorboard_logs', dataset_type)
            utils.create_folder_if_not_exists(tensorboard_log_folders[dataset_type])

        # Khởi tạo các writers cho tensorboard
        writers = {} # Có nhiều nhất 4 writers train, test, valid, deploy
        for dataset_type in dataset_filepaths.keys():
            writers[dataset_type] = tf.summary.FileWriter(tensorboard_log_folders[dataset_type], graph=sess.graph)
        embedding_writer = tf.summary.FileWriter(model_folder) # embedding_writer has to write in model_folder, otherwise TensorBoard won't be able to view embeddings

        # Dùng cho việc visualize embedding bằng tensorboard
        embeddings_projector_config = projector.ProjectorConfig()
        tensorboard_token_embeddings = embeddings_projector_config.embeddings.add()
        tensorboard_token_embeddings.tensor_name = model.token_embedding_weights.name
        token_list_file_path = os.path.join(model_folder, 'tensorboard_metadata_tokens.tsv')
        tensorboard_token_embeddings.metadata_path = 'tensorboard_metadata_tokens.tsv'#os.path.relpath(token_list_file_path, '..')

        tensorboard_character_embeddings = embeddings_projector_config.embeddings.add()
        tensorboard_character_embeddings.tensor_name = model.character_embedding_weights.name
        character_list_file_path = os.path.join(model_folder, 'tensorboard_metadata_characters.tsv')
        tensorboard_character_embeddings.metadata_path = 'tensorboard_metadata_characters.tsv'#os.path.relpath(character_list_file_path, '..')

        # Saves a configuration file that TensorBoard will read during startup.
        projector.visualize_embeddings(embedding_writer, embeddings_projector_config)

        # Ghi token vào file tsv dùng làm metadata cho embedding
        token_list_file = codecs.open(token_list_file_path,'w', 'UTF-8')
        for token_index in range(dataset.vocabulary_size):
            token_list_file.write('{0}\n'.format(dataset.index_to_token[token_index]))
        token_list_file.close()

        # Ghi characters vào file tsv dùng làm metadata cho embedding
        character_list_file = codecs.open(character_list_file_path,'w', 'UTF-8')
        for character_index in range(dataset.alphabet_size):
            if character_index == dataset.PADDING_CHARACTER_INDEX:
                character_list_file.write('PADDING\n')
            else:
                character_list_file.write('{0}\n'.format(dataset.index_to_character[character_index]))
        character_list_file.close()


        # Start training + evaluation loop. Each iteration corresponds to 1 epoch.
        bad_counter = 0 # number of epochs with no improvement on the validation test in terms of F1-score
        previous_best_valid_f1_score = 0 # f1-Score tốt nhất ở các lần chạy trước
        epoch_number = -1
        try:
            while True:
                step = 0
                epoch_number += 1
                print('\nStarting epoch {0}'.format(epoch_number))

                epoch_start_time = time.time()

                if epoch_number != 0:
                    # Train model: loop over all sequences of training set with shuffling
                    sequence_numbers=list(range(len(dataset.token_indices['train'])))
                    print("----****____")
                    print(dataset.token_indices['train'][:10])
                    random.shuffle(sequence_numbers)
                    # Thuc hien train
                    for sequence_number in sequence_numbers:
                        transition_params_trained = train.train_step(sess, dataset, sequence_number, model, parameters)
                        step += 1
                        if step % 10 == 0:
                            print('Training {0:.2f}% done'.format(step/len(sequence_numbers)*100), end='\r', flush=True)

                # Tinh thoi gian thuc hien 1 epoch
                epoch_elapsed_training_time = time.time() - epoch_start_time
                print('Training completed in {0:.2f} seconds'.format(epoch_elapsed_training_time), flush=True)

                y_pred, y_true, output_filepaths = train.predict_labels(sess, model, transition_params_trained, parameters, dataset, epoch_number, stats_graph_folder, dataset_filepaths)

                # Evaluate model: save and plot results
                evaluate.evaluate_model(results, dataset, y_pred, y_true, stats_graph_folder, epoch_number, epoch_start_time, output_filepaths, parameters)

                if parameters['use_pretrained_model'] and not parameters['train_model']:
                    conll_to_brat.output_brat(output_filepaths, dataset_brat_folders, stats_graph_folder)
                    break

                # Save model
                model.saver.save(sess, os.path.join(model_folder, 'model_{0:05d}.ckpt'.format(epoch_number)))

                # Save TensorBoard logs
                summary = sess.run(model.summary_op, feed_dict=None)
                writers['train'].add_summary(summary, epoch_number)
                writers['train'].flush()
                utils.copytree(writers['train'].get_logdir(), model_folder)


                # Early stop
                valid_f1_score = results['epoch'][epoch_number][0]['valid']['f1_score']['micro']
                # If do chinh xac cua epoch > do chinh xac cua epoch truoc
                if  valid_f1_score > previous_best_valid_f1_score:
                    bad_counter = 0
                    previous_best_valid_f1_score = valid_f1_score
                    conll_to_brat.output_brat(output_filepaths, dataset_brat_folders, stats_graph_folder, overwrite=True)
                    self.transition_params_trained = transition_params_trained
                else:
                    bad_counter += 1
                print("The last {0} epochs have not shown improvements on the validation set.".format(bad_counter))

                # If bad_counter den mot muc gioi han parameters['patience'] = 10 (gia tri khoi tao) finish train
                if bad_counter >= parameters['patience']:
                    print('Early Stop!')
                    results['execution_details']['early_stop'] = True
                    break

                # Neu so epoch >= so luong epoch toi da quy dinh --> ket thuc train
                if epoch_number >= parameters['maximum_number_of_epochs']: break


        except KeyboardInterrupt:
            results['execution_details']['keyboard_interrupt'] = True
            print('Training interrupted')

        # Ket thuc train luu cac tham so time, ket qua
        print('Finishing the experiment')
        end_time = time.time()
        results['execution_details']['train_duration'] = end_time - start_time
        results['execution_details']['train_end'] = end_time
        evaluate.save_results(results, stats_graph_folder)
        for dataset_type in dataset_filepaths.keys():
            writers[dataset_type].close()