Ejemplo n.º 1
0
def generate_reference_text_file_for_conll(conll_input_filepath,
                                           conll_output_filepath, text_folder):
    '''
    generates reference text files and adds the corresponding filename and token offsets to conll file.
    
    conll_input_filepath: path to a conll-formatted file without filename and token offsets
    text_folder: folder to write the reference text file to
    '''
    dataset_type = utils.get_basename_without_extension(conll_input_filepath)
    conll_file = codecs.open(conll_input_filepath, 'r', 'UTF-8')
    utils.create_folder_if_not_exists(text_folder)
    text = ''
    new_conll_string = ''
    character_index = 0
    document_count = 0
    text_base_filename = '{0}_text_{1}'.format(dataset_type,
                                               str(document_count).zfill(5))
    for line in conll_file:
        split_line = line.strip().split(' ')
        # New document
        if '-DOCSTART-' in split_line[0]:
            new_conll_string += line
            if len(text) != 0:
                with codecs.open(
                        os.path.join(text_folder,
                                     '{0}.txt'.format(text_base_filename)),
                        'w', 'UTF-8') as f:
                    f.write(text)
            text = ''
            character_index = 0
            document_count += 1
            text_base_filename = '{0}_text_{1}'.format(
                dataset_type,
                str(document_count).zfill(5))
            continue
            # New sentence
        elif len(split_line) == 0 or len(split_line[0]) == 0:
            new_conll_string += '\n'
            if text != '':
                text += '\n'
                character_index += 1
            continue
        token = split_line[0]
        start = character_index
        end = start + len(token)
        text += token + ' '
        character_index += len(token) + 1
        new_conll_string += ' '.join(
            [token, text_base_filename,
             str(start), str(end)] + split_line[1:]) + '\n'
    if len(text) != 0:
        with codecs.open(
                os.path.join(text_folder,
                             '{0}.txt'.format(text_base_filename)), 'w',
                'UTF-8') as f:
            f.write(text)
    conll_file.close()

    with codecs.open(conll_output_filepath, 'w', 'UTF-8') as f:
        f.write(new_conll_string)
Ejemplo n.º 2
0
def xml_to_brat(input_folder, output_folder, overwrite=True):
    print('input_folder: {0}'.format(input_folder))
    start_time = time.time()
    if overwrite:
        shutil.rmtree(output_folder, ignore_errors=True)
    utils.create_folder_if_not_exists(output_folder)

    for input_filepath in sorted(glob.glob(os.path.join(input_folder, '*.xml'))):
        filename = utils.get_basename_without_extension(input_filepath)
        output_text_filepath = os.path.join(output_folder, '{0}.txt'.format(filename))
        xmldoc = xml.etree.ElementTree.parse(input_filepath).getroot()
        # Get text
        text = xmldoc.findtext('TEXT')
        with codecs.open(output_text_filepath, 'w', 'UTF-8') as f:
            f.write(text)

        # Get PHI tags
        tags = xmldoc.findall('TAGS')[0] # [0] because there is only one <TAGS>...</TAGS>
        entities = []
        for tag in tags:
            entity = {}
            entity['label'] = tag.get('TYPE')
            entity['text'] = tag.get('text')
            entity['start'] = int(tag.get('start'))
            entity['end'] = int(tag.get('end'))
            entities.append(entity)
        output_entities(output_folder, filename, entities, output_text_filepath, text, overwrite=overwrite)

    time_spent = time.time() - start_time
    print("Time spent formatting: {0:.2f} seconds".format(time_spent))
Ejemplo n.º 3
0
def prepare_pretrained_model_for_restoring(output_folder_name,
                                           epoch_number,
                                           model_name,
                                           delete_token_mappings=False):
    '''
    Copy the dataset.pickle, parameters.ini, and model checkpoint files after removing the data used for training.
    
    The dataset and labels are deleted from dataset.pickle by default. The only information about the dataset that remain in the pretrained model
    is the list of tokens that appears in the dataset and the corresponding token embeddings learned from the dataset.
    
    If delete_token_mappings is set to True, index_to_token and token_to_index mappings are deleted from dataset.pickle additionally,
    and the corresponding token embeddings are deleted from the model checkpoint files. In this case, the pretrained model would not contain
    any information about the dataset used for training the model. 
    
    If you wish to share a pretrained model with delete_token_mappings = True, it is highly recommended to use some external pre-trained token 
    embeddings and freeze them while training the model to obtain high performance. This can be done by specifying the token_pretrained_embedding_filepath 
    and setting freeze_token_embeddings = True in parameters.ini for training.
    '''
    input_model_folder = os.path.join('..', 'output', output_folder_name,
                                      'model')
    output_model_folder = os.path.join('..', 'trained_models', model_name)
    utils.create_folder_if_not_exists(output_model_folder)

    # trim and copy dataset.pickle
    input_dataset_filepath = os.path.join(input_model_folder, 'dataset.pickle')
    output_dataset_filepath = os.path.join(output_model_folder,
                                           'dataset.pickle')
    trim_dataset_pickle(input_dataset_filepath,
                        output_dataset_filepath,
                        delete_token_mappings=delete_token_mappings)

    # copy parameters.ini
    parameters_filepath = os.path.join(input_model_folder, 'parameters.ini')
    shutil.copy(parameters_filepath, output_model_folder)

    # (trim and) copy checkpoint files
    epoch_number_string = str(epoch_number).zfill(5)
    if delete_token_mappings:
        input_checkpoint_filepath = os.path.join(
            input_model_folder, 'model_{0}.ckpt'.format(epoch_number_string))
        output_checkpoint_filepath = os.path.join(output_model_folder,
                                                  'model.ckpt')
        trim_model_checkpoint(parameters_filepath, output_dataset_filepath,
                              input_checkpoint_filepath,
                              output_checkpoint_filepath)
    else:
        for filepath in glob.glob(
                os.path.join(input_model_folder,
                             'model_{0}.ckpt*'.format(epoch_number_string))):
            shutil.copyfile(
                filepath,
                os.path.join(
                    output_model_folder,
                    os.path.basename(filepath).replace(
                        '_' + epoch_number_string, '')))
Ejemplo n.º 4
0
 def _create_stats_graph_folder(self, parameters):
     # Initialize stats_graph_folder
     experiment_timestamp = utils.get_current_time_in_miliseconds()
     dataset_name = utils.get_basename_without_extension(
         parameters['dataset_text_folder'])
     model_name = '{0}_{1}'.format(dataset_name, experiment_timestamp)
     utils.create_folder_if_not_exists(parameters['output_folder'])
     stats_graph_folder = os.path.join(
         parameters['output_folder'],
         model_name)  # Folder where to save graphs
     utils.create_folder_if_not_exists(stats_graph_folder)
     return stats_graph_folder, experiment_timestamp
Ejemplo n.º 5
0
    def predict(self, text):
        """
        Predict

        Args:
            text (str): Description.
        """
        self.prediction_count += 1

        if self.prediction_count == 1:
            self.parameters['dataset_text_folder'] = os.path.join('.', 'data', 'temp')
            self.stats_graph_folder, _ = self._create_stats_graph_folder(self.parameters)

        # Update the deploy folder, file, and modeldata
        dataset_type = 'deploy'

        # Delete all deployment data
        for filepath in glob.glob(os.path.join(self.parameters['dataset_text_folder'],
            '{0}*'.format(dataset_type))):
            if os.path.isdir(filepath):
                shutil.rmtree(filepath)
            else:
                os.remove(filepath)

        # # Create brat folder and file
        dataset_brat_deploy_folder = os.path.join(self.parameters['dataset_text_folder'],
            dataset_type)
        utils.create_folder_if_not_exists(dataset_brat_deploy_folder)
        dataset_brat_deploy_filepath = os.path.join(dataset_brat_deploy_folder,
            'temp_{0}.txt'.format(str(self.prediction_count).zfill(5)))
            #self._get_dataset_brat_deploy_filepath(dataset_brat_deploy_folder)
        with codecs.open(dataset_brat_deploy_filepath, 'w', 'UTF-8') as f:
            f.write(text)

        # Update deploy filepaths
        dataset_filepaths, dataset_brat_folders = self._get_valid_dataset_filepaths(self.parameters,
            dataset_types=[dataset_type])
        self.dataset_filepaths.update(dataset_filepaths)
        self.dataset_brat_folders.update(dataset_brat_folders)

        # Update the dataset for the new deploy set
        self.modeldata.update_dataset(self.dataset_filepaths, [dataset_type])

        # Predict labels and output brat
        output_filepaths = {}
        prediction_output = train.prediction_step(self.sess, self.modeldata,
            dataset_type, self.model, self.transition_params_trained,
            self.prediction_count, self.parameters)

        return prediction_output
Ejemplo n.º 6
0
def output_brat(output_filepaths,
                dataset_brat_folders,
                stats_graph_folder,
                overwrite=False):
    # Output brat files
    for dataset_type in ['train', 'valid', 'test', 'deploy']:
        if dataset_type not in output_filepaths.keys():
            continue
        brat_output_folder = os.path.join(stats_graph_folder, 'brat',
                                          dataset_type)
        utils.create_folder_if_not_exists(brat_output_folder)
        conll_to_brat(output_filepaths[dataset_type],
                      output_filepaths[dataset_type],
                      dataset_brat_folders[dataset_type],
                      brat_output_folder,
                      overwrite=overwrite)
Ejemplo n.º 7
0
    def predict(self, text):
        """
        Predict

        Args:
            text (str): Description.
        """
        self.prediction_count += 1

        if self.prediction_count == 1:
            self.parameters['dataset_text_folder'] = os.path.join(
                '.', 'data', 'temp')
            self.stats_graph_folder, _ = self._create_stats_graph_folder(
                self.parameters)

        # Update the deploy folder, file, and modeldata
        dataset_type = 'deploy'

        # Delete all deployment data
        for filepath in glob.glob(
                os.path.join(self.parameters['dataset_text_folder'],
                             '{0}*'.format(dataset_type))):
            if os.path.isdir(filepath):
                shutil.rmtree(filepath)
            else:
                os.remove(filepath)

        # Create brat folder and file
        dataset_brat_deploy_folder = os.path.join(
            self.parameters['dataset_text_folder'], dataset_type)
        utils.create_folder_if_not_exists(dataset_brat_deploy_folder)
        dataset_brat_deploy_filepath = os.path.join(
            dataset_brat_deploy_folder,
            'temp_{0}.txt'.format(str(self.prediction_count).zfill(5)))
        #self._get_dataset_brat_deploy_filepath(dataset_brat_deploy_folder)
        with codecs.open(dataset_brat_deploy_filepath, 'w', 'UTF-8') as f:
            f.write(text)

        # Update deploy filepaths
        dataset_filepaths, dataset_brat_folders = self._get_valid_dataset_filepaths(
            self.parameters, dataset_types=[dataset_type])
        self.dataset_filepaths.update(dataset_filepaths)
        self.dataset_brat_folders.update(dataset_brat_folders)

        # Update the dataset for the new deploy set
        self.modeldata.update_dataset(self.dataset_filepaths, [dataset_type])

        # Predict labels and output brat
        output_filepaths = {}
        prediction_output = train.prediction_step(
            self.sess, self.modeldata, dataset_type, self.model,
            self.transition_params_trained, self.stats_graph_folder,
            self.prediction_count, self.parameters, self.dataset_filepaths)

        _, _, output_filepaths[dataset_type] = prediction_output
        conll_to_brat.output_brat(output_filepaths,
                                  self.dataset_brat_folders,
                                  self.stats_graph_folder,
                                  overwrite=True)

        # Print and output result
        text_filepath = os.path.join(
            self.stats_graph_folder, 'brat', 'deploy',
            os.path.basename(dataset_brat_deploy_filepath))
        annotation_filepath = os.path.join(
            self.stats_graph_folder, 'brat', 'deploy', '{0}.ann'.format(
                utils.get_basename_without_extension(
                    dataset_brat_deploy_filepath)))
        text2, entities = brat_to_conll.get_entities_from_brat(
            text_filepath, annotation_filepath, verbose=True)
        assert (text == text2)
        return entities
Ejemplo n.º 8
0
    def fit(self):
        """
        Fit the model.
        """
        parameters = self.parameters
        conf_parameters = self.conf_parameters
        dataset_filepaths = self.dataset_filepaths
        modeldata = self.modeldata
        dataset_brat_folders = self.dataset_brat_folders
        sess = self.sess
        model = self.model
        transition_params_trained = self.transition_params_trained
        stats_graph_folder, experiment_timestamp = self._create_stats_graph_folder(
            parameters)

        # Initialize and save execution details
        start_time = time.time()
        results = {}
        results['epoch'] = {}
        results['execution_details'] = {}
        results['execution_details']['train_start'] = start_time
        results['execution_details']['time_stamp'] = experiment_timestamp
        results['execution_details']['early_stop'] = False
        results['execution_details']['keyboard_interrupt'] = False
        results['execution_details']['num_epochs'] = 0
        results['model_options'] = copy.copy(parameters)

        model_folder = os.path.join(stats_graph_folder, 'model')
        utils.create_folder_if_not_exists(model_folder)
        with open(os.path.join(model_folder, 'parameters.ini'),
                  'w') as parameters_file:
            conf_parameters.write(parameters_file)
        pickle.dump(modeldata,
                    open(os.path.join(model_folder, 'dataset.pickle'), 'wb'))

        tensorboard_log_folder = os.path.join(stats_graph_folder,
                                              'tensorboard_logs')
        utils.create_folder_if_not_exists(tensorboard_log_folder)
        tensorboard_log_folders = {}
        for dataset_type in dataset_filepaths.keys():
            tensorboard_log_folders[dataset_type] = os.path.join(
                stats_graph_folder, 'tensorboard_logs', dataset_type)
            utils.create_folder_if_not_exists(
                tensorboard_log_folders[dataset_type])

        # Instantiate the writers for TensorBoard
        writers = {}
        for dataset_type in dataset_filepaths.keys():
            writers[dataset_type] = tf.summary.FileWriter(
                tensorboard_log_folders[dataset_type], graph=sess.graph)

        # embedding_writer has to write in model_folder, otherwise TensorBoard won't be able to view embeddings
        embedding_writer = tf.summary.FileWriter(model_folder)

        embeddings_projector_config = projector.ProjectorConfig()
        tensorboard_token_embeddings = embeddings_projector_config.embeddings.add(
        )
        tensorboard_token_embeddings.tensor_name = model.token_embedding_weights.name
        token_list_file_path = os.path.join(model_folder,
                                            'tensorboard_metadata_tokens.tsv')
        tensorboard_token_embeddings.metadata_path = os.path.relpath(
            token_list_file_path, '.')

        tensorboard_character_embeddings = embeddings_projector_config.embeddings.add(
        )
        tensorboard_character_embeddings.tensor_name = model.character_embedding_weights.name
        character_list_file_path = os.path.join(
            model_folder, 'tensorboard_metadata_characters.tsv')
        tensorboard_character_embeddings.metadata_path = os.path.relpath(
            character_list_file_path, '.')

        projector.visualize_embeddings(embedding_writer,
                                       embeddings_projector_config)

        # Write metadata for TensorBoard embeddings
        token_list_file = codecs.open(token_list_file_path, 'w', 'UTF-8')
        for token_index in range(modeldata.vocabulary_size):
            token_list_file.write('{0}\n'.format(
                modeldata.index_to_token[token_index]))
        token_list_file.close()

        character_list_file = codecs.open(character_list_file_path, 'w',
                                          'UTF-8')
        for character_index in range(modeldata.alphabet_size):
            if character_index == modeldata.PADDING_CHARACTER_INDEX:
                character_list_file.write('PADDING\n')
            else:
                character_list_file.write('{0}\n'.format(
                    modeldata.index_to_character[character_index]))
        character_list_file.close()

        # Start training + evaluation loop. Each iteration corresponds to 1 epoch.
        # number of epochs with no improvement on the validation test in terms of F1-score
        bad_counter = 0
        previous_best_valid_f1_score = 0
        epoch_number = -1
        try:
            while True:
                step = 0
                epoch_number += 1
                print('\nStarting epoch {0}'.format(epoch_number))

                epoch_start_time = time.time()

                if epoch_number != 0:
                    # Train model: loop over all sequences of training set with shuffling
                    sequence_numbers = list(
                        range(len(modeldata.token_indices['train'])))
                    random.shuffle(sequence_numbers)
                    for sequence_number in sequence_numbers:
                        transition_params_trained = train.train_step(
                            sess, modeldata, sequence_number, model,
                            parameters)
                        step += 1
                        if step % 10 == 0:
                            print('Training {0:.2f}% done'.format(
                                step / len(sequence_numbers) * 100),
                                  end='\r',
                                  flush=True)

                epoch_elapsed_training_time = time.time() - epoch_start_time
                print('Training completed in {0:.2f} seconds'.format(
                    epoch_elapsed_training_time),
                      flush=True)

                y_pred, y_true, output_filepaths = train.predict_labels(
                    sess, model, transition_params_trained, parameters,
                    modeldata, epoch_number, stats_graph_folder,
                    dataset_filepaths)

                # Evaluate model: save and plot results
                evaluate.evaluate_model(results, modeldata, y_pred, y_true,
                                        stats_graph_folder, epoch_number,
                                        epoch_start_time, output_filepaths,
                                        parameters)

                if parameters['use_pretrained_model'] and not parameters[
                        'train_model']:
                    conll_to_brat.output_brat(output_filepaths,
                                              dataset_brat_folders,
                                              stats_graph_folder)
                    break

                # Save model
                model.saver.save(
                    sess,
                    os.path.join(model_folder,
                                 'model_{0:05d}.ckpt'.format(epoch_number)))

                # Save TensorBoard logs
                summary = sess.run(model.summary_op, feed_dict=None)
                writers['train'].add_summary(summary, epoch_number)
                writers['train'].flush()
                utils.copytree(writers['train'].get_logdir(), model_folder)

                # Early stop
                valid_f1_score = results['epoch'][epoch_number][0]['valid'][
                    'f1_score']['micro']
                if valid_f1_score > previous_best_valid_f1_score:
                    bad_counter = 0
                    previous_best_valid_f1_score = valid_f1_score
                    conll_to_brat.output_brat(output_filepaths,
                                              dataset_brat_folders,
                                              stats_graph_folder,
                                              overwrite=True)
                    self.transition_params_trained = transition_params_trained
                else:
                    bad_counter += 1
                print(
                    "The last {0} epochs have not shown improvements on the validation set."
                    .format(bad_counter))

                if bad_counter >= parameters['patience']:
                    print('Early Stop!')
                    results['execution_details']['early_stop'] = True
                    break

                if epoch_number >= parameters['maximum_number_of_epochs']:
                    break

        except KeyboardInterrupt:
            results['execution_details']['keyboard_interrupt'] = True
            print('Training interrupted')

        print('Finishing the experiment')
        end_time = time.time()
        results['execution_details']['train_duration'] = end_time - start_time
        results['execution_details']['train_end'] = end_time
        evaluate.save_results(results, stats_graph_folder)
        for dataset_type in dataset_filepaths.keys():
            writers[dataset_type].close()
Ejemplo n.º 9
0
def conll_to_brat(conll_input_filepath,
                  conll_output_filepath,
                  brat_original_folder,
                  brat_output_folder,
                  overwrite=False):
    '''
    convert conll file in conll-filepath to brat annotations and output to brat_output_folder, 
    with reference to the existing text files in brat_original_folder 
    if brat_original_folder does not exist or contain any text file, then the text files are generated from conll files,
    and conll file is updated with filenames and token offsets accordingly. 
    
    conll_input_filepath: path to conll file to convert to brat annotations
    conll_output_filepath: path to output conll file with filename and offsets that are compatible with brat annotations
    brat_original_folder: folder that contains the original .txt (and .ann) files that are formatted according to brat.
                          .txt files are used to check if the token offsets match and generate the annotation from conll.                      
    brat_output_folder: folder to output the text and brat annotations 
                        .txt files are copied from brat_original_folder to brat_output_folder
    '''
    verbose = False
    dataset_type = utils.get_basename_without_extension(conll_input_filepath)
    print("Formatting {0} set from CONLL to BRAT... ".format(dataset_type),
          end='')

    # if brat_original_folder does not exist or have any text file
    if not os.path.exists(brat_original_folder) or len(
            glob.glob(os.path.join(brat_original_folder, '*.txt'))) == 0:
        assert (conll_input_filepath != conll_output_filepath)
        generate_reference_text_file_for_conll(conll_input_filepath,
                                               conll_output_filepath,
                                               brat_original_folder)

    utils.create_folder_if_not_exists(brat_output_folder)
    conll_file = codecs.open(conll_output_filepath, 'r', 'UTF-8')

    previous_token_label = 'O'
    previous_filename = ''
    text_filepath = ''
    text = ''
    entity_id = 1
    entities = []
    entity = {}
    for line in conll_file:
        line = line.strip().split(' ')
        # New sentence
        if len(line) == 0 or len(line[0]) == 0 or '-DOCSTART-' in line[0]:
            # Add the last entity
            if entity != {}:
                if verbose: print("entity: {0}".format(entity))
                entities.append(entity)
                entity_id += 1
                entity = {}
            previous_token_label = 'O'
            continue

        filename = str(line[1])
        # New file
        if filename != previous_filename:
            output_entities(brat_output_folder,
                            previous_filename,
                            entities,
                            text_filepath,
                            text,
                            overwrite=overwrite)
            text_filepath = os.path.join(brat_original_folder,
                                         '{0}.txt'.format(filename))
            with codecs.open(text_filepath, 'r', 'UTF-8') as f:
                text = f.read()
            previous_token_label = 'O'
            previous_filename = filename
            entity_id = 1
            entities = []
            entity = {}

        label = str(line[-1]).replace('_', '-')  # For LOCATION-OTHER
        if label == 'O':
            # Previous entity ended
            if previous_token_label != 'O':
                if verbose: print("entity: {0}".format(entity))
                entities.append(entity)
                entity_id += 1
                entity = {}
            previous_token_label = 'O'
            continue

        token = {}
        token['text'] = str(line[0])
        token['start'] = int(line[2])
        token['end'] = int(line[3])
        # check that the token text matches the original
        if token['text'] != text[token['start']:token['end']].replace(
                ' ', '-'):
            print("Warning: conll and brat text do not match.")
            print("\tCONLL: {0}".format(token['text']))
            print("\tBRAT : {0}".format(text[token['start']:token['end']]))
        token['label'] = label[2:]

        if label[:2] == 'B-':
            if previous_token_label != 'O':
                # End the previous entity
                if verbose: print("entity: {0}".format(entity))
                entities.append(entity)
                entity_id += 1
            # Start a new entity
            entity = token
        elif label[:2] == 'I-':
            # Entity continued
            if previous_token_label == token['label']:
                # if there is no newline between the entity and the token
                if '\n' not in text[entity['end']:token['start']]:
                    # Update entity
                    entity['text'] = entity['text'] + ' ' + token['text']
                    entity['end'] = token['end']
                else:  # newline between the entity and the token
                    # End the previous entity
                    if verbose: print("entity: {0}".format(entity))
                    entities.append(entity)
                    entity_id += 1
                    # Start a new entity
                    entity = token
            elif previous_token_label != 'O':
                # TODO: count BI or II incompatibility
                # End the previous entity
                if verbose: print("entity: {0}".format(entity))
                entities.append(entity)
                entity_id += 1
                # Start new entity
                entity = token
            else:  # previous_token_label == 'O'
                # TODO: count  OI incompatibility
                # Start new entity
                entity = token
        previous_token_label = token['label']
    output_entities(brat_output_folder,
                    previous_filename,
                    entities,
                    text_filepath,
                    text,
                    overwrite=overwrite)
    conll_file.close()
    print('Done.')