def xml_to_brat(input_folder, output_folder, overwrite=True):
    print('input_folder: {0}'.format(input_folder))
    start_time = time.time()
    if overwrite:
        shutil.rmtree(output_folder, ignore_errors=True)
    utils.create_folder_if_not_exists(output_folder)

    for input_filepath in sorted(glob.glob(os.path.join(input_folder, '*.xml'))):
        filename = utils.get_basename_without_extension(input_filepath)
        output_text_filepath = os.path.join(output_folder, '{0}.txt'.format(filename))
        xmldoc = xml.etree.ElementTree.parse(input_filepath).getroot()
        # Get text
        text = xmldoc.findtext('TEXT')
        with codecs.open(output_text_filepath, 'w', 'UTF-8') as f:
            f.write(text)

        # Get PHI tags
        tags = xmldoc.findall('TAGS')[0] # [0] because there is only one <TAGS>...</TAGS>
        entities = []
        for tag in tags:
            entity = {}
            entity['label'] = tag.get('TYPE')
            entity['text'] = tag.get('text')
            entity['start'] = int(tag.get('start'))
            entity['end'] = int(tag.get('end'))
            entities.append(entity)
        output_entities(output_folder, filename, entities, output_text_filepath, text, overwrite=overwrite)

    time_spent = time.time() - start_time
    print("Time spent formatting: {0:.2f} seconds".format(time_spent))
Ejemplo n.º 2
0
def generate_reference_text_file_for_conll(conll_input_filepath,
                                           conll_output_filepath, text_folder):
    '''
    generates reference text files and adds the corresponding filename and token offsets to conll file.
    
    conll_input_filepath: path to a conll-formatted file without filename and token offsets
    text_folder: folder to write the reference text file to
    '''
    dataset_type = utils.get_basename_without_extension(conll_input_filepath)
    conll_file = codecs.open(conll_input_filepath, 'r', 'UTF-8')
    utils.create_folder_if_not_exists(text_folder)
    text = ''
    new_conll_string = ''
    character_index = 0
    document_count = 0
    text_base_filename = '{0}_text_{1}'.format(dataset_type,
                                               str(document_count).zfill(5))
    for line in conll_file:
        split_line = line.strip().split(' ')
        # New document
        if '-DOCSTART-' in split_line[0]:
            new_conll_string += line
            if len(text) != 0:
                with codecs.open(
                        os.path.join(text_folder,
                                     '{0}.txt'.format(text_base_filename)),
                        'w', 'UTF-8') as f:
                    f.write(text)
            text = ''
            character_index = 0
            document_count += 1
            text_base_filename = '{0}_text_{1}'.format(
                dataset_type,
                str(document_count).zfill(5))
            continue
        # New sentence
        elif len(split_line) == 0 or len(split_line[0]) == 0:
            new_conll_string += '\n'
            if text != '':
                text += '\n'
                character_index += 1
            continue
        token = split_line[0]
        start = character_index
        end = start + len(token)
        text += token + ' '
        character_index += len(token) + 1
        new_conll_string += ' '.join(
            [token, text_base_filename,
             str(start), str(end)] + split_line[1:]) + '\n'
    if len(text) != 0:
        with codecs.open(
                os.path.join(text_folder,
                             '{0}.txt'.format(text_base_filename)), 'w',
                'UTF-8') as f:
            f.write(text)
    conll_file.close()

    with codecs.open(conll_output_filepath, 'w', 'UTF-8') as f:
        f.write(new_conll_string)
Ejemplo n.º 3
0
def format_for_ann_old(dataset_base_filename, split):
    '''
    version without any dictionary or feature
    '''
    print("Started formatting for ann")
    start_time = time.time()
    filepaths = utils_deid.get_original_dataset_filepaths(dataset_base_filename, split=split)
    output_folder = os.path.join('ann', 'data', dataset_base_filename, 'stanford', split)
    create_folder_if_not_exists(output_folder)
    
    number_of_unicode_characters = 0
    open('unicode.txt','w').close()
    for dataset_type in filepaths:
#         if dataset_type == 'test':
#             continue
        output_filepath = os.path.join(output_folder, '{0}.txt'.format(dataset_type))
        open(output_filepath, 'w').close()
#         output_file.write('')
        for filepath in filepaths[dataset_type]:
            print("filepath: {0}".format(filepath))
#             filepath = '../data/datasets/original/i2b2deid2016/60_40/training-PHI-Gold-Set1/0666_gs.xml'
            xmldoc = xml.etree.ElementTree.parse(filepath).getroot()            
            # Get text
            text = xmldoc.findtext('TEXT')#.replace(u"\u2019", "'")
#             .encode('ascii', 'replace')
#             try:
#                 print("text: {0}".format(text))
#             except:
#                 number_of_unicode_characters += 1
#                 text_replaced = utils_nlp.normalize_unicode_text(text)
#                 with open('unicode.txt','a') as f:
#                     f.write(filepath+'\n\n')
# #                     f.write(text+'\n\n')
#                     f.write(text_replaced + '\n\n\n======================================================================{0}'.format(number_of_unicode_characters))
#                 text = text_replaced
#             text = text.replace(u"\u2019", "'")
#             text = text.replace(u"\u00E3", "a")
            
            # Get stanford output
            stanford_output = get_stanford_annotations(text, annotators='tokenize,ssplit')
            
            # Get PHI tags
            tags = xmldoc.findall('TAGS')[0] # [0] because there is only one <TAGS>...</TAGS>
            phis = []        
            for tag in tags:            
                #print(tag)
                phi = {}
                phi['main_type'] = tag.tag
                phi['type'] = tag.get('TYPE')
                phi['text'] = tag.get('text')#.replace(u"\u00E3", "a")
                phi['start'] = int(tag.get('start'))
                phi['end'] = int(tag.get('end'))
                phis.append(phi)
            
            xml_filename = utils.get_basename_without_extension(filepath)
            
            convert_stanford_output_to_ann_txt_old(output_filepath, xml_filename, stanford_output, phis)
#             0/0
    time_spent = time.time() - start_time
    print("Time spent formatting for ann: {0}".format(time_spent))
Ejemplo n.º 4
0
def check_validity_of_conll_bioes(bioes_filepath):
    dataset_type = utils.get_basename_without_extension(bioes_filepath).split(
        '_')[0]
    print("Checking validity of CONLL BIOES format... ".format(dataset_type),
          end='')

    input_conll_file = codecs.open(bioes_filepath, 'r', 'UTF-8')
    labels_bioes = []
    labels_bio = []
    for line in input_conll_file:
        split_line = line.strip().split(' ')
        # New sentence
        if len(split_line) == 0 or len(
                split_line[0]) == 0 or '-DOCSTART-' in split_line[0]:
            if check_bio_bioes_compatibility(labels_bio, labels_bioes):
                continue
            return False
        label_bioes = split_line[-1]
        label_bio = split_line[-2]
        labels_bioes.append(label_bioes)
        labels_bio.append(label_bio)
    input_conll_file.close()
    if check_bio_bioes_compatibility(labels_bio, labels_bioes):
        print("Done.")
        return True
    return False
Ejemplo n.º 5
0
def convert_conll_from_bio_to_bioes(input_conll_filepath, output_conll_filepath):
    if os.path.exists(output_conll_filepath):
        if check_validity_of_conll_bioes(output_conll_filepath):
            return
    dataset_type = utils.get_basename_without_extension(input_conll_filepath).split('_')[0]
    print("Converting CONLL from BIO to BIOES format... ".format(dataset_type), end='')
    input_conll_file = codecs.open(input_conll_filepath, 'r', 'UTF-8')
    output_conll_file = codecs.open(output_conll_filepath, 'w', 'UTF-8')

    labels = []
    split_lines = []
    for line in input_conll_file:
        split_line = line.strip().split(' ')
        # New sentence
        if len(split_line) == 0 or len(split_line[0]) == 0 or '-DOCSTART-' in split_line[0]:
            output_conll_lines_with_bioes(split_lines, labels, output_conll_file)
            output_conll_file.write(line)
            continue
        label = split_line[-1]    
        labels.append(label)
        split_lines.append(split_line)
    output_conll_lines_with_bioes(split_lines, labels, output_conll_file)
    
    input_conll_file.close()
    output_conll_file.close()
    print("Done.")
Ejemplo n.º 6
0
def predict(text):
    #         if prediction_count == 1:
    parameters['dataset_text_folder'] = os.path.join('..', 'data', 'temp')
    stats_graph_folder, _ = utils.create_stats_graph_folder(parameters)

    # Update the deploy folder, file, and dataset
    dataset_type = 'deploy'
    ### Delete all deployment data
    for filepath in glob.glob(
            os.path.join(parameters['dataset_text_folder'],
                         '{0}*'.format(dataset_type))):
        if os.path.isdir(filepath):
            shutil.rmtree(filepath)
        else:
            os.remove(filepath)
    ### Create brat folder and file
    dataset_brat_deploy_folder = os.path.join(
        parameters['dataset_text_folder'], dataset_type)
    utils.create_folder_if_not_exists(dataset_brat_deploy_folder)
    dataset_brat_deploy_filepath = os.path.join(
        dataset_brat_deploy_folder,
        'temp_{0}.txt'.format(str(prediction_count).zfill(5))
    )  # self._get_dataset_brat_deploy_filepath(dataset_brat_deploy_folder)
    with codecs.open(dataset_brat_deploy_filepath, 'w', 'UTF-8') as f:
        f.write(text)
    ### Update deploy filepaths
    dataset_filepaths, dataset_brat_folders = utils.get_valid_dataset_filepaths(
        parameters, dataset_types=[dataset_type])
    dataset_filepaths.update(dataset_filepaths)
    dataset_brat_folders.update(dataset_brat_folders)
    ### Update the dataset for the new deploy set
    dataset.update_dataset(dataset_filepaths, [dataset_type])

    # Predict labels and output brat
    output_filepaths = {}
    prediction_output = train.prediction_step(
        sess, dataset, dataset_type, model, transition_params_trained,
        stats_graph_folder, prediction_count, dataset_filepaths,
        parameters['tagging_format'], parameters['main_evaluation_mode'])
    _, _, output_filepaths[dataset_type] = prediction_output
    conll2brat.output_brat(output_filepaths,
                           dataset_brat_folders,
                           stats_graph_folder,
                           overwrite=True)

    # Print and output result
    text_filepath = os.path.join(
        stats_graph_folder, 'brat', 'deploy',
        os.path.basename(dataset_brat_deploy_filepath))
    annotation_filepath = os.path.join(
        stats_graph_folder, 'brat', 'deploy', '{0}.ann'.format(
            utils.get_basename_without_extension(
                dataset_brat_deploy_filepath)))
    text2, entities = brat2conll.get_entities_from_brat(text_filepath,
                                                        annotation_filepath,
                                                        verbose=True)
    assert (text == text2)
    return entities
Ejemplo n.º 7
0
 def _create_stats_graph_folder(self, parameters):
     # Initialize stats_graph_folder
     experiment_timestamp = utils.get_current_time_in_miliseconds()
     dataset_name = utils.get_basename_without_extension(parameters['dataset_text_folder'])
     model_name = '{0}_{1}'.format(dataset_name, experiment_timestamp)
     utils.create_folder_if_not_exists(parameters['output_folder'])
     stats_graph_folder = os.path.join(parameters['output_folder'], model_name) # Folder where to save graphs
     utils.create_folder_if_not_exists(stats_graph_folder)
     return stats_graph_folder, experiment_timestamp
Ejemplo n.º 8
0
def check_compatibility_between_conll_and_brat_text(conll_filepath,
                                                    brat_folder):
    '''
    check if token offsets match between conll and brat .txt files.

    conll_filepath: path to conll file
    brat_folder: folder that contains the .txt (and .ann) files that are formatted according to brat.

    '''
    dataset_type = utils.get_basename_without_extension(conll_filepath)
    print("Checking compatibility between CONLL and BRAT for {0} set ... ".
          format(dataset_type),
          end='')
    print('**** conll_filepath=%s' % conll_filepath)
    conll_file = codecs.open(conll_filepath, 'r', 'UTF-8')

    previous_filename = ''
    for line in conll_file:
        line = line.strip().split(' ')
        # New sentence
        if len(line) == 0 or len(line[0]) == 0 or '-DOCSTART-' in line[0]:
            continue

        filename = str(line[1])
        # New file
        if filename != previous_filename:
            text_filepath = os.path.join(brat_folder,
                                         '{0}.txt'.format(filename))
            try:
                text = utils.read_file(text_filepath)
            except:
                print('$' * 80)
                print(line)
                raise
            previous_filename = filename

        label = str(line[-1]).replace('_', '-')  # For LOCATION-OTHER

        token = {}
        token['text'] = str(line[0])
        token['start'] = int(line[2])
        token['end'] = int(line[3])

        # check that the token text matches the original
        if token['text'] != text[token['start']:token['end']]:
            print("Warning: conll and brat text do not match.")
            print("\tCONLL: {0}".format(token['text']))
            print("\tBRAT : {0}".format(text[token['start']:token['end']]))
            if token['text'] != text[token['start']:token['end']].replace(
                    ' ', '-'):
                raise AssertionError("CONLL and BRAT files are incompatible.")

    print("Done.")
Ejemplo n.º 9
0
 def _create_stats_graph_folder(self, parameters):
     '''
     Tạo folder output/en để chứa các file sinh ra trong quá trình huấn luyện
     Trả về
     (đường dẫn đến folder output, timestamp của lần chạy)
     '''
     experiment_timestamp = utils.get_current_time_in_miliseconds()
     dataset_name = utils.get_basename_without_extension(parameters['dataset_text_folder'])
     model_name = '{0}_{1}'.format(dataset_name, experiment_timestamp)
     utils.create_folder_if_not_exists(parameters['output_folder'])
     stats_graph_folder = os.path.join(parameters['output_folder'], model_name) # Folder where to save graphs
     utils.create_folder_if_not_exists(stats_graph_folder)
     return stats_graph_folder, experiment_timestamp
Ejemplo n.º 10
0
def generate_reference_text_file_for_conll(conll_input_filepath, conll_output_filepath, text_folder):
    '''
    Từ file conll ban đầu, tiến hành generate ngược lại các file text (nằm trong text_folder) và conll_reference (conll_output_filepath)
    Cấu trúc conll_output_filepath:
    - DOCSTART: bắt đầu văn bản
    - <token> <tên file> <startIndex> <endIndex> < 3 nhãn chuẩn conll>
    '''
    dataset_type =  utils.get_basename_without_extension(conll_input_filepath) # lấy filename mà không có phần mở rộng
    conll_file = codecs.open(conll_input_filepath, 'r', 'UTF-8')
    utils.create_folder_if_not_exists(text_folder)
    text = ''
    new_conll_string = ''
    character_index = 0
    document_count = 0
    text_base_filename = '{0}_text_{1}'.format(dataset_type, str(document_count).zfill(5))  # zfill: dùng để format string đảm bảo luôn có 5 ký tự
    for line in conll_file:
        split_line = line.strip().split(' ')
        # Bắt đầu 1 document: có chuỗi '-DOCSTART-' đầu câu
        if '-DOCSTART-' in split_line[0]:
            new_conll_string += line
            if len(text) != 0:
                with codecs.open(os.path.join(text_folder, '{0}.txt'.format(text_base_filename)), 'w', 'UTF-8') as f:
                    f.write(text)
            text = ''
            character_index = 0
            document_count += 1
            text_base_filename = '{0}_text_{1}'.format(dataset_type, str(document_count).zfill(5))
            continue
        # Bắt đầu 1 câu
        elif len(split_line) == 0 or len(split_line[0]) == 0:
            new_conll_string += '\n'
            if text != '':
                text += '\n'
                character_index += 1
            continue
        token = split_line[0]
        start = character_index
        end = start + len(token)
        text += token + ' '
        character_index += len(token) + 1
        new_conll_string += ' '.join([token, text_base_filename, str(start), str(end)] + split_line[1:]) + '\n'
    if len(text) != 0:
        with codecs.open(os.path.join(text_folder, '{0}.txt'.format(text_base_filename)), 'w', 'UTF-8') as f:
            f.write(text)
    conll_file.close()

    with codecs.open(conll_output_filepath, 'w', 'UTF-8') as f:
        f.write(new_conll_string)
def main():
    '''
    This is the main function
    '''
    #stats_graph_folder=os.path.join('..', 'stats_graphs', 'test')
    stats_graph_folder = os.path.join('..', 'output')

    # Getting a list of all subdirectories in the current directory. Not recursive.
    subfolders = os.listdir(stats_graph_folder)
    subfolders = sorted(os.listdir(stats_graph_folder), reverse=True)

    # Recursive
    #subfolders = [x[0] for x in os.walk(stats_graph_folder)][1:]

    # Parameters
    #metrics = ['accuracy_score', 'f1_score']
    metrics = ['f1_score', 'f1_conll']
    dataset_types = ['train', 'valid', 'test']
    execution_details = [
        'num_epochs', 'train_duration', 'keyboard_interrupt', 'early_stop'
    ]
    # It's good to put the important fields (for your experiments) first,
    # so that it appears right next to the test f1 score.

    fields_of_interest = '''dataset_text_filepath all_emb pre_emb char_dim  char_bidirect character_cnn_filter_height character_cnn_number_of_filters
    word_dim using_token_lstm word_lstm_dim word_bidirect experiment_name
    using_token_cnn token_cnn_filter_height token_cnn_number_of_filters using_character_lstm using_character_cnn patience  char_lstm_dim
    crf dropout lr_method training_set_size'''.replace('\n', '').split(' ')
    fields_of_interest = filter(None, fields_of_interest)

    result_tables = {}
    print('subfolders: {0}'.format(subfolders))
    # 0/0
    # Define column_order, i.e. how the result table is presented
    column_order = ['dataset_name', 'time_stamp']
    for metric in metrics:
        for dataset_type in dataset_types:
            column_order.append('{0}_{1}'.format(dataset_type, metric))
        column_order.append('{0}_{1} (based on valid)'.format('test', metric))

    column_order.extend(fields_of_interest[:3])

    for metric in metrics:
        column_order.append(
            'best_epoch_for_{0} (based on valid)'.format(metric))
    column_order.extend(execution_details)

    column_order.extend(fields_of_interest[3:])

    print('fields_of_interest: {0}'.format(fields_of_interest))
    print('column_order: {0}'.format(column_order))

    for subfolder in subfolders:
        result_row = {}
        result_filepath = os.path.join(stats_graph_folder, subfolder,
                                       'results.json')
        if not os.path.isfile(result_filepath): continue
        print('result_filepath: {0}'.format(result_filepath))
        try:
            result_json = json.load(open(result_filepath, 'r'))
        except ValueError:
            print('This file is skipped since it is in use or corrupted.')

        # Include time stamp of the experiments
        result_row['time_stamp'] = result_json['execution_details'][
            'time_stamp']

        for field_of_interest in fields_of_interest:
            if field_of_interest in result_json['model_options']:
                if field_of_interest == 'pre_emb':
                    result_row[field_of_interest] = os.path.basename(
                        result_json['model_options'][field_of_interest])
                else:
                    result_row[field_of_interest] = result_json[
                        'model_options'][field_of_interest]

        for execution_detail in execution_details:
            try:
                result_row[execution_detail] = result_json[
                    'execution_details'][execution_detail]
            except:
                result_row[execution_detail] = 'NULL'

        for metric in metrics:
            for dataset_type in dataset_types:
                result_row['{0}_{1}'.format(
                    dataset_type, metric)] = result_json[dataset_type].get(
                        'best_{0}'.format(metric), 'NULL')
                if dataset_type == 'test':
                    result_row['{0}_{1} (based on valid)'.format(
                        dataset_type, metric)] = result_json[dataset_type].get(
                            'best_{0}_based_on_valid'.format(metric), 'NULL')
                elif dataset_type == 'valid':
                    result_row['best_epoch_for_{0} (based on valid)'.format(
                        metric)] = result_json[dataset_type].get(
                            'epoch_for_best_{0}'.format(metric), 'NULL')

        # Save row in table: one table per data set
        dataset_name = utils.get_basename_without_extension(
            result_row['dataset_text_filepath'])
        result_row['dataset_name'] = dataset_name
        if dataset_name not in result_tables:
            result_tables[dataset_name] = []

        result_row_ordered = []
        for column_name in column_order:
            if column_name in result_row:
                result_row_ordered.append(result_row[column_name])
            else:
                result_row_ordered.append('NULL')

        result_tables[dataset_name].append(result_row_ordered)

    print('result_tables: {0}'.format(result_tables))

    #print('\ncolumn_order: {0}'.format(column_order))
    #print('result_table: {0}'.format(result_tables))

    import MySQLdb as mdb
    connection = mdb.connect('128.52.165.241', 'tc',
                             open('database_password.txt', 'r').readline(),
                             'tc')
    cursor = connection.cursor()

    for dataset_name, dataset_results in result_tables.items():
        with open(
                os.path.join(stats_graph_folder,
                             'results_{0}.csv'.format(dataset_name)),
                'wb') as testfile:
            csv_writer = csv.writer(testfile)
            clean_column_names = map(clean_column_name, column_order)
            csv_writer.writerow(clean_column_names)
            for row in dataset_results:
                csv_writer.writerow(row)

                # Convert row values to some importable string
                values = ''
                for value_number, value in enumerate(row):
                    if value_number > 0:
                        values += ','
                    if isinstance(value,
                                  (bool)):  # Check if object is a boolean
                        if value:
                            value = '1'
                        else:
                            value = '0'
                    if not isinstance(
                            value, (int, long)
                    ) and value != 'NULL':  # http://stackoverflow.com/questions/3501382/checking-whether-a-variable-is-an-integer-or-not
                        value = '"{0}"'.format(str(value))
                    else:
                        value = '{0}'.format(str(value))
                    #print('value: {0}'.format(value))
                    values += value

                # Make sure that train_duration is more than 0. (if 0 it means the training was interrupted)
                #print('row[clean_column_names.index("train_duration"): {0}'.format(row[clean_column_names.index('train_duration')]))
                train_duration = row[clean_column_names.index(
                    'train_duration')]
                keyboard_interrupt = row[clean_column_names.index(
                    'keyboard_interrupt')]
                #print('train_duration: {0}'.format(train_duration))
                '''
                if train_duration == 'NULL' and keyboard_interrupt == '0' or train_duration == '0':
                    print('The experiment has train_duration = {0}, so we skip it.'.format(train_duration))
                    continue
                '''

                # Make sure the experiment isn't already in the database
                time_stamp = row[clean_column_names.index('time_stamp')]
                sql = 'SELECT COUNT(*) FROM tc.results_neurodeid WHERE time_stamp = "{0}"'.format(
                    time_stamp)
                cursor.execute(sql)
                row = cursor.fetchone()
                if row[0] >= 1:
                    print(
                        'The experiment with timestamp {0} is already in the database, so we skip it.'
                        .format(time_stamp))
                    continue
                if time_stamp < '2016-08-17_18-20-05-836274':
                    print(
                        'The experiment with timestamp {0} is too old, so we skip it.'
                        .format(time_stamp))
                    continue

                sql = 'INSERT INTO tc.results_neurodeid ({0}) VALUES ({1})'.format(
                    ','.join(clean_column_names), values)

                print('sql: {0}'.format(sql))
                cursor.execute(sql)
                connection.commit()

    connection.commit()
    connection.close()
Ejemplo n.º 12
0
def conll_to_brat(conll_input_filepath, conll_output_filepath, brat_original_folder, brat_output_folder,
                  overwrite=False):
    '''
    convert conll file in conll-filepath to brat annotations and output to brat_output_folder, 
    with reference to the existing text files in brat_original_folder 
    if brat_original_folder does not exist or contain any text file, then the text files are generated from conll files,
    and conll file is updated with filenames and token offsets accordingly. 
    
    conll_input_filepath: path to conll file to convert to brat annotations
    conll_output_filepath: path to output conll file with filename and offsets that are compatible with brat annotations
    brat_original_folder: folder that contains the original .txt (and .ann) files that are formatted according to brat.
                          .txt files are used to check if the token offsets match and generate the annotation from conll.                      
    brat_output_folder: folder to output the text and brat annotations 
                        .txt files are copied from brat_original_folder to brat_output_folder
    '''
    verbose = False
    dataset_type = utils.get_basename_without_extension(conll_input_filepath)
    print("Formatting {0} set from CONLL to BRAT... ".format(dataset_type), end='')

    # if brat_original_folder does not exist or have any text file
    if not os.path.exists(brat_original_folder) or len(glob.glob(os.path.join(brat_original_folder, '*.txt'))) == 0:
        assert (conll_input_filepath != conll_output_filepath)
        generate_reference_text_file_for_conll(conll_input_filepath, conll_output_filepath, brat_original_folder)

    utils.create_folder_if_not_exists(brat_output_folder)
    conll_file = codecs.open(conll_output_filepath, 'r', 'latin-1', errors='replace')

    previous_token_label = 'O'
    previous_filename = ''
    text_filepath = ''
    text = ''
    entity_id = 1
    entities = []
    entity = {}
    line_count = 0
    for line in conll_file:
        line = line.strip().split(' ')
        # New sentence
        if len(line) == 0 or len(line[0]) == 0 or '-DOCSTART-' in line[0]:
            # Add the last entity 
            if entity != {}:
                if verbose: print("entity: {0}".format(entity))
                entities.append(entity)
                entity_id += 1
                entity = {}
            previous_token_label = 'O'
            continue

        filename = str(line[1])
        # New file
        if filename != previous_filename:
            output_entities(brat_output_folder, previous_filename, entities, text_filepath, text, overwrite=overwrite)
            text_filepath = os.path.join(brat_original_folder, '{0}.txt'.format(filename))
            with codecs.open(text_filepath, 'r', 'latin-1', errors='replace') as f:
                text = f.read()
            previous_token_label = 'O'
            previous_filename = filename
            entity_id = 1
            entities = []
            entity = {}

        label = str(line[-1]).replace('_', '-')  # For LOCATION-OTHER
        if label == 'O':
            # Previous entity ended
            if previous_token_label != 'O':
                if verbose: print("entity: {0}".format(entity))
                entities.append(entity)
                entity_id += 1
                entity = {}
            previous_token_label = 'O'
            continue

        token = {}
        token['text'] = str(line[0])
        token['start'] = int(line[2])
        token['end'] = int(line[3])
        # check that the token text matches the original
        if token['text'] != text[token['start']:token['end']].replace(' ', '-'):
            print("Warning: conll and brat text do not match.")
            print("\tCONLL: {0}".format(token['text']))
            print("\tBRAT : {0}".format(text[token['start']:token['end']]))
        token['label'] = label[2:]

        if label[:2] == 'B-':
            if previous_token_label != 'O':
                # End the previous entity
                if verbose: print("entity: {0}".format(entity))
                entities.append(entity)
                entity_id += 1
            # Start a new entity
            entity = token
        elif label[:2] == 'I-':
            # Entity continued
            if previous_token_label == token['label']:
                # if there is no newline between the entity and the token
                if '\n' not in text[entity['end']:token['start']]:
                    # Update entity 
                    entity['text'] = entity['text'] + ' ' + token['text']
                    entity['end'] = token['end']
                else:  # newline between the entity and the token
                    # End the previous entity
                    if verbose: print("entity: {0}".format(entity))
                    entities.append(entity)
                    entity_id += 1
                    # Start a new entity
                    entity = token
            elif previous_token_label != 'O':
                # TODO: count BI or II incompatibility
                # End the previous entity
                if verbose: print("entity: {0}".format(entity))
                entities.append(entity)
                entity_id += 1
                # Start new entity
                entity = token
            else:  # previous_token_label == 'O'
                # TODO: count  OI incompatibility
                # Start new entity
                entity = token
        previous_token_label = token['label']
    output_entities(brat_output_folder, previous_filename, entities, text_filepath, text, overwrite=overwrite)
    conll_file.close()
    print('Done.')
Ejemplo n.º 13
0
def main():

    parameters, conf_parameters = load_parameters()
    dataset_filepaths, dataset_brat_folders = get_valid_dataset_filepaths(parameters)
    check_parameter_compatiblity(parameters, dataset_filepaths)

    # Load dataset
    dataset = ds.Dataset(verbose=parameters['verbose'], debug=parameters['debug'])
    dataset.load_dataset(dataset_filepaths, parameters)

    # Create graph and session
    with tf.Graph().as_default():
        session_conf = tf.ConfigProto(
            intra_op_parallelism_threads=parameters['number_of_cpu_threads'],
            inter_op_parallelism_threads=parameters['number_of_cpu_threads'],
            device_count={'CPU': 1, 'GPU': parameters['number_of_gpus']},
            allow_soft_placement=True, # automatically choose an existing and supported device to run the operations in case the specified one doesn't exist
            log_device_placement=False
            )

        sess = tf.Session(config=session_conf)

        with sess.as_default():
            # Initialize and save execution details
            start_time = time.time()
            experiment_timestamp = utils.get_current_time_in_miliseconds()
            results = {}
            results['epoch'] = {}
            results['execution_details'] = {}
            results['execution_details']['train_start'] = start_time
            results['execution_details']['time_stamp'] = experiment_timestamp
            results['execution_details']['early_stop'] = False
            results['execution_details']['keyboard_interrupt'] = False
            results['execution_details']['num_epochs'] = 0
            results['model_options'] = copy.copy(parameters)

            dataset_name = utils.get_basename_without_extension(parameters['dataset_text_folder'])
            model_name = '{0}_{1}'.format(dataset_name, results['execution_details']['time_stamp'])

            output_folder=os.path.join('..', 'output')
            utils.create_folder_if_not_exists(output_folder)
            stats_graph_folder=os.path.join(output_folder, model_name) # Folder where to save graphs
            utils.create_folder_if_not_exists(stats_graph_folder)
            model_folder = os.path.join(stats_graph_folder, 'model')
            utils.create_folder_if_not_exists(model_folder)
            with open(os.path.join(model_folder, 'parameters.ini'), 'w') as parameters_file:
                conf_parameters.write(parameters_file)
            tensorboard_log_folder = os.path.join(stats_graph_folder, 'tensorboard_logs')
            utils.create_folder_if_not_exists(tensorboard_log_folder)
            tensorboard_log_folders = {}
            for dataset_type in dataset_filepaths.keys():
                tensorboard_log_folders[dataset_type] = os.path.join(stats_graph_folder, 'tensorboard_logs', dataset_type)
                utils.create_folder_if_not_exists(tensorboard_log_folders[dataset_type])
            pickle.dump(dataset, open(os.path.join(model_folder, 'dataset.pickle'), 'wb'))

            # Instantiate the model
            # graph initialization should be before FileWriter, otherwise the graph will not appear in TensorBoard
            model = EntityLSTM(dataset, parameters)

            # Instantiate the writers for TensorBoard
            writers = {}
            for dataset_type in dataset_filepaths.keys():
                writers[dataset_type] = tf.summary.FileWriter(tensorboard_log_folders[dataset_type], graph=sess.graph)
            embedding_writer = tf.summary.FileWriter(model_folder) # embedding_writer has to write in model_folder, otherwise TensorBoard won't be able to view embeddings

            embeddings_projector_config = projector.ProjectorConfig()
            tensorboard_token_embeddings = embeddings_projector_config.embeddings.add()
            tensorboard_token_embeddings.tensor_name = model.token_embedding_weights.name
            token_list_file_path = os.path.join(model_folder, 'tensorboard_metadata_tokens.tsv')
            tensorboard_token_embeddings.metadata_path = os.path.relpath(token_list_file_path, '..')

            tensorboard_character_embeddings = embeddings_projector_config.embeddings.add()
            tensorboard_character_embeddings.tensor_name = model.character_embedding_weights.name
            character_list_file_path = os.path.join(model_folder, 'tensorboard_metadata_characters.tsv')
            tensorboard_character_embeddings.metadata_path = os.path.relpath(character_list_file_path, '..')

            projector.visualize_embeddings(embedding_writer, embeddings_projector_config)

            # Write metadata for TensorBoard embeddings
            token_list_file = codecs.open(token_list_file_path,'w', 'UTF-8')
            for token_index in range(dataset.vocabulary_size):
                token_list_file.write('{0}\n'.format(dataset.index_to_token[token_index]))
            token_list_file.close()

            character_list_file = codecs.open(character_list_file_path,'w', 'UTF-8')
            for character_index in range(dataset.alphabet_size):
                if character_index == dataset.PADDING_CHARACTER_INDEX:
                    character_list_file.write('PADDING\n')
                else:
                    character_list_file.write('{0}\n'.format(dataset.index_to_character[character_index]))
            character_list_file.close()


            # Initialize the model
            sess.run(tf.global_variables_initializer())
            if not parameters['use_pretrained_model']:
                model.load_pretrained_token_embeddings(sess, dataset, parameters)

            # Start training + evaluation loop. Each iteration corresponds to 1 epoch.
            bad_counter = 0 # number of epochs with no improvement on the validation test in terms of F1-score
            previous_best_valid_f1_score = 0
            transition_params_trained = np.random.rand(len(dataset.unique_labels)+2,len(dataset.unique_labels)+2)
            model_saver = tf.train.Saver(max_to_keep=parameters['maximum_number_of_epochs'])  # defaults to saving all variables
            epoch_number = -1
            try:
                while True:
                    step = 0
                    epoch_number += 1
                    print('\nStarting epoch {0}'.format(epoch_number))

                    epoch_start_time = time.time()

                    if parameters['use_pretrained_model'] and epoch_number == 0:
                        # Restore pretrained model parameters
                        transition_params_trained = train.restore_model_parameters_from_pretrained_model(parameters, dataset, sess, model, model_saver)
                    elif epoch_number != 0:
                        # Train model: loop over all sequences of training set with shuffling
                        sequence_numbers=list(range(len(dataset.token_indices['train'])))
                        random.shuffle(sequence_numbers)
                        for sequence_number in sequence_numbers:
                            transition_params_trained = train.train_step(sess, dataset, sequence_number, model, transition_params_trained, parameters)
                            step += 1
                            if step % 10 == 0:
                                print('Training {0:.2f}% done'.format(step/len(sequence_numbers)*100), end='\r', flush=True)

                    epoch_elapsed_training_time = time.time() - epoch_start_time
                    print('Training completed in {0:.2f} seconds'.format(epoch_elapsed_training_time), flush=True)

                    y_pred, y_true, output_filepaths = train.predict_labels(sess, model, transition_params_trained, parameters, dataset, epoch_number, stats_graph_folder, dataset_filepaths)

                    # Evaluate model: save and plot results
                    evaluate.evaluate_model(results, dataset, y_pred, y_true, stats_graph_folder, epoch_number, epoch_start_time, output_filepaths, parameters)

                    if parameters['use_pretrained_model'] and not parameters['train_model']:
                        conll_to_brat.output_brat(output_filepaths, dataset_brat_folders, stats_graph_folder)
                        break

                    # Save model
                    model_saver.save(sess, os.path.join(model_folder, 'model_{0:05d}.ckpt'.format(epoch_number)))

                    # Save TensorBoard logs
                    summary = sess.run(model.summary_op, feed_dict=None)
                    writers['train'].add_summary(summary, epoch_number)
                    writers['train'].flush()
                    utils.copytree(writers['train'].get_logdir(), model_folder)


                    # Early stop
                    valid_f1_score = results['epoch'][epoch_number][0]['valid']['f1_score']['micro']
                    if  valid_f1_score > previous_best_valid_f1_score:
                        bad_counter = 0
                        previous_best_valid_f1_score = valid_f1_score
                        conll_to_brat.output_brat(output_filepaths, dataset_brat_folders, stats_graph_folder, overwrite=True)
                    else:
                        bad_counter += 1
                    print("The last {0} epochs have not shown improvements on the validation set.".format(bad_counter))

                    if bad_counter >= parameters['patience']:
                        print('Early Stop!')
                        results['execution_details']['early_stop'] = True
                        break

                    if epoch_number >= parameters['maximum_number_of_epochs']: break


            except KeyboardInterrupt:
                results['execution_details']['keyboard_interrupt'] = True
                print('Training interrupted')

            print('Finishing the experiment')
            end_time = time.time()
            results['execution_details']['train_duration'] = end_time - start_time
            results['execution_details']['train_end'] = end_time
            print('ok1')
            evaluate.save_results(results, stats_graph_folder)
            print('ok2')
        print('ok3')
        #sess.close() # release the session's resources
    print('ok4')
Ejemplo n.º 14
0
def main():
    file_params = 'parameters_yelp_50k.ini'
    if len(sys.argv) > 1 and '.ini' in sys.argv[1]:
        file_params = sys.argv[1]

    # Load config
    parameters, conf_parameters = load_parameters(
        parameters_filepath=os.path.join('.', file_params))
    dataset_filepaths = get_valid_dataset_filepaths(parameters)
    #check_parameter_compatiblity(parameters, dataset_filepaths)

    if parameters['seed'] != -1:
        random.seed(parameters['seed'])

    # Create annotator
    annotator = stanford_corenlp_pywrapper.CoreNLP(
        configdict={
            'annotators': 'tokenize, ssplit',
            'ssplit.eolonly': True
        },
        corenlp_jars=[parameters['stanford_folder'] + '/*'])
    # Load dataset
    dataset = ds.Dataset(verbose=parameters['verbose'],
                         debug=parameters['debug'])
    dataset.load_dataset(dataset_filepaths, parameters, annotator)

    # Adapt train/valid/test to be multiple of batch_size
    for size in ['train_size', 'valid_size', 'test_size']:
        if parameters[size] % parameters['batch_size'] != 0:
            parameters[size] = int(
                parameters[size] /
                parameters['batch_size']) * parameters['batch_size']
            print('Changed {}'.format(size))

    # Set GPU device if more GPUs are specified
    if parameters['number_of_gpus'] > 1 and parameters['gpu_device'] != -1:
        os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
        os.environ["CUDA_VISIBLE_DEVICES"] = parameters['gpu_device']

    # GPUs
    print(device_lib.list_local_devices())
    # Create graph and session
    with tf.Graph().as_default():
        session_conf = tf.ConfigProto(
            intra_op_parallelism_threads=parameters['number_of_cpu_threads'],
            inter_op_parallelism_threads=parameters['number_of_cpu_threads'],
            device_count={
                'CPU': 1,
                'GPU': parameters['number_of_gpus']
            },
            allow_soft_placement=
            True,  # automatically choose an existing and supported device to run the operations in case the specified one doesn't exist
            log_device_placement=False)

        sess = tf.Session(config=session_conf)

        with sess.as_default():
            if parameters['seed'] != -1:
                tf.set_random_seed(parameters['seed'])

                # Initialize and save execution details
                start_time = time.time()
                experiment_timestamp = utils.get_current_time_in_miliseconds()

                results = {}
                results['epoch'] = {}
                results['execution_details'] = {}
                results['execution_details']['train_start'] = start_time
                results['execution_details'][
                    'time_stamp'] = experiment_timestamp
                results['execution_details']['early_stop'] = False
                results['execution_details']['keyboard_interrupt'] = False
                results['execution_details']['num_epochs'] = 0
                results['model_options'] = copy.copy(parameters)

                dataset_name = utils.get_basename_without_extension(
                    parameters['dataset_folder'])
                model_name = '{0}_{1}'.format(
                    dataset_name, results['execution_details']['time_stamp'])

                output_folder = os.path.join('..', 'output')
                utils.create_folder_if_not_exists(output_folder)

                stats_graph_folder = os.path.join(
                    output_folder, model_name)  # Folder where to save graphs
                utils.create_folder_if_not_exists(stats_graph_folder)
                model_folder = os.path.join(stats_graph_folder, 'model')
                utils.create_folder_if_not_exists(model_folder)

                with open(os.path.join(model_folder, file_params),
                          'w') as parameters_file:
                    conf_parameters.write(parameters_file)

                pickle.dump(
                    dataset,
                    open(os.path.join(model_folder, 'dataset.pickle'), 'wb'))

                # Instantiate the model
                # graph initialization should be before FileWriter, otherwise the graph will not appear in TensorBoard
                model = SelfSent(dataset, parameters)

                # Initialize the model
                sess.run(tf.global_variables_initializer())
                if not parameters['use_pretrained_model']:
                    model.load_pretrained_token_embeddings(
                        sess, dataset, parameters)

                # Start training + evaluation loop. Each iteration corresponds to 1 epoch.
                bad_counter = 0  # number of epochs with no improvement on the validation test
                previous_best_valid_accuracy = 0
                previous_best_test_accuracy = 0
                model_saver = tf.train.Saver(
                    max_to_keep=parameters['maximum_number_of_epochs']
                )  # defaults to saving all variables
                epoch_number = -1
                try:
                    while True:
                        epoch_number += 1
                        print('\nStarting epoch {0}'.format(epoch_number))

                        epoch_start_time = time.time()

                        if parameters[
                                'use_pretrained_model'] and epoch_number == 0:
                            # Restore pretrained model parameters
                            dataset = train.restore_model_parameters_from_pretrained_model(
                                parameters, dataset, sess, model_saver)
                            dataset.load_deploy(
                                os.path.join(parameters['dataset_folder'],
                                             '{0}.json'.format('deploy')),
                                parameters, annotator)
                            y_pred, y_true, output_filepaths, attentions = train.predict_labels(
                                sess,
                                model,
                                parameters,
                                dataset,
                                epoch_number,
                                stats_graph_folder,
                                dataset_filepaths,
                                only_deploy=True)
                            y_pred = y_pred['deploy']

                            with open(
                                    output_filepaths['deploy']
                                [:output_filepaths['deploy'].rfind('/') + 1] +
                                    'attention.txt',
                                    'w',
                                    encoding='utf-8') as fp:
                                # Compute attention
                                tokens_with_attentions = []
                                for sample_id in range(len(y_pred)):
                                    attention = attentions[int(
                                        sample_id / parameters['batch_size'])][
                                            sample_id %
                                            parameters['batch_size']]
                                    # Remove padded dimension
                                    attention = attention[:dataset.
                                                          token_lengths[
                                                              'deploy']
                                                          [sample_id]]

                                    # Save current attention
                                    fp.write("{}\t{:05.2f}\t".format(
                                        y_pred[sample_id][0],
                                        y_pred[sample_id][1]))
                                    fp.write(' '.join(dataset.tokens['deploy']
                                                      [sample_id]) + '\t')
                                    fp.write(' '.join(
                                        [str(a)
                                         for a in attention.flatten()]) + '\n')

                                    # Sum over columns (we combine all the annotation vectors)
                                    attention = np.sum(attention, axis=1)
                                    # Normalize to sum at 1
                                    attention = attention / np.linalg.norm(
                                        attention)

                                    # Keep only high confidence
                                    if y_pred[sample_id][1] >= parameters[
                                            'attention_visualization_conf']:
                                        tokens_with_attentions.append(
                                            (y_pred[sample_id][0],
                                             y_pred[sample_id][1],
                                             dataset.tokens['deploy']
                                             [sample_id], attention))

                            # Plot attention
                            utils_plots.visualize_attention(
                                tokens_with_attentions, dataset.unique_labels,
                                output_filepaths['deploy']
                                [:output_filepaths['deploy'].rfind('/') + 1],
                                parameters['attention_visualization_conf'])
                            break
                        elif epoch_number != 0:
                            total_loss, total_accuracy = train.train_step(
                                sess, dataset, model, parameters)
                            print('Mean loss: {:.2f}\tMean accuracy: {:.2f}'.
                                  format(np.mean(total_loss),
                                         100.0 * np.mean(total_accuracy)),
                                  flush=True)

                        epoch_elapsed_training_time = time.time(
                        ) - epoch_start_time
                        print('Training completed in {0:.2f} seconds'.format(
                            epoch_elapsed_training_time),
                              flush=True)

                        y_pred, y_true, output_filepaths, _ = train.predict_labels(
                            sess, model, parameters, dataset, epoch_number,
                            stats_graph_folder, dataset_filepaths)

                        # Evaluate model: save and plot results
                        evaluate.evaluate_model(results, dataset, y_pred,
                                                y_true, stats_graph_folder,
                                                epoch_number, epoch_start_time,
                                                output_filepaths, parameters)

                        # Save model
                        model_saver.save(
                            sess,
                            os.path.join(
                                model_folder,
                                'model_{0:05d}.ckpt'.format(epoch_number)))

                        # Early stop
                        valid_accuracy = results['epoch'][epoch_number][0][
                            'valid']['accuracy_score']
                        if valid_accuracy > previous_best_valid_accuracy:
                            bad_counter = 0
                            previous_best_valid_accuracy = valid_accuracy
                            previous_best_test_accuracy = results['epoch'][
                                epoch_number][0]['test']['accuracy_score']
                        else:
                            bad_counter += 1
                        print(
                            "The last {0} epochs have not shown improvements on the validation set."
                            .format(bad_counter))
                        print("Best valid with test performances in epoch " +
                              str(epoch_number - bad_counter) +
                              ": {:05.2f}%\t{:05.2f}%".format(
                                  previous_best_valid_accuracy,
                                  previous_best_test_accuracy))
                        if bad_counter >= parameters['patience']:
                            print('Early Stop!')
                            results['execution_details']['early_stop'] = True
                            break

                        if epoch_number >= parameters[
                                'maximum_number_of_epochs']:
                            break

                except KeyboardInterrupt:
                    results['execution_details']['keyboard_interrupt'] = True
                    print('Training interrupted')

                print('Finishing the experiment')
                end_time = time.time()
                results['execution_details'][
                    'train_duration'] = end_time - start_time
                results['execution_details']['train_end'] = end_time
                evaluate.save_results(results, stats_graph_folder)

            sess.close()  # release the session's resources
Ejemplo n.º 15
0
def main(argv=sys.argv):

    arguments = parse_arguments(argv[1:])

    parameters, conf_parameters = load_parameters(
        arguments['parameters_filepath'], arguments=arguments)
    dataset_filepaths, dataset_brat_folders = get_valid_dataset_filepaths(
        parameters)
    check_parameter_compatiblity(parameters, dataset_filepaths)

    # Load dataset
    dataset = ds.Dataset(verbose=parameters['verbose'],
                         debug=parameters['debug'])
    dataset.load_dataset(dataset_filepaths, parameters)

    # Create graph and session
    with tf.device('/gpu:0'):
        with tf.Graph().as_default():
            session_conf = tf.ConfigProto(
                intra_op_parallelism_threads=parameters[
                    'number_of_cpu_threads'],
                inter_op_parallelism_threads=parameters[
                    'number_of_cpu_threads'],
                device_count={
                    'CPU': 1,
                    'GPU': parameters['number_of_gpus']
                },
                allow_soft_placement=True,
                log_device_placement=False)

            sess = tf.Session(config=session_conf)

            with sess.as_default():
                start_time = time.time()
                experiment_timestamp = utils.get_current_time_in_miliseconds()
                results = {}
                results['epoch'] = {}
                results['execution_details'] = {}
                results['execution_details']['train_start'] = start_time
                results['execution_details'][
                    'time_stamp'] = experiment_timestamp
                results['execution_details']['early_stop'] = False
                results['execution_details']['keyboard_interrupt'] = False
                results['execution_details']['num_epochs'] = 0
                results['model_options'] = copy.copy(parameters)

                dataset_name = utils.get_basename_without_extension(
                    parameters['dataset_text_folder'])
                model_name = dataset_name
                utils.create_folder_if_not_exists(parameters['output_folder'])
                stats_graph_folder = os.path.join(
                    parameters['output_folder'],
                    model_name)  # Folder where to save graphs
                final_weights_folder = os.path.join(
                    parameters['output_folder'], 'weights')
                utils.create_folder_if_not_exists(stats_graph_folder)
                utils.create_folder_if_not_exists(final_weights_folder)
                model_folder = os.path.join(stats_graph_folder, 'model')
                utils.create_folder_if_not_exists(model_folder)
                with open(os.path.join(model_folder, 'parameters.ini'),
                          'w') as parameters_file:
                    conf_parameters.write(parameters_file)
                tensorboard_log_folder = os.path.join(stats_graph_folder,
                                                      'tensorboard_logs')
                utils.create_folder_if_not_exists(tensorboard_log_folder)
                tensorboard_log_folders = {}
                for dataset_type in dataset_filepaths.keys():
                    tensorboard_log_folders[dataset_type] = os.path.join(
                        stats_graph_folder, 'tensorboard_logs', dataset_type)
                    utils.create_folder_if_not_exists(
                        tensorboard_log_folders[dataset_type])
                pickle.dump(
                    dataset,
                    open(os.path.join(model_folder, 'dataset.pickle'), 'wb'))

                model = EntityLSTM(dataset, parameters)

                writers = {}
                for dataset_type in dataset_filepaths.keys():
                    writers[dataset_type] = tf.summary.FileWriter(
                        tensorboard_log_folders[dataset_type],
                        graph=sess.graph)
                embedding_writer = tf.summary.FileWriter(model_folder)

                embeddings_projector_config = projector.ProjectorConfig()
                tensorboard_token_embeddings = embeddings_projector_config.embeddings.add(
                )
                tensorboard_token_embeddings.tensor_name = model.token_embedding_weights.name
                token_list_file_path = os.path.join(
                    model_folder, 'tensorboard_metadata_tokens.tsv')
                tensorboard_token_embeddings.metadata_path = os.path.relpath(
                    token_list_file_path, '..')

                tensorboard_character_embeddings = embeddings_projector_config.embeddings.add(
                )
                tensorboard_character_embeddings.tensor_name = model.character_embedding_weights.name
                character_list_file_path = os.path.join(
                    model_folder, 'tensorboard_metadata_characters.tsv')
                tensorboard_character_embeddings.metadata_path = os.path.relpath(
                    character_list_file_path, '..')

                projector.visualize_embeddings(embedding_writer,
                                               embeddings_projector_config)

                token_list_file = codecs.open(token_list_file_path, 'w',
                                              'latin-1')
                for token_index in range(dataset.vocabulary_size):
                    token_list_file.write('{0}\n'.format(
                        dataset.index_to_token[token_index]))
                token_list_file.close()

                character_list_file = codecs.open(character_list_file_path,
                                                  'w', 'latin-1')
                for character_index in range(dataset.alphabet_size):
                    if character_index == dataset.PADDING_CHARACTER_INDEX:
                        character_list_file.write('PADDING\n')
                    else:
                        character_list_file.write('{0}\n'.format(
                            dataset.index_to_character[character_index]))
                character_list_file.close()

                # Initialize the model
                sess.run(tf.global_variables_initializer())
                if not parameters['use_pretrained_model']:
                    model.load_pretrained_token_embeddings(
                        sess, dataset, parameters)

                patience_counter = 0  # number of epochs with no improvement on the validation test in terms of F1-score
                f1_score_best = 0
                f1_scores = {'train-F1': [], 'valid-F1': [], 'test-F1': []}
                transition_params_trained = np.random.rand(
                    len(dataset.unique_labels) + 2,
                    len(dataset.unique_labels) + 2)
                model_saver = tf.train.Saver(
                    max_to_keep=parameters['num_of_model_to_keep']
                )  #, reshape= True)  # defaults to saving all variables
                epoch_number = -1
                try:
                    while True:
                        step = 0
                        epoch_number += 1
                        print('\nStarting epoch {0}'.format(epoch_number))

                        epoch_start_time = time.time()

                        if parameters[
                                'use_pretrained_model'] and epoch_number == 0:

                            if parameters['use_corrector']:
                                parameters['use_corrector'] = False
                                transition_params_trained = train.restore_pretrained_model(
                                    parameters, dataset, sess, model,
                                    model_saver)
                                print(
                                    'Getting the 3-label predictions from the step1 model.'
                                )
                                all_pred_labels, y_pred_for_corrector, y_true_for_corrector, \
                                output_filepaths = train.predict_labels(sess, model,
                                                                        transition_params_trained,
                                                                        parameters, dataset,
                                                                        epoch_number,
                                                                        stats_graph_folder,
                                                                        dataset_filepaths,
                                                                        for_corrector = True)
                                all_pred_indices = {}  #defaultdict(list)
                                for dataset_type in dataset_filepaths.keys():
                                    all_pred_indices[dataset_type] = []
                                    for i in range(
                                            len(all_pred_labels[dataset_type])
                                    ):
                                        indices = [
                                            dataset.
                                            label_corrector_to_index[label]
                                            for label in
                                            all_pred_labels[dataset_type][i]
                                        ]
                                        all_pred_indices[dataset_type].append(
                                            indices)

                                label_binarizer_corrector = sklearn.preprocessing.LabelBinarizer(
                                )
                                label_binarizer_corrector.fit(
                                    range(
                                        max(dataset.index_to_label_corrector.
                                            keys()) + 1))
                                predicted_label_corrector_vector_indices = {}
                                for dataset_type in dataset_filepaths.keys():
                                    predicted_label_corrector_vector_indices[
                                        dataset_type] = []
                                    for label_indices_sequence in all_pred_indices[
                                            dataset_type]:
                                        predicted_label_corrector_vector_indices[
                                            dataset_type].append(
                                                label_binarizer_corrector.
                                                transform(
                                                    label_indices_sequence))
                                parameters['use_corrector'] = True

                            transition_params_trained, model, glo_step = \
                                train.restore_model_parameters_from_pretrained_model(parameters, dataset, sess, model, model_saver)

                            for dataset_type in dataset_filepaths.keys():
                                writers[dataset_type] = tf.summary.FileWriter(
                                    tensorboard_log_folders[dataset_type],
                                    graph=sess.graph)
                                embedding_writer = tf.summary.FileWriter(
                                    model_folder)
                            init_new_vars_op = tf.initialize_variables(
                                [glo_step])
                            sess.run(init_new_vars_op)

                        elif epoch_number != 0:
                            sequence_numbers = list(
                                range(len(dataset.token_indices['train'])))
                            random.shuffle(sequence_numbers)
                            for sequence_number in sequence_numbers:
                                transition_params_trained, W_before_crf = train.train_step(
                                    sess, dataset, sequence_number, model,
                                    transition_params_trained, parameters)
                                step += 1

                        epoch_elapsed_training_time = time.time(
                        ) - epoch_start_time
                        print('Training completed in {0:.2f} seconds'.format(
                            epoch_elapsed_training_time),
                              flush=False)
                        if parameters['use_corrector']:
                            original_label_corrector_vector_indices = dataset.label_corrector_vector_indices
                            dataset.label_corrector_vector_indices = predicted_label_corrector_vector_indices
                            y_pred, y_true, output_filepaths = train.predict_labels(
                                sess, model, transition_params_trained,
                                parameters, dataset, epoch_number,
                                stats_graph_folder, dataset_filepaths)

                            # Evaluate model: save and plot results
                            evaluate.evaluate_model(results, dataset, y_pred,
                                                    y_true, stats_graph_folder,
                                                    epoch_number,
                                                    epoch_start_time,
                                                    output_filepaths,
                                                    parameters)
                            dataset.label_corrector_vector_indices = original_label_corrector_vector_indices
                        else:
                            y_pred, y_true, output_filepaths = train.predict_labels(
                                sess, model, transition_params_trained,
                                parameters, dataset, epoch_number,
                                stats_graph_folder, dataset_filepaths)

                            # Evaluate model: save and plot results
                            evaluate.evaluate_model(results, dataset, y_pred,
                                                    y_true, stats_graph_folder,
                                                    epoch_number,
                                                    epoch_start_time,
                                                    output_filepaths,
                                                    parameters)

                        summary = sess.run(model.summary_op, feed_dict=None)
                        writers['train'].add_summary(summary, epoch_number)
                        writers['train'].flush()
                        utils.copytree(writers['train'].get_logdir(),
                                       model_folder)

                        # Early stopping
                        train_f1_score = results['epoch'][epoch_number][0][
                            'train']['f1_score']['micro']
                        valid_f1_score = results['epoch'][epoch_number][0][
                            'valid']['f1_score']['micro']
                        test_f1_score = results['epoch'][epoch_number][0][
                            'test']['f1_score']['micro']
                        f1_scores['train-F1'].append(train_f1_score)
                        f1_scores['valid-F1'].append(valid_f1_score)
                        f1_scores['test-F1'].append(test_f1_score)

                        if valid_f1_score > f1_score_best:
                            patience_counter = 0
                            f1_score_best = valid_f1_score
                            # Save the best model
                            model_saver.save(
                                sess,
                                os.path.join(model_folder, 'best_model.ckpt'))
                            print(
                                'updated model to current epoch : epoch {:d}'.
                                format(epoch_number))
                            print('the model is saved in: {:s}'.format(
                                model_folder))
                            ### newly deleted
                        else:
                            patience_counter += 1
                        print("In epoch {:d}, the valid F1 is : {:f}".format(
                            epoch_number, valid_f1_score))
                        print(
                            "The last {0} epochs have not shown improvements on the validation set."
                            .format(patience_counter))

                        if patience_counter >= parameters['patience']:
                            print('Early Stop!')
                            results['execution_details']['early_stop'] = True

                        if epoch_number >= parameters[
                                'maximum_number_of_epochs'] and parameters[
                                    'refine_with_crf']:
                            model = train.refine_with_crf(
                                parameters, sess, model, model_saver)
                            print('refine model with CRF ...')

                            for additional_epoch in range(
                                    parameters['additional_epochs_with_crf']):
                                print('Additional {:d}th epoch'.format(
                                    additional_epoch))
                                sequence_numbers = list(
                                    range(len(dataset.token_indices['train'])))
                                random.shuffle(sequence_numbers)
                                for sequence_number in sequence_numbers:
                                    transition_params_trained, W_before_crf = train.train_step(
                                        sess, dataset, sequence_number, model,
                                        transition_params_trained, parameters)
                                    step += 1
                                epoch_elapsed_training_time = time.time(
                                ) - epoch_start_time
                                print(
                                    'Additional training completed in {0:.2f} seconds'
                                    .format(epoch_elapsed_training_time),
                                    flush=False)

                                y_pred, y_true, output_filepaths = train.predict_labels(
                                    sess, model, transition_params_trained,
                                    parameters, dataset, epoch_number,
                                    stats_graph_folder, dataset_filepaths)

                                evaluate.evaluate_model(
                                    results, dataset, y_pred, y_true,
                                    stats_graph_folder, epoch_number,
                                    epoch_start_time, output_filepaths,
                                    parameters)

                                summary = sess.run(model.summary_op,
                                                   feed_dict=None)
                                writers['train'].add_summary(
                                    summary, epoch_number)
                                writers['train'].flush()
                                utils.copytree(writers['train'].get_logdir(),
                                               model_folder)

                        if epoch_number >= parameters[
                                'maximum_number_of_epochs'] and not parameters[
                                    'refine_with_crf']:
                            break
                    if not parameters['use_pretrained_model']:
                        plot_name = 'F1-summary-step1.svg'
                    else:
                        plot_name = 'F1-summary-step2.svg'
                    for k, l in f1_scores.items():
                        print(k, l)
                    utils_plots.plot_f1(
                        f1_scores,
                        os.path.join(stats_graph_folder, '..', plot_name),
                        'F1 score summary')

                except KeyboardInterrupt:
                    results['execution_details']['keyboard_interrupt'] = True
                    print('Training interrupted')

                print('Finishing the experiment')
                end_time = time.time()
                results['execution_details'][
                    'train_duration'] = end_time - start_time
                results['execution_details']['train_end'] = end_time
                evaluate.save_results(results, stats_graph_folder)
                for dataset_type in dataset_filepaths.keys():
                    writers[dataset_type].close()

    sess.close()
Ejemplo n.º 16
0
def main(languages):
    #embeddings_type = ['polyglot', 'fasttext']
    #embeddings_type = ['fasttext', 'fasttext_noOOV']
    embeddings_type = ['fasttext_noOOV']
    character_lstm = [True]
    embedding_language = ['target', 'source']
    combination = product(languages, embeddings_type, embedding_language, character_lstm)
    create_folder_if_not_exists(os.path.join("..", "log"))
    experiment_timestamp = utils.get_current_time_in_miliseconds()
    log_file = os.path.join("..", "log", "experiment-{}.log".format(experiment_timestamp))

    for language, emb_type, emb_language, char_lstm in combination:
        conf_parameters = load_parameters()
        conf_parameters = set_datasets(conf_parameters, language)
        conf_parameters.set('ann','use_character_lstm', str(char_lstm))
        conf_parameters.set('ann','embedding_type', emb_type)
        conf_parameters.set('ann','embedding_language', emb_language)
        if emb_type == 'polyglot':
            conf_parameters.set('ann', 'embedding_dimension', str(64))
        elif 'fasttext' in emb_type:
            conf_parameters.set('ann', 'embedding_dimension', str(300))
        else:
            raise("Uknown embedding type")
        if emb_language == 'source':
            conf_parameters.set('dataset', 'language', constants.MAPPING_LANGUAGE[language])
        else:
            conf_parameters.set('dataset', 'language', language)
        parameters, conf_parameters = parse_parameters(conf_parameters)

        start_time = time.time()
        experiment_timestamp = utils.get_current_time_in_miliseconds()

        results = {}
        results['epoch'] = {}
        results['execution_details'] = {}
        results['execution_details']['train_start'] = start_time
        results['execution_details']['time_stamp'] = experiment_timestamp
        results['execution_details']['early_stop'] = False
        results['execution_details']['keyboard_interrupt'] = False
        results['execution_details']['num_epochs'] = 0
        results['model_options'] = copy.copy(parameters)

        dataset_name = utils.get_basename_without_extension(parameters['dataset_train'])
        model_name = '{0}_{1}_{2}_{3}_{4}'.format(language, emb_type, char_lstm, emb_language,
                                                  results['execution_details']['time_stamp'])

        sys.stdout = open(os.path.join("..", "log", model_name), "w")
        print(language, emb_type, char_lstm, emb_language)

        with open(log_file, "a") as file:
            file.write("Experiment: {}\n".format(model_name))
            file.write("Start time:{}\n".format(experiment_timestamp))
            file.write("-------------------------------------\n\n")
        pprint(parameters)
        dataset_filepaths = get_valid_dataset_filepaths(parameters)
        check_parameter_compatiblity(parameters, dataset_filepaths)
        previous_best_valid_epoch = -1

        # Load dataset
        dataset = ds.Dataset(verbose=parameters['verbose'], debug=parameters['debug'])
        dataset.load_vocab_word_embeddings(parameters)
        dataset.load_dataset(dataset_filepaths, parameters)

        # Create graph and session
        with tf.Graph().as_default():
            session_conf = tf.ConfigProto(
                intra_op_parallelism_threads=parameters['number_of_cpu_threads'],
                inter_op_parallelism_threads=parameters['number_of_cpu_threads'],
                device_count={'CPU': 1, 'GPU': parameters['number_of_gpus']},
                allow_soft_placement=True,
                # automatically choose an existing and supported device to run the operations in case the specified one doesn't exist
                log_device_placement=False
            )

            session_conf.gpu_options.allow_growth = True

            sess = tf.Session(config=session_conf)

            with sess.as_default():
                # Initialize and save execution details

                print(model_name)
                output_folder = os.path.join('..', 'output')
                utils.create_folder_if_not_exists(output_folder)
                stats_graph_folder = os.path.join(output_folder, model_name)  # Folder where to save graphs
                utils.create_folder_if_not_exists(stats_graph_folder)
                model_folder = os.path.join(stats_graph_folder, 'model')
                utils.create_folder_if_not_exists(model_folder)
                with open(os.path.join(model_folder, 'parameters.ini'), 'w') as parameters_file:
                    conf_parameters.write(parameters_file)
                tensorboard_log_folder = os.path.join(stats_graph_folder, 'tensorboard_logs')
                utils.create_folder_if_not_exists(tensorboard_log_folder)
                tensorboard_log_folders = {}
                for dataset_type in dataset_filepaths.keys():
                    tensorboard_log_folders[dataset_type] = os.path.join(stats_graph_folder, 'tensorboard_logs',
                                                                         dataset_type)
                    utils.create_folder_if_not_exists(tensorboard_log_folders[dataset_type])
                # del dataset.embeddings_matrix
                if not parameters['use_pretrained_model']:
                    pickle.dump(dataset, open(os.path.join(model_folder, 'dataset.pickle'), 'wb'))
                # dataset.load_pretrained_word_embeddings(parameters)
                # Instantiate the model
                # graph initialization should be before FileWriter, otherwise the graph will not appear in TensorBoard
                model = EntityLSTM(dataset, parameters)

                # Instantiate the writers for TensorBoard
                writers = {}
                for dataset_type in dataset_filepaths.keys():
                    writers[dataset_type] = tf.summary.FileWriter(tensorboard_log_folders[dataset_type],
                                                                  graph=sess.graph)
                embedding_writer = tf.summary.FileWriter(
                    model_folder)  # embedding_writer has to write in model_folder, otherwise TensorBoard won't be able to view embeddings

                embeddings_projector_config = projector.ProjectorConfig()
                tensorboard_token_embeddings = embeddings_projector_config.embeddings.add()
                tensorboard_token_embeddings.tensor_name = model.token_embedding_weights.name
                token_list_file_path = os.path.join(model_folder, 'tensorboard_metadata_tokens.tsv')
                tensorboard_token_embeddings.metadata_path = os.path.relpath(token_list_file_path, '..')

                if parameters['use_character_lstm']:
                    tensorboard_character_embeddings = embeddings_projector_config.embeddings.add()
                    tensorboard_character_embeddings.tensor_name = model.character_embedding_weights.name
                    character_list_file_path = os.path.join(model_folder, 'tensorboard_metadata_characters.tsv')
                    tensorboard_character_embeddings.metadata_path = os.path.relpath(character_list_file_path, '..')

                projector.visualize_embeddings(embedding_writer, embeddings_projector_config)

                # Write metadata for TensorBoard embeddings
                token_list_file = codecs.open(token_list_file_path, 'w', 'UTF-8')
                for token_index in range(len(dataset.index_to_token)):
                    token_list_file.write('{0}\n'.format(dataset.index_to_token[token_index]))
                token_list_file.close()

                if parameters['use_character_lstm']:
                    character_list_file = codecs.open(character_list_file_path, 'w', 'UTF-8')
                    for character_index in range(dataset.alphabet_size):
                        if character_index == dataset.PADDING_CHARACTER_INDEX:
                            character_list_file.write('PADDING\n')
                        else:
                            character_list_file.write('{0}\n'.format(dataset.index_to_character[character_index]))
                    character_list_file.close()

                try:
                    # Initialize the model
                    sess.run(tf.global_variables_initializer())
                    if not parameters['use_pretrained_model']:
                        model.load_pretrained_token_embeddings(sess, dataset, parameters)

                    # Start training + evaluation loop. Each iteration corresponds to 1 epoch.
                    bad_counter = 0  # number of epochs with no improvement on the validation test in terms of F1-score
                    previous_best_valid_f1_score = -1
                    transition_params_trained = np.random.rand(len(dataset.unique_labels), len(
                        dataset.unique_labels))  # TODO np.random.rand(len(dataset.unique_labels)+2,len(dataset.unique_labels)+2)
                    model_saver = tf.train.Saver(
                        max_to_keep=None)  # parameters['maximum_number_of_epochs'])  # defaults to saving all variables
                    epoch_number = 0

                    while True:
                        step = 0
                        epoch_number += 1
                        print('\nStarting epoch {0}'.format(epoch_number))

                        epoch_start_time = time.time()

                        if parameters['use_pretrained_model'] and epoch_number == 1:
                            # Restore pretrained model parameters
                            transition_params_trained = train.restore_model_parameters_from_pretrained_model(parameters,
                                                                                                             dataset,
                                                                                                             sess,
                                                                                                             model,
                                                                                                             model_saver)
                        elif epoch_number != 0:
                            # Train model: loop over all sequences of training set with shuffling
                            sequence_numbers = list(range(len(dataset.token_indices['train'])))
                            random.shuffle(sequence_numbers)
                            data_counter = 0
                            sub_id = 0
                            for i in tqdm(range(0, len(sequence_numbers), parameters['batch_size']), "Training epoch {}".format(epoch_number),
                                          mininterval=1):
                                data_counter += parameters['batch_size']
                                if data_counter >= 20000:
                                    data_counter = 0
                                    sub_id += 0.001
                                    print("Intermediate evaluation number: ", sub_id)
                                    epoch_elapsed_training_time = time.time() - epoch_start_time
                                    print('Training completed in {0:.2f} seconds'.format(epoch_elapsed_training_time),
                                          flush=True)

                                    y_pred, y_true, output_filepaths = train.predict_labels(sess, model,
                                                                                            transition_params_trained,
                                                                                            parameters, dataset,
                                                                                            epoch_number + sub_id,
                                                                                            stats_graph_folder,
                                                                                            dataset_filepaths)
                                    # Evaluate model: save and plot results
                                    evaluate.evaluate_model(results, dataset, y_pred, y_true, stats_graph_folder,
                                                            epoch_number, epoch_start_time, output_filepaths,
                                                            parameters)
                                    # Save model
                                    model_saver.save(sess, os.path.join(model_folder,
                                                                        'model_{0:07.3f}.ckpt'.format(
                                                                            epoch_number + sub_id)))
                                    # Save TensorBoard logs
                                    summary = sess.run(model.summary_op, feed_dict=None)
                                    writers['train'].add_summary(summary, epoch_number)
                                    writers['train'].flush()
                                    utils.copytree(writers['train'].get_logdir(), model_folder)
                                    # Early stop
                                    valid_f1_score = results['epoch'][epoch_number][0]['valid']['f1_score']['micro']
                                    if valid_f1_score > previous_best_valid_f1_score:
                                        bad_counter = 0
                                        previous_best_valid_f1_score = valid_f1_score
                                    else:
                                        bad_counter += 1

                                sequence_number = sequence_numbers[i: i + parameters['batch_size']]
                                transition_params_trained, loss = train.train_step(sess, dataset, sequence_number,
                                                                                   model, transition_params_trained,
                                                                                   parameters)
                        epoch_elapsed_training_time = time.time() - epoch_start_time
                        print('Training completed in {0:.2f} seconds'.format(epoch_elapsed_training_time), flush=True)

                        y_pred, y_true, output_filepaths = train.predict_labels(sess, model, transition_params_trained,
                                                                                parameters, dataset, epoch_number,
                                                                                stats_graph_folder, dataset_filepaths)

                        # Evaluate model: save and plot results
                        evaluate.evaluate_model(results, dataset, y_pred, y_true, stats_graph_folder, epoch_number,
                                                epoch_start_time, output_filepaths, parameters)

                        # Save model
                        model_saver.save(sess, os.path.join(model_folder, 'model_{0:05d}.ckpt'.format(epoch_number)))

                        # Save TensorBoard logs
                        summary = sess.run(model.summary_op, feed_dict=None)
                        writers['train'].add_summary(summary, epoch_number)
                        writers['train'].flush()
                        utils.copytree(writers['train'].get_logdir(), model_folder)

                        # Early stop
                        valid_f1_score = results['epoch'][epoch_number][0]['valid']['f1_score']['micro']
                        if valid_f1_score > previous_best_valid_f1_score:
                            bad_counter = 0
                            previous_best_valid_f1_score = valid_f1_score
                            previous_best_valid_epoch = epoch_number
                        else:
                            bad_counter += 1
                        print("The last {0} epochs have not shown improvements on the validation set.".format(
                            bad_counter))

                        if bad_counter >= parameters['patience']:
                            print('Early Stop!')
                            results['execution_details']['early_stop'] = True
                            break

                        if epoch_number >= parameters['maximum_number_of_epochs']: break

                    keep_only_best_model(model_folder,previous_best_valid_epoch ,parameters['maximum_number_of_epochs']+1)

                except KeyboardInterrupt:
                    results['execution_details']['keyboard_interrupt'] = True
                    print('Training interrupted')
                    # remove the experiment
                    remove_experiment = input("Do you want to remove the experiment? (yes/y/Yes)")
                    if remove_experiment in ["Yes", "yes", "y"]:
                        shutil.rmtree(stats_graph_folder)
                        print("Folder removed")
                    else:
                        print('Finishing the experiment')
                        end_time = time.time()
                        results['execution_details']['train_duration'] = end_time - start_time
                        results['execution_details']['train_end'] = end_time
                        evaluate.save_results(results, stats_graph_folder)
                    sys.stdout.close()
                except Exception:
                    logging.exception("")
                    remove_experiment = input("Do you want to remove the experiment? (yes/y/Yes)")
                    if remove_experiment in ["Yes", "yes", "y"]:
                        shutil.rmtree(stats_graph_folder)
                        print("Folder removed")
                    sys.stdout.close()

            sess.close()  # release the session's resources
            sys.stdout.close()
Ejemplo n.º 17
0
    def _get_valid_dataset_filepaths(self, parameters, dataset_types=['train', 'valid', 'test', 'deploy']):
        '''
        Tiền xử lý dataset đầu vào, nếu data chuẩn conll thì chuyển sang brat
        Tham số:
            - parameters: parameters của toàn bộ chương trình
        Return:
        (
            {   // dataset_filepaths các fields bên dưới là optional, không nhất thiết phải đủ 4
                "train": "data_text_folder/train[_compatible_with_brat][_bioes].txt",
                "valid": "data_text_folder/valid[_compatible_with_brat][_bioes].txt",
                "test": "data_text_folder/test[_compatible_with_brat][_bioes].txt",
                "deploy": "data_text_folder/deploy[_compatible_with_brat][_bioes].txt"
            },
            {   // dataset_brat_folders, các fields bên dưới là optional, không nhất thiết phải đủ 4
                "train": "data_text_folder/train",
                "valid": "data_text_folder/valid",
                "test": "data_text_folder/test",
                "deploy": "data_text_folder/deploy"
            }
        )
        '''
        dataset_filepaths = {}
        dataset_brat_folders = {}
        for dataset_type in dataset_types:
            dataset_filepaths[dataset_type] = os.path.join(parameters['dataset_text_folder'], '{0}.txt'.format(dataset_type))
            dataset_brat_folders[dataset_type] = os.path.join(parameters['dataset_text_folder'], dataset_type)
            dataset_compatible_with_brat_filepath = os.path.join(parameters['dataset_text_folder'], '{0}_compatible_with_brat.txt'.format(dataset_type))

            # Conll file exists
            if os.path.isfile(dataset_filepaths[dataset_type]) and os.path.getsize(dataset_filepaths[dataset_type]) > 0:
                # Brat text files exist
                if os.path.exists(dataset_brat_folders[dataset_type]) and len(glob.glob(os.path.join(dataset_brat_folders[dataset_type], '*.txt'))) > 0:

                    # Check compatibility between conll and brat files
                    brat_to_conll.check_brat_annotation_and_text_compatibility(dataset_brat_folders[dataset_type])
                    if os.path.exists(dataset_compatible_with_brat_filepath):
                        dataset_filepaths[dataset_type] = dataset_compatible_with_brat_filepath
                    conll_to_brat.check_compatibility_between_conll_and_brat_text(dataset_filepaths[dataset_type], dataset_brat_folders[dataset_type])

                # Brat text files do not exist
                else:

                    # Populate brat text and annotation files based on conll file
                    conll_to_brat.conll_to_brat(dataset_filepaths[dataset_type], dataset_compatible_with_brat_filepath, dataset_brat_folders[dataset_type], dataset_brat_folders[dataset_type])
                    dataset_filepaths[dataset_type] = dataset_compatible_with_brat_filepath

            # Conll file does not exist
            else:
                # Brat text files exist
                if os.path.exists(dataset_brat_folders[dataset_type]) and len(glob.glob(os.path.join(dataset_brat_folders[dataset_type], '*.txt'))) > 0:
                    dataset_filepath_for_tokenizer = os.path.join(parameters['dataset_text_folder'], '{0}_{1}.txt'.format(dataset_type, parameters['tokenizer']))
                    if os.path.exists(dataset_filepath_for_tokenizer):
                        conll_to_brat.check_compatibility_between_conll_and_brat_text(dataset_filepath_for_tokenizer, dataset_brat_folders[dataset_type])
                    else:
                        # Populate conll file based on brat files
                        brat_to_conll.brat_to_conll(dataset_brat_folders[dataset_type], dataset_filepath_for_tokenizer, parameters['tokenizer'], parameters['spacylanguage'])
                    dataset_filepaths[dataset_type] = dataset_filepath_for_tokenizer

                # Brat text files do not exist
                else:
                    del dataset_filepaths[dataset_type]
                    del dataset_brat_folders[dataset_type]
                    continue

            if parameters['tagging_format'] == 'bioes':
                # Generate conll file with BIOES format
                bioes_filepath = os.path.join(parameters['dataset_text_folder'], '{0}_bioes.txt'.format(utils.get_basename_without_extension(dataset_filepaths[dataset_type])))
                utils_nlp.convert_conll_from_bio_to_bioes(dataset_filepaths[dataset_type], bioes_filepath)
                dataset_filepaths[dataset_type] = bioes_filepath

        return dataset_filepaths, dataset_brat_folders
Ejemplo n.º 18
0
def main(argv=sys.argv):
    ''' NeuroNER main method

    Args:
        parameters_filepath the path to the parameters file
        output_folder the path to the output folder
    '''
    arguments = parse_arguments(argv[1:])
    parameters, conf_parameters = load_parameters(
        arguments['parameters_filepath'], arguments=arguments)
    dataset_filepaths, dataset_brat_folders = get_valid_dataset_filepaths(
        parameters)
    check_parameter_compatiblity(parameters, dataset_filepaths)

    # Load dataset
    dataset = ds.Dataset(verbose=parameters['verbose'],
                         debug=parameters['debug'])
    dataset.load_dataset(dataset_filepaths, parameters)

    # Create graph and session
    with tf.device('/gpu:0'):
        with tf.Graph().as_default():
            session_conf = tf.ConfigProto(
                intra_op_parallelism_threads=parameters[
                    'number_of_cpu_threads'],
                inter_op_parallelism_threads=parameters[
                    'number_of_cpu_threads'],
                device_count={
                    'CPU': 1,
                    'GPU': parameters['number_of_gpus']
                },
                allow_soft_placement=True,
                # automatically choose an existing and supported device to run the operations in case the specified one doesn't exist
                log_device_placement=False)

            sess = tf.Session(config=session_conf)

            with sess.as_default():

                start_time = time.time()
                experiment_timestamp = utils.get_current_time_in_miliseconds()
                results = {}
                results['epoch'] = {}
                results['execution_details'] = {}
                results['execution_details']['train_start'] = start_time
                results['execution_details'][
                    'time_stamp'] = experiment_timestamp
                results['execution_details']['early_stop'] = False
                results['execution_details']['keyboard_interrupt'] = False
                results['execution_details']['num_epochs'] = 0
                results['model_options'] = copy.copy(parameters)

                dataset_name = utils.get_basename_without_extension(
                    parameters['dataset_text_folder'])
                model_name = dataset_name
                utils.create_folder_if_not_exists(parameters['output_folder'])
                stats_graph_folder = os.path.join(
                    parameters['output_folder'],
                    model_name)  # Folder where to save graphs
                final_weights_folder = os.path.join(
                    parameters['output_folder'], 'weights')
                utils.create_folder_if_not_exists(stats_graph_folder)
                utils.create_folder_if_not_exists(final_weights_folder)
                model_folder = os.path.join(stats_graph_folder, 'model')
                utils.create_folder_if_not_exists(model_folder)
                # saving the parameter setting to the output model dir. For later resuming training
                with open(os.path.join(model_folder, 'parameters.ini'),
                          'w') as parameters_file:
                    conf_parameters.write(parameters_file)
                tensorboard_log_folder = os.path.join(stats_graph_folder,
                                                      'tensorboard_logs')
                utils.create_folder_if_not_exists(tensorboard_log_folder)
                tensorboard_log_folders = {}
                for dataset_type in dataset_filepaths.keys():
                    tensorboard_log_folders[dataset_type] = os.path.join(
                        stats_graph_folder, 'tensorboard_logs', dataset_type)
                    utils.create_folder_if_not_exists(
                        tensorboard_log_folders[dataset_type])
                pickle.dump(
                    dataset,
                    open(os.path.join(model_folder, 'dataset.pickle'), 'wb'))

                # Instantiate the model
                # graph initialization should be before FileWriter, otherwise the graph will not appear in TensorBoard
                model = EntityLSTM(dataset, parameters)

                # Instantiate the writers for TensorBoard
                writers = {}
                for dataset_type in dataset_filepaths.keys():
                    writers[dataset_type] = tf.summary.FileWriter(
                        tensorboard_log_folders[dataset_type],
                        graph=sess.graph)
                # embedding_writer has to write in model_folder, otherwise TensorBoard won't be able to view embeddings
                embedding_writer = tf.summary.FileWriter(model_folder)

                embeddings_projector_config = projector.ProjectorConfig()
                tensorboard_token_embeddings = embeddings_projector_config.embeddings.add(
                )
                tensorboard_token_embeddings.tensor_name = model.token_embedding_weights.name
                token_list_file_path = os.path.join(
                    model_folder, 'tensorboard_metadata_tokens.tsv')
                tensorboard_token_embeddings.metadata_path = os.path.relpath(
                    token_list_file_path, '..')

                tensorboard_character_embeddings = embeddings_projector_config.embeddings.add(
                )
                tensorboard_character_embeddings.tensor_name = model.character_embedding_weights.name
                character_list_file_path = os.path.join(
                    model_folder, 'tensorboard_metadata_characters.tsv')
                tensorboard_character_embeddings.metadata_path = os.path.relpath(
                    character_list_file_path, '..')

                projector.visualize_embeddings(embedding_writer,
                                               embeddings_projector_config)

                # Write metadata for TensorBoard embeddings
                token_list_file = codecs.open(token_list_file_path, 'w',
                                              'latin-1')
                for token_index in range(dataset.vocabulary_size):
                    token_list_file.write('{0}\n'.format(
                        dataset.index_to_token[token_index]))
                token_list_file.close()

                character_list_file = codecs.open(character_list_file_path,
                                                  'w', 'latin-1')
                for character_index in range(dataset.alphabet_size):
                    if character_index == dataset.PADDING_CHARACTER_INDEX:
                        character_list_file.write('PADDING\n')
                    else:
                        character_list_file.write('{0}\n'.format(
                            dataset.index_to_character[character_index]))
                character_list_file.close()

                # Initialize the model
                sess.run(tf.global_variables_initializer())
                if not parameters['use_pretrained_model']:
                    model.load_pretrained_token_embeddings(
                        sess, dataset, parameters)

                # Start training + evaluation loop. Each iteration corresponds to 1 epoch.
                patience_counter = 0
                f1_score_best = 0
                f1_scores = {'train-F1': [], 'valid-F1': [], 'test-F1': []}
                f1_scores_conll = {
                    'train-F1': [],
                    'valid-F1': [],
                    'test-F1': []
                }
                transition_params_trained = np.random.rand(
                    len(dataset.unique_labels) + 2,
                    len(dataset.unique_labels) + 2)
                model_saver = tf.train.Saver(
                    max_to_keep=parameters['num_of_model_to_keep'])
                epoch_number = -1
                try:
                    while True:
                        step = 0
                        epoch_number += 1
                        print('\nStarting epoch {0}'.format(epoch_number))

                        epoch_start_time = time.time()

                        # use pre-trained model and epoch_number = 0
                        if parameters[
                                'use_pretrained_model'] and epoch_number == 0:

                            if parameters['use_adapter']:
                                parameters['use_adapter'] = False
                                transition_params_trained = train.restore_pretrained_model(
                                    parameters, dataset, sess, model,
                                    model_saver)
                                print(
                                    'Getting the 3-label predictions from the step1 model.'
                                )
                                all_pred_labels, y_pred_for_adapter, y_true_for_adapter, \
                                output_filepaths = train.predict_labels(sess, model,
                                                                        transition_params_trained,
                                                                        parameters, dataset,
                                                                        epoch_number,
                                                                        stats_graph_folder,
                                                                        dataset_filepaths,
                                                                        for_adapter=True)
                                # use the label2idx mapping (for adapter) in the dataset to transform all_pred_labels
                                all_pred_indices = {}
                                for dataset_type in dataset_filepaths.keys():
                                    all_pred_indices[dataset_type] = []
                                    for i in range(
                                            len(all_pred_labels[dataset_type])
                                    ):
                                        indices = [
                                            dataset.
                                            label_adapter_to_index[label]
                                            for label in
                                            all_pred_labels[dataset_type][i]
                                        ]
                                        all_pred_indices[dataset_type].append(
                                            indices)

                                # and use binarizer to transform to ndarray
                                label_binarizer_adapter = sklearn.preprocessing.LabelBinarizer(
                                )
                                label_binarizer_adapter.fit(
                                    range(
                                        max(dataset.index_to_label_adapter.
                                            keys()) + 1))
                                predicted_label_adapter_vector_indices = {}
                                for dataset_type in dataset_filepaths.keys():
                                    predicted_label_adapter_vector_indices[
                                        dataset_type] = []
                                    for label_indices_sequence in all_pred_indices[
                                            dataset_type]:
                                        predicted_label_adapter_vector_indices[
                                            dataset_type].append(
                                                label_binarizer_adapter.
                                                transform(
                                                    label_indices_sequence))
                                parameters['use_adapter'] = True

                            if parameters['train_model'] and parameters[
                                    'add_class']:
                                transition_params_trained, model, glo_step = \
                                    train.restore_model_parameters_from_pretrained_model(parameters, dataset, sess,
                                                                                         model, model_saver)
                                init_new_vars_op = tf.initialize_variables(
                                    [glo_step])
                                sess.run(init_new_vars_op)
                            else:
                                transition_params_trained = \
                                    train.restore_pretrained_model(parameters, dataset, sess, model, model_saver)

                            for dataset_type in dataset_filepaths.keys():
                                writers[dataset_type] = tf.summary.FileWriter(
                                    tensorboard_log_folders[dataset_type],
                                    graph=sess.graph)
                                # embedding_writer has to write in model_folder, otherwise TensorBoard won't be able to view embeddings
                                embedding_writer = tf.summary.FileWriter(
                                    model_folder)

                        # epoch_number != 0, no matter use or not use pre-trained model
                        elif epoch_number != 0:
                            # Train model: loop over all sequences of training set with shuffling
                            sequence_numbers = list(
                                range(len(dataset.token_indices['train'])))
                            random.shuffle(sequence_numbers)
                            for sequence_number in sequence_numbers:
                                transition_params_trained, W_before_crf = train.train_step(
                                    sess, dataset, sequence_number, model,
                                    transition_params_trained, parameters)
                                step += 1
                        epoch_elapsed_training_time = time.time(
                        ) - epoch_start_time
                        print('Training completed in {0:.2f} seconds'.format(
                            epoch_elapsed_training_time),
                              flush=False)

                        if parameters[
                                'use_adapter']:  # model evaluation, using adapter
                            # pass the pred_for_adapter as label_indices vector
                            original_label_adapter_vector_indices = dataset.label_adapter_vector_indices
                            dataset.label_adapter_vector_indices = predicted_label_adapter_vector_indices
                            y_pred, y_true, output_filepaths = train.predict_labels(
                                sess, model, transition_params_trained,
                                parameters, dataset, epoch_number,
                                stats_graph_folder, dataset_filepaths)

                            evaluate.evaluate_model(results, dataset, y_pred,
                                                    y_true, stats_graph_folder,
                                                    epoch_number,
                                                    epoch_start_time,
                                                    output_filepaths,
                                                    parameters)
                            dataset.label_adapter_vector_indices = original_label_adapter_vector_indices

                        else:  # model evaluation,  not using adapter
                            y_pred, y_true, output_filepaths = train.predict_labels(
                                sess, model, transition_params_trained,
                                parameters, dataset, epoch_number,
                                stats_graph_folder, dataset_filepaths)

                            # Evaluate model: save and plot results
                            evaluate.evaluate_model(results, dataset, y_pred,
                                                    y_true, stats_graph_folder,
                                                    epoch_number,
                                                    epoch_start_time,
                                                    output_filepaths,
                                                    parameters)

                        summary = sess.run(model.summary_op, feed_dict=None)
                        writers['train'].add_summary(summary, epoch_number)
                        writers['train'].flush()
                        utils.copytree(writers['train'].get_logdir(),
                                       model_folder)

                        # Early stopping
                        train_f1_score = results['epoch'][epoch_number][0][
                            'train']['f1_score']['weighted']
                        valid_f1_score = results['epoch'][epoch_number][0][
                            'valid']['f1_score']['weighted']
                        test_f1_score = results['epoch'][epoch_number][0][
                            'test']['f1_score']['weighted']
                        f1_scores['train-F1'].append(train_f1_score)
                        f1_scores['valid-F1'].append(valid_f1_score)
                        f1_scores['test-F1'].append(test_f1_score)

                        train_f1_score_conll = results['epoch'][epoch_number][
                            0]['train']['f1_conll']['micro']
                        valid_f1_score_conll = results['epoch'][epoch_number][
                            0]['valid']['f1_conll']['micro']
                        test_f1_score_conll = results['epoch'][epoch_number][
                            0]['test']['f1_conll']['micro']
                        f1_scores_conll['train-F1'].append(
                            train_f1_score_conll)
                        f1_scores_conll['valid-F1'].append(
                            valid_f1_score_conll)
                        f1_scores_conll['test-F1'].append(test_f1_score_conll)

                        if valid_f1_score > f1_score_best:
                            patience_counter = 0
                            f1_score_best = valid_f1_score
                            # Save the best model
                            model_saver.save(
                                sess,
                                os.path.join(model_folder, 'best_model.ckpt'))
                            print(
                                'updated model to current epoch : epoch {:d}'.
                                format(epoch_number))
                            print('the model is saved in: {:s}'.format(
                                model_folder))
                        else:
                            patience_counter += 1
                        print("In epoch {:d}, the valid F1 is : {:f}".format(
                            epoch_number, valid_f1_score))
                        print(
                            "The last {0} epochs have not shown improvements on the validation set."
                            .format(patience_counter))

                        if patience_counter >= parameters['patience']:
                            print('Early Stop!')
                            results['execution_details']['early_stop'] = True
                            # save last model
                            model_saver.save(
                                sess,
                                os.path.join(model_folder, 'last_model.ckpt'))
                            print('the last model is saved in: {:s}'.format(
                                model_folder))

                            break

                        if epoch_number >= parameters[
                                'maximum_number_of_epochs'] and not parameters[
                                    'refine_with_crf']:
                            break
                    if not parameters['use_pretrained_model']:
                        plot_name = 'F1-summary-step1.svg'
                    else:
                        plot_name = 'F1-summary-step2.svg'

                    print('Sklearn result:')
                    for k, l in f1_scores.items():
                        print(k, l)

                    print('Conll result:')
                    for k, l in f1_scores_conll.items():
                        print(k, l)
                    utils_plots.plot_f1(
                        f1_scores,
                        os.path.join(stats_graph_folder, '..', plot_name),
                        'F1 score summary')

                    # TODO: in step 1, for task a, add the best deploy data to step 2 train set, and call script
                    print('(sklearn micro) test F1:')
                    micro_f1 = ','.join([
                        str(results['epoch'][ep][0]['test']['f1_score']
                            ['micro']) for ep in range(epoch_number + 1)
                    ])
                    print(micro_f1)
                    print('(sklearn macro) test F1:')
                    macro_f1 = ','.join([
                        str(results['epoch'][ep][0]['test']['f1_score']
                            ['macro']) for ep in range(epoch_number + 1)
                    ])
                    print(macro_f1)

                except KeyboardInterrupt:
                    results['execution_details']['keyboard_interrupt'] = True
                    print('Training interrupted')

                print('Finishing the experiment')
                end_time = time.time()
                results['execution_details'][
                    'train_duration'] = end_time - start_time
                results['execution_details']['train_end'] = end_time
                evaluate.save_results(results, stats_graph_folder)
                for dataset_type in dataset_filepaths.keys():
                    writers[dataset_type].close()

    sess.close()  # release the session's resources
Ejemplo n.º 19
0
def main():

    parameters, dataset_filepaths = load_parameters()

    # Load dataset
    dataset = ds.Dataset()
    dataset.load_dataset(dataset_filepaths, parameters)

    # Create graph and session
    with tf.Graph().as_default():
        session_conf = tf.ConfigProto(
            device_count={
                'CPU': 1,
                'GPU': 1
            },
            allow_soft_placement=
            True,  #  automatically choose an existing and supported device to run the operations in case the specified one doesn't exist
            log_device_placement=False)

        sess = tf.Session(config=session_conf)

        with sess.as_default():
            # Initialize and save execution details
            start_time = time.time()
            experiment_timestamp = utils.get_current_time_in_miliseconds()
            results = {}
            results['epoch'] = {}
            results['execution_details'] = {}
            results['execution_details']['train_start'] = start_time
            results['execution_details']['time_stamp'] = experiment_timestamp
            results['execution_details']['early_stop'] = False
            results['execution_details']['keyboard_interrupt'] = False
            results['execution_details']['num_epochs'] = 0
            results['model_options'] = copy.copy(parameters)

            dataset_name = utils.get_basename_without_extension(
                parameters['dataset_text_folder'])
            model_name = '{0}_{1}'.format(
                dataset_name, results['execution_details']['time_stamp'])

            output_folder = os.path.join('..', 'output')
            utils.create_folder_if_not_exists(output_folder)
            stats_graph_folder = os.path.join(
                output_folder, model_name)  # Folder where to save graphs
            utils.create_folder_if_not_exists(stats_graph_folder)
            model_folder = os.path.join(stats_graph_folder, 'model')
            utils.create_folder_if_not_exists(model_folder)
            tensorboard_log_folder = os.path.join(stats_graph_folder,
                                                  'tensorboard_logs')
            utils.create_folder_if_not_exists(tensorboard_log_folder)
            tensorboard_log_folders = {}
            for dataset_type in ['train', 'valid', 'test']:
                tensorboard_log_folders[dataset_type] = os.path.join(
                    stats_graph_folder, 'tensorboard_logs', dataset_type)
                utils.create_folder_if_not_exists(
                    tensorboard_log_folders[dataset_type])

            pickle.dump(
                dataset,
                open(os.path.join(stats_graph_folder, 'dataset.pickle'), 'wb'))

            # Instantiate the model
            # graph initialization should be before FileWriter, otherwise the graph will not appear in TensorBoard
            model = EntityLSTM(dataset, parameters)

            # Instantiate the writers for TensorBoard
            writers = {}
            for dataset_type in ['train', 'valid', 'test']:
                writers[dataset_type] = tf.summary.FileWriter(
                    tensorboard_log_folders[dataset_type], graph=sess.graph)
            embedding_writer = tf.summary.FileWriter(
                model_folder
            )  # embedding_writer has to write in model_folder, otherwise TensorBoard won't be able to view embeddings

            embeddings_projector_config = projector.ProjectorConfig()
            tensorboard_token_embeddings = embeddings_projector_config.embeddings.add(
            )
            tensorboard_token_embeddings.tensor_name = model.token_embedding_weights.name
            token_list_file_path = os.path.join(
                model_folder, 'tensorboard_metadata_tokens.tsv')
            tensorboard_token_embeddings.metadata_path = os.path.relpath(
                token_list_file_path, '..')

            tensorboard_character_embeddings = embeddings_projector_config.embeddings.add(
            )
            tensorboard_character_embeddings.tensor_name = model.character_embedding_weights.name
            character_list_file_path = os.path.join(
                model_folder,
                'tensorboard_metadata_characters.tsv')  #  'metadata.tsv'
            tensorboard_character_embeddings.metadata_path = os.path.relpath(
                character_list_file_path, '..')

            projector.visualize_embeddings(embedding_writer,
                                           embeddings_projector_config)

            # Write metadata for TensorBoard embeddings
            token_list_file = open(token_list_file_path, 'w')
            for token_index in range(dataset.vocabulary_size):
                token_list_file.write('{0}\n'.format(
                    dataset.index_to_token[token_index]))
            token_list_file.close()

            character_list_file = open(character_list_file_path, 'w')
            print('len(dataset.character_to_index): {0}'.format(
                len(dataset.character_to_index)))
            print('len(dataset.index_to_character): {0}'.format(
                len(dataset.index_to_character)))
            for character_index in range(dataset.alphabet_size):
                if character_index == dataset.PADDING_CHARACTER_INDEX:
                    character_list_file.write('PADDING\n')
                else:
                    character_list_file.write('{0}\n'.format(
                        dataset.index_to_character[character_index]))
            character_list_file.close()

            # Initialize the model
            sess.run(tf.global_variables_initializer())
            model.load_pretrained_token_embeddings(sess, dataset, parameters)

            # Start training + evaluation loop. Each iteration corresponds to 1 epoch.
            step = 0
            bad_counter = 0  # number of epochs with no improvement on the validation test in terms of F1-score
            previous_best_valid_f1_score = 0
            transition_params_trained = np.random.rand(
                len(dataset.unique_labels), len(dataset.unique_labels))
            model_saver = tf.train.Saver(
                max_to_keep=parameters['maximum_number_of_epochs']
            )  # defaults to saving all variables
            epoch_number = -1
            try:
                while True:
                    epoch_number += 1
                    #epoch_number = math.floor(step / len(dataset.token_indices['train']))
                    print('\nStarting epoch {0}'.format(epoch_number), end='')

                    epoch_start_time = time.time()
                    #print('step: {0}'.format(step))

                    # Train model: loop over all sequences of training set with shuffling
                    sequence_numbers = list(
                        range(len(dataset.token_indices['train'])))
                    random.shuffle(sequence_numbers)
                    for sequence_number in sequence_numbers:
                        transition_params_trained = train.train_step(
                            sess, dataset, sequence_number, model,
                            transition_params_trained, parameters)
                        step += 1
                        if step % 100 == 0:
                            print('.', end='', flush=True)
                            #break
                    print('.', flush=True)
                    #print('step: {0}'.format(step))

                    # Predict labels using trained model
                    y_pred = {}
                    y_true = {}
                    output_filepaths = {}
                    for dataset_type in ['train', 'valid', 'test']:
                        #print('dataset_type:     {0}'.format(dataset_type))
                        prediction_output = train.prediction_step(
                            sess, dataset, dataset_type, model,
                            transition_params_trained, step,
                            stats_graph_folder, epoch_number, parameters)
                        y_pred[dataset_type], y_true[
                            dataset_type], output_filepaths[
                                dataset_type] = prediction_output
#                         model_options = None

                    epoch_elapsed_training_time = time.time(
                    ) - epoch_start_time
                    print(
                        'epoch_elapsed_training_time: {0:.2f} seconds'.format(
                            epoch_elapsed_training_time))

                    results['execution_details']['num_epochs'] = epoch_number

                    # Evaluate model: save and plot results
                    evaluate.evaluate_model(results, dataset, y_pred, y_true,
                                            stats_graph_folder, epoch_number,
                                            epoch_start_time, output_filepaths,
                                            parameters)

                    # Save model
                    model_saver.save(
                        sess,
                        os.path.join(model_folder,
                                     'model_{0:05d}.ckpt'.format(epoch_number))
                    )  #, global_step, latest_filename, meta_graph_suffix, write_meta_graph, write_state)

                    # Save TensorBoard logs
                    summary = sess.run(model.summary_op, feed_dict=None)
                    writers['train'].add_summary(summary, epoch_number)

                    # Early stop
                    valid_f1_score = results['epoch'][epoch_number][0][
                        'valid']['f1_score']['micro']
                    if valid_f1_score > previous_best_valid_f1_score:
                        bad_counter = 0
                        previous_best_valid_f1_score = valid_f1_score
                    else:
                        bad_counter += 1

                    if bad_counter > parameters['patience']:
                        print('Early Stop!')
                        results['execution_details']['early_stop'] = True
                        break

                    if epoch_number > parameters['maximum_number_of_epochs']:
                        break


#                     break # debugging

            except KeyboardInterrupt:
                results['execution_details']['keyboard_interrupt'] = True
                #         assess_model.save_results(results, stats_graph_folder)
                print('Training interrupted')

            print('Finishing the experiment')
            end_time = time.time()
            results['execution_details'][
                'train_duration'] = end_time - start_time
            results['execution_details']['train_end'] = end_time
            evaluate.save_results(results, stats_graph_folder)

    sess.close()  # release the session's resources
Ejemplo n.º 20
0
def main():

    #### Parameters - start
    conf_parameters = configparser.ConfigParser()
    conf_parameters.read(os.path.join('.', 'parameters.ini'))
    nested_parameters = utils.convert_configparser_to_dictionary(
        conf_parameters)
    parameters = {}
    for k, v in nested_parameters.items():
        parameters.update(v)
    for k, v in parameters.items():
        if k in [
                'remove_unknown_tokens', 'character_embedding_dimension',
                'character_lstm_hidden_state_dimension',
                'token_embedding_dimension',
                'token_lstm_hidden_state_dimension', 'patience',
                'maximum_number_of_epochs', 'maximum_training_time',
                'number_of_cpu_threads', 'number_of_gpus'
        ]:
            parameters[k] = int(v)
        if k in ['dropout_rate']:
            parameters[k] = float(v)
        if k in [
                'use_character_lstm', 'is_character_lstm_bidirect',
                'is_token_lstm_bidirect', 'use_crf'
        ]:
            parameters[k] = distutils.util.strtobool(v)
    pprint(parameters)

    # Load dataset
    dataset_filepaths = {}
    dataset_filepaths['train'] = os.path.join(
        parameters['dataset_text_folder'], 'train.txt')
    dataset_filepaths['valid'] = os.path.join(
        parameters['dataset_text_folder'], 'valid.txt')
    dataset_filepaths['test'] = os.path.join(parameters['dataset_text_folder'],
                                             'test.txt')
    dataset = ds.Dataset()
    dataset.load_dataset(dataset_filepaths, parameters)

    with tf.Graph().as_default():
        session_conf = tf.ConfigProto(
            device_count={
                'CPU': 1,
                'GPU': 1
            },
            allow_soft_placement=
            True,  #  automatically choose an existing and supported device to run the operations in case the specified one doesn't exist
            log_device_placement=False)

        sess = tf.Session(config=session_conf)

        with sess.as_default():
            # Instantiate model
            model = EntityLSTM(dataset, parameters)
            sess.run(tf.global_variables_initializer())
            model.load_pretrained_token_embeddings(sess, dataset, parameters)

            # Initialize and save execution details
            start_time = time.time()
            experiment_timestamp = utils.get_current_time_in_miliseconds()
            results = {}
            #results['model_options'] = copy.copy(model_options)
            #results['model_options'].pop('optimizer', None)
            results['epoch'] = {}
            results['execution_details'] = {}
            results['execution_details']['train_start'] = start_time
            results['execution_details']['time_stamp'] = experiment_timestamp
            results['execution_details']['early_stop'] = False
            results['execution_details']['keyboard_interrupt'] = False
            results['execution_details']['num_epochs'] = 0
            results['model_options'] = copy.copy(parameters)

            dataset_name = utils.get_basename_without_extension(
                parameters['dataset_text_folder']
            )  #opts.train.replace('/', '_').split('.')[0] # 'conll2003en'
            model_name = '{0}_{1}'.format(
                dataset_name, results['execution_details']['time_stamp'])

            output_folder = os.path.join('..', 'output')
            utils.create_folder_if_not_exists(output_folder)
            stats_graph_folder = os.path.join(
                output_folder, model_name)  # Folder where to save graphs
            #print('stats_graph_folder: {0}'.format(stats_graph_folder))
            utils.create_folder_if_not_exists(stats_graph_folder)
            #             model_folder = os.path.join(stats_graph_folder, 'model')
            #             utils.create_folder_if_not_exists(model_folder)

            step = 0
            bad_counter = 0
            previous_best_valid_f1_score = 0
            transition_params_trained = np.random.rand(
                len(dataset.unique_labels), len(dataset.unique_labels))
            try:
                while True:
                    epoch_number = math.floor(
                        step / len(dataset.token_indices['train']))
                    print('\nStarting epoch {0}'.format(epoch_number), end='')

                    epoch_start_time = time.time()
                    #print('step: {0}'.format(step))

                    # Train model: loop over all sequences of training set with shuffling
                    sequence_numbers = list(
                        range(len(dataset.token_indices['train'])))
                    random.shuffle(sequence_numbers)
                    for sequence_number in sequence_numbers:
                        transition_params_trained = train.train_step(
                            sess, dataset, sequence_number, model,
                            transition_params_trained, parameters)
                        step += 1
                        if step % 100 == 0:
                            print('.', end='', flush=True)
                            #break
                    print('.', flush=True)
                    #print('step: {0}'.format(step))

                    # Predict labels using trained model
                    all_predictions = {}
                    all_y_true = {}
                    output_filepaths = {}
                    for dataset_type in ['train', 'valid', 'test']:
                        #print('dataset_type:     {0}'.format(dataset_type))
                        prediction_output = train.prediction_step(
                            sess, dataset, dataset_type, model,
                            transition_params_trained, step,
                            stats_graph_folder, epoch_number, parameters)
                        all_predictions[dataset_type], all_y_true[
                            dataset_type], output_filepaths[
                                dataset_type] = prediction_output
#                         model_options = None

                    epoch_elapsed_training_time = time.time(
                    ) - epoch_start_time
                    print(
                        'epoch_elapsed_training_time: {0:.2f} seconds'.format(
                            epoch_elapsed_training_time))

                    results['execution_details']['num_epochs'] = epoch_number

                    # Evaluate model: save and plot results
                    evaluate.evaluate_model(results, dataset, all_predictions,
                                            all_y_true, stats_graph_folder,
                                            epoch_number, epoch_start_time,
                                            output_filepaths)

                    # Early stop
                    valid_f1_score = results['epoch'][epoch_number][0][
                        'valid']['f1_score']['micro']
                    if valid_f1_score > previous_best_valid_f1_score:
                        bad_counter = 0
                        previous_best_valid_f1_score = valid_f1_score
                    else:
                        bad_counter += 1

                    if bad_counter > parameters['patience']:
                        print('Early Stop!')
                        results['execution_details']['early_stop'] = True
                        break

                    if epoch_number > parameters['maximum_number_of_epochs']:
                        break


#                     break # debugging

            except KeyboardInterrupt:
                results['execution_details']['keyboard_interrupt'] = True
                #         assess_model.save_results(results, stats_graph_folder)
                print('Training interrupted')

            print('Finishing the experiment')
            end_time = time.time()
            results['execution_details'][
                'train_duration'] = end_time - start_time
            results['execution_details']['train_end'] = end_time
            evaluate.save_results(results, stats_graph_folder)

    sess.close()  # release the session's resources
Ejemplo n.º 21
0
def main():

    parameters, conf_parameters = load_parameters()
    pprint(parameters)
    dataset_filepaths = get_valid_dataset_filepaths(parameters)
    check_parameter_compatiblity(parameters, dataset_filepaths)

    cross_validation = parameters[
        'cross_validation'] if 'cross_validation' in parameters else 1
    valid_fscores = []
    valid_precisions = []
    valid_recalls = []
    for cv in range(0, cross_validation):
        if "als" in dataset_filepaths['train'] and cross_validation > 1:
            train_files = list(range(0, cv)) + list(
                range(cv + 1, cross_validation))
            test_file = cv
            file_train = "tmp_combined.train"
            file_valid = "tmp_combined.test"
            output = []
            for i in train_files:
                with open(dataset_filepaths['train'] + "_" + str(i),
                          "r",
                          encoding="utf-8") as file:
                    output.append(file.read())
            with open(file_train, "w", encoding="utf-8") as file:
                file.write("\n\n".join(output))
            output = []
            with open(dataset_filepaths['train'] + "_" + str(test_file),
                      "r",
                      encoding="utf-8") as file:
                output.append(file.read())
            with open(file_valid, "w", encoding="utf-8") as file:
                file.write("\n\n".join(output))
            dataset_filepaths['train'] = file_train
            dataset_filepaths['valid'] = file_valid
        # Load dataset
        dataset = ds.Dataset(verbose=parameters['verbose'],
                             debug=parameters['debug'])
        dataset.load_vocab_word_embeddings(parameters)
        dataset.load_dataset(dataset_filepaths, parameters)

        # Create graph and session
        with tf.Graph().as_default():
            session_conf = tf.ConfigProto(
                intra_op_parallelism_threads=parameters[
                    'number_of_cpu_threads'],
                inter_op_parallelism_threads=parameters[
                    'number_of_cpu_threads'],
                device_count={
                    'CPU': 1,
                    'GPU': parameters['number_of_gpus']
                },
                allow_soft_placement=
                True,  # automatically choose an existing and supported device to run the operations in case the specified one doesn't exist
                log_device_placement=False)

            session_conf.gpu_options.allow_growth = True

            sess = tf.Session(config=session_conf)

            with sess.as_default():
                # Initialize and save execution details
                start_time = time.time()
                experiment_timestamp = utils.get_current_time_in_miliseconds()
                results = {}
                results['epoch'] = {}
                results['execution_details'] = {}
                results['execution_details']['train_start'] = start_time
                results['execution_details'][
                    'time_stamp'] = experiment_timestamp
                results['execution_details']['early_stop'] = False
                results['execution_details']['keyboard_interrupt'] = False
                results['execution_details']['num_epochs'] = 0
                results['model_options'] = copy.copy(parameters)

                dataset_name = utils.get_basename_without_extension(
                    parameters['dataset_train'])
                if 'data_to_use' in parameters:
                    model_name = '{0}_{1}'.format(
                        parameters['language'] + "_" + dataset_name + "_small",
                        results['execution_details']['time_stamp'])
                else:
                    model_name = '{0}_{1}'.format(
                        parameters['language'] + "_" + dataset_name,
                        results['execution_details']['time_stamp'])

                output_folder = os.path.join('..', 'output')
                utils.create_folder_if_not_exists(output_folder)
                stats_graph_folder = os.path.join(
                    output_folder, model_name)  # Folder where to save graphs
                utils.create_folder_if_not_exists(stats_graph_folder)
                model_folder = os.path.join(stats_graph_folder, 'model')
                utils.create_folder_if_not_exists(model_folder)
                with open(os.path.join(model_folder, 'parameters.ini'),
                          'w') as parameters_file:
                    conf_parameters.write(parameters_file)
                tensorboard_log_folder = os.path.join(stats_graph_folder,
                                                      'tensorboard_logs')
                utils.create_folder_if_not_exists(tensorboard_log_folder)
                tensorboard_log_folders = {}
                for dataset_type in dataset_filepaths.keys():
                    tensorboard_log_folders[dataset_type] = os.path.join(
                        stats_graph_folder, 'tensorboard_logs', dataset_type)
                    utils.create_folder_if_not_exists(
                        tensorboard_log_folders[dataset_type])
                #del dataset.embeddings_matrix
                if not parameters['use_pretrained_model']:
                    pickle.dump(
                        dataset,
                        open(os.path.join(model_folder, 'dataset.pickle'),
                             'wb'))
                #dataset.load_pretrained_word_embeddings(parameters)
                # Instantiate the model
                # graph initialization should be before FileWriter, otherwise the graph will not appear in TensorBoard
                model = EntityLSTM(dataset, parameters)

                # Instantiate the writers for TensorBoard
                writers = {}
                for dataset_type in dataset_filepaths.keys():
                    writers[dataset_type] = tf.summary.FileWriter(
                        tensorboard_log_folders[dataset_type],
                        graph=sess.graph)
                embedding_writer = tf.summary.FileWriter(
                    model_folder
                )  # embedding_writer has to write in model_folder, otherwise TensorBoard won't be able to view embeddings

                embeddings_projector_config = projector.ProjectorConfig()
                tensorboard_token_embeddings = embeddings_projector_config.embeddings.add(
                )
                tensorboard_token_embeddings.tensor_name = model.token_embedding_weights.name
                token_list_file_path = os.path.join(
                    model_folder, 'tensorboard_metadata_tokens.tsv')
                tensorboard_token_embeddings.metadata_path = os.path.relpath(
                    token_list_file_path, '..')

                if parameters['use_character_lstm']:
                    tensorboard_character_embeddings = embeddings_projector_config.embeddings.add(
                    )
                    tensorboard_character_embeddings.tensor_name = model.character_embedding_weights.name
                    character_list_file_path = os.path.join(
                        model_folder, 'tensorboard_metadata_characters.tsv')
                    tensorboard_character_embeddings.metadata_path = os.path.relpath(
                        character_list_file_path, '..')

                projector.visualize_embeddings(embedding_writer,
                                               embeddings_projector_config)

                # Write metadata for TensorBoard embeddings
                token_list_file = codecs.open(token_list_file_path, 'w',
                                              'UTF-8')
                for token_index in range(len(dataset.index_to_token)):
                    token_list_file.write('{0}\n'.format(
                        dataset.index_to_token[token_index]))
                token_list_file.close()

                if parameters['use_character_lstm']:
                    character_list_file = codecs.open(character_list_file_path,
                                                      'w', 'UTF-8')
                    for character_index in range(dataset.alphabet_size):
                        if character_index == dataset.PADDING_CHARACTER_INDEX:
                            character_list_file.write('PADDING\n')
                        else:
                            character_list_file.write('{0}\n'.format(
                                dataset.index_to_character[character_index]))
                    character_list_file.close()

                try:
                    # Initialize the model
                    sess.run(tf.global_variables_initializer())
                    if not parameters['use_pretrained_model']:
                        model.load_pretrained_token_embeddings(
                            sess, dataset, parameters)

                    # Start training + evaluation loop. Each iteration corresponds to 1 epoch.
                    bad_counter = 0  # number of epochs with no improvement on the validation test in terms of F1-score
                    previous_best_valid_f1_score = 0
                    transition_params_trained = np.random.rand(
                        len(dataset.unique_labels), len(dataset.unique_labels)
                    )  #TODO np.random.rand(len(dataset.unique_labels)+2,len(dataset.unique_labels)+2)
                    model_saver = tf.train.Saver(
                        max_to_keep=None
                    )  #parameters['maximum_number_of_epochs'])  # defaults to saving all variables
                    epoch_number = 0

                    while True:
                        epoch_number += 1
                        print('\nStarting epoch {0}'.format(epoch_number))

                        epoch_start_time = time.time()

                        if parameters[
                                'use_pretrained_model'] and epoch_number == 1:
                            # Restore pretrained model parameters
                            transition_params_trained = train.restore_model_parameters_from_pretrained_model(
                                parameters, dataset, sess, model, model_saver)
                        elif epoch_number != 0:
                            # Train model: loop over all sequences of training set with shuffling
                            sequence_numbers = list(
                                range(len(dataset.token_indices['train'])))
                            random.shuffle(sequence_numbers)
                            data_counter = 0
                            sub_id = 0
                            for i in tqdm(range(0, len(sequence_numbers),
                                                parameters['batch_size']),
                                          "Training",
                                          mininterval=1):
                                data_counter += parameters['batch_size']
                                if data_counter >= 20000:
                                    data_counter = 0
                                    sub_id += 0.001
                                    print("Intermediate evaluation number: ",
                                          sub_id)

                                    #model_saver.save(sess,
                                    #                 os.path.join(model_folder, 'model_{0:05d}_{1}.ckpt'.format(epoch_number, len(sequence_numbers)/4/len(sequence_numbers))))
                                    epoch_elapsed_training_time = time.time(
                                    ) - epoch_start_time
                                    print(
                                        'Training completed in {0:.2f} seconds'
                                        .format(epoch_elapsed_training_time),
                                        flush=True)

                                    y_pred, y_true, output_filepaths = train.predict_labels(
                                        sess, model, transition_params_trained,
                                        parameters, dataset,
                                        epoch_number + sub_id,
                                        stats_graph_folder, dataset_filepaths)

                                    # Evaluate model: save and plot results
                                    evaluate.evaluate_model(
                                        results, dataset, y_pred, y_true,
                                        stats_graph_folder, epoch_number,
                                        epoch_start_time, output_filepaths,
                                        parameters)

                                    # Save model
                                    model_saver.save(
                                        sess,
                                        os.path.join(
                                            model_folder,
                                            'model_{0:07.3f}.ckpt'.format(
                                                epoch_number + sub_id)))

                                    # Save TensorBoard logs
                                    summary = sess.run(model.summary_op,
                                                       feed_dict=None)
                                    writers['train'].add_summary(
                                        summary, epoch_number)
                                    writers['train'].flush()
                                    utils.copytree(
                                        writers['train'].get_logdir(),
                                        model_folder)

                                    # Early stop
                                    valid_f1_score = results['epoch'][
                                        epoch_number][0]['valid']['f1_score'][
                                            'micro']
                                    # valid_precision = results['epoch'][epoch_number][0]['valid']['precision']['micro']
                                    # valid_recall = results['epoch'][epoch_number][0]['valid']['recall']['micro']

                                    # valid_fscores.append(valid_f1_score)
                                    if valid_f1_score > previous_best_valid_f1_score:
                                        bad_counter = 0
                                        previous_best_valid_f1_score = valid_f1_score
                                        # previous_best_valid_precision = valid_precision
                                        # previous_best_valid_recall = valid_recall
                                    else:
                                        bad_counter += 1

                                sequence_number = sequence_numbers[
                                    i:i + parameters['batch_size']]
                                transition_params_trained, loss = train.train_step(
                                    sess, dataset, sequence_number, model,
                                    transition_params_trained, parameters)
                        epoch_elapsed_training_time = time.time(
                        ) - epoch_start_time
                        print('Training completed in {0:.2f} seconds'.format(
                            epoch_elapsed_training_time),
                              flush=True)

                        y_pred, y_true, output_filepaths = train.predict_labels(
                            sess, model, transition_params_trained, parameters,
                            dataset, epoch_number, stats_graph_folder,
                            dataset_filepaths)

                        # Evaluate model: save and plot results
                        evaluate.evaluate_model(results, dataset, y_pred,
                                                y_true, stats_graph_folder,
                                                epoch_number, epoch_start_time,
                                                output_filepaths, parameters)

                        # Save model
                        model_saver.save(
                            sess,
                            os.path.join(
                                model_folder,
                                'model_{0:05d}.ckpt'.format(epoch_number)))

                        # Save TensorBoard logs
                        summary = sess.run(model.summary_op, feed_dict=None)
                        writers['train'].add_summary(summary, epoch_number)
                        writers['train'].flush()
                        utils.copytree(writers['train'].get_logdir(),
                                       model_folder)

                        # Early stop
                        valid_f1_score = results['epoch'][epoch_number][0][
                            'valid']['f1_score']['micro']
                        #valid_precision = results['epoch'][epoch_number][0]['valid']['precision']['micro']
                        #valid_recall = results['epoch'][epoch_number][0]['valid']['recall']['micro']

                        #valid_fscores.append(valid_f1_score)
                        if valid_f1_score > previous_best_valid_f1_score:
                            bad_counter = 0
                            previous_best_valid_f1_score = valid_f1_score
                            #previous_best_valid_precision = valid_precision
                            #previous_best_valid_recall = valid_recall
                        else:
                            bad_counter += 1
                        print(
                            "The last {0} epochs have not shown improvements on the validation set."
                            .format(bad_counter))

                        if bad_counter >= parameters['patience']:
                            print('Early Stop!')
                            results['execution_details']['early_stop'] = True
                            break

                        if epoch_number >= parameters[
                                'maximum_number_of_epochs']:
                            break

                except KeyboardInterrupt:
                    results['execution_details']['keyboard_interrupt'] = True
                    print('Training interrupted')
                    # remove the experiment
                    remove_experiment = input(
                        "Do you want to remove the experiment? (yes/y/Yes)")
                    if remove_experiment in ["Yes", "yes", "y"]:
                        shutil.rmtree(stats_graph_folder)
                        print("Folder removed")
                    else:
                        print('Finishing the experiment')
                        end_time = time.time()
                        results['execution_details'][
                            'train_duration'] = end_time - start_time
                        results['execution_details']['train_end'] = end_time
                        evaluate.save_results(results, stats_graph_folder)
                except Exception:
                    logging.exception("")
                    remove_experiment = input(
                        "Do you want to remove the experiment? (yes/y/Yes)")
                    if remove_experiment in ["Yes", "yes", "y"]:
                        shutil.rmtree(stats_graph_folder)
                        print("Folder removed")

        sess.close()  # release the session's resources
        if 'cross_validation' in parameters and parameters[
                'cross_validation'] > 1:
            valid_fscores.append(previous_best_valid_f1_score)
            #valid_precisions.append(previous_best_valid_precision)
            #valid_recalls.append(previous_best_valid_recall)
    if 'cross_validation' in parameters and parameters['cross_validation'] > 1:
        print("mean f1score:", np.mean(valid_fscores))
        #print("mean precision:", np.mean(valid_precisions))
        #print("mean recall:", np.mean(valid_recalls))
        with codecs.open(os.path.join(stats_graph_folder, "result_cv.txt"),
                         "w") as file:
            file.write("F1score " + ", ".join(map(str, valid_fscores)))
            # file.write("Precision " + valid_precisions)
            # file.write("Recall " + valid_recalls)
            file.write("Mean F1score " + str(np.mean(valid_fscores)))
Ejemplo n.º 22
0
    def predict(self, text):
        """
        Predict

        Args:
            text (str): Description.
        """
        self.prediction_count += 1

        if self.prediction_count == 1:
            self.parameters['dataset_text_folder'] = os.path.join(
                '.', 'data', 'temp')
            self.stats_graph_folder, _ = self._create_stats_graph_folder(
                self.parameters)

        # Update the deploy folder, file, and modeldata
        dataset_type = 'deploy'

        # Delete all deployment data
        for filepath in glob.glob(
                os.path.join(self.parameters['dataset_text_folder'],
                             '{0}*'.format(dataset_type))):
            if os.path.isdir(filepath):
                shutil.rmtree(filepath)
            else:
                os.remove(filepath)

        # Create brat folder and file
        dataset_brat_deploy_folder = os.path.join(
            self.parameters['dataset_text_folder'], dataset_type)
        utils.create_folder_if_not_exists(dataset_brat_deploy_folder)
        dataset_brat_deploy_filepath = os.path.join(
            dataset_brat_deploy_folder,
            'temp_{0}.txt'.format(str(self.prediction_count).zfill(5)))
        #self._get_dataset_brat_deploy_filepath(dataset_brat_deploy_folder)
        # print('over here: ',dataset_brat_deploy_filepath)
        with codecs.open(dataset_brat_deploy_filepath, 'w', 'UTF-8') as f:
            f.write(text)

        # Update deploy filepaths
        dataset_filepaths, dataset_brat_folders = self._get_valid_dataset_filepaths(
            self.parameters, dataset_types=[dataset_type])
        self.dataset_filepaths.update(dataset_filepaths)
        self.dataset_brat_folders.update(dataset_brat_folders)

        # Update the dataset for the new deploy set
        self.modeldata.update_dataset(self.dataset_filepaths, [dataset_type])

        # Predict labels and output brat
        output_filepaths = {}
        prediction_output = train.prediction_step(
            self.sess, self.modeldata, dataset_type, self.model,
            self.transition_params_trained, self.stats_graph_folder,
            self.prediction_count, self.parameters, self.dataset_filepaths)

        _, _, output_filepaths[dataset_type] = prediction_output
        conll_to_brat.output_brat(output_filepaths,
                                  self.dataset_brat_folders,
                                  self.stats_graph_folder,
                                  overwrite=True)

        # Print and output result
        text_filepath = os.path.join(
            self.stats_graph_folder, 'brat', 'deploy',
            os.path.basename(dataset_brat_deploy_filepath))
        annotation_filepath = os.path.join(
            self.stats_graph_folder, 'brat', 'deploy', '{0}.ann'.format(
                utils.get_basename_without_extension(
                    dataset_brat_deploy_filepath)))
        text2, entities = brat_to_conll.get_entities_from_brat(
            text_filepath, annotation_filepath, verbose=True)
        assert (text == text2)
        return entities
Ejemplo n.º 23
0
    def predict(self, test_file_path):
        # Not use
        text = ''
        with open(test_file_path, "r") as f:
            text = f.read()
        test_file_path = test_file_path.split('/')[-1]
        self.prediction_count += 1

        if self.prediction_count == 1:
            self.parameters['dataset_text_folder'] = os.path.join('..', 'data', 'temp')
            self.stats_graph_folder, _ = self._create_stats_graph_folder(self.parameters)

        # Update the deploy folder, file, and dataset
        dataset_type = 'deploy'
        ### Delete all deployment data
        for filepath in glob.glob(os.path.join(self.parameters['dataset_text_folder'], '{0}*'.format(dataset_type))):
            if os.path.isdir(filepath):
                shutil.rmtree(filepath)
            else:
                os.remove(filepath)
        ### Create brat folder and file
        dataset_brat_deploy_folder = os.path.join(self.parameters['dataset_text_folder'], dataset_type)
        utils.create_folder_if_not_exists(dataset_brat_deploy_folder)
        dataset_brat_deploy_filepath = os.path.join(dataset_brat_deploy_folder, test_file_path.format(str(self.prediction_count).zfill(5)))#self._get_dataset_brat_deploy_filepath(dataset_brat_deploy_folder)
        with codecs.open(dataset_brat_deploy_filepath, 'w', 'UTF-8') as f:
            f.write(text)
        ### Update deploy filepaths
        dataset_filepaths, dataset_brat_folders = self._get_valid_dataset_filepaths(self.parameters, dataset_types=[dataset_type])
        self.dataset_filepaths.update(dataset_filepaths)
        self.dataset_brat_folders.update(dataset_brat_folders)
        ### Update the dataset for the new deploy set
        self.dataset.update_dataset(self.dataset_filepaths, [dataset_type])

        # Predict labels and output brat
        output_filepaths = {}
        prediction_output = train.prediction_step(self.sess, self.dataset, dataset_type, self.model, self.transition_params_trained, self.stats_graph_folder, self.prediction_count, self.parameters, self.dataset_filepaths)
        _, _, output_filepaths[dataset_type] = prediction_output
        conll_to_brat.output_brat(output_filepaths, self.dataset_brat_folders, self.stats_graph_folder, overwrite=True)

        # Print and output result
        text_filepath = os.path.join(self.stats_graph_folder, 'brat', 'deploy', os.path.basename(dataset_brat_deploy_filepath))
        annotation_filepath = os.path.join(self.stats_graph_folder, 'brat', 'deploy', '{0}.ann'.format(utils.get_basename_without_extension(dataset_brat_deploy_filepath)))
        text2, entities = brat_to_conll.get_entities_from_brat(text_filepath, annotation_filepath, verbose=True)
        assert(text == text2)
        #print (entities)
        os.rename(self.stats_graph_folder, "../data/" + self.stats_graph_folder.split('/')[-1])
        print("Use brat tool to see result at ", "../data/" + self.stats_graph_folder.split('/')[-1])
Ejemplo n.º 24
0
def get_valid_dataset_filepaths(parameters):
    dataset_filepaths = {}
    dataset_brat_folders = {}
    for dataset_type in ['train', 'valid', 'test', 'deploy']:
        dataset_filepaths[dataset_type] = os.path.join(
            parameters['dataset_text_folder'], '{0}.txt'.format(dataset_type))
        dataset_brat_folders[dataset_type] = os.path.join(
            parameters['dataset_text_folder'], dataset_type)
        dataset_compatible_with_brat_filepath = os.path.join(
            parameters['dataset_text_folder'],
            '{0}_compatible_with_brat.txt'.format(dataset_type))

        # Conll file exists
        if os.path.isfile(dataset_filepaths[dataset_type]) and os.path.getsize(
                dataset_filepaths[dataset_type]) > 0:
            # Brat text files exist
            if os.path.exists(dataset_brat_folders[dataset_type]) and len(
                    glob.glob(
                        os.path.join(dataset_brat_folders[dataset_type],
                                     '*.txt'))) > 0:

                # Check compatibility between conll and brat files
                brat_to_conll.check_brat_annotation_and_text_compatibility(
                    dataset_brat_folders[dataset_type])
                if os.path.exists(dataset_compatible_with_brat_filepath):
                    dataset_filepaths[
                        dataset_type] = dataset_compatible_with_brat_filepath
                conll_to_brat.check_compatibility_between_conll_and_brat_text(
                    dataset_filepaths[dataset_type],
                    dataset_brat_folders[dataset_type])

            # Brat text files do not exist
            else:

                # Populate brat text and annotation files based on conll file
                conll_to_brat.conll_to_brat(
                    dataset_filepaths[dataset_type],
                    dataset_compatible_with_brat_filepath,
                    dataset_brat_folders[dataset_type],
                    dataset_brat_folders[dataset_type])
                dataset_filepaths[
                    dataset_type] = dataset_compatible_with_brat_filepath

        # Conll file does not exist
        else:
            # Brat text files exist
            if os.path.exists(dataset_brat_folders[dataset_type]) and len(
                    glob.glob(
                        os.path.join(dataset_brat_folders[dataset_type],
                                     '*.txt'))) > 0:
                dataset_filepath_for_tokenizer = os.path.join(
                    parameters['dataset_text_folder'],
                    '{0}_{1}.txt'.format(dataset_type,
                                         parameters['tokenizer']))
                if os.path.exists(dataset_filepath_for_tokenizer):
                    conll_to_brat.check_compatibility_between_conll_and_brat_text(
                        dataset_filepath_for_tokenizer,
                        dataset_brat_folders[dataset_type])
                else:
                    # Populate conll file based on brat files
                    brat_to_conll.brat_to_conll(
                        dataset_brat_folders[dataset_type],
                        dataset_filepath_for_tokenizer,
                        parameters['tokenizer'], parameters['spacylanguage'])
                dataset_filepaths[
                    dataset_type] = dataset_filepath_for_tokenizer

            # Brat text files do not exist
            else:
                del dataset_filepaths[dataset_type]
                del dataset_brat_folders[dataset_type]
                continue

        if parameters['tagging_format'] == 'bioes':
            # Generate conll file with BIOES format
            bioes_filepath = os.path.join(
                parameters['dataset_text_folder'], '{0}_bioes.txt'.format(
                    utils.get_basename_without_extension(
                        dataset_filepaths[dataset_type])))
            utils_nlp.convert_conll_from_bio_to_bioes(
                dataset_filepaths[dataset_type], bioes_filepath)
            dataset_filepaths[dataset_type] = bioes_filepath

    return dataset_filepaths, dataset_brat_folders
Ejemplo n.º 25
0
def main():


    #### Parameters - start
    conf_parameters = configparser.ConfigParser()
    conf_parameters.read(os.path.join('.','parameters.ini'))
    nested_parameters = utils.convert_configparser_to_dictionary(conf_parameters)
    parameters = {}
    for k,v in nested_parameters.items():
        parameters.update(v)
    for k,v in parameters.items():
        if k in ['remove_unknown_tokens','character_embedding_dimension','character_lstm_hidden_state_dimension','token_embedding_dimension','token_lstm_hidden_state_dimension',
                 'patience','maximum_number_of_epochs','maximum_training_time','number_of_cpu_threads','number_of_gpus']:
            parameters[k] = int(v)
        if k in ['dropout_rate']:
            parameters[k] = float(v)
        if k in ['use_character_lstm','is_character_lstm_bidirect','is_token_lstm_bidirect','use_crf']:
            parameters[k] = distutils.util.strtobool(v)
    pprint(parameters)

    # Load dataset
    dataset_filepaths = {}
    dataset_filepaths['train'] = os.path.join(parameters['dataset_text_folder'], 'train.txt')
    dataset_filepaths['valid'] = os.path.join(parameters['dataset_text_folder'], 'valid.txt')
    dataset_filepaths['test']  = os.path.join(parameters['dataset_text_folder'], 'test.txt')
    dataset = ds.Dataset()
    dataset.load_dataset(dataset_filepaths, parameters)


    with tf.Graph().as_default():
        session_conf = tf.ConfigProto(
          device_count={'CPU': 1, 'GPU': 1},
          allow_soft_placement=True, #  automatically choose an existing and supported device to run the operations in case the specified one doesn't exist
          log_device_placement=False
          )

        sess = tf.Session(config=session_conf)

        with sess.as_default():
            model = EntityLSTM(dataset, parameters)

            # Define training procedure
            global_step = tf.Variable(0, name="global_step", trainable=False)
            if parameters['optimizer'] == 'adam':
                optimizer = tf.train.AdamOptimizer(1e-3)
            elif parameters['optimizer'] == 'sgd':
                optimizer = tf.train.GradientDescentOptimizer(0.005)
            else:
                raise ValueError("The lr_method parameter must be either adam or sgd.")

            # https://github.com/google/prettytensor/issues/6
            # https://www.tensorflow.org/api_docs/python/framework/graph_collections

            #print('tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES) : {0}'.format(tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES) ))
            #print('tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES) : {0}'.format(tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES) ))
            #print('tf.get_collection(tf.GraphKeys.MODEL_VARIABLES) : {0}'.format(tf.get_collection(tf.GraphKeys.MODEL_VARIABLES) ))

            # https://github.com/blei-lab/edward/issues/286#ref-pullrequest-181330211 : utility function to get all tensorflow variables a node depends on


            grads_and_vars = optimizer.compute_gradients(model.loss)

            # By defining a global_step variable and passing it to the optimizer we allow TensorFlow handle the counting of training steps for us.
            # The global step will be automatically incremented by one every time you execute train_op.
            train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step)


            # Initialize all variables
            sess.run(tf.global_variables_initializer())

            # Load pretrained token embeddings
            if not parameters['token_pretrained_embedding_filepath'] == '':
                load_token_embeddings(sess, model.W, dataset, parameters)


            estop = False  # early stop
            start_time = time.time()
            experiment_timestamp = utils.get_current_time_in_miliseconds()
            results = {}
            #results['model_options'] = copy.copy(model_options)
            #results['model_options'].pop('optimizer', None)
            results['epoch'] = {}
            # save/initialize execution details
            results['execution_details'] = {}
            results['execution_details']['train_start'] = start_time
            results['execution_details']['time_stamp'] = experiment_timestamp
            results['execution_details']['early_stop'] = False
            results['execution_details']['keyboard_interrupt'] = False
            results['execution_details']['num_epochs'] = 0
            results['model_options'] = copy.copy(parameters)

            dataset_name = utils.get_basename_without_extension(parameters['dataset_text_folder']) #opts.train.replace('/', '_').split('.')[0] # 'conll2003en'
            model_name = '{0}_{1}'.format(dataset_name, results['execution_details']['time_stamp'])

            output_folder=os.path.join('..', 'output')
            stats_graph_folder=os.path.join(output_folder, model_name) # Folder where to save graphs
            utils.create_folder_if_not_exists(output_folder)
            print('stats_graph_folder: {0}'.format(stats_graph_folder))
            utils.create_folder_if_not_exists(stats_graph_folder)
            model_folder = os.path.join(stats_graph_folder, 'model')
            utils.create_folder_if_not_exists(model_folder)

            step = 0
            bad_counter = 0
            previous_best_valid_f1_score = 0
            transition_params_trained = np.random.rand(len(dataset.unique_labels),len(dataset.unique_labels))
            try:
                while True:
                    epoch_number = math.floor(step / len(dataset.token_indices['train']))
                    print('epoch_number: {0}'.format(epoch_number))

                    epoch_start_time = time.time()

                    #print('step: {0}'.format(step))

                    # Train model: loop over all sequences of training set with shuffling
                    sequence_numbers=list(range(len(dataset.token_indices['train'])))
                    random.shuffle(sequence_numbers)
                    for sequence_number in sequence_numbers:
                        transition_params_trained = train_step(sess, dataset, sequence_number, train_op, global_step, model, transition_params_trained, parameters)
                        step += 1
                        if sequence_number % 100 == 0:
                            print('.',end='', flush=True)
                            #break

                    # Evaluate model
                    print('step: {0}'.format(step))
                    all_predictions = {}
                    all_y_true  = {}
                    output_filepaths = {}
                    for dataset_type in ['train', 'valid', 'test']:
                        print('dataset_type:     {0}'.format(dataset_type))
                        all_predictions[dataset_type], all_y_true[dataset_type], output_filepaths[dataset_type] = evaluate_model(sess, dataset, dataset_type, model, transition_params_trained, step, stats_graph_folder, epoch_number, parameters)
                        model_options = None

                    # Save and plot results
                    # TODO: remove uidx
                    uidx = 0
                    results['epoch'][epoch_number] = []
                    results['execution_details']['num_epochs'] = epoch_number

                    epoch_elapsed_training_time = time.time() - epoch_start_time
                    print('epoch_elapsed_training_time: {0:02f} seconds'.format(epoch_elapsed_training_time))

                    assess_model.assess_and_save(results, dataset, model_options, all_predictions, all_y_true, stats_graph_folder, epoch_number, uidx, epoch_start_time)
                    assess_model.plot_f1_vs_epoch(results, stats_graph_folder, 'f1_score')
                    assess_model.plot_f1_vs_epoch(results, stats_graph_folder, 'accuracy_score')

                    # CoNLL evaluation script
                    for dataset_type in ['train', 'valid', 'test']:
                        conll_evaluation_script = os.path.join('.', 'conlleval')
                        conll_output_filepath = '{0}_conll_evaluation.txt'.format(output_filepaths[dataset_type])
                        shell_command = 'perl {0} < {1} > {2}'.format(conll_evaluation_script, output_filepaths[dataset_type], conll_output_filepath)
                        print('shell_command: {0}'.format(shell_command))
                        #subprocess.call([shell_command])
                        os.system(shell_command)
                        conll_parsed_output = utils_nlp.get_parsed_conll_output(conll_output_filepath)
                        print('conll_parsed_output: {0}'.format(conll_parsed_output))
                        results['epoch'][epoch_number][0][dataset_type]['conll'] = conll_parsed_output
                        results['epoch'][epoch_number][0][dataset_type]['f1_conll'] = {}
                        results['epoch'][epoch_number][0][dataset_type]['f1_conll']['micro'] = results['epoch'][epoch_number][0][dataset_type]['conll']['all']['f1']
                    assess_model.plot_f1_vs_epoch(results, stats_graph_folder, 'f1_conll', from_json=False)

                    #end_time = time.time()
                    #results['execution_details']['train_duration'] = end_time - start_time
                    #results['execution_details']['train_end'] = end_time

                    # Early stop
                    valid_f1_score = results['epoch'][epoch_number][0]['valid']['f1_score']['micro']
                    if  valid_f1_score > previous_best_valid_f1_score:
                        bad_counter = 0
                        previous_best_valid_f1_score = valid_f1_score
                    else:
                        bad_counter += 1


                    if bad_counter > parameters['patience']:
                        print('Early Stop!')
                        results['execution_details']['early_stop'] = True
                        break

                    if epoch_number > parameters['maximum_number_of_epochs']: break

            except KeyboardInterrupt:
                results['execution_details']['keyboard_interrupt'] = True
        #         assess_model.save_results(results, stats_graph_folder)
                print('Training interrupted')

            print('Finishing the experiment')
            end_time = time.time()
            results['execution_details']['train_duration'] = end_time - start_time
            results['execution_details']['train_end'] = end_time
            assess_model.save_results(results, stats_graph_folder)

    sess.close() # release the session's resources