Exemple #1
0
def decode_training_files():
    paths = [os.path.join(generated_data_dir(), 'html',
                          'encoded', '*.encoded'),
             os.path.join(generated_data_dir(), 'expected_json',
                          'encoded', '*.encoded')]
    tokens_path = os.path.join(generated_data_dir(), 'tokens')
    decode_all_files(get_filenames(paths), tokens_path)
Exemple #2
0
def encode_training_files():
    paths = [
        os.path.join(generated_data_dir(), 'html', '*.unescaped'),
        os.path.join(generated_data_dir(), 'expected_json', '*.expected_json')
    ]
    saved_filenames_path = os.path.join(generated_data_dir(),
                                        'training_filenames')
    tokens_path = os.path.join(generated_data_dir(), 'tokens')
    encode_all_html_tables(FILETYPE_TRAINING, paths, saved_filenames_path,
                           tokens_path)
Exemple #3
0
def generate_samples():

    create_dirs([os.path.join(generated_data_dir(), 'html'),
                 os.path.join(generated_data_dir(), 'expected_json'),
                 os.path.join(generated_data_dir(), 'input')])

    data_filenames = []
    for samples_dir in [generated_html_json_dir()]:
        sorted_files = sorted(list(get_filenames([os.path.join(samples_dir, '*')])))
        sorted_files = list(filter(lambda x: x.endswith('unescaped'),
                                   sorted_files))
        data_filenames.extend(sorted_files)

    generate_random_text(data_filenames, NUMBER_OF_OUTPUT_FILES)
Exemple #4
0
    def __init__(self):
        self.all_chars = self.set_of_all_chars_in_data()

        self.regex_number_token = re.compile(r'^num\_\d+$')
        self.MIN_DATA_SIZE = 5
        self.MAX_DATA_SIZE = 20

        self.NUM_TOKENS = 1000

        self.tokens_fn = os.path.join(generated_data_dir(), 'tokens')
        if os.path.exists(self.tokens_fn):
            self.tokens = get_json_from_file(self.tokens_fn)
        else:
            self.tokens = self.create_tokens()
Exemple #5
0
def generate_random_text(input_filenames, num_output_files):
    print('Getting set of all chars in data', end='')
    print(' ... done')

    for id in range(num_output_files):
        input_fn = np.random.choice(input_filenames)
        # input_fn = '/Volumes/datadrive/generated-html-json/0001035713_providian_financial_corp__10-k__2004-01-01_2004-12-31_10-k__tables-extracted_split-tables__24.unescaped'

        # To be done again as some of the numbers that should be empty are 9's,
        # even in the html page.
        print('{:6d}: file: {}'.format(id, input_fn))

        fn_parts = input_fn.split(os.sep)
        fn_name = fn_parts[-1].split('.')
        fn_prefix, fn_type = fn_name[0], fn_name[1]

        json_input_fn = os.sep + os.path.join(*fn_parts[:-1],
                                              fn_prefix + '.json')
        json_generated_output_fn = os.path.join(generated_data_dir(),
                                                'html',
                                                str(id) + '.' + fn_type)
        json_expected_output_fn = os.path.join(generated_data_dir(),
                                               'expected_json',
                                               str(id) + '.expected_json')

        input_generated_fn = os.path.join(generated_data_dir(),
                                          'input',
                                          str(id) + '.input')

        generated_input, json_expected = \
            generate_input(input_fn,
                           fn_type,
                           json_input_fn)

        write_file(json_generated_output_fn, generated_input)
        write_json_to_file(json_expected_output_fn, json_expected)
        copy_file(input_fn, input_generated_fn)
def train_set_max_token_len():
    print('Getting filenames ...', end=' ')
    base_path = os.path.join(generated_data_dir())
    fns = list(get_filenames([os.path.join(base_path, 'html', '*.unescaped')]))
    fns.extend(list(get_filenames([os.path.join(base_path, 'expected_json',
                                                '*.expected_json')])))
    print('done')

    bar = ChargingBar('Processing files', max=len(fns))
    max_token_len = 0
    for fn in fns:
        token_len = len(read_file(fn).split())
        if token_len > max_token_len:
            max_token_len = token_len
        bar.next()

    bar.finish()

    with open(os.path.join(base_path, 'max_token_len'), 'w') as f:
        f.write(f'max_token_len: {max_token_len}')
Exemple #7
0
def tokenize_training_set():
    def update_max_token_len(html, json, max_len):
        html_len, json_len = len(html.split()), len(json.split())
        return max(html_len, max(json_len, max_len))

    input_path = generated_data_dir()
    output_path = tokenized_dir()
    create_dirs(output_path)

    if generate is True:
        input_fns = list(
            get_filenames(
                [os.path.join(generated_html_json_dir(), '*.unescaped')]))
        html_fns, json_fns = [], []
        for id in range(NUMBER_OF_OUTPUTS):
            html_fn = np.random.choice(input_fns)

            fn_parts = html_fn.split(os.sep)
            fn_name = fn_parts[-1].split('.')
            fn_prefix, fn_type = fn_name[0], fn_name[1]

            json_fn = os.sep + os.path.join(*fn_parts[:-1],
                                            fn_prefix + '.json')
            html_fns.append(html_fn)
            json_fns.append(json_fn)

        combined_fns = zip(html_fns, json_fns)
    else:
        combined_fns = zip(
            list(
                get_filenames(
                    [os.path.join(input_path, 'html', '*.unescaped')])),
            list(
                get_filenames([
                    os.path.join(input_path, 'expected_json',
                                 '*.expected_json')
                ])))

    # print(f'combined_fns: {(list(combined_fns))[:2]}')

    update_tokens = []
    separate_files = []
    tokens = set()
    max_token_len = 0

    def file_update(html_fn,
                    html_tokens,
                    json_fn,
                    json_tokens,
                    update_type=SINGLE_FILE):
        if update_type == SINGLE_FILE:
            update_tokens.append(html_fn + '^' + html_tokens + \
                '^' + json_fn + '^' + json_tokens)
        else:  # multiple files created - one for each set
            # of (html, json) input files
            update_tokens.append((html_fn, json_fn))
            create_dirs(os.path.join(output_path, 'separate_files'))

            output_html_fn = os.path.join(
                output_path, 'separate_files',
                html_fn.split(os.sep)[-1] + '.tokenized')
            output_json_fn = os.path.join(
                output_path, 'separate_files',
                json_fn.split(os.sep)[-1] + '.tokenized')
            separate_files.append(output_html_fn + '^' + output_json_fn)
            write_file(output_html_fn, html_tokens)
            write_file(output_json_fn, json_tokens)

    def file_flush(update_type):
        if update_type == SINGLE_FILE:
            write_file(os.path.join(output_path, 'tokenized'),
                       '\n'.join(update_tokens))
        else:
            write_file(
                os.path.join(output_path, 'separate_files', 'file_list'),
                '\n'.join(separate_files))

    for html_fn, json_fn in combined_fns:
        # html_fn = '/Volumes/Seagate/generated-data/html/0.unescaped'
        # json_fn = '/Volumes/Seagate/generated-data/expected_json/0.expected_json'

        print(f'html_fn: {html_fn}')
        print(f'json_fn: {json_fn}')
        html_tokens, json_tokens = tokenize_html_json(html_fn,
                                                      json_fn,
                                                      generate=generate)
        html_tokens = ' '.join(html_tokens).replace("'", "")

        json_tokens = ' '.join(json_tokens).replace("'", "")
        # Remove json string's quotes at the beginning and end
        json_tokens = json_tokens[2:len(json_tokens) - 2]

        max_token_len = update_max_token_len(html_tokens, json_tokens,
                                             max_token_len)

        tokens.update(html_tokens.split())
        tokens.update(json_tokens.split())

        file_update(html_fn,
                    html_tokens,
                    json_fn,
                    json_tokens,
                    update_type=UPDATE_TYPE)

    file_flush(update_type=UPDATE_TYPE)
    tokens = sorted(list(tokens))
    tokens.reverse()
    tokens.extend(['<sos>', '<pad>', '<eos>'])
    tokens.reverse()

    write_json_to_file(os.path.join(output_path, 'tokens'), tokens)

    with open(os.path.join(output_path, 'max_token_len'), 'w') as f:
        f.write(f'max_token_len: {max_token_len}')
Exemple #8
0
def all_encodings(filenames, base_dirname, tokens_path):

    # Since we're writing tokens to a file for each company,
    # and later merging these tokens, the token number
    # must always keep incrementing. This way, our dictionary with
    # (token_num: token_value) will not miss any tokens.

    out_dirname_json = \
        os.path.join(os.sep.join(tokens_path.split(os.sep)[:-1]),
                     'expected_json',
                     'encoded')
    out_dirname_html = \
        os.path.join(os.sep.join(tokens_path.split(os.sep)[:-1]),
                     'html',
                     'encoded')
    create_dirs([out_dirname_json, out_dirname_html])

    current_company_dir = ''
    token_num = Number.START_WORD_NUM.value
    tokens = set()
    tokens_filename = ''
    # num_dirs_to_process = 3
    for filename in filenames:
        # filename = '/Volumes/datadrive/tags-cleaned/0000707605_AMERISERV_FINANCIAL_INC__PA_/10-k/2018-01-01_2018-12-31_10-K/tables-extracted/162.table-extracted'
        print(f'filename: {filename}')
        text = read_file(filename)

        company_dir_idx = len(base_dirname)
        if base_dirname == generated_data_dir():
            company_dir = ''
        else:
            company_dir = filename[company_dir_idx + 1:].split(os.sep)[0]

        if current_company_dir != company_dir:
            if len(tokens) > 0:
                write_tokens_file(tokens, tokens_filename, token_num)
                token_num += len(tokens)
                del tokens

            tokens = set()
            current_company_dir = company_dir
            # num_dirs_to_process -= 1
            # if num_dirs_to_process <= 0:
            #     break
        else:
            # We have to create this variable, and assign to it.
            # This way, we have access to the last filename
            # in the else clause of this for statement.
            tokens_filename = get_tokens_filename(filename, company_dir_idx,
                                                  company_dir, "tokens")

        if filename.endswith('unescaped') or filename.endswith('html') \
           or filename.endswith('table-extracted'):
            find_html_table_encodings(out_dirname_html, filename, text, tokens)
        elif filename.endswith('json'):
            find_json_encodings(out_dirname_json, filename, text, tokens)
    else:
        write_tokens_file(tokens, tokens_filename, token_num)

    all_tokens_filename = os.path.join(base_dirname, 'tokens')

    all_tokens = set()
    for filename in get_filenames([tokens_path]):

        tokens = read_tokens_file(filename)
        all_tokens.update(get_token_values(tokens))

    print(f'len(all_tokens): {len(all_tokens)}')

    # We need to give the offset as the last value in this function call.
    # This allows us to interpret the value of 1 as the start of a
    # number sequence, and not confuse it with an entry in the tokens
    # file that has key = 1.
    write_tokens_file(all_tokens, all_tokens_filename,
                      Number.START_WORD_NUM.value)