Esempio n. 1
0
def unescape_all_tables(search_path):
    for filename in get_filenames(search_path):
        print(f'Un-escaping file: {filename}')
        parts = filename.split('.')
        out_filename = '.'.join(parts[:-1]) + '.unescaped'
        converted = html.unescape(read_file(filename))
        write_file(out_filename, converted)
Esempio n. 2
0
def decode_training_files():
    paths = [os.path.join(generated_data_dir(), 'html',
                          'encoded', '*.encoded'),
             os.path.join(generated_data_dir(), 'expected_json',
                          'encoded', '*.encoded')]
    tokens_path = os.path.join(generated_data_dir(), 'tokens')
    decode_all_files(get_filenames(paths), tokens_path)
Esempio n. 3
0
def find_all_encodings(file_type, paths, saved_filenames_path, tokens_path):
    # filenames = matching_filenames(saved_filenames_path,
    #                                paths,
    #                                file_type)
    filenames = get_filenames(paths)
    print('Starting all encodings')
    base_dirname = os.sep.join(saved_filenames_path.split(os.sep)[:-1])
    all_encodings(filenames, base_dirname, tokens_path)
Esempio n. 4
0
def encode_all_html_tables(file_type, paths, saved_filenames_path,
                           tokens_path):

    # Read tokens into a dictionary using value:token.
    # This allows us to write the token given each
    # value as we encode each file.
    tokens = read_tokens_file(tokens_path)
    tokens = flip_tokens_keys_values(tokens)

    # num_dirs_to_process = 3
    # current_company_dir = ''
    out_dirname_json = \
        os.path.join(os.sep.join(tokens_path.split(os.sep)[:-1]),
                     'expected_json',
                     'encoded')
    out_dirname_html = \
        os.path.join(os.sep.join(tokens_path.split(os.sep)[:-1]),
                     'html',
                     'encoded')
    create_dirs([out_dirname_json, out_dirname_html])

    max_encoded_file_token_len = 0
    # filenames = matching_filenames(saved_filenames_path,
    #                                paths,
    #                                file_type)
    filenames = get_filenames(paths)

    for filename in filenames:

        # company_dir_idx = len(cleaned_tags_dir())
        # company_dir = filename[company_dir_idx+1:].split(os.sep)[0]

        # if current_company_dir != company_dir:
        #     current_company_dir = company_dir
        #     num_dirs_to_process -= 1
        #     if num_dirs_to_process <= 0:
        #         break

        # filename = '/Volumes/datadrive/tags-cleaned/0000707605_AMERISERV_FINANCIAL_INC__PA_/10-k/2018-01-01_2018-12-31_10-K/tables-extracted/162.table-extracted'
        print(f'filename: {filename}')
        file_data = read_file(filename)

        if filename.endswith('json'):
            token_len = encode_json(out_dirname_json, filename, file_data,
                                    tokens)
        else:
            token_len = encode_html_table(out_dirname_html, filename,
                                          file_data, tokens)

        max_encoded_file_token_len = max(max_encoded_file_token_len, token_len)

    with open(os.path.join(out_dirname_json, 'max_encoded_file_token_len'),
              'w') as f:
        f.write(f'max_encoded_file_token_len={max_encoded_file_token_len}')
    with open(os.path.join(out_dirname_html, 'max_encoded_file_token_len'),
              'w') as f:
        f.write(f'max_encoded_file_token_len={max_encoded_file_token_len}')
Esempio n. 5
0
def remove_single_parens(search_path):

    for filename in get_filenames(search_path):
        # filename = '/Volumes/datadrive/generated-html-json/0001035713_providian_financial_corp__10-k__2004-01-01_2004-12-31_10-k__tables-extracted_split-tables__24.table-extracted'

        print(f'Removing single parens from file: {filename}')
        parts = filename.split('.')
        out_filename = '.'.join(parts[:-1]) + '.remove-single-parens'

        top_tag = handle_single_parens(read_file(filename))
        write_file(out_filename, str(top_tag))
def train_set_max_token_len():
    print('Getting filenames ...', end=' ')
    base_path = os.path.join(generated_data_dir())
    fns = list(get_filenames([os.path.join(base_path, 'html', '*.unescaped')]))
    fns.extend(list(get_filenames([os.path.join(base_path, 'expected_json',
                                                '*.expected_json')])))
    print('done')

    bar = ChargingBar('Processing files', max=len(fns))
    max_token_len = 0
    for fn in fns:
        token_len = len(read_file(fn).split())
        if token_len > max_token_len:
            max_token_len = token_len
        bar.next()

    bar.finish()

    with open(os.path.join(base_path, 'max_token_len'), 'w') as f:
        f.write(f'max_token_len: {max_token_len}')
Esempio n. 7
0
def check_hand_created_samples():
    result = True
    for samples_dir, input_name in \
        zip([text_samples_dir(), html_samples_dir()],
            ['text_input', 'html_input']):
        data_filenames = get_filenames(samples_dir, input_name, '*')
        json_filenames = get_filenames(samples_dir, 'json_input', '*')
        data_filenames = sorted(data_filenames)
        json_filenames = sorted(json_filenames)
        for d_fn, j_fn in zip(data_filenames, json_filenames):
            print(f'Checking:\n  {d_fn}\n  {j_fn}\n')
            input_data = read_file(d_fn)
            json_input_data = get_json_from_file(j_fn)

            if data_contains_all_elements(input_data,
                                          json_input_data) is False:
                print(f'Errors found in:\n  input: {d_fn}\n'
                      f'  json_input: {j_fn}')
                result = False
    return result
def matching_filenames(
        saved_filenames_path,
        all_filename_paths,
        filename_type=0,
        selector_weights=[VALIDATION_FILE_PERCENT, TEST_FILE_PERCENT]):
    '''
    selector_weights: For training, selector weights will be [100, 0].
    This is so we can use all the files for training. Our training
    files are not the original ones - each will be generated.
    For validation/testing, we want selector weights to be [80, 20].
    This means we will validate on 80% of our actual files,
    and test on 20%.
    '''

    init_rng()  # Initialize the random number generator.

    try:
        names = get_json_from_file(saved_filenames_path)

        # This will allow us to regenerate the filenames list
        # for the new filename type that is passed in.
        if not selectors_contain_filename_type(names['selectors'],
                                               filename_type):
            raise FileNotFoundError

        return select_filenames(names['filenames'], names['selectors'],
                                filename_type)
    except FileNotFoundError:
        all_filenames = []
        for paths in all_filename_paths:
            all_filenames.extend(get_filenames(paths))

        # Some of our directories will have files which have been processed.
        # Ignore those files by filtering them out.
        all_filenames = [
            fn for fn in all_filenames
            if fn.endswith(('html', 'json', 'expected_json', 'table-extracted',
                            'unescaped'))
        ]
        all_filenames.sort()

        if filename_type == FILETYPE_TRAINING:
            selectors = training_selectors(len(all_filenames))
        else:
            selectors = validation_test_selectors(len(all_filenames),
                                                  selector_weights)
        names = {
            'filename_type': filename_type,
            'filenames': all_filenames,
            'selectors': selectors
        }
        write_json_to_file(saved_filenames_path, names)
        return select_filenames(names['filenames'], names['selectors'],
                                filename_type)
Esempio n. 9
0
def generate_samples():

    create_dirs([os.path.join(generated_data_dir(), 'html'),
                 os.path.join(generated_data_dir(), 'expected_json'),
                 os.path.join(generated_data_dir(), 'input')])

    data_filenames = []
    for samples_dir in [generated_html_json_dir()]:
        sorted_files = sorted(list(get_filenames([os.path.join(samples_dir, '*')])))
        sorted_files = list(filter(lambda x: x.endswith('unescaped'),
                                   sorted_files))
        data_filenames.extend(sorted_files)

    generate_random_text(data_filenames, NUMBER_OF_OUTPUT_FILES)
Esempio n. 10
0
def clean_all_tables(input_paths):
    for filename in get_filenames(input_paths):
        prefix = filename.split(os.sep)[-1].split('.')[0]
        out_filename = os.path.join(generated_html_json_dir(),
                                    prefix + '.cleaned')

        print(f'filename: {filename}')
        table_tag = BeautifulSoup(read_file(filename), 'html.parser')

        remove_tags(table_tag)

        out_dirname_parts = out_filename.split(os.sep)[:-1]
        ensure_dir_exists(os.path.join(os.sep, *out_dirname_parts))

        write_file(out_filename, table_tag.prettify())
def test_set_max_token_len():
    print('Getting filenames ...', end=' ')
    base_path = tables_extracted_split_tables_dir()
    fns = list(
        get_filenames(
            [os.path.join(base_path, '*', '10-k', '*', '*.table-extracted')]))
    print('done.')
    bar = ChargingBar('Processing files', max=len(fns))
    max_token_len = 0
    for fn in fns:
        token_len = len(read_file(fn).split())
        if token_len > max_token_len:
            max_token_len = token_len
        bar.next()

    bar.finish()

    with open(os.path.join(base_path, 'max_token_len'), 'w') as f:
        f.write(f'max_token_len: {max_token_len}')
Esempio n. 12
0
def find_unprocessed_tag_names():
    unprocessed_tags = set()
    unprocessed_tags_exist = False
    for filename in get_filenames(extracted_tables_dir(),
                                  '*', '10-k', '*', '*', '*'):
        table_tag = BeautifulSoup(read_file(filename), 'html.parser')

        descendant_tag_names = find_descendant_tag_names(table_tag.descendants)

        diff = descendant_tag_names - set(tag_actions.keys())

        unprocessed_tags.update(diff)
        if len(diff) > 0:
            unprocessed_tags_exist = True
            print(f'filename: {filename}')
            print(f'unprocessed_tags: {unprocessed_tags}')

    if unprocessed_tags_exist:
        print(f'unprocessed_tags: {unprocessed_tags}')
    else:
        print('No unprocessed tags found')
Esempio n. 13
0
def html_to_json():
    output_dirname = os.path.join(generated_html_json_dir())
    os.makedirs(output_dirname, exist_ok=True)

    result_string = ''
    num_all_files = 0
    num_files_processed = 0
    for full_filepath in get_filenames(html_samples_dir(), 'html_input', '*'):
        # full_filepath = './data/extract/samples/html/html_input/1.html'
        filename = full_filepath.split(os.sep)[-1].lower()

        if not filename.endswith('table-extracted'):
            continue
        print(f'{num_all_files}: full_filepath: {full_filepath}')
        result_string += full_filepath + '\n'

        num_all_files += 1
        html_to_image(full_filepath)
        json_data, error_str = image_to_json('out.png')
        if json_data is None:
            result_string += traceback.format_exc() + '\n\n'
        else:
            num_files_processed += 1
            output_filename = \
                os.path.join(output_dirname,
                            filename.split('.')[0] + '.json')
            print(f'output_filename: {output_filename}')
            write_json_to_file(output_filename, json_data)

            output_html_filename = os.path.join(output_dirname, filename)
            copy_file(full_filepath, output_html_filename)

    result_stats = f'num_files_processed: {num_files_processed}\n' \
        f'num_all_files: {num_all_files}\n' \
        f'success ratio: {num_files_processed / num_all_files}\n'
    print(result_stats)
    result_string += result_stats
    write_file(os.path.join(output_dirname, 'html_to_json_processing_results'),
               result_string)
Esempio n. 14
0
def tokenize_training_set():
    def update_max_token_len(html, json, max_len):
        html_len, json_len = len(html.split()), len(json.split())
        return max(html_len, max(json_len, max_len))

    input_path = generated_data_dir()
    output_path = tokenized_dir()
    create_dirs(output_path)

    if generate is True:
        input_fns = list(
            get_filenames(
                [os.path.join(generated_html_json_dir(), '*.unescaped')]))
        html_fns, json_fns = [], []
        for id in range(NUMBER_OF_OUTPUTS):
            html_fn = np.random.choice(input_fns)

            fn_parts = html_fn.split(os.sep)
            fn_name = fn_parts[-1].split('.')
            fn_prefix, fn_type = fn_name[0], fn_name[1]

            json_fn = os.sep + os.path.join(*fn_parts[:-1],
                                            fn_prefix + '.json')
            html_fns.append(html_fn)
            json_fns.append(json_fn)

        combined_fns = zip(html_fns, json_fns)
    else:
        combined_fns = zip(
            list(
                get_filenames(
                    [os.path.join(input_path, 'html', '*.unescaped')])),
            list(
                get_filenames([
                    os.path.join(input_path, 'expected_json',
                                 '*.expected_json')
                ])))

    # print(f'combined_fns: {(list(combined_fns))[:2]}')

    update_tokens = []
    separate_files = []
    tokens = set()
    max_token_len = 0

    def file_update(html_fn,
                    html_tokens,
                    json_fn,
                    json_tokens,
                    update_type=SINGLE_FILE):
        if update_type == SINGLE_FILE:
            update_tokens.append(html_fn + '^' + html_tokens + \
                '^' + json_fn + '^' + json_tokens)
        else:  # multiple files created - one for each set
            # of (html, json) input files
            update_tokens.append((html_fn, json_fn))
            create_dirs(os.path.join(output_path, 'separate_files'))

            output_html_fn = os.path.join(
                output_path, 'separate_files',
                html_fn.split(os.sep)[-1] + '.tokenized')
            output_json_fn = os.path.join(
                output_path, 'separate_files',
                json_fn.split(os.sep)[-1] + '.tokenized')
            separate_files.append(output_html_fn + '^' + output_json_fn)
            write_file(output_html_fn, html_tokens)
            write_file(output_json_fn, json_tokens)

    def file_flush(update_type):
        if update_type == SINGLE_FILE:
            write_file(os.path.join(output_path, 'tokenized'),
                       '\n'.join(update_tokens))
        else:
            write_file(
                os.path.join(output_path, 'separate_files', 'file_list'),
                '\n'.join(separate_files))

    for html_fn, json_fn in combined_fns:
        # html_fn = '/Volumes/Seagate/generated-data/html/0.unescaped'
        # json_fn = '/Volumes/Seagate/generated-data/expected_json/0.expected_json'

        print(f'html_fn: {html_fn}')
        print(f'json_fn: {json_fn}')
        html_tokens, json_tokens = tokenize_html_json(html_fn,
                                                      json_fn,
                                                      generate=generate)
        html_tokens = ' '.join(html_tokens).replace("'", "")

        json_tokens = ' '.join(json_tokens).replace("'", "")
        # Remove json string's quotes at the beginning and end
        json_tokens = json_tokens[2:len(json_tokens) - 2]

        max_token_len = update_max_token_len(html_tokens, json_tokens,
                                             max_token_len)

        tokens.update(html_tokens.split())
        tokens.update(json_tokens.split())

        file_update(html_fn,
                    html_tokens,
                    json_fn,
                    json_tokens,
                    update_type=UPDATE_TYPE)

    file_flush(update_type=UPDATE_TYPE)
    tokens = sorted(list(tokens))
    tokens.reverse()
    tokens.extend(['<sos>', '<pad>', '<eos>'])
    tokens.reverse()

    write_json_to_file(os.path.join(output_path, 'tokens'), tokens)

    with open(os.path.join(output_path, 'max_token_len'), 'w') as f:
        f.write(f'max_token_len: {max_token_len}')
Esempio n. 15
0
def all_encodings(filenames, base_dirname, tokens_path):

    # Since we're writing tokens to a file for each company,
    # and later merging these tokens, the token number
    # must always keep incrementing. This way, our dictionary with
    # (token_num: token_value) will not miss any tokens.

    out_dirname_json = \
        os.path.join(os.sep.join(tokens_path.split(os.sep)[:-1]),
                     'expected_json',
                     'encoded')
    out_dirname_html = \
        os.path.join(os.sep.join(tokens_path.split(os.sep)[:-1]),
                     'html',
                     'encoded')
    create_dirs([out_dirname_json, out_dirname_html])

    current_company_dir = ''
    token_num = Number.START_WORD_NUM.value
    tokens = set()
    tokens_filename = ''
    # num_dirs_to_process = 3
    for filename in filenames:
        # filename = '/Volumes/datadrive/tags-cleaned/0000707605_AMERISERV_FINANCIAL_INC__PA_/10-k/2018-01-01_2018-12-31_10-K/tables-extracted/162.table-extracted'
        print(f'filename: {filename}')
        text = read_file(filename)

        company_dir_idx = len(base_dirname)
        if base_dirname == generated_data_dir():
            company_dir = ''
        else:
            company_dir = filename[company_dir_idx + 1:].split(os.sep)[0]

        if current_company_dir != company_dir:
            if len(tokens) > 0:
                write_tokens_file(tokens, tokens_filename, token_num)
                token_num += len(tokens)
                del tokens

            tokens = set()
            current_company_dir = company_dir
            # num_dirs_to_process -= 1
            # if num_dirs_to_process <= 0:
            #     break
        else:
            # We have to create this variable, and assign to it.
            # This way, we have access to the last filename
            # in the else clause of this for statement.
            tokens_filename = get_tokens_filename(filename, company_dir_idx,
                                                  company_dir, "tokens")

        if filename.endswith('unescaped') or filename.endswith('html') \
           or filename.endswith('table-extracted'):
            find_html_table_encodings(out_dirname_html, filename, text, tokens)
        elif filename.endswith('json'):
            find_json_encodings(out_dirname_json, filename, text, tokens)
    else:
        write_tokens_file(tokens, tokens_filename, token_num)

    all_tokens_filename = os.path.join(base_dirname, 'tokens')

    all_tokens = set()
    for filename in get_filenames([tokens_path]):

        tokens = read_tokens_file(filename)
        all_tokens.update(get_token_values(tokens))

    print(f'len(all_tokens): {len(all_tokens)}')

    # We need to give the offset as the last value in this function call.
    # This allows us to interpret the value of 1 as the start of a
    # number sequence, and not confuse it with an entry in the tokens
    # file that has key = 1.
    write_tokens_file(all_tokens, all_tokens_filename,
                      Number.START_WORD_NUM.value)
Esempio n. 16
0
def decode_validation_test_files():
    paths = os.path.join(cleaned_tags_dir(),
                         '*', '10-k', '*', '*', '*.encoded')
    tokens_path = os.path.join(cleaned_tags_dir(), 'tokens')
    decode_all_files(get_filenames(paths), tokens_path)