def matching_filenames(
        saved_filenames_path,
        all_filename_paths,
        filename_type=0,
        selector_weights=[VALIDATION_FILE_PERCENT, TEST_FILE_PERCENT]):
    '''
    selector_weights: For training, selector weights will be [100, 0].
    This is so we can use all the files for training. Our training
    files are not the original ones - each will be generated.
    For validation/testing, we want selector weights to be [80, 20].
    This means we will validate on 80% of our actual files,
    and test on 20%.
    '''

    init_rng()  # Initialize the random number generator.

    try:
        names = get_json_from_file(saved_filenames_path)

        # This will allow us to regenerate the filenames list
        # for the new filename type that is passed in.
        if not selectors_contain_filename_type(names['selectors'],
                                               filename_type):
            raise FileNotFoundError

        return select_filenames(names['filenames'], names['selectors'],
                                filename_type)
    except FileNotFoundError:
        all_filenames = []
        for paths in all_filename_paths:
            all_filenames.extend(get_filenames(paths))

        # Some of our directories will have files which have been processed.
        # Ignore those files by filtering them out.
        all_filenames = [
            fn for fn in all_filenames
            if fn.endswith(('html', 'json', 'expected_json', 'table-extracted',
                            'unescaped'))
        ]
        all_filenames.sort()

        if filename_type == FILETYPE_TRAINING:
            selectors = training_selectors(len(all_filenames))
        else:
            selectors = validation_test_selectors(len(all_filenames),
                                                  selector_weights)
        names = {
            'filename_type': filename_type,
            'filenames': all_filenames,
            'selectors': selectors
        }
        write_json_to_file(saved_filenames_path, names)
        return select_filenames(names['filenames'], names['selectors'],
                                filename_type)
Beispiel #2
0
    def create_tokens(self):

        lengths = np.random.randint(self.MIN_DATA_SIZE, self.MAX_DATA_SIZE + 1,
                                    self.NUM_TOKENS)

        all_tokens = ['<sos>', '<pad>', '<eos>']
        all_tokens.extend(self.special_tokens())
        all_tokens.extend(self.html_structure_tokens())
        all_tokens.extend(self.json_structure_tokens())

        all_tokens.extend([
            ''.join(np.random.choice(self.all_chars, length))
            for length in lengths
        ])

        all_tokens = [x.strip() for x in all_tokens]
        write_json_to_file(self.tokens_fn, all_tokens)

        return all_tokens
Beispiel #3
0
def html_to_json():
    output_dirname = os.path.join(generated_html_json_dir())
    os.makedirs(output_dirname, exist_ok=True)

    result_string = ''
    num_all_files = 0
    num_files_processed = 0
    for full_filepath in get_filenames(html_samples_dir(), 'html_input', '*'):
        # full_filepath = './data/extract/samples/html/html_input/1.html'
        filename = full_filepath.split(os.sep)[-1].lower()

        if not filename.endswith('table-extracted'):
            continue
        print(f'{num_all_files}: full_filepath: {full_filepath}')
        result_string += full_filepath + '\n'

        num_all_files += 1
        html_to_image(full_filepath)
        json_data, error_str = image_to_json('out.png')
        if json_data is None:
            result_string += traceback.format_exc() + '\n\n'
        else:
            num_files_processed += 1
            output_filename = \
                os.path.join(output_dirname,
                            filename.split('.')[0] + '.json')
            print(f'output_filename: {output_filename}')
            write_json_to_file(output_filename, json_data)

            output_html_filename = os.path.join(output_dirname, filename)
            copy_file(full_filepath, output_html_filename)

    result_stats = f'num_files_processed: {num_files_processed}\n' \
        f'num_all_files: {num_all_files}\n' \
        f'success ratio: {num_files_processed / num_all_files}\n'
    print(result_stats)
    result_string += result_stats
    write_file(os.path.join(output_dirname, 'html_to_json_processing_results'),
               result_string)
Beispiel #4
0
def generate_random_text(input_filenames, num_output_files):
    print('Getting set of all chars in data', end='')
    print(' ... done')

    for id in range(num_output_files):
        input_fn = np.random.choice(input_filenames)
        # input_fn = '/Volumes/datadrive/generated-html-json/0001035713_providian_financial_corp__10-k__2004-01-01_2004-12-31_10-k__tables-extracted_split-tables__24.unescaped'

        # To be done again as some of the numbers that should be empty are 9's,
        # even in the html page.
        print('{:6d}: file: {}'.format(id, input_fn))

        fn_parts = input_fn.split(os.sep)
        fn_name = fn_parts[-1].split('.')
        fn_prefix, fn_type = fn_name[0], fn_name[1]

        json_input_fn = os.sep + os.path.join(*fn_parts[:-1],
                                              fn_prefix + '.json')
        json_generated_output_fn = os.path.join(generated_data_dir(),
                                                'html',
                                                str(id) + '.' + fn_type)
        json_expected_output_fn = os.path.join(generated_data_dir(),
                                               'expected_json',
                                               str(id) + '.expected_json')

        input_generated_fn = os.path.join(generated_data_dir(),
                                          'input',
                                          str(id) + '.input')

        generated_input, json_expected = \
            generate_input(input_fn,
                           fn_type,
                           json_input_fn)

        write_file(json_generated_output_fn, generated_input)
        write_json_to_file(json_expected_output_fn, json_expected)
        copy_file(input_fn, input_generated_fn)
Beispiel #5
0
def get_json_sequences(out_dirname,
                       filename,
                       json_text,
                       write_number_dict=True):
    token_seq = []
    word_num = Number.START_WORD_NUM.value
    number_dict = {}
    reverse_number_dict = {}

    matches = regex_words.findall(json_text)
    words = []

    for match in matches:
        if len(match.strip()) == 0:
            continue

        if is_number(match):
            is_negative, num_seq, is_percent = get_number(match)

            if num_seq is not False:
                words.append(
                    number_to_sequence(is_negative, num_seq, is_percent))
            else:
                raise ValueError(f'match: {match} is not a number')
        else:
            words.append(match)

    word_num = update_seq_and_number_dict(words, token_seq, word_num,
                                          number_dict, reverse_number_dict)

    if write_number_dict is True:
        write_json_to_file(
            os.path.join(out_dirname,
                         filename.split(os.sep)[-1] + '.nums'),
            convert_dict_values(number_dict))
    return token_seq, number_dict
Beispiel #6
0
def get_html_sequences(out_dirname, filename, top_tag, write_number_dict=True):
    token_seq = []
    word_num = Number.START_WORD_NUM.value
    number_dict = {}
    reverse_number_dict = {}

    def recurse(tag):
        nonlocal token_seq, word_num

        if isinstance(tag, NavigableString):
            words = []

            # We need to split the tag first, because part of the tag
            # may have $1,009 for example. If we split using punctuation,
            # we will get two words with 1 and 9 (since we're converting
            # the 009 to a number). When we put it back we will get 19.
            # Instead, we split the tag using spaces first, then check
            # if it is a number (including characters $,.()%). We extract
            # that number (excluding $,()% characters) and write
            # it to our list for further procecssing.
            for word in tag.split():
                # We want to store numbers that we find within the
                # cells of the tables with their negative sign,
                # and their % sign. We're going to output unsigned
                # integers, so we create known numbers to denote
                # - and % and start/end sequence numbers for our
                # number sequence.
                is_negative, num_seq, is_percent = get_number(word)
                if num_seq is not False:
                    # We must append the tuple here.
                    # If we extend, each value in the tuple will be
                    # separately appended and we will lose the
                    # tuple.
                    words.append(
                        number_to_sequence(is_negative, num_seq, is_percent))
                else:
                    for x in split_using_punctuation(word):
                        words.append(x)
            word_num = update_seq_and_number_dict(words, token_seq, word_num,
                                                  number_dict,
                                                  reverse_number_dict)
        else:
            token_seq.append(tag.name.strip().lower())

            attr_names_values = []
            for name_or_value in get_attr_names_values(tag):
                for x in name_or_value.split():
                    attr_names_values.extend(split_using_punctuation(x))

            word_num = update_seq_and_number_dict(attr_names_values, token_seq,
                                                  word_num, number_dict,
                                                  reverse_number_dict)
            for child in tag.children:
                recurse(child)
            token_seq.append('end_' + tag.name.strip().lower())

        return word_num

    recurse(top_tag)

    if write_number_dict is True:
        write_json_to_file(
            os.path.join(out_dirname,
                         filename.split(os.sep)[-1] + '.nums'),
            convert_dict_values(number_dict))
    return token_seq, number_dict
Beispiel #7
0
def tokenize_training_set():
    def update_max_token_len(html, json, max_len):
        html_len, json_len = len(html.split()), len(json.split())
        return max(html_len, max(json_len, max_len))

    input_path = generated_data_dir()
    output_path = tokenized_dir()
    create_dirs(output_path)

    if generate is True:
        input_fns = list(
            get_filenames(
                [os.path.join(generated_html_json_dir(), '*.unescaped')]))
        html_fns, json_fns = [], []
        for id in range(NUMBER_OF_OUTPUTS):
            html_fn = np.random.choice(input_fns)

            fn_parts = html_fn.split(os.sep)
            fn_name = fn_parts[-1].split('.')
            fn_prefix, fn_type = fn_name[0], fn_name[1]

            json_fn = os.sep + os.path.join(*fn_parts[:-1],
                                            fn_prefix + '.json')
            html_fns.append(html_fn)
            json_fns.append(json_fn)

        combined_fns = zip(html_fns, json_fns)
    else:
        combined_fns = zip(
            list(
                get_filenames(
                    [os.path.join(input_path, 'html', '*.unescaped')])),
            list(
                get_filenames([
                    os.path.join(input_path, 'expected_json',
                                 '*.expected_json')
                ])))

    # print(f'combined_fns: {(list(combined_fns))[:2]}')

    update_tokens = []
    separate_files = []
    tokens = set()
    max_token_len = 0

    def file_update(html_fn,
                    html_tokens,
                    json_fn,
                    json_tokens,
                    update_type=SINGLE_FILE):
        if update_type == SINGLE_FILE:
            update_tokens.append(html_fn + '^' + html_tokens + \
                '^' + json_fn + '^' + json_tokens)
        else:  # multiple files created - one for each set
            # of (html, json) input files
            update_tokens.append((html_fn, json_fn))
            create_dirs(os.path.join(output_path, 'separate_files'))

            output_html_fn = os.path.join(
                output_path, 'separate_files',
                html_fn.split(os.sep)[-1] + '.tokenized')
            output_json_fn = os.path.join(
                output_path, 'separate_files',
                json_fn.split(os.sep)[-1] + '.tokenized')
            separate_files.append(output_html_fn + '^' + output_json_fn)
            write_file(output_html_fn, html_tokens)
            write_file(output_json_fn, json_tokens)

    def file_flush(update_type):
        if update_type == SINGLE_FILE:
            write_file(os.path.join(output_path, 'tokenized'),
                       '\n'.join(update_tokens))
        else:
            write_file(
                os.path.join(output_path, 'separate_files', 'file_list'),
                '\n'.join(separate_files))

    for html_fn, json_fn in combined_fns:
        # html_fn = '/Volumes/Seagate/generated-data/html/0.unescaped'
        # json_fn = '/Volumes/Seagate/generated-data/expected_json/0.expected_json'

        print(f'html_fn: {html_fn}')
        print(f'json_fn: {json_fn}')
        html_tokens, json_tokens = tokenize_html_json(html_fn,
                                                      json_fn,
                                                      generate=generate)
        html_tokens = ' '.join(html_tokens).replace("'", "")

        json_tokens = ' '.join(json_tokens).replace("'", "")
        # Remove json string's quotes at the beginning and end
        json_tokens = json_tokens[2:len(json_tokens) - 2]

        max_token_len = update_max_token_len(html_tokens, json_tokens,
                                             max_token_len)

        tokens.update(html_tokens.split())
        tokens.update(json_tokens.split())

        file_update(html_fn,
                    html_tokens,
                    json_fn,
                    json_tokens,
                    update_type=UPDATE_TYPE)

    file_flush(update_type=UPDATE_TYPE)
    tokens = sorted(list(tokens))
    tokens.reverse()
    tokens.extend(['<sos>', '<pad>', '<eos>'])
    tokens.reverse()

    write_json_to_file(os.path.join(output_path, 'tokens'), tokens)

    with open(os.path.join(output_path, 'max_token_len'), 'w') as f:
        f.write(f'max_token_len: {max_token_len}')
Beispiel #8
0
###################################################

THEGAMEDATA_PATH = "thegamedata"

###################################################

cred = credentials.Certificate('firebase/sacckey.json')
default_app = firebase_admin.initialize_app(cred)
db = firestore.client()
print("firebase initialized", db)

###################################################

create_dir("backup", verbose=True)

print("retrieving doc")
gamedatacoll = db.collection(THEGAMEDATA_PATH)
thegamepgn_docref = gamedatacoll.document("pgn")
thegamepgn_dict = thegamepgn_docref.get().to_dict()

print("writing json")
write_json_to_file("backup/pgn.json", thegamepgn_dict)

print("setting backup in db")
thegamepgn_backup_docref = gamedatacoll.document("backuppgn")
thegamepgn_backup_docref.set(thegamepgn_dict)

print("backup done")

###################################################