def decode_training_files(): paths = [os.path.join(generated_data_dir(), 'html', 'encoded', '*.encoded'), os.path.join(generated_data_dir(), 'expected_json', 'encoded', '*.encoded')] tokens_path = os.path.join(generated_data_dir(), 'tokens') decode_all_files(get_filenames(paths), tokens_path)
def encode_training_files(): paths = [ os.path.join(generated_data_dir(), 'html', '*.unescaped'), os.path.join(generated_data_dir(), 'expected_json', '*.expected_json') ] saved_filenames_path = os.path.join(generated_data_dir(), 'training_filenames') tokens_path = os.path.join(generated_data_dir(), 'tokens') encode_all_html_tables(FILETYPE_TRAINING, paths, saved_filenames_path, tokens_path)
def generate_samples(): create_dirs([os.path.join(generated_data_dir(), 'html'), os.path.join(generated_data_dir(), 'expected_json'), os.path.join(generated_data_dir(), 'input')]) data_filenames = [] for samples_dir in [generated_html_json_dir()]: sorted_files = sorted(list(get_filenames([os.path.join(samples_dir, '*')]))) sorted_files = list(filter(lambda x: x.endswith('unescaped'), sorted_files)) data_filenames.extend(sorted_files) generate_random_text(data_filenames, NUMBER_OF_OUTPUT_FILES)
def __init__(self): self.all_chars = self.set_of_all_chars_in_data() self.regex_number_token = re.compile(r'^num\_\d+$') self.MIN_DATA_SIZE = 5 self.MAX_DATA_SIZE = 20 self.NUM_TOKENS = 1000 self.tokens_fn = os.path.join(generated_data_dir(), 'tokens') if os.path.exists(self.tokens_fn): self.tokens = get_json_from_file(self.tokens_fn) else: self.tokens = self.create_tokens()
def generate_random_text(input_filenames, num_output_files): print('Getting set of all chars in data', end='') print(' ... done') for id in range(num_output_files): input_fn = np.random.choice(input_filenames) # input_fn = '/Volumes/datadrive/generated-html-json/0001035713_providian_financial_corp__10-k__2004-01-01_2004-12-31_10-k__tables-extracted_split-tables__24.unescaped' # To be done again as some of the numbers that should be empty are 9's, # even in the html page. print('{:6d}: file: {}'.format(id, input_fn)) fn_parts = input_fn.split(os.sep) fn_name = fn_parts[-1].split('.') fn_prefix, fn_type = fn_name[0], fn_name[1] json_input_fn = os.sep + os.path.join(*fn_parts[:-1], fn_prefix + '.json') json_generated_output_fn = os.path.join(generated_data_dir(), 'html', str(id) + '.' + fn_type) json_expected_output_fn = os.path.join(generated_data_dir(), 'expected_json', str(id) + '.expected_json') input_generated_fn = os.path.join(generated_data_dir(), 'input', str(id) + '.input') generated_input, json_expected = \ generate_input(input_fn, fn_type, json_input_fn) write_file(json_generated_output_fn, generated_input) write_json_to_file(json_expected_output_fn, json_expected) copy_file(input_fn, input_generated_fn)
def train_set_max_token_len(): print('Getting filenames ...', end=' ') base_path = os.path.join(generated_data_dir()) fns = list(get_filenames([os.path.join(base_path, 'html', '*.unescaped')])) fns.extend(list(get_filenames([os.path.join(base_path, 'expected_json', '*.expected_json')]))) print('done') bar = ChargingBar('Processing files', max=len(fns)) max_token_len = 0 for fn in fns: token_len = len(read_file(fn).split()) if token_len > max_token_len: max_token_len = token_len bar.next() bar.finish() with open(os.path.join(base_path, 'max_token_len'), 'w') as f: f.write(f'max_token_len: {max_token_len}')
def tokenize_training_set(): def update_max_token_len(html, json, max_len): html_len, json_len = len(html.split()), len(json.split()) return max(html_len, max(json_len, max_len)) input_path = generated_data_dir() output_path = tokenized_dir() create_dirs(output_path) if generate is True: input_fns = list( get_filenames( [os.path.join(generated_html_json_dir(), '*.unescaped')])) html_fns, json_fns = [], [] for id in range(NUMBER_OF_OUTPUTS): html_fn = np.random.choice(input_fns) fn_parts = html_fn.split(os.sep) fn_name = fn_parts[-1].split('.') fn_prefix, fn_type = fn_name[0], fn_name[1] json_fn = os.sep + os.path.join(*fn_parts[:-1], fn_prefix + '.json') html_fns.append(html_fn) json_fns.append(json_fn) combined_fns = zip(html_fns, json_fns) else: combined_fns = zip( list( get_filenames( [os.path.join(input_path, 'html', '*.unescaped')])), list( get_filenames([ os.path.join(input_path, 'expected_json', '*.expected_json') ]))) # print(f'combined_fns: {(list(combined_fns))[:2]}') update_tokens = [] separate_files = [] tokens = set() max_token_len = 0 def file_update(html_fn, html_tokens, json_fn, json_tokens, update_type=SINGLE_FILE): if update_type == SINGLE_FILE: update_tokens.append(html_fn + '^' + html_tokens + \ '^' + json_fn + '^' + json_tokens) else: # multiple files created - one for each set # of (html, json) input files update_tokens.append((html_fn, json_fn)) create_dirs(os.path.join(output_path, 'separate_files')) output_html_fn = os.path.join( output_path, 'separate_files', html_fn.split(os.sep)[-1] + '.tokenized') output_json_fn = os.path.join( output_path, 'separate_files', json_fn.split(os.sep)[-1] + '.tokenized') separate_files.append(output_html_fn + '^' + output_json_fn) write_file(output_html_fn, html_tokens) write_file(output_json_fn, json_tokens) def file_flush(update_type): if update_type == SINGLE_FILE: write_file(os.path.join(output_path, 'tokenized'), '\n'.join(update_tokens)) else: write_file( os.path.join(output_path, 'separate_files', 'file_list'), '\n'.join(separate_files)) for html_fn, json_fn in combined_fns: # html_fn = '/Volumes/Seagate/generated-data/html/0.unescaped' # json_fn = '/Volumes/Seagate/generated-data/expected_json/0.expected_json' print(f'html_fn: {html_fn}') print(f'json_fn: {json_fn}') html_tokens, json_tokens = tokenize_html_json(html_fn, json_fn, generate=generate) html_tokens = ' '.join(html_tokens).replace("'", "") json_tokens = ' '.join(json_tokens).replace("'", "") # Remove json string's quotes at the beginning and end json_tokens = json_tokens[2:len(json_tokens) - 2] max_token_len = update_max_token_len(html_tokens, json_tokens, max_token_len) tokens.update(html_tokens.split()) tokens.update(json_tokens.split()) file_update(html_fn, html_tokens, json_fn, json_tokens, update_type=UPDATE_TYPE) file_flush(update_type=UPDATE_TYPE) tokens = sorted(list(tokens)) tokens.reverse() tokens.extend(['<sos>', '<pad>', '<eos>']) tokens.reverse() write_json_to_file(os.path.join(output_path, 'tokens'), tokens) with open(os.path.join(output_path, 'max_token_len'), 'w') as f: f.write(f'max_token_len: {max_token_len}')
def all_encodings(filenames, base_dirname, tokens_path): # Since we're writing tokens to a file for each company, # and later merging these tokens, the token number # must always keep incrementing. This way, our dictionary with # (token_num: token_value) will not miss any tokens. out_dirname_json = \ os.path.join(os.sep.join(tokens_path.split(os.sep)[:-1]), 'expected_json', 'encoded') out_dirname_html = \ os.path.join(os.sep.join(tokens_path.split(os.sep)[:-1]), 'html', 'encoded') create_dirs([out_dirname_json, out_dirname_html]) current_company_dir = '' token_num = Number.START_WORD_NUM.value tokens = set() tokens_filename = '' # num_dirs_to_process = 3 for filename in filenames: # filename = '/Volumes/datadrive/tags-cleaned/0000707605_AMERISERV_FINANCIAL_INC__PA_/10-k/2018-01-01_2018-12-31_10-K/tables-extracted/162.table-extracted' print(f'filename: {filename}') text = read_file(filename) company_dir_idx = len(base_dirname) if base_dirname == generated_data_dir(): company_dir = '' else: company_dir = filename[company_dir_idx + 1:].split(os.sep)[0] if current_company_dir != company_dir: if len(tokens) > 0: write_tokens_file(tokens, tokens_filename, token_num) token_num += len(tokens) del tokens tokens = set() current_company_dir = company_dir # num_dirs_to_process -= 1 # if num_dirs_to_process <= 0: # break else: # We have to create this variable, and assign to it. # This way, we have access to the last filename # in the else clause of this for statement. tokens_filename = get_tokens_filename(filename, company_dir_idx, company_dir, "tokens") if filename.endswith('unescaped') or filename.endswith('html') \ or filename.endswith('table-extracted'): find_html_table_encodings(out_dirname_html, filename, text, tokens) elif filename.endswith('json'): find_json_encodings(out_dirname_json, filename, text, tokens) else: write_tokens_file(tokens, tokens_filename, token_num) all_tokens_filename = os.path.join(base_dirname, 'tokens') all_tokens = set() for filename in get_filenames([tokens_path]): tokens = read_tokens_file(filename) all_tokens.update(get_token_values(tokens)) print(f'len(all_tokens): {len(all_tokens)}') # We need to give the offset as the last value in this function call. # This allows us to interpret the value of 1 as the start of a # number sequence, and not confuse it with an entry in the tokens # file that has key = 1. write_tokens_file(all_tokens, all_tokens_filename, Number.START_WORD_NUM.value)