# -*- coding: utf-8 -*- import os import numpy as np from keras import layers from keras.models import Sequential from six.moves import range from constants import INVERT from data_gen import LazyDataLoader from utils import colors from utils import get_chars_and_ctable ls -alrt DATA_LOADER = LazyDataLoader() INPUT_MAX_LEN, OUTPUT_MAX_LEN, TRAINING_SIZE = DATA_LOADER.statistics() chars, ctable = get_chars_and_ctable() if not os.path.exists('x_y.npz'): raise Exception('Please run the vectorization script before.') print('Loading data from prefetch...') data = np.load('x_y.npz') x_train = data['x_train'] x_val = data['x_val'] y_train = data['y_train'] y_val = data['y_val']
'--training_filename', type=str, help='Result of run_data_processing.py. ' 'Something like: /home/premy/BreachCompilationAnalysis/edit-distances/1.csv', required=True) # parser.add_argument('--encoding_output_folder', type=str, help='Will be used for training') arg_p = parser.parse_args() print('Building vocabulary...') build_vocabulary(arg_p.training_filename) print('Vectorization...') DATA_LOADER = LazyDataLoader(arg_p.training_filename) _, _, training_records_count = DATA_LOADER.statistics() # TOKEN_INDICES = get_token_indices() chars, c_table = get_chars_and_ctable() inputs = [] targets = [] print('Generating data...') for i in tqdm(range(training_records_count), desc='Generating inputs and targets'): x_, y_ = DATA_LOADER.next() # Pad the data with spaces such that it is always MAXLEN. inputs.append(x_)
parser = argparse.ArgumentParser('Data Encoding Tool.') parser.add_argument('--training_filename', type=str, help='Result of run_data_processing.py. ' 'Something like: /home/premy/BreachCompilationAnalysis/edit-distances/1.csv', required=True) # parser.add_argument('--encoding_output_folder', type=str, help='Will be used for training') arg_p = parser.parse_args() print('Building vocabulary...') build_vocabulary(arg_p.training_filename) print('Vectorization...') DATA_LOADER = LazyDataLoader(arg_p.training_filename) _, _, training_records_count = DATA_LOADER.statistics() # TOKEN_INDICES = get_token_indices() chars, c_table = get_chars_and_ctable() inputs = [] targets = [] print('Generating data...') for i in tqdm(range(training_records_count), desc='Generating inputs and targets'): x_, y_ = DATA_LOADER.next() # Pad the data with spaces such that it is always MAXLEN. inputs.append(x_) targets.append(y_)