# parser.add_argument('--encoding_output_folder', type=str, help='Will be used for training') arg_p = parser.parse_args() print('Building vocabulary...') build_vocabulary(arg_p.training_filename) print('Vectorization...') DATA_LOADER = LazyDataLoader(arg_p.training_filename) _, _, training_records_count = DATA_LOADER.statistics() # TOKEN_INDICES = get_token_indices() chars, c_table = get_chars_and_ctable() inputs = [] targets = [] print('Generating data...') for i in tqdm(range(training_records_count), desc='Generating inputs and targets'): x_, y_ = DATA_LOADER.next() # Pad the data with spaces such that it is always MAXLEN. inputs.append(x_) targets.append(y_) np.savez_compressed('/tmp/x_y.npz', inputs=inputs, targets=targets) print('Done... File is /tmp/x_y.npz')
arg_p = parser.parse_args() print('Building vocabulary...') build_vocabulary(arg_p.training_filename) print('Vectorization...') DATA_LOADER = LazyDataLoader(arg_p.training_filename) _, _, training_records_count = DATA_LOADER.statistics() # TOKEN_INDICES = get_token_indices() chars, c_table = get_chars_and_ctable() inputs = [] targets = [] print('Generating data...') for i in tqdm(range(training_records_count), desc='Generating inputs and targets'): x_, y_ = DATA_LOADER.next() # Pad the data with spaces such that it is always MAXLEN. inputs.append(x_) targets.append(y_) np.savez_compressed('/tmp/x_y.npz', inputs=inputs, targets=targets) print('Done... File is /tmp/x_y.npz')
print('Vectorization...') DATA_LOADER = LazyDataLoader() INPUT_MAX_LEN, OUTPUT_MAX_LEN, TRAINING_SIZE = DATA_LOADER.statistics() TOKEN_INDICES = get_TOKEN_INDICES() chars, c_table = get_chars_and_ctable() questions = [] expected = [] print('Generating data...') while len(questions) < TRAINING_SIZE: x, y = DATA_LOADER.next() # Pad the data with spaces such that it is always MAXLEN. q = x query = q ans = y if ADD_NOISE_TO_DATA: #print('Old query =', query, end=' | ') query, _ = add_noise_to_data(input_str=query, probs=NOISE_PROBS, vocabulary=chars) #print('Query =', query, ' | Noise type =', noise_type) if INVERT: # 反转输入序列 query = query[::-1] questions.append(query)