def train(hidden_size, batch_size): batcher = Batcher() print('Data:') print(batcher.inputs.shape) print(batcher.targets.shape) model = get_model(hidden_size, batcher.chars_len()) model.compile(loss={ 'op': 'categorical_crossentropy', 'char': 'categorical_crossentropy' }, optimizer='adam', metrics=['accuracy']) model.summary() for grad_step in range(int(1e9)): ppp = gen_large_chunk_single_thread(batcher, batcher.inputs, batcher.targets, chunk_size=batch_size) x_train, y_train_1, y_train_2, x_val, y_val_1, y_val_2, val_sub_inputs, val_sub_targets = ppp model.train_on_batch(x=x_train, y=[y_train_1, y_train_2]) print( dict( zip(model.metrics_names, model.test_on_batch(x=x_val, y=[y_val_1, y_val_2])))) # guess = c_table.decode(preds[0], calc_argmax=False) # top_passwords = predict_top_most_likely_passwords_monte_carlo(model, row_x, 100) # p = model.predict(row_x, batch_size=32, verbose=0)[0] # p.shape (12, 82) # [np.random.choice(a=range(82), size=1, p=p[i, :]) for i in range(12)] # s = [np.random.choice(a=range(82), size=1, p=p[i, :])[0] for i in range(12)] # c_table.decode(s, calc_argmax=False) # Could sample 1000 and take the most_common() if grad_step % 100 == 0: row_x, password_target, password_input = x_val, val_sub_targets, val_sub_inputs ops, char = model.predict(row_x, verbose=0) predicted_chars = list(batcher.decode(char)) ops = ops.argmax(axis=1) decoded_op = [] for op in ops: if op == 0: decoded_op.append('insert') elif op == 1: decoded_op.append('replace') else: decoded_op.append('delete') for i, (x, y, pc, po) in enumerate( zip(password_input, password_target, predicted_chars, decoded_op)): print('x :', x) print('y :', y) print('predict char :', pc) print('predict op :', po) print('---------------------') if i >= 100: break
def gen_large_chunk_single_thread(sed: Batcher, inputs_, targets_, chunk_size): # make it simple now. random_indices = np.random.choice(a=range(len(inputs_)), size=chunk_size, replace=True) sub_inputs = inputs_[random_indices] sub_targets = targets_[random_indices] n = len(sub_inputs) x = np.zeros( (chunk_size, sed.ENCODING_MAX_PASSWORD_LENGTH, sed.chars_len()), dtype=float) y2_char = np.zeros(shape=(n, sed.chars_len())) y1_op = np.zeros(shape=(n, 3)) for i in range(n): # ed = 1 edit_dist = Levenshtein.editops(sub_inputs[i], sub_targets[i])[0] op = edit_dist[0] assert edit_dist[1] == edit_dist[2] if op == 'insert': op_encoding = [1, 0, 0] char_changed = sub_targets[i][edit_dist[1]] elif op == 'replace': op_encoding = [0, 1, 0] char_changed = sub_targets[i][edit_dist[1]] elif op == 'delete': op_encoding = [0, 0, 1] char_changed = sub_inputs[i][edit_dist[1]] else: raise Exception('Unsupported op.') y1_op[i] = op_encoding y2_char[i] = sed.encode(char_changed, 1)[0] for i, element in enumerate(sub_inputs): x[i] = sed.encode(element) split_at = int(len(x) * 0.9) (x_train, x_val) = x[:split_at], x[split_at:] (y_train_1, y_val_1) = y1_op[:split_at], y1_op[split_at:] (y_train_2, y_val_2) = y2_char[:split_at], y2_char[split_at:] val_sub_targets = sub_targets[split_at:] val_sub_inputs = sub_inputs[split_at:] return x_train, y_train_1, y_train_2, x_val, y_val_1, y_val_2, val_sub_inputs, val_sub_targets