Example #1
0
def train(hidden_size, batch_size):
    batcher = Batcher()
    print('Data:')
    print(batcher.inputs.shape)
    print(batcher.targets.shape)

    model = get_model(hidden_size, batcher.chars_len())

    model.compile(loss={
        'op': 'categorical_crossentropy',
        'char': 'categorical_crossentropy'
    },
                  optimizer='adam',
                  metrics=['accuracy'])

    model.summary()

    for grad_step in range(int(1e9)):
        ppp = gen_large_chunk_single_thread(batcher,
                                            batcher.inputs,
                                            batcher.targets,
                                            chunk_size=batch_size)
        x_train, y_train_1, y_train_2, x_val, y_val_1, y_val_2, val_sub_inputs, val_sub_targets = ppp
        model.train_on_batch(x=x_train, y=[y_train_1, y_train_2])
        print(
            dict(
                zip(model.metrics_names,
                    model.test_on_batch(x=x_val, y=[y_val_1, y_val_2]))))
        # guess = c_table.decode(preds[0], calc_argmax=False)
        # top_passwords = predict_top_most_likely_passwords_monte_carlo(model, row_x, 100)
        # p = model.predict(row_x, batch_size=32, verbose=0)[0]
        # p.shape (12, 82)
        # [np.random.choice(a=range(82), size=1, p=p[i, :]) for i in range(12)]
        # s = [np.random.choice(a=range(82), size=1, p=p[i, :])[0] for i in range(12)]
        # c_table.decode(s, calc_argmax=False)
        # Could sample 1000 and take the most_common()
        if grad_step % 100 == 0:
            row_x, password_target, password_input = x_val, val_sub_targets, val_sub_inputs
            ops, char = model.predict(row_x, verbose=0)
            predicted_chars = list(batcher.decode(char))
            ops = ops.argmax(axis=1)
            decoded_op = []
            for op in ops:
                if op == 0:
                    decoded_op.append('insert')
                elif op == 1:
                    decoded_op.append('replace')
                else:
                    decoded_op.append('delete')
            for i, (x, y, pc, po) in enumerate(
                    zip(password_input, password_target, predicted_chars,
                        decoded_op)):
                print('x            :', x)
                print('y            :', y)
                print('predict char :', pc)
                print('predict op   :', po)
                print('---------------------')
                if i >= 100:
                    break
Example #2
0
def gen_large_chunk_single_thread(sed: Batcher, inputs_, targets_, chunk_size):
    # make it simple now.
    random_indices = np.random.choice(a=range(len(inputs_)),
                                      size=chunk_size,
                                      replace=True)
    sub_inputs = inputs_[random_indices]
    sub_targets = targets_[random_indices]

    n = len(sub_inputs)
    x = np.zeros(
        (chunk_size, sed.ENCODING_MAX_PASSWORD_LENGTH, sed.chars_len()),
        dtype=float)
    y2_char = np.zeros(shape=(n, sed.chars_len()))
    y1_op = np.zeros(shape=(n, 3))

    for i in range(n):
        # ed = 1
        edit_dist = Levenshtein.editops(sub_inputs[i], sub_targets[i])[0]
        op = edit_dist[0]
        assert edit_dist[1] == edit_dist[2]
        if op == 'insert':
            op_encoding = [1, 0, 0]
            char_changed = sub_targets[i][edit_dist[1]]
        elif op == 'replace':
            op_encoding = [0, 1, 0]
            char_changed = sub_targets[i][edit_dist[1]]
        elif op == 'delete':
            op_encoding = [0, 0, 1]
            char_changed = sub_inputs[i][edit_dist[1]]
        else:
            raise Exception('Unsupported op.')
        y1_op[i] = op_encoding
        y2_char[i] = sed.encode(char_changed, 1)[0]

    for i, element in enumerate(sub_inputs):
        x[i] = sed.encode(element)

    split_at = int(len(x) * 0.9)
    (x_train, x_val) = x[:split_at], x[split_at:]
    (y_train_1, y_val_1) = y1_op[:split_at], y1_op[split_at:]
    (y_train_2, y_val_2) = y2_char[:split_at], y2_char[split_at:]
    val_sub_targets = sub_targets[split_at:]
    val_sub_inputs = sub_inputs[split_at:]

    return x_train, y_train_1, y_train_2, x_val, y_val_1, y_val_2, val_sub_inputs, val_sub_targets