Ejemplo n.º 1
0
def test_rnn_deriv_ref(sequence_length, input_size, hidden_size, batch_size,
                       return_sequence, weight_initializer, bias_initializer,
                       transformer_factory):

    assert batch_size == 1, "the recurrent reference implementation only support batch size 1"
    assert return_sequence is True, "the reference rnn only supports sequences for deriv"

    # Get input placeholder and numpy array
    input_placeholder, input_value = make_placeholder(input_size, sequence_length, batch_size)

    # Construct network weights and initial state, if desired
    W_in, W_rec, b, init_state, init_state_value = make_weights(input_placeholder, hidden_size,
                                                                weight_initializer,
                                                                bias_initializer)

    # Compute reference numpy RNN
    rnn_ref = RefRecurrent(input_size, hidden_size, return_sequence=return_sequence)
    rnn_ref.set_weights(W_in, W_rec, b.reshape(rnn_ref.bh.shape))

    # Prepare deltas for gradient check
    output_shape = (hidden_size, sequence_length, batch_size)

    # generate random deltas tensor
    deltas = np.random.randn(*output_shape)

    # the reference code expects these shapes:
    # input_shape: (seq_len, input_size, batch_size)
    # output_shape: (seq_len, hidden_size, batch_size)
    dW_in, dW_rec, db = rnn_ref.lossFun(input_value.transpose([1, 0, 2]),
                                        deltas.copy().transpose([1, 0, 2]),
                                        init_states=init_state_value)[:3]

    # Generate ngraph RNN
    rnn_ng = Recurrent(hidden_size, init=W_in, init_inner=W_rec, activation=Tanh(),
                       reset_cells=True, return_sequence=return_sequence)

    # fprop ngraph RNN
    out_ng = rnn_ng.train_outputs(input_placeholder)

    deltas_constant = ng.constant(deltas, axes=out_ng.axes)
    params = [(rnn_ng.W_input, W_in),
              (rnn_ng.W_recur, W_rec),
              (rnn_ng.b, b)]

    with ExecutorFactory() as ex:
        # Create derivative computations and execute
        param_updates = list()
        for px, _ in params:
            update = ng.deriv(out_ng, px, error=deltas_constant)
            param_updates.append(ex.executor(update, input_placeholder))

        for update_fun, ref_val in zip(param_updates, [dW_in, dW_rec, db]):
            ng.testing.assert_allclose(update_fun(input_value),
                                       ref_val.squeeze(),
                                       rtol=bprop_rtol, atol=bprop_atol)
Ejemplo n.º 2
0
def test_rnn_fprop(sequence_length, input_size, hidden_size, batch_size,
                   return_sequence, weight_initializer, bias_initializer,
                   init_state, extra_axes, backward, transformer_factory):

    assert batch_size == 1, "the recurrent reference implementation only support batch size 1"

    # Get input placeholder and numpy array
    input_placeholder, input_value = make_placeholder(input_size,
                                                      sequence_length,
                                                      batch_size,
                                                      extra_axes=extra_axes)

    # Construct network weights and initial state, if desired
    W_in, W_rec, b, init_state, init_state_value = make_weights(
        input_placeholder, hidden_size, weight_initializer, bias_initializer,
        init_state)

    # Compute reference numpy RNN
    rnn_ref = RefRecurrent(input_size,
                           hidden_size,
                           return_sequence=return_sequence)
    rnn_ref.set_weights(W_in.reshape(rnn_ref.Wxh.shape), W_rec,
                        b.reshape(rnn_ref.bh.shape))

    # Compute reference numpy RNN
    input_shape = (input_size, sequence_length, batch_size)
    h_ref_list = rnn_ref.fprop_only(input_value.reshape(input_shape).transpose(
        [1, 0, 2]),
                                    init_states=init_state_value,
                                    backward=backward)

    # Generate ngraph RNN
    rnn_ng = Recurrent(hidden_size,
                       init=W_in,
                       init_inner=W_rec,
                       activation=Tanh(),
                       reset_cells=True,
                       return_sequence=return_sequence,
                       backward=backward)

    # fprop ngraph RNN
    out_ng = rnn_ng(input_placeholder, init_state=init_state)

    with ExecutorFactory() as ex:
        # Create computation and execute
        if init_state is not None:
            fprop_neon_fun = ex.executor(out_ng, input_placeholder, init_state)
            fprop_neon = fprop_neon_fun(input_value, init_state_value)

        else:
            fprop_neon_fun = ex.executor(out_ng, input_placeholder)
            fprop_neon = fprop_neon_fun(input_value)

        # Compare output with reference implementation
        if return_sequence is True:
            fprop_neon = fprop_neon[:, :, 0]
        ng.testing.assert_allclose(fprop_neon,
                                   h_ref_list,
                                   rtol=fprop_rtol,
                                   atol=fprop_atol)
Ejemplo n.º 3
0
def define_recurrent_layers(out_axes=None,
                            celltype='RNN',
                            recurrent_units=[32],
                            init=GlorotInit(),
                            return_sequence=True):
    layers = []
    for e, i in enumerate(recurrent_units):
        layer_return_sequence = e < len(recurrent_units) - 1 or return_sequence
        if celltype == 'RNN':
            layers.append(
                Recurrent(nout=i,
                          init=init,
                          backward=False,
                          activation=Tanh(),
                          return_sequence=layer_return_sequence))
        elif celltype == 'LSTM':
            layers.append(
                LSTM(nout=i,
                     init=init,
                     backward=False,
                     activation=Tanh(),
                     gate_activation=Logistic(),
                     return_sequence=layer_return_sequence))
    if out_axes is not None:
        affine_layer = Affine(weight_init=init,
                              bias_init=init,
                              activation=Identity(),
                              axes=out_axes)
        layers.append(affine_layer)
    return layers
Ejemplo n.º 4
0
def test_rnn_deriv_numerical(sequence_length, input_size, hidden_size,
                             batch_size, return_sequence, weight_initializer,
                             bias_initializer, backward, init_state,
                             transformer_factory):

    # Get input placeholder and numpy array
    input_placeholder, input_value = make_placeholder(input_size,
                                                      sequence_length,
                                                      batch_size)

    # Construct network weights and initial state, if desired
    W_in, W_rec, b, init_state, init_state_value = make_weights(
        input_placeholder, hidden_size, weight_initializer, bias_initializer,
        init_state)

    # Generate ngraph RNN
    rnn_ng = Recurrent(hidden_size,
                       init=W_in,
                       init_inner=W_rec,
                       activation=Tanh(),
                       reset_cells=True,
                       return_sequence=return_sequence,
                       backward=backward)

    # fprop ngraph RNN
    out_ng = rnn_ng(input_placeholder, init_state=init_state)

    params = [(rnn_ng.W_input, W_in), (rnn_ng.W_recur, W_rec), (rnn_ng.b, b)]

    with ExecutorFactory() as ex:
        # Create derivative computations and execute
        param_updates = list()
        for px, _ in params:
            if init_state is not None:
                update = (ex.derivative(out_ng, px, input_placeholder,
                                        init_state),
                          ex.numeric_derivative(out_ng, px, delta,
                                                input_placeholder, init_state))
            else:
                update = (ex.derivative(out_ng, px, input_placeholder),
                          ex.numeric_derivative(out_ng, px, delta,
                                                input_placeholder))
            param_updates.append(update)

        for (deriv_s, deriv_n), (_, val) in zip(param_updates, params):
            if init_state is not None:
                ng.testing.assert_allclose(deriv_s(val, input_value,
                                                   init_state_value),
                                           deriv_n(val, input_value,
                                                   init_state_value),
                                           rtol=num_rtol,
                                           atol=num_atol)
            else:
                ng.testing.assert_allclose(deriv_s(val, input_value),
                                           deriv_n(val, input_value),
                                           rtol=num_rtol,
                                           atol=num_atol)
Ejemplo n.º 5
0
def test_inference_reuse_recurrent(recurrent_input):

    layer = Recurrent(10, dummy_init, activation=lambda x: x)
    layer(recurrent_input)
    train_params = (layer.W_input, layer.W_recur)
    with Layer.inference_mode_on():
        layer(recurrent_input)
        inference_params = (layer.W_input, layer.W_recur)

    for train_param, inference_param in zip(train_params, inference_params):
        assert train_param is inference_param
Ejemplo n.º 6
0

def expand_onehot(x):
    return ng.one_hot(x, axis=ax.Y)


# weight initialization
init = UniformInit(low=-0.08, high=0.08)

if args.use_lut:
    layer_0 = LookupTable(50, 100, init, update=True, pad_idx=0)
else:
    layer_0 = Preprocess(functor=lambda x: ng.one_hot(x, axis=ax.Y))

if args.layer_type == "rnn":
    rlayer = Recurrent(hidden_size, init, activation=Tanh())
elif args.layer_type == "birnn":
    rlayer = BiRNN(hidden_size,
                   init,
                   activation=Tanh(),
                   return_sequence=True,
                   sum_out=True)

# model initialization
seq1 = Sequential([
    layer_0, rlayer,
    Affine(init, activation=Softmax(), bias_init=init, axes=(ax.Y, ))
])

optimizer = RMSProp()
Ejemplo n.º 7
0
train_set = ArrayIterator(imdb_data['train'],
                          batch_size=args.batch_size,
                          total_iterations=args.num_iterations)
valid_set = ArrayIterator(imdb_data['valid'], batch_size=args.batch_size)

inputs = train_set.make_placeholders()
ax.Y.length = imdb_dataset.nclass

# weight initialization
init = UniformInit(low=-0.08, high=0.08)

if args.layer_type == "rnn":
    rlayer = Recurrent(hidden_size,
                       init,
                       activation=Tanh(),
                       reset_cells=True,
                       return_sequence=False)
else:
    rlayer = BiRNN(hidden_size,
                   init,
                   activation=Tanh(),
                   reset_cells=True,
                   return_sequence=False,
                   sum_out=True)

# model initialization
seq1 = Sequential([
    LookupTable(vocab_size, embed_size, init, update=True, pad_idx=pad_idx),
    rlayer,
    Affine(init, activation=Softmax(), bias_init=init, axes=(ax.Y, ))
Ejemplo n.º 8
0
def test_seq2seq_deriv_ref(batch_size, sequence_length_enc,
                           sequence_length_dec, input_size, hidden_size,
                           weight_initializer, bias_initializer,
                           transformer_factory):

    # TODO: are these assumptions true?
    assert batch_size == 1, "the seq2seq reference implementation only support batch size 1"

    # Get input placeholders and numpy arrays
    input_placeholder_enc, input_value_enc, = \
        make_placeholder(input_size, sequence_length_enc, batch_size)
    input_placeholder_dec, input_value_dec, = \
        make_placeholder(input_size, sequence_length_dec, batch_size)

    # Construct encoder weights
    W_in_enc, W_rec_enc, b_enc, _, _ = make_weights(input_placeholder_enc,
                                                    hidden_size,
                                                    weight_initializer,
                                                    bias_initializer,
                                                    init_state=False)

    # Construct decoder weights
    W_in_dec, W_rec_dec, b_dec, _, _ = make_weights(input_placeholder_dec,
                                                    hidden_size,
                                                    weight_initializer,
                                                    bias_initializer,
                                                    init_state=False)

    # Reference numpy seq2seq
    seq2seq_ref = RefSeq2Seq(input_size,
                             hidden_size,
                             decoder_return_sequence=True)
    seq2seq_ref.set_weights(W_in_enc, W_rec_enc,
                            b_enc.reshape(seq2seq_ref.bh_enc.shape), W_in_dec,
                            W_rec_dec, b_dec.reshape(seq2seq_ref.bh_dec.shape))

    # Prepare deltas for gradient check
    output_shape = (hidden_size, sequence_length_dec, batch_size)

    # generate random deltas tensor
    deltas = np.random.randn(*output_shape)

    # the reference code expects these shapes:
    # input_shape: (seq_len, input_size, batch_size)
    # output_shape: (seq_len, hidden_size, batch_size)
    dW_in_enc, dW_rec_enc, db_enc, dW_in_dec, dW_rec_dec, db_dec, encoding_ref, hs_return_dec = \
        seq2seq_ref.lossFun(input_value_enc.transpose([1, 0, 2]),
                            input_value_dec.transpose([1, 0, 2]),
                            deltas.copy().transpose([1, 0, 2]))

    # Generate ngraph Seq2Seq
    rnn_enc_ng = Recurrent(hidden_size,
                           init=W_in_enc,
                           init_inner=W_rec_enc,
                           activation=Tanh(),
                           reset_cells=True,
                           return_sequence=False)
    rnn_dec_ng = Recurrent(hidden_size,
                           init=W_in_dec,
                           init_inner=W_rec_dec,
                           activation=Tanh(),
                           reset_cells=True,
                           return_sequence=True)

    # ngraph fprop graph
    encoding_ng = rnn_enc_ng(input_placeholder_enc, init_state=None)
    output_ng = rnn_dec_ng(input_placeholder_dec, init_state=encoding_ng)

    deltas_constant = ng.constant(deltas, axes=output_ng.axes)
    params = [(rnn_dec_ng.b, db_dec), (rnn_dec_ng.W_input, dW_in_dec),
              (rnn_dec_ng.W_recur, dW_rec_dec), (rnn_enc_ng.b, db_enc),
              (rnn_enc_ng.W_input, dW_in_enc),
              (rnn_enc_ng.W_recur, dW_rec_enc)]

    with ExecutorFactory() as ex:

        # fprop computations
        fprop_fun = ex.executor([encoding_ng, output_ng],
                                input_placeholder_enc, input_placeholder_dec)

        # gradient computations
        update_funs = []
        for px, _ in params:
            update = ng.deriv(output_ng, px, error=deltas_constant)
            update_funs.append(
                ex.executor(update, input_placeholder_enc,
                            input_placeholder_dec))

        # check forward pass
        encoding, output = fprop_fun(input_value_enc, input_value_dec)
        ng.testing.assert_allclose(encoding, encoding_ref)
        ng.testing.assert_allclose(np.squeeze(output),
                                   np.squeeze(hs_return_dec))

        # check gradient computations
        for update_fun, (_, deriv_ref_val) in zip(update_funs, params):
            grad_neon = update_fun(input_value_enc, input_value_dec)
            ng.testing.assert_allclose(grad_neon,
                                       deriv_ref_val.squeeze(),
                                       rtol=bprop_rtol,
                                       atol=1e-4)
Ejemplo n.º 9
0
train_set = SequentialArrayIterator(ptb_data['train'],
                                    batch_size=args.batch_size,
                                    time_steps=time_steps,
                                    total_iterations=args.num_iterations)

valid_set = SequentialArrayIterator(ptb_data['valid'],
                                    batch_size=args.batch_size,
                                    time_steps=time_steps)

# weight initialization
init = UniformInit(low=-0.08, high=0.08)

# model initialization
seq1 = Sequential([
    Preprocess(functor=lambda x: ng.one_hot(x, axis=ax.Y)),
    Recurrent(hidden_size, init, activation=Tanh(), reset_cells=False),
    Affine(init, activation=Softmax(), bias_init=init, axes=(ax.Y, ax.REC))
])

# Bind axes lengths:
ax.Y.length = len(tree_bank_data.vocab)
ax.REC.length = time_steps
ax.N.length = args.batch_size

# placeholders with descriptive names
inputs = dict(inp_txt=ng.placeholder([ax.REC, ax.N]),
              tgt_txt=ng.placeholder([ax.REC, ax.N]))

optimizer = RMSProp(decay_rate=0.95,
                    learning_rate=2e-3,
                    epsilon=1e-6,
Ejemplo n.º 10
0
def check_rnn(seq_len,
              input_size,
              hidden_size,
              batch_size,
              init_func,
              return_seq=True):
    # init_func is the initializer for the model params
    assert batch_size == 1, "the recurrent reference implementation only support batch size 1"

    # ========== neon model ==========
    Cin = ng.make_axis(input_size)
    REC = ng.make_axis(seq_len, recurrent=True)
    N = ng.make_axis(batch_size, batch=True)
    H = ng.make_axis(hidden_size)
    ax_s = ng.make_axes([H, N])

    ex = ExecutorFactory()
    np.random.seed(0)

    rnn_ng = Recurrent(hidden_size,
                       init_func,
                       activation=Tanh(),
                       reset_cells=True,
                       return_sequence=return_seq)

    inp_ng = ng.placeholder([Cin, REC, N])
    init_state_ng = ng.placeholder(ax_s)

    # fprop graph
    out_ng = rnn_ng.train_outputs(inp_ng, init_state=init_state_ng)
    out_ng.input = True

    rnn_W_input = rnn_ng.W_input
    rnn_W_input.input = True
    rnn_W_recur = rnn_ng.W_recur
    rnn_W_recur.input = True
    rnn_b = rnn_ng.b
    rnn_b.input = True

    fprop_neon_fun = ex.executor(out_ng, inp_ng, init_state_ng)

    dWrecur_s_fun = ex.derivative(out_ng, rnn_W_recur, inp_ng, rnn_W_input,
                                  rnn_b)
    dWrecur_n_fun = ex.numeric_derivative(out_ng, rnn_W_recur, delta, inp_ng,
                                          rnn_W_input, rnn_b)
    dWinput_s_fun = ex.derivative(out_ng, rnn_W_input, inp_ng, rnn_W_recur,
                                  rnn_b)
    dWinput_n_fun = ex.numeric_derivative(out_ng, rnn_W_input, delta, inp_ng,
                                          rnn_W_recur, rnn_b)
    dWb_s_fun = ex.derivative(out_ng, rnn_b, inp_ng, rnn_W_input, rnn_W_recur)
    dWb_n_fun = ex.numeric_derivative(out_ng, rnn_b, delta, inp_ng,
                                      rnn_W_input, rnn_W_recur)

    # fprop on random inputs
    input_value = rng.uniform(-1, 1, inp_ng.axes)
    init_state_value = rng.uniform(-1, 1, init_state_ng.axes)
    fprop_neon = fprop_neon_fun(input_value, init_state_value).copy()

    # after the rnn graph has been executed, can get the W values. Get copies so
    # shared values don't confuse derivatives
    Wxh_neon = rnn_ng.W_input.value.get(None).copy()
    Whh_neon = rnn_ng.W_recur.value.get(None).copy()
    bh_neon = rnn_ng.b.value.get(None).copy()

    # bprop derivs
    dWrecur_s = dWrecur_s_fun(Whh_neon, input_value, Wxh_neon, bh_neon)
    dWrecur_n = dWrecur_n_fun(Whh_neon, input_value, Wxh_neon, bh_neon)
    np.testing.assert_allclose(dWrecur_s, dWrecur_n, rtol=rtol, atol=atol)

    dWb_s = dWb_s_fun(bh_neon, input_value, Wxh_neon, Whh_neon)
    dWb_n = dWb_n_fun(bh_neon, input_value, Wxh_neon, Whh_neon)
    np.testing.assert_allclose(dWb_s, dWb_n, rtol=rtol, atol=atol)

    dWinput_s = dWinput_s_fun(Wxh_neon, input_value, Whh_neon, bh_neon)
    dWinput_n = dWinput_n_fun(Wxh_neon, input_value, Whh_neon, bh_neon)
    np.testing.assert_allclose(dWinput_s, dWinput_n, rtol=rtol, atol=atol)

    # ========= reference model ==========
    output_shape = (hidden_size, seq_len * batch_size)

    # generate random deltas tensor
    deltas = np.random.randn(*output_shape)

    # the reference code expects these shapes:
    # input_shape: (seq_len, input_size, batch_size)
    # output_shape: (seq_len, hidden_size, batch_size)
    deltas_ref = deltas.copy().T.reshape(seq_len, batch_size,
                                         hidden_size).swapaxes(1, 2)

    inp_ref = input_value.transpose([1, 0, 2])

    # reference numpy RNN
    rnn_ref = RefRecurrent(input_size, hidden_size)
    rnn_ref.Wxh[:] = Wxh_neon
    rnn_ref.Whh[:] = Whh_neon
    rnn_ref.bh[:] = bh_neon.reshape(rnn_ref.bh.shape)

    (dWxh_ref, dWhh_ref, db_ref, h_ref_list, dh_ref_list,
     d_out_ref) = rnn_ref.lossFun(inp_ref,
                                  deltas_ref,
                                  init_states=init_state_value)

    # comparing outputs
    if return_seq is False:
        h_ref_list = h_ref_list[:, -1].reshape(-1, 1)
    else:
        fprop_neon = fprop_neon[:, :, 0]
    np.testing.assert_allclose(fprop_neon, h_ref_list, rtol=0.0, atol=1.0e-5)

    return
Ejemplo n.º 11
0
def expand_onehot(x):
    # Assign the recurrent role and property to the axis named 'time'
    x.axes.find_by_short_name('time')[0].add_role(ar.time)
    x.axes.find_by_short_name('time')[0].is_recurrent = True
    return ng.one_hot(x, axis=ax.Y)


# weight initialization
init = UniformInit(low=-0.08, high=0.08)

# model initialization
one_hot_enc = Preprocess(functor=expand_onehot)
enc = Recurrent(hidden_size,
                init,
                activation=Tanh(),
                reset_cells=True,
                return_sequence=False)
one_hot_dec = Preprocess(functor=expand_onehot)
dec = Recurrent(hidden_size,
                init,
                activation=Tanh(),
                reset_cells=True,
                return_sequence=True)
linear = Affine(init, activation=Softmax(), bias_init=init, axes=(ax.Y))

optimizer = RMSProp(decay_rate=0.95,
                    learning_rate=2e-3,
                    epsilon=1e-6,
                    gradient_clip_value=gradient_clip_value)