Beispiel #1
0
def check_stacked_lstm(seq_len,
                       input_size,
                       hidden_size,
                       batch_size,
                       init_func,
                       return_seq=True,
                       backward=False,
                       reset_cells=False,
                       num_iter=2):

    Cin = ng.make_axis(input_size, name='Feature')
    REC = ng.make_axis(seq_len, name='REC')
    N = ng.make_axis(batch_size, name='N')

    with ExecutorFactory() as ex:
        np.random.seed(0)

        inp_ng = ng.placeholder([Cin, REC, N])

        lstm_ng_1 = LSTM(hidden_size,
                         init_func,
                         activation=Tanh(),
                         gate_activation=Logistic(),
                         reset_cells=reset_cells,
                         return_sequence=return_seq,
                         backward=backward)
        lstm_ng_2 = LSTM(hidden_size + 1,
                         init_func,
                         activation=Tanh(),
                         gate_activation=Logistic(),
                         reset_cells=reset_cells,
                         return_sequence=return_seq,
                         backward=backward)

        out_ng_1 = lstm_ng_1(inp_ng)
        out_ng_2 = lstm_ng_2(out_ng_1)

        fprop_neon_fun_2 = ex.executor(out_ng_2, inp_ng)

        gates = ['i', 'f', 'o', 'g']
        Wxh_neon_1_fun = copier_T(
            ex.executor(list(lstm_ng_1.W_input[k] for k in gates)))
        Whh_neon_1_fun = copier_T(
            ex.executor(list(lstm_ng_1.W_recur[k] for k in gates)))
        bh_neon_1_fun = copier(ex.executor(list(lstm_ng_1.b[k]
                                                for k in gates)))
        Wxh_neon_2_fun = copier_T(
            ex.executor(list(lstm_ng_2.W_input[k] for k in gates)))
        Whh_neon_2_fun = copier_T(
            ex.executor(list(lstm_ng_2.W_recur[k] for k in gates)))
        bh_neon_2_fun = copier(ex.executor(list(lstm_ng_2.b[k]
                                                for k in gates)))

        h_init_fun = ex.executor(lstm_ng_2.h_init)

        # fprop on random inputs for multiple iterations
        fprop_neon_2_list = []
        input_value_list = []

        for i in range(num_iter):
            input_value = rng.uniform(-1, 1, inp_ng.axes)
            fprop_neon_2 = fprop_neon_fun_2(input_value).copy()

            # comparing outputs
            if return_seq is True:
                fprop_neon_2 = fprop_neon_2[:, :, 0]

            input_value_list.append(input_value)
            fprop_neon_2_list.append(fprop_neon_2)

            if reset_cells is False:
                # look at the last hidden states
                h_init_neon = fprop_neon_2[:, -1].reshape(-1, 1)
                h_init_ng = h_init_fun()
                ng.testing.assert_allclose(h_init_neon,
                                           h_init_ng,
                                           rtol=rtol,
                                           atol=atol)

        # after the rnn graph has been executed, can get the W values. Get copies so
        # shared values don't confuse derivatives
        # concatenate weights to i, f, o, g together (in this order)
        gates = ['i', 'f', 'o', 'g']
        Wxh_neon_1 = np.concatenate(Wxh_neon_1_fun(), 1)
        Whh_neon_1 = np.concatenate(Whh_neon_1_fun(), 1)
        bh_neon_1 = np.concatenate(bh_neon_1_fun())
        Wxh_neon_2 = np.concatenate(Wxh_neon_2_fun(), 1)
        Whh_neon_2 = np.concatenate(Whh_neon_2_fun(), 1)
        bh_neon_2 = np.concatenate(bh_neon_2_fun())

        # reference numpy LSTM
        lstm_ref_1 = RefLSTM()
        lstm_ref_2 = RefLSTM()
        WLSTM_1 = lstm_ref_1.init(input_size, hidden_size)
        WLSTM_2 = lstm_ref_2.init(hidden_size, hidden_size + 1)

        # make ref weights and biases the same with neon model
        WLSTM_1[0, :] = bh_neon_1
        WLSTM_1[1:input_size + 1, :] = Wxh_neon_1
        WLSTM_1[input_size + 1:] = Whh_neon_1
        WLSTM_2[0, :] = bh_neon_2
        WLSTM_2[1:hidden_size + 1, :] = Wxh_neon_2
        WLSTM_2[hidden_size + 1:] = Whh_neon_2

        # transpose input X and do fprop
        fprop_ref_2_list = []
        c0_1 = h0_1 = None
        c0_2 = h0_2 = None
        for i in range(num_iter):
            input_value = input_value_list[i]
            inp_ref = input_value.copy().transpose([1, 2, 0])
            (Hout_ref_1, cprev_1, hprev_1,
             batch_cache) = lstm_ref_1.forward(inp_ref, WLSTM_1, c0_1, h0_1)
            (Hout_ref_2, cprev_2, hprev_2,
             batch_cache) = lstm_ref_2.forward(Hout_ref_1, WLSTM_2, c0_2, h0_2)

            if reset_cells is False:
                c0_1 = cprev_1
                h0_1 = hprev_1
                c0_2 = cprev_2
                h0_2 = hprev_2

            # the output needs transpose as well
            Hout_ref_2 = Hout_ref_2.reshape(seq_len * batch_size,
                                            hidden_size + 1).T

            fprop_ref_2_list.append(Hout_ref_2)

        for i in range(num_iter):
            ng.testing.assert_allclose(fprop_neon_2_list[i],
                                       fprop_ref_2_list[i],
                                       rtol=rtol,
                                       atol=atol)
Beispiel #2
0
def check_lstm(seq_len,
               input_size,
               hidden_size,
               batch_size,
               init_func,
               return_seq=True,
               backward=False,
               reset_cells=False,
               num_iter=2):

    Cin = ng.make_axis(input_size, name='Feature')
    REC = ng.make_axis(seq_len, name='REC')
    N = ng.make_axis(batch_size, name='N')

    with ExecutorFactory() as ex:
        np.random.seed(0)

        inp_ng = ng.placeholder([Cin, REC, N])

        lstm_ng = LSTM(hidden_size,
                       init_func,
                       activation=Tanh(),
                       gate_activation=Logistic(),
                       reset_cells=reset_cells,
                       return_sequence=return_seq,
                       backward=backward)

        out_ng = lstm_ng(inp_ng)

        fprop_neon_fun = copier(ex.executor((out_ng, lstm_ng.h_init), inp_ng))

        gates = ['i', 'f', 'o', 'g']
        Wxh_neon_fun = copier_T(
            ex.executor(list(lstm_ng.W_input[k] for k in gates)))
        Whh_neon_fun = copier_T(
            ex.executor(list(lstm_ng.W_recur[k] for k in gates)))
        bh_neon_fun = copier(ex.executor(list(lstm_ng.b[k] for k in gates)))

        fprop_neon_list = []
        input_value_list = []

        for i in range(num_iter):
            # fprop on random inputs
            input_value = rng.uniform(-1, 1, inp_ng.axes)
            fprop_neon, h_init_neon = fprop_neon_fun(input_value)

            if return_seq is True:
                fprop_neon = fprop_neon[:, :, 0]

            input_value_list.append(input_value)
            fprop_neon_list.append(fprop_neon)

            if reset_cells is False:
                # look at the last hidden states
                ng.testing.assert_allclose(fprop_neon[:, -1].reshape(-1, 1),
                                           h_init_neon,
                                           rtol=rtol,
                                           atol=atol)

        # after the rnn graph has been executed, can get the W values. Get copies so
        # shared values don't confuse derivatives
        # concatenate weights to i, f, o, g together (in this order)
        Wxh_neon = Wxh_neon_fun()
        Whh_neon = Whh_neon_fun()
        bh_neon = bh_neon_fun()

        # reference numpy LSTM
        lstm_ref = RefLSTM()
        WLSTM = lstm_ref.init(input_size, hidden_size)

        # make ref weights and biases with neon model
        WLSTM[0, :] = np.concatenate(bh_neon)
        WLSTM[1:input_size + 1, :] = np.concatenate(Wxh_neon, 1)
        WLSTM[input_size + 1:] = np.concatenate(Whh_neon, 1)

        # transpose input X and do fprop
        fprop_ref_list = []
        c0 = h0 = None
        for i in range(num_iter):
            input_value = input_value_list[i]
            inp_ref = input_value.copy().transpose([1, 2, 0])
            (Hout_ref, cprev, hprev,
             batch_cache) = lstm_ref.forward(inp_ref, WLSTM, c0, h0)
            if reset_cells is False:
                c0 = cprev
                h0 = hprev

            # the output needs transpose as well
            Hout_ref = Hout_ref.reshape(seq_len * batch_size, hidden_size).T
            fprop_ref_list.append(Hout_ref)

        for i in range(num_iter):
            ng.testing.assert_allclose(fprop_neon_list[i],
                                       fprop_ref_list[i],
                                       rtol=rtol,
                                       atol=atol)
Beispiel #3
0
def check_lstm(seq_len,
               input_size,
               hidden_size,
               batch_size,
               init_func,
               inp_moms=[0.0, 1.0]):
    # init_func is the initializer for the model params
    # inp_moms is the [ mean, std dev] of the random input
    input_shape = (input_size, seq_len * batch_size)
    hidden_shape = (hidden_size, seq_len * batch_size)
    NervanaObject.be.bsz = NervanaObject.be.batch_size = batch_size

    # neon LSTM
    lstm = LSTM(hidden_size,
                init_func,
                activation=Tanh(),
                gate_activation=Logistic())

    inp = np.random.rand(*input_shape) * inp_moms[1] + inp_moms[0]
    inpa = lstm.be.array(inp)
    # run neon fprop
    lstm.configure((input_size, seq_len))
    lstm.prev_layer = True  # Hack to force allocating a delta buffer
    lstm.allocate()
    lstm.set_deltas([lstm.be.iobuf(lstm.in_shape)])
    lstm.fprop(inpa)

    # reference numpy LSTM
    lstm_ref = RefLSTM()
    WLSTM = lstm_ref.init(input_size, hidden_size)

    # make ref weights and biases with neon model
    WLSTM[0, :] = lstm.b.get().T
    WLSTM[1:input_size + 1, :] = lstm.W_input.get().T
    WLSTM[input_size + 1:] = lstm.W_recur.get().T

    # transpose input X and do fprop
    inp_ref = inp.copy().T.reshape(seq_len, batch_size, input_size)
    (Hout_ref, cprev, hprev, batch_cache) = lstm_ref.forward(inp_ref, WLSTM)

    # the output needs transpose as well
    Hout_ref = Hout_ref.reshape(seq_len * batch_size, hidden_size).T
    IFOGf_ref = batch_cache['IFOGf'].reshape(seq_len * batch_size,
                                             hidden_size * 4).T
    Ct_ref = batch_cache['Ct'].reshape(seq_len * batch_size, hidden_size).T

    # compare results
    neon_logger.display('====Verifying IFOG====')
    assert allclose_with_out(lstm.ifog_buffer.get(),
                             IFOGf_ref,
                             rtol=0.0,
                             atol=1.5e-5)

    neon_logger.display('====Verifying cell states====')
    assert allclose_with_out(lstm.c_act_buffer.get(),
                             Ct_ref,
                             rtol=0.0,
                             atol=1.5e-5)

    neon_logger.display('====Verifying hidden states====')
    assert allclose_with_out(lstm.outputs.get(),
                             Hout_ref,
                             rtol=0.0,
                             atol=1.5e-5)

    neon_logger.display('fprop is verified')

    # now test the bprop
    # generate random deltas tensor
    deltas = np.random.randn(*hidden_shape)

    lstm.bprop(lstm.be.array(deltas))
    # grab the delta W from gradient buffer
    dWinput_neon = lstm.dW_input.get()
    dWrecur_neon = lstm.dW_recur.get()
    db_neon = lstm.db.get()

    deltas_ref = deltas.copy().T.reshape(seq_len, batch_size, hidden_size)
    (dX_ref, dWLSTM_ref, dc0_ref,
     dh0_ref) = lstm_ref.backward(deltas_ref, batch_cache)
    dWrecur_ref = dWLSTM_ref[-hidden_size:, :]
    dWinput_ref = dWLSTM_ref[1:input_size + 1, :]
    db_ref = dWLSTM_ref[0, :]
    dX_ref = dX_ref.reshape(seq_len * batch_size, input_size).T

    # compare results
    neon_logger.display('Making sure neon LSTM match numpy LSTM in bprop')
    neon_logger.display('====Verifying update on W_recur====')

    assert allclose_with_out(dWrecur_neon,
                             dWrecur_ref.T,
                             rtol=0.0,
                             atol=1.5e-5)

    neon_logger.display('====Verifying update on W_input====')
    assert allclose_with_out(dWinput_neon,
                             dWinput_ref.T,
                             rtol=0.0,
                             atol=1.5e-5)

    neon_logger.display('====Verifying update on bias====')
    assert allclose_with_out(db_neon.flatten(), db_ref, rtol=0.0, atol=1.5e-5)

    neon_logger.display('====Verifying output delta====')
    assert allclose_with_out(lstm.out_deltas_buffer.get(),
                             dX_ref,
                             rtol=0.0,
                             atol=1.5e-5)

    neon_logger.display('bprop is verified')

    return
Beispiel #4
0
def gradient_check_ref(seq_len,
                       input_size,
                       hidden_size,
                       batch_size,
                       epsilon=1.0e-5,
                       dtypeu=np.float64,
                       threshold=1e-4):
    # this is a check of the reference code itself
    # estimates the gradients by adding perturbations
    # to the input and the weights and compares to
    # the values calculated in bprop

    # generate sparse random input matrix
    NervanaObject.be.bsz = NervanaObject.be.batch_size = batch_size
    input_shape = (seq_len, input_size, batch_size)
    # hidden_shape = (seq_len, hidden_size, batch_size)
    (inp_bl, nz_inds) = sparse_rand(input_shape,
                                    frac=1.0 / float(input_shape[1]))
    inp_bl = np.random.randn(*input_shape)

    # convert input matrix from neon to ref code format
    inp_bl = inp_bl.swapaxes(1, 2).astype(dtypeu)

    # generate reference LSTM
    lstm_ref = RefLSTM()
    WLSTM = lstm_ref.init(input_size, hidden_size).astype(dtypeu)

    # init parameters as done for neon
    WLSTM = np.random.randn(*WLSTM.shape)

    (Hout, cprev, hprev, cache) = lstm_ref.forward(inp_bl, WLSTM)

    # scale Hout by random matrix...
    rand_scale = np.random.random(Hout.shape) * 2.0 - 1.0
    rand_scale = dtypeu(rand_scale)

    # line below would be the loss function
    # loss_bl = np.sum(rand_scale * Hout)

    # run bprop, input deltas is rand_scale
    (dX_bl, dWLSTM_bl, dc0, dh0) = lstm_ref.backward(rand_scale, cache)

    grads_est = np.zeros(dX_bl.shape)
    inp_pert = inp_bl.copy()
    for pert_ind in range(inp_bl.size):
        save_val = inp_pert.flat[pert_ind]

        # add/subtract perturbations to input
        inp_pert.flat[pert_ind] = save_val + epsilon
        # and run fprop on perturbed input
        (Hout_pos, cprev, hprev, cache) = lstm_ref.forward(inp_pert, WLSTM)

        inp_pert.flat[pert_ind] = save_val - epsilon
        (Hout_neg, cprev, hprev, cache) = lstm_ref.forward(inp_pert, WLSTM)

        # calculate the loss on outputs
        loss_pos = np.sum(rand_scale * Hout_pos)
        loss_neg = np.sum(rand_scale * Hout_neg)

        grads_est.flat[pert_ind] = 0.5 / float(epsilon) * (loss_pos - loss_neg)

        # reset input
        inp_pert.flat[pert_ind] = save_val

    # assert that gradient estimates within rel threshold of
    # bprop calculated deltas
    assert allclose_with_out(grads_est, dX_bl, rtol=threshold, atol=0.0)
    return
Beispiel #5
0
def gradient_check_ref(seq_len, input_size, hidden_size, batch_size,
                       epsilon=1.0e-5, dtypeu=np.float64, threshold=1e-4):
    # this is a check of the reference code itself
    # estimates the gradients by adding perturbations
    # to the input and the weights and compares to
    # the values calculated in bprop

    # generate sparse random input matrix
    NervanaObject.be.bsz = NervanaObject.be.batch_size = batch_size
    input_shape = (seq_len, input_size, batch_size)
    # hidden_shape = (seq_len, hidden_size, batch_size)
    (inp_bl, nz_inds) = sparse_rand(input_shape, frac=1.0 / float(input_shape[1]))
    inp_bl = np.random.randn(*input_shape)

    # convert input matrix from neon to ref code format
    inp_bl = inp_bl.swapaxes(1, 2).astype(dtypeu)

    # generate reference LSTM
    lstm_ref = RefLSTM()
    WLSTM = lstm_ref.init(input_size, hidden_size).astype(dtypeu)

    # init parameters as done for neon
    WLSTM = np.random.randn(*WLSTM.shape)

    (Hout, cprev, hprev, cache) = lstm_ref.forward(inp_bl, WLSTM)

    # scale Hout by random matrix...
    rand_scale = np.random.random(Hout.shape) * 2.0 - 1.0
    rand_scale = dtypeu(rand_scale)

    # line below would be the loss function
    # loss_bl = np.sum(rand_scale * Hout)

    # run bprop, input deltas is rand_scale
    (dX_bl, dWLSTM_bl, dc0, dh0) = lstm_ref.backward(rand_scale, cache)

    grads_est = np.zeros(dX_bl.shape)
    inp_pert = inp_bl.copy()
    for pert_ind in range(inp_bl.size):
        save_val = inp_pert.flat[pert_ind]

        # add/subtract perturbations to input
        inp_pert.flat[pert_ind] = save_val + epsilon
        # and run fprop on perturbed input
        (Hout_pos, cprev, hprev, cache) = lstm_ref.forward(inp_pert, WLSTM)

        inp_pert.flat[pert_ind] = save_val - epsilon
        (Hout_neg, cprev, hprev, cache) = lstm_ref.forward(inp_pert, WLSTM)

        # calculate the loss on outputs
        loss_pos = np.sum(rand_scale * Hout_pos)
        loss_neg = np.sum(rand_scale * Hout_neg)

        grads_est.flat[pert_ind] = 0.5 / float(epsilon) * (loss_pos - loss_neg)

        # reset input
        inp_pert.flat[pert_ind] = save_val

    # assert that gradient estimates within rel threshold of
    # bprop calculated deltas
    assert allclose_with_out(grads_est, dX_bl, rtol=threshold, atol=0.0)
    return
Beispiel #6
0
def check_lstm(seq_len, input_size, hidden_size,
               batch_size, init_func, inp_moms=[0.0, 1.0]):
    # init_func is the initializer for the model params
    # inp_moms is the [ mean, std dev] of the random input
    input_shape = (input_size, seq_len * batch_size)
    hidden_shape = (hidden_size, seq_len * batch_size)
    NervanaObject.be.bsz = NervanaObject.be.batch_size = batch_size

    # neon LSTM
    lstm = LSTM(hidden_size,
                init_func,
                activation=Tanh(),
                gate_activation=Logistic())

    inp = np.random.rand(*input_shape) * inp_moms[1] + inp_moms[0]
    inpa = lstm.be.array(inp)
    # run neon fprop
    lstm.configure((input_size, seq_len))
    lstm.prev_layer = True  # Hack to force allocating a delta buffer
    lstm.allocate()

    dtree = DeltasTree()
    lstm.allocate_deltas(dtree)
    dtree.allocate_buffers()
    lstm.set_deltas(dtree)

    lstm.fprop(inpa)

    # reference numpy LSTM
    lstm_ref = RefLSTM()
    WLSTM = lstm_ref.init(input_size, hidden_size)

    # make ref weights and biases with neon model
    WLSTM[0, :] = lstm.b.get().T
    WLSTM[1:input_size + 1, :] = lstm.W_input.get().T
    WLSTM[input_size + 1:] = lstm.W_recur.get().T

    # transpose input X and do fprop
    inp_ref = inp.copy().T.reshape(seq_len, batch_size, input_size)
    (Hout_ref, cprev, hprev, batch_cache) = lstm_ref.forward(inp_ref,
                                                             WLSTM)

    # the output needs transpose as well
    Hout_ref = Hout_ref.reshape(seq_len * batch_size, hidden_size).T
    IFOGf_ref = batch_cache['IFOGf'].reshape(seq_len * batch_size, hidden_size * 4).T
    Ct_ref = batch_cache['Ct'].reshape(seq_len * batch_size, hidden_size).T

    # compare results
    neon_logger.display('====Verifying IFOG====')
    assert allclose_with_out(lstm.ifog_buffer.get(),
                             IFOGf_ref,
                             rtol=0.0,
                             atol=1.5e-5)

    neon_logger.display('====Verifying cell states====')
    assert allclose_with_out(lstm.c_act_buffer.get(),
                             Ct_ref,
                             rtol=0.0,
                             atol=1.5e-5)

    neon_logger.display('====Verifying hidden states====')
    assert allclose_with_out(lstm.outputs.get(),
                             Hout_ref,
                             rtol=0.0,
                             atol=1.5e-5)

    neon_logger.display('fprop is verified')

    # now test the bprop
    # generate random deltas tensor
    deltas = np.random.randn(*hidden_shape)

    lstm.bprop(lstm.be.array(deltas))
    # grab the delta W from gradient buffer
    dWinput_neon = lstm.dW_input.get()
    dWrecur_neon = lstm.dW_recur.get()
    db_neon = lstm.db.get()

    deltas_ref = deltas.copy().T.reshape(seq_len, batch_size, hidden_size)
    (dX_ref, dWLSTM_ref, dc0_ref, dh0_ref) = lstm_ref.backward(deltas_ref,
                                                               batch_cache)
    dWrecur_ref = dWLSTM_ref[-hidden_size:, :]
    dWinput_ref = dWLSTM_ref[1:input_size + 1, :]
    db_ref = dWLSTM_ref[0, :]
    dX_ref = dX_ref.reshape(seq_len * batch_size, input_size).T

    # compare results
    neon_logger.display('Making sure neon LSTM match numpy LSTM in bprop')
    neon_logger.display('====Verifying update on W_recur====')

    assert allclose_with_out(dWrecur_neon,
                             dWrecur_ref.T,
                             rtol=0.0,
                             atol=1.5e-5)

    neon_logger.display('====Verifying update on W_input====')
    assert allclose_with_out(dWinput_neon,
                             dWinput_ref.T,
                             rtol=0.0,
                             atol=1.5e-5)

    neon_logger.display('====Verifying update on bias====')
    assert allclose_with_out(db_neon.flatten(),
                             db_ref,
                             rtol=0.0,
                             atol=1.5e-5)

    neon_logger.display('====Verifying output delta====')
    assert allclose_with_out(lstm.out_deltas_buffer.get(),
                             dX_ref,
                             rtol=0.0,
                             atol=1.5e-5)

    neon_logger.display('bprop is verified')

    return