Beispiel #1
0
def gradient_calc(seq_len, input_size, hidden_size, batch_size,
                  epsilon=None, rand_scale=None, inp_bl=None):
    NervanaObject.be.bsz = NervanaObject.be.batch_size = batch_size

    input_shape = (input_size, seq_len * batch_size)

    # generate input if one is not given
    if inp_bl is None:
        inp_bl = np.random.randn(*input_shape)

    # neon lstm instance
    lstm = LSTM(hidden_size, Gaussian(), Tanh(), Logistic())
    inpa = lstm.be.array(np.copy(inp_bl))

    # run fprop on the baseline input
    lstm.configure((input_size, seq_len))
    lstm.prev_layer = True  # Hack to force allocating a delta buffer
    lstm.allocate()
    lstm.set_deltas([lstm.be.iobuf(lstm.in_shape)])
    out_bl = lstm.fprop(inpa).get()

    # random scaling/hash to generate fake loss
    if rand_scale is None:
        rand_scale = np.random.random(out_bl.shape) * 2.0 - 1.0
    # loss function would be:
    # loss_bl = np.sum(rand_scale * out_bl)

    # run back prop with rand_scale as the errors
    # use copy to avoid any interactions
    deltas_neon = lstm.bprop(lstm.be.array(np.copy(rand_scale))).get()

    # add a perturbation to each input element
    grads_est = np.zeros(inpa.shape)
    inp_pert = inp_bl.copy()
    for pert_ind in range(inpa.size):
        save_val = inp_pert.flat[pert_ind]

        inp_pert.flat[pert_ind] = save_val + epsilon
        reset_lstm(lstm)
        lstm.allocate()
        out_pos = lstm.fprop(lstm.be.array(inp_pert)).get()

        inp_pert.flat[pert_ind] = save_val - epsilon
        reset_lstm(lstm)
        lstm.allocate()
        out_neg = lstm.fprop(lstm.be.array(inp_pert)).get()

        # calculate the loss with perturbations
        loss_pos = np.sum(rand_scale*out_pos)
        loss_neg = np.sum(rand_scale*out_neg)
        # compute the gradient estimate
        grad = 0.5*(loss_pos-loss_neg)/epsilon

        grads_est.flat[pert_ind] = grad

        # reset the perturbed input element
        inp_pert.flat[pert_ind] = save_val

    del lstm
    return (grads_est, deltas_neon)
Beispiel #2
0
def gradient_calc(seq_len,
                  input_size,
                  hidden_size,
                  batch_size,
                  epsilon=None,
                  rand_scale=None,
                  inp_bl=None):
    NervanaObject.be.bsz = NervanaObject.be.batch_size = batch_size

    input_shape = (input_size, seq_len * batch_size)

    # generate input if one is not given
    if inp_bl is None:
        inp_bl = np.random.randn(*input_shape)

    # neon lstm instance
    lstm = LSTM(hidden_size, Gaussian(), Tanh(), Logistic())
    inpa = lstm.be.array(np.copy(inp_bl))

    # run fprop on the baseline input
    out_bl = lstm.fprop(inpa).get()

    # random scaling/hash to generate fake loss
    if rand_scale is None:
        rand_scale = np.random.random(out_bl.shape) * 2.0 - 1.0
    # loss function would be:
    # loss_bl = np.sum(rand_scale * out_bl)

    # run back prop with rand_scale as the errors
    # use copy to avoid any interactions
    deltas_neon = lstm.bprop(lstm.be.array(np.copy(rand_scale))).get()

    # add a perturbation to each input element
    grads_est = np.zeros(inpa.shape)
    inp_pert = inp_bl.copy()
    for pert_ind in range(inpa.size):
        save_val = inp_pert.flat[pert_ind]

        inp_pert.flat[pert_ind] = save_val + epsilon
        reset_lstm(lstm)
        out_pos = lstm.fprop(lstm.be.array(inp_pert)).get()

        inp_pert.flat[pert_ind] = save_val - epsilon
        reset_lstm(lstm)
        out_neg = lstm.fprop(lstm.be.array(inp_pert)).get()

        # calculate the loss with perturbations
        loss_pos = np.sum(rand_scale * out_pos)
        loss_neg = np.sum(rand_scale * out_neg)
        # compute the gradient estimate
        grad = 0.5 * (loss_pos - loss_neg) / epsilon

        grads_est.flat[pert_ind] = grad

        # reset the perturbed input element
        inp_pert.flat[pert_ind] = save_val

    del lstm
    return (grads_est, deltas_neon)
Beispiel #3
0
def test_biLSTM_fprop_rnn(backend_default, fargs):

    # basic sanity check with 0 weights random inputs
    seq_len, input_size, hidden_size, batch_size = fargs
    in_shape = (input_size, seq_len)
    out_shape = (hidden_size, seq_len)
    NervanaObject.be.bsz = batch_size

    # setup the bi-directional rnn
    init_glorot = GlorotUniform()
    bilstm = BiLSTM(hidden_size, gate_activation=Logistic(),
                    activation=Tanh(), init=init_glorot, reset_cells=True)
    bilstm.configure(in_shape)
    bilstm.prev_layer = True
    bilstm.allocate()

    # setup the bi-directional rnn
    init_glorot = GlorotUniform()
    rnn = LSTM(hidden_size, gate_activation=Logistic(),
               activation=Tanh(), init=init_glorot, reset_cells=True)
    rnn.configure(in_shape)
    rnn.prev_layer = True
    rnn.allocate()

    # same weight for bi-rnn backward and rnn weights
    nout = hidden_size
    bilstm.W_input_b[:] = bilstm.W_input_f
    bilstm.W_recur_b[:] = bilstm.W_recur_f
    bilstm.b_b[:] = bilstm.b_f
    bilstm.dW[:] = 0
    rnn.W_input[:] = bilstm.W_input_f
    rnn.W_recur[:] = bilstm.W_recur_f
    rnn.b[:] = bilstm.b_f
    rnn.dW[:] = 0

    # inputs - random and flipped left-to-right inputs
    lr = np.random.random((input_size, seq_len * batch_size))
    lr_rev = list(reversed(get_steps(lr.copy(), in_shape)))

    rl = con(lr_rev, axis=1)
    inp_lr = bilstm.be.array(lr)
    inp_rl = bilstm.be.array(rl)
    inp_rnn = rnn.be.array(lr)

    # outputs
    out_lr = bilstm.fprop(inp_lr).get().copy()
    bilstm.h_buffer[:] = 0
    out_rl = bilstm.fprop(inp_rl).get()
    out_rnn = rnn.fprop(inp_rnn).get().copy()

    # views
    out_lr_f_s = get_steps(out_lr[:nout], out_shape)
    out_lr_b_s = get_steps(out_lr[nout:], out_shape)
    out_rl_f_s = get_steps(out_rl[:nout], out_shape)
    out_rl_b_s = get_steps(out_rl[nout:], out_shape)
    out_rnn_s = get_steps(out_rnn, out_shape)

    # asserts for fprop
    for x_rnn, x_f, x_b, y_f, y_b in zip(out_rnn_s, out_lr_f_s, out_lr_b_s,
                                         reversed(out_rl_f_s), reversed(out_rl_b_s)):
        assert allclose_with_out(x_f, y_b, rtol=0.0, atol=1.0e-5)
        assert allclose_with_out(x_b, y_f, rtol=0.0, atol=1.0e-5)
        assert allclose_with_out(x_rnn, x_f, rtol=0.0, atol=1.0e-5)
        assert allclose_with_out(x_rnn, y_b, rtol=0.0, atol=1.0e-5)
Beispiel #4
0
def test_biLSTM_fprop_rnn(backend_default, fargs):

    # basic sanity check with 0 weights random inputs
    seq_len, input_size, hidden_size, batch_size = fargs
    in_shape = (input_size, seq_len)
    out_shape = (hidden_size, seq_len)
    NervanaObject.be.bsz = batch_size

    # setup the bi-directional rnn
    init_glorot = GlorotUniform()
    bilstm = BiLSTM(hidden_size,
                    gate_activation=Logistic(),
                    activation=Tanh(),
                    init=init_glorot,
                    reset_cells=True)
    bilstm.configure(in_shape)
    bilstm.prev_layer = True
    bilstm.allocate()

    # setup the bi-directional rnn
    init_glorot = GlorotUniform()
    rnn = LSTM(hidden_size,
               gate_activation=Logistic(),
               activation=Tanh(),
               init=init_glorot,
               reset_cells=True)
    rnn.configure(in_shape)
    rnn.prev_layer = True
    rnn.allocate()

    # same weight for bi-rnn backward and rnn weights
    nout = hidden_size
    bilstm.W_input_b[:] = bilstm.W_input_f
    bilstm.W_recur_b[:] = bilstm.W_recur_f
    bilstm.b_b[:] = bilstm.b_f
    bilstm.dW[:] = 0
    rnn.W_input[:] = bilstm.W_input_f
    rnn.W_recur[:] = bilstm.W_recur_f
    rnn.b[:] = bilstm.b_f
    rnn.dW[:] = 0

    # inputs - random and flipped left-to-right inputs
    lr = np.random.random((input_size, seq_len * batch_size))
    lr_rev = list(reversed(get_steps(lr.copy(), in_shape)))

    rl = con(lr_rev, axis=1)
    inp_lr = bilstm.be.array(lr)
    inp_rl = bilstm.be.array(rl)
    inp_rnn = rnn.be.array(lr)

    # outputs
    out_lr = bilstm.fprop(inp_lr).get().copy()
    bilstm.h_buffer[:] = 0
    out_rl = bilstm.fprop(inp_rl).get()
    out_rnn = rnn.fprop(inp_rnn).get().copy()

    # views
    out_lr_f_s = get_steps(out_lr[:nout], out_shape)
    out_lr_b_s = get_steps(out_lr[nout:], out_shape)
    out_rl_f_s = get_steps(out_rl[:nout], out_shape)
    out_rl_b_s = get_steps(out_rl[nout:], out_shape)
    out_rnn_s = get_steps(out_rnn, out_shape)

    # asserts for fprop
    for x_rnn, x_f, x_b, y_f, y_b in zip(out_rnn_s, out_lr_f_s, out_lr_b_s,
                                         reversed(out_rl_f_s),
                                         reversed(out_rl_b_s)):
        assert allclose_with_out(x_f, y_b, rtol=0.0, atol=1.0e-5)
        assert allclose_with_out(x_b, y_f, rtol=0.0, atol=1.0e-5)
        assert allclose_with_out(x_rnn, x_f, rtol=0.0, atol=1.0e-5)
        assert allclose_with_out(x_rnn, y_b, rtol=0.0, atol=1.0e-5)
Beispiel #5
0
def test_beamsearch(backend_default):
    """
    Simlulated beam search on a minibatch of 2, for 4 time steps. The
    LSTM states are real but the "softmax outputs" z are hardcoded and
    not taken from  the network.
    There are 6 tokens the network outputs, and they have probabilities
    like exp(1), exp(5), exp(7)

    The test asserts that the score_lists assigned by _beamsearch_step(z_list)
    are equal to the probabilities computed manually adding probabilities
    to z_list.
    """
    be = backend_default

    batch_size = 2
    be.bsz = batch_size
    time_steps = 4
    nout = 6
    num_beams = 3

    # create unused layers
    activation = Tanh()
    gate_activation = Logistic()
    init_ary = np.eye(nout)
    init = Array(init_ary)
    encoder = LSTM(nout, init,
                   activation=activation, gate_activation=gate_activation,
                   name="Enc")
    decoder = LSTM(nout, init,
                   activation=activation, gate_activation=gate_activation,
                   name="Dec")

    class DummyFProp():
        """
        Constructs an artificial beam search example with known correct outputs.
        This is called inside a nested loop over steps, num_life. In the first
        time step there is one life beam, after that, 3 life beams per step.
        There are 4 time steps total. Each beamsearch_step builds one list over
        num_life beams.

        At t=0, the winners for ex0 are 1, 4, 5 (indexed by their position) and
        winners for ex1 are 2,4,5. From there we continue the beam for ex0:
            12, 13, 14              6+2=8 6+3=9  6+2=8
            40, 43, 45  with scores 5+4=9 5+3=8  5+7=12 three new winners 45, 52, 55
            50, 52, 55              5+4=9 5+6=11 5+5=10

        for ex2
            1 4 5  with scores   5 4 7
        we get the three winners 1, 4, 5 and continue (just taking the
        3 in order, no sorting)
            10 12 13 14 (not unique!)  5+2=7  5+2=7  5+3=8
            41 42 43       with scores 4+6=10 4+5=9  4+7=11 winners  43 51 52
            51 52 53                   7+4=11 7+6=13 7+3=10 scores   11 11 13
        continue from the three winners 43 51 52
            431 433 434             11+10=21 11+3=14 11+9=20
            511 512 513 with scores 11+6=17  11+5=16 11+7=18  winners 431 434 520
            520 521 522             13+8=21  13+4=17 13+6=19  scores   21  20  21
        continue from three winners 431 511 513 (going along beams, the matches
        in a beam)
            4310 4312 4313 4314             21+2=23  21+2=23 21+3=24 21+10=31 (not unique!)
            4341 4342 4343      with scores 20+10=30 20+5=25 20+7=27        winners 4314 4341 5204
            5200 5202 5204                  21+8=29  21+6=27 21+10=31       scores    31   30   31
        overall winners are 4314 4341 5204

        """
        def __init__(self):
            self.i = -1
            # t=0
            #                                 X        x  x  <-- winners: 1, 4, 5  (for example 0)
            z = be.array(np.exp(np.array([[1, 6, 2, 1, 5, 5],
                                          [1, 5, 2, 2, 4, 7]]))).T

            # t=1
            #                                     x  x  x  <-- give we picked 4: new winners 2,3,4
            z1 = be.array(np.exp(np.array([[1, 1, 2, 3, 2, 1],
                                           [2, 1, 2, 3, 2, 1]]))).T
            #                               x        x     x  <-- give we picked 5:
            #                                                     new winners 0,3,[5]
            #                                                     score 12
            z2 = be.array(np.exp(np.array([[4, 1, 2, 3, 1, 7],
                                           [2, 6, 5, 7, 2, 4]]))).T
            #                               x     X        X  <-- give we picked 1:
            #                                                     new winners 0,[2],[5]
            #                                                     scores 12, 11
            z3 = be.array(np.exp(np.array([[4, 1, 6, 3, 1, 5],
                                           [1, 4, 6, 3, 2, 1]]))).T

            # t=2
            # example 0: given constructed (1, 5), score 11: 3, 4; scores 21, 20
            z4 = be.array(np.exp(np.array([[1, 1, 2, 10, 9, 1],
                                           [2, 10, 2, 3, 9, 1]]))).T
            # example 0: given constructed (5, 5), score 12: none selected from this beam
            z5 = be.array(np.exp(np.array([[4, 1, 2, 3, 1, 7],
                                           [2, 6, 5, 7, 2, 4]]))).T
            # example 0: given constructed (1, 2), score 12: 1; score 20
            z6 = be.array(np.exp(np.array([[4, 8, 6, 3, 1, 5],
                                           [8, 4, 6, 3, 1, 1]]))).T

            # t=3
            # example 0: given constructed (1, 5, 4), score 20: 1, score 30
            z7 = be.array(np.exp(np.array([[1, 10, 2, 1, 1, 1],
                                           [2, 1, 2, 3, 10, 1]]))).T
            # example 0: given constructed (1, 2, 1), score 20: 5, score 30
            z8 = be.array(np.exp(np.array([[4, 1, 2, 3, 1, 10],
                                           [2, 10, 5, 7, 2, 4]]))).T
            # example 0: given constructed (1, 5, 3), score 21: 4, score 31
            z9 = be.array(np.exp(np.array([[4, 8, 6, 3, 10, 5],
                                           [8, 4, 6, 3, 10, 1]]))).T

            self.z_list = [z, z1, z2, z3, z4, z5, z6, z7, z8, z9]

        def fprop(self, z, inference=True, init_state=None):
            self.i += 1
            return self.z_list[self.i]

    def final_state():
        return be.zeros_like(decoder.h[-1])

    class InObj(NervanaObject):
        def __init__(self):
            self.shape = (nout, time_steps)
            self.decoder_shape = (nout, time_steps)

    decoder.fprop = DummyFProp().fprop
    layers = Seq2Seq([encoder, decoder], decoder_connections=[0])
    layers.decoder._recurrent[0].final_state = final_state

    in_obj = InObj()
    layers.configure(in_obj)  # made zeros because zeros have shape
    layers.allocate()
    layers.allocate_deltas(None)
    beamsearch = BeamSearch(layers)
    inputs = be.iobuf(in_obj.shape)
    beamsearch.beamsearch(inputs, num_beams=num_beams)

    ex0 = np.array([[1, 5, 4, 1],
                    [1, 2, 1, 5],
                    [1, 5, 3, 4]])
    ex1 = np.array([[5, 1, 4, 4],
                    [5, 1, 1, 1],
                    [5, 2, 0, 4]])

    # extract all candidates
    examples = reformat_samples(beamsearch, num_beams, batch_size)
    assert allclose_with_out(examples[0], ex0)
    assert allclose_with_out(examples[1], ex1)
Beispiel #6
0
def check_lstm(seq_len,
               input_size,
               hidden_size,
               batch_size,
               init_func,
               inp_moms=[0.0, 1.0]):
    # init_func is the initializer for the model params
    # inp_moms is the [ mean, std dev] of the random input
    input_shape = (input_size, seq_len * batch_size)
    hidden_shape = (hidden_size, seq_len * batch_size)
    NervanaObject.be.bsz = NervanaObject.be.batch_size = batch_size

    # neon LSTM
    lstm = LSTM(hidden_size,
                init_func,
                activation=Tanh(),
                gate_activation=Logistic())

    inp = np.random.rand(*input_shape) * inp_moms[1] + inp_moms[0]
    inpa = lstm.be.array(inp)
    # import pdb; pdb.set_trace()
    # run neon fprop
    lstm.fprop(inpa)

    # reference numpy LSTM
    lstm_ref = RefLSTM()
    WLSTM = lstm_ref.init(input_size, hidden_size)

    # make ref weights and biases with neon model
    WLSTM[0, :] = lstm.b.get().T
    WLSTM[1:input_size + 1, :] = lstm.W_input.get().T
    WLSTM[input_size + 1:] = lstm.W_recur.get().T

    # transpose input X and do fprop
    inp_ref = inp.copy().T.reshape(seq_len, batch_size, input_size)
    (Hout_ref, cprev, hprev, batch_cache) = lstm_ref.forward(inp_ref, WLSTM)

    # the output needs transpose as well
    Hout_ref = Hout_ref.reshape(seq_len * batch_size, hidden_size).T
    IFOGf_ref = batch_cache['IFOGf'].reshape(seq_len * batch_size,
                                             hidden_size * 4).T
    Ct_ref = batch_cache['Ct'].reshape(seq_len * batch_size, hidden_size).T

    # compare results
    print '====Verifying IFOG===='
    allclose_with_out(lstm.ifog_buffer.get(), IFOGf_ref, rtol=0.0, atol=1.0e-5)

    print '====Verifying cell states===='
    allclose_with_out(lstm.c_act_buffer.get(), Ct_ref, rtol=0.0, atol=1.0e-5)

    print '====Verifying hidden states===='
    allclose_with_out(lstm.h_buffer.get(), Hout_ref, rtol=0.0, atol=1.0e-5)

    print 'fprop is verified'

    # now test the bprop
    # generate random deltas tensor
    deltas = np.random.randn(*hidden_shape)

    lstm.bprop(lstm.be.array(deltas))
    # grab the delta W from gradient buffer
    dWinput_neon = lstm.dW_input.get()
    dWrecur_neon = lstm.dW_recur.get()
    db_neon = lstm.db.get()

    # import pdb; pdb.set_trace()
    deltas_ref = deltas.copy().T.reshape(seq_len, batch_size, hidden_size)
    (dX_ref, dWLSTM_ref, dc0_ref,
     dh0_ref) = lstm_ref.backward(deltas_ref, batch_cache)
    dWrecur_ref = dWLSTM_ref[-hidden_size:, :]
    dWinput_ref = dWLSTM_ref[1:input_size + 1, :]
    db_ref = dWLSTM_ref[0, :]
    dX_ref = dX_ref.reshape(seq_len * batch_size, input_size).T

    # compare results
    print 'Making sure neon LSTM match numpy LSTM in bprop'
    print '====Verifying update on W_recur===='

    assert allclose_with_out(dWrecur_neon,
                             dWrecur_ref.T,
                             rtol=0.0,
                             atol=1.0e-5)

    print '====Verifying update on W_input===='
    assert allclose_with_out(dWinput_neon,
                             dWinput_ref.T,
                             rtol=0.0,
                             atol=1.0e-5)

    print '====Verifying update on bias===='
    assert allclose_with_out(db_neon.flatten(), db_ref, rtol=0.0, atol=1.0e-5)

    print '====Verifying output delta===='
    assert allclose_with_out(lstm.out_deltas_buffer.get(),
                             dX_ref,
                             rtol=0.0,
                             atol=1.0e-5)

    print 'bprop is verified'

    return
Beispiel #7
0
def check_lstm(seq_len, input_size, hidden_size,
               batch_size, init_func, inp_moms=[0.0, 1.0]):
    # init_func is the initializer for the model params
    # inp_moms is the [ mean, std dev] of the random input
    input_shape = (input_size, seq_len * batch_size)
    hidden_shape = (hidden_size, seq_len * batch_size)
    NervanaObject.be.bsz = NervanaObject.be.batch_size = batch_size

    # neon LSTM
    lstm = LSTM(hidden_size,
                init_func,
                activation=Tanh(),
                gate_activation=Logistic())

    inp = np.random.rand(*input_shape)*inp_moms[1] + inp_moms[0]
    inpa = lstm.be.array(inp)
    # run neon fprop
    lstm.configure((input_size, seq_len))
    lstm.prev_layer = True  # Hack to force allocating a delta buffer
    lstm.allocate()
    lstm.set_deltas([lstm.be.iobuf(lstm.in_shape)])
    lstm.fprop(inpa)

    # reference numpy LSTM
    lstm_ref = RefLSTM()
    WLSTM = lstm_ref.init(input_size, hidden_size)

    # make ref weights and biases with neon model
    WLSTM[0, :] = lstm.b.get().T
    WLSTM[1:input_size+1, :] = lstm.W_input.get().T
    WLSTM[input_size+1:] = lstm.W_recur.get().T

    # transpose input X and do fprop
    inp_ref = inp.copy().T.reshape(seq_len, batch_size, input_size)
    (Hout_ref, cprev, hprev, batch_cache) = lstm_ref.forward(inp_ref,
                                                             WLSTM)

    # the output needs transpose as well
    Hout_ref = Hout_ref.reshape(seq_len * batch_size, hidden_size).T
    IFOGf_ref = batch_cache['IFOGf'].reshape(seq_len * batch_size, hidden_size * 4).T
    Ct_ref = batch_cache['Ct'].reshape(seq_len * batch_size, hidden_size).T

    # compare results
    print '====Verifying IFOG===='
    allclose_with_out(lstm.ifog_buffer.get(),
                      IFOGf_ref,
                      rtol=0.0,
                      atol=1.0e-5)

    print '====Verifying cell states===='
    allclose_with_out(lstm.c_act_buffer.get(),
                      Ct_ref,
                      rtol=0.0,
                      atol=1.0e-5)

    print '====Verifying hidden states===='
    allclose_with_out(lstm.outputs.get(),
                      Hout_ref,
                      rtol=0.0,
                      atol=1.0e-5)

    print 'fprop is verified'

    # now test the bprop
    # generate random deltas tensor
    deltas = np.random.randn(*hidden_shape)

    lstm.bprop(lstm.be.array(deltas))
    # grab the delta W from gradient buffer
    dWinput_neon = lstm.dW_input.get()
    dWrecur_neon = lstm.dW_recur.get()
    db_neon = lstm.db.get()

    deltas_ref = deltas.copy().T.reshape(seq_len, batch_size, hidden_size)
    (dX_ref, dWLSTM_ref, dc0_ref, dh0_ref) = lstm_ref.backward(deltas_ref,
                                                               batch_cache)
    dWrecur_ref = dWLSTM_ref[-hidden_size:, :]
    dWinput_ref = dWLSTM_ref[1:input_size+1, :]
    db_ref = dWLSTM_ref[0, :]
    dX_ref = dX_ref.reshape(seq_len * batch_size, input_size).T

    # compare results
    print 'Making sure neon LSTM match numpy LSTM in bprop'
    print '====Verifying update on W_recur===='

    assert allclose_with_out(dWrecur_neon,
                             dWrecur_ref.T,
                             rtol=0.0,
                             atol=1.0e-5)

    print '====Verifying update on W_input===='
    assert allclose_with_out(dWinput_neon,
                             dWinput_ref.T,
                             rtol=0.0,
                             atol=1.0e-5)

    print '====Verifying update on bias===='
    assert allclose_with_out(db_neon.flatten(),
                             db_ref,
                             rtol=0.0,
                             atol=1.0e-5)

    print '====Verifying output delta===='
    assert allclose_with_out(lstm.out_deltas_buffer.get(),
                             dX_ref,
                             rtol=0.0,
                             atol=1.0e-5)

    print 'bprop is verified'

    return
def test_beamsearch(backend_default):
    """
    Simlulated beam search on a minibatch of 2, for 4 time steps. The
    LSTM states are real but the "softmax outputs" z are hardcoded and
    not taken from  the network.
    There are 6 tokens the network outputs, and they have probabilities
    like exp(1), exp(5), exp(7)

    The test asserts that the score_lists assigned by _beamsearch_step(z_list)
    are equal to the probabilities computed manually adding probabilities
    to z_list.
    """
    be = backend_default

    batch_size = 2
    be.bsz = batch_size
    time_steps = 4
    nout = 6
    num_beams = 3

    # create unused layers
    activation = Tanh()
    gate_activation = Logistic()
    init_ary = np.eye(nout)
    init = Array(init_ary)
    encoder = LSTM(nout,
                   init,
                   activation=activation,
                   gate_activation=gate_activation,
                   name="Enc")
    decoder = LSTM(nout,
                   init,
                   activation=activation,
                   gate_activation=gate_activation,
                   name="Dec")

    class DummyFProp():
        """
        Constructs an artificial beam search example with known correct outputs.
        This is called inside a nested loop over steps, num_life. In the first
        time step there is one life beam, after that, 3 life beams per step.
        There are 4 time steps total. Each beamsearch_step builds one list over
        num_life beams.

        At t=0, the winners for ex0 are 1, 4, 5 (indexed by their position) and
        winners for ex1 are 2,4,5. From there we continue the beam for ex0:
            12, 13, 14              6+2=8 6+3=9  6+2=8
            40, 43, 45  with scores 5+4=9 5+3=8  5+7=12 three new winners 45, 52, 55
            50, 52, 55              5+4=9 5+6=11 5+5=10

        for ex2
            1 4 5  with scores   5 4 7
        we get the three winners 1, 4, 5 and continue (just taking the
        3 in order, no sorting)
            10 12 13 14 (not unique!)  5+2=7  5+2=7  5+3=8
            41 42 43       with scores 4+6=10 4+5=9  4+7=11 winners  43 51 52
            51 52 53                   7+4=11 7+6=13 7+3=10 scores   11 11 13
        continue from the three winners 43 51 52
            431 433 434             11+10=21 11+3=14 11+9=20
            511 512 513 with scores 11+6=17  11+5=16 11+7=18  winners 431 434 520
            520 521 522             13+8=21  13+4=17 13+6=19  scores   21  20  21
        continue from three winners 431 511 513 (going along beams, the matches
        in a beam)
            4310 4312 4313 4314             21+2=23  21+2=23 21+3=24 21+10=31 (not unique!)
            4341 4342 4343      with scores 20+10=30 20+5=25 20+7=27        winners 4314 4341 5204
            5200 5202 5204                  21+8=29  21+6=27 21+10=31       scores    31   30   31
        overall winners are 4314 4341 5204

        """
        def __init__(self):
            self.i = -1
            # t=0
            #                                 X        x  x  <-- winners: 1, 4, 5  (for example 0)
            z = be.array(
                np.exp(np.array([[1, 6, 2, 1, 5, 5], [1, 5, 2, 2, 4, 7]]))).T

            # t=1
            #                                     x  x  x  <-- give we picked 4: new winners 2,3,4
            z1 = be.array(
                np.exp(np.array([[1, 1, 2, 3, 2, 1], [2, 1, 2, 3, 2, 1]]))).T
            #                               x        x     x  <-- give we picked 5:
            #                                                     new winners 0,3,[5]
            #                                                     score 12
            z2 = be.array(
                np.exp(np.array([[4, 1, 2, 3, 1, 7], [2, 6, 5, 7, 2, 4]]))).T
            #                               x     X        X  <-- give we picked 1:
            #                                                     new winners 0,[2],[5]
            #                                                     scores 12, 11
            z3 = be.array(
                np.exp(np.array([[4, 1, 6, 3, 1, 5], [1, 4, 6, 3, 2, 1]]))).T

            # t=2
            # example 0: given constructed (1, 5), score 11: 3, 4; scores 21, 20
            z4 = be.array(
                np.exp(np.array([[1, 1, 2, 10, 9, 1], [2, 10, 2, 3, 9, 1]]))).T
            # example 0: given constructed (5, 5), score 12: none selected from this beam
            z5 = be.array(
                np.exp(np.array([[4, 1, 2, 3, 1, 7], [2, 6, 5, 7, 2, 4]]))).T
            # example 0: given constructed (1, 2), score 12: 1; score 20
            z6 = be.array(
                np.exp(np.array([[4, 8, 6, 3, 1, 5], [8, 4, 6, 3, 1, 1]]))).T

            # t=3
            # example 0: given constructed (1, 5, 4), score 20: 1, score 30
            z7 = be.array(
                np.exp(np.array([[1, 10, 2, 1, 1, 1], [2, 1, 2, 3, 10, 1]]))).T
            # example 0: given constructed (1, 2, 1), score 20: 5, score 30
            z8 = be.array(
                np.exp(np.array([[4, 1, 2, 3, 1, 10], [2, 10, 5, 7, 2, 4]]))).T
            # example 0: given constructed (1, 5, 3), score 21: 4, score 31
            z9 = be.array(
                np.exp(np.array([[4, 8, 6, 3, 10, 5], [8, 4, 6, 3, 10, 1]]))).T

            self.z_list = [z, z1, z2, z3, z4, z5, z6, z7, z8, z9]

        def fprop(self, z, inference=True, init_state=None):
            self.i += 1
            return self.z_list[self.i]

    def final_state():
        return be.zeros_like(decoder.h[-1])

    class InObj(NervanaObject):
        def __init__(self):
            self.shape = (nout, time_steps)
            self.decoder_shape = (nout, time_steps)

    decoder.fprop = DummyFProp().fprop
    layers = Seq2Seq([encoder, decoder], decoder_connections=[0])
    layers.decoder._recurrent[0].final_state = final_state

    in_obj = InObj()
    layers.configure(in_obj)  # made zeros because zeros have shape
    layers.allocate()
    layers.allocate_deltas(None)
    beamsearch = BeamSearch(layers)
    inputs = be.iobuf(in_obj.shape)
    beamsearch.beamsearch(inputs, num_beams=num_beams)

    ex0 = np.array([[1, 5, 4, 1], [1, 2, 1, 5], [1, 5, 3, 4]])
    ex1 = np.array([[5, 1, 4, 4], [5, 1, 1, 1], [5, 2, 0, 4]])

    # extract all candidates
    examples = reformat_samples(beamsearch, num_beams, batch_size)
    assert allclose_with_out(examples[0], ex0)
    assert allclose_with_out(examples[1], ex1)