Beispiel #1
0
def tensors_allclose(a_tensors, b_tensors, rtol=0, atol=1e-7):
    """
    For each backends, calls f with its tensors, and returns the results to
    allclose.

    Arguments:
        a_tensors: list of tensors, or a tensor
        b_tensors: (another) list of tensors, or a tensor
        rtol (float, optional): Relative tolerance.
        atol (float, optional): Absolute tolerance.
    Returns:
        bool: If the tensors of fs is all close
    """
    # deal with individual tensor
    if type(a_tensors) is not list and type(b_tensors) is not list:
        a_tensors = [a_tensors]
        b_tensors = [b_tensors]
    results = []
    for a_tensor, b_tensor in zip(a_tensors, b_tensors):
        if isinstance(a_tensor, Tensor):
            a_tensor = a_tensor.get()
        if isinstance(b_tensor, Tensor):
            b_tensor = b_tensor.get()
        results.append(
            allclose_with_out(a_tensor.astype(b_tensor.dtype),
                              b_tensor,
                              rtol=rtol,
                              atol=atol))

    return all(results)
Beispiel #2
0
def test_padding(backend_default, poolargs):
    fshape, nifm, padding, stride, in_sz, batch_size = poolargs

    NervanaObject.be.bsz = batch_size

    # basic sanity check with random inputs
    inshape = (nifm, in_sz, in_sz)
    insize = np.prod(inshape)
    neon_layer = Pooling(fshape=fshape, strides=stride, padding=padding)

    inp = neon_layer.be.array(np.random.random((insize, batch_size)))
    inp.lshape = inshape
    neon_layer.configure(inshape)
    neon_layer.prev_layer = True
    neon_layer.allocate()
    neon_layer.set_deltas([neon_layer.be.iobuf(inshape)])

    out = neon_layer.fprop(inp).get()

    ncheck = [0, batch_size / 2, batch_size - 1]

    (out_exp, check_inds) = ref_pooling(
        inp, inp.lshape, (fshape, fshape), padding, (stride, stride), neon_layer.be, ncheck=ncheck
    )

    out_shape = list(out_exp.shape[0:3])
    out_shape.append(batch_size)
    outa = out.reshape(out_shape)

    assert allclose_with_out(out_exp, outa[:, :, :, check_inds], atol=0.0, rtol=0.0)
def test_padding(backend_default, poolargs):
    fshape, nifm, padding, stride, in_sz, batch_size = poolargs

    NervanaObject.be.bsz = batch_size

    # basic sanity check with random inputs
    inshape = (nifm, in_sz, in_sz)
    insize = np.prod(inshape)
    neon_layer = Pooling(fshape=fshape, strides=stride, padding=padding)

    inp = neon_layer.be.array(np.random.random((insize, batch_size)))
    inp.lshape = inshape
    neon_layer.configure(inshape)
    neon_layer.prev_layer = True
    neon_layer.allocate()
    neon_layer.set_deltas([neon_layer.be.iobuf(inshape)])

    out = neon_layer.fprop(inp).get()

    ncheck = [0, batch_size/2, batch_size-1]

    (out_exp, check_inds) = ref_pooling(inp, inp.lshape,
                                        (fshape, fshape),
                                        padding,
                                        (stride, stride),
                                        neon_layer.be,
                                        ncheck=ncheck)

    out_shape = list(out_exp.shape[0:3])
    out_shape.append(batch_size)
    outa = out.reshape(out_shape)

    assert allclose_with_out(out_exp, outa[:, :, :, check_inds], atol=0.0, rtol=0.0)
Beispiel #4
0
def tensors_allclose(a_tensors, b_tensors, rtol=0, atol=1e-7):
    """
    For each backends, calls f with its tensors, and returns the results to
    allclose.

    Arguments:
        a_tensors: list of tensors, or a tensor
        b_tensors: (another) list of tensors, or a tensor
        rtol (float, optional): Relative tolerance.
        atol (float, optional): Absolute tolerance.
    Returns:
        bool: If the tensors of fs is all close
    """
    # deal with individual tensor
    if type(a_tensors) is not list and type(b_tensors) is not list:
        a_tensors = [a_tensors]
        b_tensors = [b_tensors]
    results = []
    for a_tensor, b_tensor in zip(a_tensors, b_tensors):
        if isinstance(a_tensor, Tensor):
            a_tensor = a_tensor.get()
        if isinstance(b_tensor, Tensor):
            b_tensor = b_tensor.get()
        results.append(allclose_with_out(a_tensor.astype(b_tensor.dtype),
                                         b_tensor,
                                         rtol=rtol, atol=atol))

    return all(results)
def test_biSum(backend_default, fargs):

    seq_len, input_size, hidden_size, batch_size = fargs
    input_size *= 2

    in_shape = (input_size, seq_len)
    NervanaObject.be.bsz = batch_size

    bisum = BiSum()
    bisum.configure(in_shape)
    bisum.prev_layer = True

    bisum.allocate()
    bisum.set_deltas([bisum.be.iobuf(bisum.in_shape)])

    # inputs
    inp_np = np.random.random((input_size, seq_len * batch_size))
    inp_be = bisum.be.array(inp_np)

    # outputs
    out_be = bisum.fprop(inp_be)
    del_be = bisum.bprop(out_be)

    out_ref = bisum.be.empty_like(out_be)
    out_ref[:] = inp_be[:input_size / 2] + inp_be[input_size / 2:]
    assert out_be.shape[0] * 2 == inp_be.shape[0]
    assert allclose_with_out(out_be.get(),
                             out_ref.get(),
                             rtol=0.0,
                             atol=1.0e-5)

    assert allclose_with_out(del_be[:input_size / 2].get(),
                             out_be.get(),
                             rtol=0.0,
                             atol=1.0e-5)
    assert allclose_with_out(del_be[input_size / 2:].get(),
                             out_be.get(),
                             rtol=0.0,
                             atol=1.0e-5)
def mergesum_test_config(be, modfunc, use_stride=1):
    l1 = Conv(**conv_params(3, 16))
    neon_layer = modfunc(16, use_stride)
    inshape = (16, 32, 32)
    insize = np.prod(inshape)
    inpa = np.random.random((insize, batch_size))

    neon_seq = Sequential([l1] + neon_layer)
    neon_seq.configure(inshape)
    inp = be.array(inpa)

    neon_seq.allocate()
    # print neon_layer.nested_str()
    # neon_layer.layers[0].prev_layer = True
    neon_seq.allocate_deltas()
    neon_out = neon_seq.fprop(inp).get()

    # Now make the reference pathways:
    p1, p2 = module_factory_copy(neon_layer, modfunc, 16, use_stride)
    l11 = Conv(**conv_params(3, 16))
    l12 = Conv(**conv_params(3, 16))

    for ll in (l11, l12):
        for lcopy, lref in zip(ll, l1):
            if lcopy.has_params:
                lcopy.set_params(lref.get_params_serialize())

    path1 = Sequential([l11] + p1)
    path2 = Sequential([l12] + p2)
    for ll in (path1, path2):
        ll.configure(inshape)
        ll.allocate()
        ll.allocate_deltas()

    o1 = path1.fprop(inp)
    o2 = path2.fprop(inp)
    neon_out_ref = be.empty_like(o1)
    neon_out_ref[:] = be.maximum(o1 + o2, 0)

    # need to have bsum false for this test to be valid
    assert allclose_with_out(neon_out_ref.get(), neon_out, rtol=0)
    print "Fprop matching"
    print "Beginning Back prop"
    erra = np.random.random(neon_out.shape)
    err = be.array(erra)

    ebr = neon_seq.layers[-1].bprop(err)
    ebr = neon_seq.layers[-2].bprop(ebr)
    trunk_neon = ebr.get()

    err = be.array(erra)
    err[:] = be.greater(neon_out_ref, 0) * err

    pstart = len(l1)
    eb1 = err
    for l in reversed(path1.layers[pstart:]):
        eb1 = l.bprop(eb1)

    eb2 = err
    for l in reversed(path2.layers[pstart:]):
        eb2 = l.bprop(eb2)

    err_ref = be.empty_like(eb1)
    err_ref[:] = eb1 + eb2

    assert allclose_with_out(err_ref.get(), trunk_neon, rtol=0)
Beispiel #7
0
def check_lstm(seq_len,
               input_size,
               hidden_size,
               batch_size,
               init_func,
               inp_moms=[0.0, 1.0]):
    # init_func is the initializer for the model params
    # inp_moms is the [ mean, std dev] of the random input
    input_shape = (input_size, seq_len * batch_size)
    hidden_shape = (hidden_size, seq_len * batch_size)
    NervanaObject.be.bsz = NervanaObject.be.batch_size = batch_size

    # neon LSTM
    lstm = LSTM(hidden_size,
                init_func,
                activation=Tanh(),
                gate_activation=Logistic())

    inp = np.random.rand(*input_shape) * inp_moms[1] + inp_moms[0]
    inpa = lstm.be.array(inp)
    # import pdb; pdb.set_trace()
    # run neon fprop
    lstm.fprop(inpa)

    # reference numpy LSTM
    lstm_ref = RefLSTM()
    WLSTM = lstm_ref.init(input_size, hidden_size)

    # make ref weights and biases with neon model
    WLSTM[0, :] = lstm.b.get().T
    WLSTM[1:input_size + 1, :] = lstm.W_input.get().T
    WLSTM[input_size + 1:] = lstm.W_recur.get().T

    # transpose input X and do fprop
    inp_ref = inp.copy().T.reshape(seq_len, batch_size, input_size)
    (Hout_ref, cprev, hprev, batch_cache) = lstm_ref.forward(inp_ref, WLSTM)

    # the output needs transpose as well
    Hout_ref = Hout_ref.reshape(seq_len * batch_size, hidden_size).T
    IFOGf_ref = batch_cache['IFOGf'].reshape(seq_len * batch_size,
                                             hidden_size * 4).T
    Ct_ref = batch_cache['Ct'].reshape(seq_len * batch_size, hidden_size).T

    # compare results
    print '====Verifying IFOG===='
    allclose_with_out(lstm.ifog_buffer.get(), IFOGf_ref, rtol=0.0, atol=1.0e-5)

    print '====Verifying cell states===='
    allclose_with_out(lstm.c_act_buffer.get(), Ct_ref, rtol=0.0, atol=1.0e-5)

    print '====Verifying hidden states===='
    allclose_with_out(lstm.h_buffer.get(), Hout_ref, rtol=0.0, atol=1.0e-5)

    print 'fprop is verified'

    # now test the bprop
    # generate random deltas tensor
    deltas = np.random.randn(*hidden_shape)

    lstm.bprop(lstm.be.array(deltas))
    # grab the delta W from gradient buffer
    dWinput_neon = lstm.dW_input.get()
    dWrecur_neon = lstm.dW_recur.get()
    db_neon = lstm.db.get()

    # import pdb; pdb.set_trace()
    deltas_ref = deltas.copy().T.reshape(seq_len, batch_size, hidden_size)
    (dX_ref, dWLSTM_ref, dc0_ref,
     dh0_ref) = lstm_ref.backward(deltas_ref, batch_cache)
    dWrecur_ref = dWLSTM_ref[-hidden_size:, :]
    dWinput_ref = dWLSTM_ref[1:input_size + 1, :]
    db_ref = dWLSTM_ref[0, :]
    dX_ref = dX_ref.reshape(seq_len * batch_size, input_size).T

    # compare results
    print 'Making sure neon LSTM match numpy LSTM in bprop'
    print '====Verifying update on W_recur===='

    assert allclose_with_out(dWrecur_neon,
                             dWrecur_ref.T,
                             rtol=0.0,
                             atol=1.0e-5)

    print '====Verifying update on W_input===='
    assert allclose_with_out(dWinput_neon,
                             dWinput_ref.T,
                             rtol=0.0,
                             atol=1.0e-5)

    print '====Verifying update on bias===='
    assert allclose_with_out(db_neon.flatten(), db_ref, rtol=0.0, atol=1.0e-5)

    print '====Verifying output delta===='
    assert allclose_with_out(lstm.out_deltas_buffer.get(),
                             dX_ref,
                             rtol=0.0,
                             atol=1.0e-5)

    print 'bprop is verified'

    return
Beispiel #8
0
def check_rnn(seq_len, input_size, hidden_size,
              batch_size, init_func, inp_moms=[0.0, 1.0]):
    # init_func is the initializer for the model params
    # inp_moms is the [ mean, std dev] of the random input
    input_shape = (input_size, seq_len * batch_size)
    output_shape = (hidden_size, seq_len * batch_size)
    NervanaObject.be.bsz = NervanaObject.be.batch_size = batch_size

    # ======== create models ========
    # neon RNN
    rnn = Recurrent(hidden_size, init_func, Tanh())

    # reference numpy RNN
    rnn_ref = RefRecurrent(input_size, hidden_size)
    Wxh = rnn_ref.Wxh
    Whh = rnn_ref.Whh
    bh = rnn_ref.bh

    # ========= generate data =================
    # generate random input tensor
    inp = np.random.rand(*input_shape)*inp_moms[1] + inp_moms[0]
    inpa = rnn.be.array(inp)
    # generate random deltas tensor
    deltas = np.random.randn(*output_shape)

    # the reference code expects these shapes:
    # input_shape: (seq_len, input_size, batch_size)
    # output_shape: (seq_len, hidden_size, batch_size)
    inp_ref = inp.copy().T.reshape(
        seq_len, batch_size, input_size).swapaxes(1, 2)
    deltas_ref = deltas.copy().T.reshape(
        seq_len, batch_size, hidden_size).swapaxes(1, 2)

    # ========= running models ==========
    # run neon fprop
    rnn.fprop(inpa)

    # weights are only initialized after doing fprop, so now
    # make ref weights and biases the same with neon model
    Wxh[:] = rnn.W_input.get()
    Whh[:] = rnn.W_recur.get()
    bh[:] = rnn.b.get()

    (dWxh_ref, dWhh_ref, db_ref, h_ref_list,
     dh_ref_list, d_out_ref) = rnn_ref.lossFun(inp_ref, deltas_ref)

    # now test the bprop
    rnn.bprop(rnn.be.array(deltas))
    # grab the delta W from gradient buffer
    dWxh_neon = rnn.dW_input.get()
    dWhh_neon = rnn.dW_recur.get()
    db_neon = rnn.db.get()

    # comparing outputs
    print '====Verifying hidden states===='
    print allclose_with_out(rnn.h_buffer.get(),
                            h_ref_list,
                            rtol=0.0,
                            atol=1.0e-5)
    print 'fprop is verified'

    print '====Verifying update on W and b ===='
    print 'dWxh'
    assert allclose_with_out(dWxh_neon,
                             dWxh_ref,
                             rtol=0.0,
                             atol=1.0e-5)
    print 'dWhh'
    assert allclose_with_out(dWhh_neon,
                             dWhh_ref,
                             rtol=0.0,
                             atol=1.0e-5)

    print '====Verifying update on bias===='
    print 'db'
    assert allclose_with_out(db_neon,
                             db_ref,
                             rtol=0.0,
                             atol=1.0e-5)

    print 'bprop is verified'

    return
Beispiel #9
0
def check_lstm(seq_len, input_size, hidden_size,
               batch_size, init_func, inp_moms=[0.0, 1.0]):
    # init_func is the initializer for the model params
    # inp_moms is the [ mean, std dev] of the random input
    input_shape = (input_size, seq_len * batch_size)
    hidden_shape = (hidden_size, seq_len * batch_size)
    NervanaObject.be.bsz = NervanaObject.be.batch_size = batch_size

    # neon LSTM
    lstm = LSTM(hidden_size,
                init_func,
                activation=Tanh(),
                gate_activation=Logistic())

    inp = np.random.rand(*input_shape)*inp_moms[1] + inp_moms[0]
    inpa = lstm.be.array(inp)
    # run neon fprop
    lstm.configure((input_size, seq_len))
    lstm.prev_layer = True  # Hack to force allocating a delta buffer
    lstm.allocate()
    lstm.set_deltas([lstm.be.iobuf(lstm.in_shape)])
    lstm.fprop(inpa)

    # reference numpy LSTM
    lstm_ref = RefLSTM()
    WLSTM = lstm_ref.init(input_size, hidden_size)

    # make ref weights and biases with neon model
    WLSTM[0, :] = lstm.b.get().T
    WLSTM[1:input_size+1, :] = lstm.W_input.get().T
    WLSTM[input_size+1:] = lstm.W_recur.get().T

    # transpose input X and do fprop
    inp_ref = inp.copy().T.reshape(seq_len, batch_size, input_size)
    (Hout_ref, cprev, hprev, batch_cache) = lstm_ref.forward(inp_ref,
                                                             WLSTM)

    # the output needs transpose as well
    Hout_ref = Hout_ref.reshape(seq_len * batch_size, hidden_size).T
    IFOGf_ref = batch_cache['IFOGf'].reshape(seq_len * batch_size, hidden_size * 4).T
    Ct_ref = batch_cache['Ct'].reshape(seq_len * batch_size, hidden_size).T

    # compare results
    print '====Verifying IFOG===='
    allclose_with_out(lstm.ifog_buffer.get(),
                      IFOGf_ref,
                      rtol=0.0,
                      atol=1.0e-5)

    print '====Verifying cell states===='
    allclose_with_out(lstm.c_act_buffer.get(),
                      Ct_ref,
                      rtol=0.0,
                      atol=1.0e-5)

    print '====Verifying hidden states===='
    allclose_with_out(lstm.outputs.get(),
                      Hout_ref,
                      rtol=0.0,
                      atol=1.0e-5)

    print 'fprop is verified'

    # now test the bprop
    # generate random deltas tensor
    deltas = np.random.randn(*hidden_shape)

    lstm.bprop(lstm.be.array(deltas))
    # grab the delta W from gradient buffer
    dWinput_neon = lstm.dW_input.get()
    dWrecur_neon = lstm.dW_recur.get()
    db_neon = lstm.db.get()

    deltas_ref = deltas.copy().T.reshape(seq_len, batch_size, hidden_size)
    (dX_ref, dWLSTM_ref, dc0_ref, dh0_ref) = lstm_ref.backward(deltas_ref,
                                                               batch_cache)
    dWrecur_ref = dWLSTM_ref[-hidden_size:, :]
    dWinput_ref = dWLSTM_ref[1:input_size+1, :]
    db_ref = dWLSTM_ref[0, :]
    dX_ref = dX_ref.reshape(seq_len * batch_size, input_size).T

    # compare results
    print 'Making sure neon LSTM match numpy LSTM in bprop'
    print '====Verifying update on W_recur===='

    assert allclose_with_out(dWrecur_neon,
                             dWrecur_ref.T,
                             rtol=0.0,
                             atol=1.0e-5)

    print '====Verifying update on W_input===='
    assert allclose_with_out(dWinput_neon,
                             dWinput_ref.T,
                             rtol=0.0,
                             atol=1.0e-5)

    print '====Verifying update on bias===='
    assert allclose_with_out(db_neon.flatten(),
                             db_ref,
                             rtol=0.0,
                             atol=1.0e-5)

    print '====Verifying output delta===='
    assert allclose_with_out(lstm.out_deltas_buffer.get(),
                             dX_ref,
                             rtol=0.0,
                             atol=1.0e-5)

    print 'bprop is verified'

    return
Beispiel #10
0
def gradient_check_ref(seq_len,
                       input_size,
                       hidden_size,
                       batch_size,
                       epsilon=1.0e-5,
                       dtypeu=np.float64,
                       threshold=1e-4):
    # this is a check of the reference code itself
    # estimates the gradients by adding perturbations
    # to the input and the weights and compares to
    # the values calculated in bprop

    # generate sparse random input matrix
    NervanaObject.be.bsz = NervanaObject.be.batch_size = batch_size
    input_shape = (seq_len, input_size, batch_size)
    # hidden_shape = (seq_len, hidden_size, batch_size)
    (inp_bl, nz_inds) = sparse_rand(input_shape, frac=1.0 / input_shape[1])
    inp_bl = np.random.randn(*input_shape)

    # convert input matrix from neon to ref code format
    inp_bl = inp_bl.swapaxes(1, 2).astype(dtypeu)

    # generate reference LSTM
    lstm_ref = RefLSTM()
    WLSTM = lstm_ref.init(input_size, hidden_size).astype(dtypeu)

    # init parameters as done for neon
    WLSTM = np.random.randn(*WLSTM.shape)

    (Hout, cprev, hprev, cache) = lstm_ref.forward(inp_bl, WLSTM)

    # scale Hout by random matrix...
    rand_scale = np.random.random(Hout.shape) * 2.0 - 1.0
    rand_scale = dtypeu(rand_scale)

    # line below would be the loss function
    # loss_bl = np.sum(rand_scale * Hout)

    # run bprop, input deltas is rand_scale
    (dX_bl, dWLSTM_bl, dc0, dh0) = lstm_ref.backward(rand_scale, cache)

    grads_est = np.zeros(dX_bl.shape)
    inp_pert = inp_bl.copy()
    for pert_ind in range(inp_bl.size):
        save_val = inp_pert.flat[pert_ind]

        # add/subtract perturbations to input
        inp_pert.flat[pert_ind] = save_val + epsilon
        # and run fprop on perturbed input
        (Hout_pos, cprev, hprev, cache) = lstm_ref.forward(inp_pert, WLSTM)

        inp_pert.flat[pert_ind] = save_val - epsilon
        (Hout_neg, cprev, hprev, cache) = lstm_ref.forward(inp_pert, WLSTM)

        # calculate the loss on outputs
        loss_pos = np.sum(rand_scale * Hout_pos)
        loss_neg = np.sum(rand_scale * Hout_neg)

        grads_est.flat[pert_ind] = 0.5 * (loss_pos - loss_neg) / epsilon

        # reset input
        inp_pert.flat[pert_ind] = save_val

    # assert that gradient estimates within rel threshold of
    # bprop calculated deltas
    assert allclose_with_out(grads_est, dX_bl, rtol=threshold, atol=0.0)
    return
def mergesum_test_config(be, modfunc, use_stride=1):
    l1 = Conv(**conv_params(3, 16))
    neon_layer = modfunc(16, use_stride)
    inshape = (16, 32, 32)
    insize = np.prod(inshape)
    inpa = np.random.random((insize, batch_size))

    neon_seq = Sequential([l1] + neon_layer)
    neon_seq.configure(inshape)
    inp = be.array(inpa)

    neon_seq.allocate()
    # print neon_layer.nested_str()
    # neon_layer.layers[0].prev_layer = True
    neon_seq.allocate_deltas()
    neon_out = neon_seq.fprop(inp).get()

    # Now make the reference pathways:
    p1, p2 = module_factory_copy(neon_layer, modfunc, 16, use_stride)
    l11 = Conv(**conv_params(3, 16))
    l12 = Conv(**conv_params(3, 16))

    for ll in (l11, l12):
        for lcopy, lref in zip(ll, l1):
            if lcopy.has_params:
                lcopy.set_params(lref.get_params_serialize())

    path1 = Sequential([l11] + p1)
    path2 = Sequential([l12] + p2)
    for ll in (path1, path2):
        ll.configure(inshape)
        ll.allocate()
        ll.allocate_deltas()

    o1 = path1.fprop(inp)
    o2 = path2.fprop(inp)
    neon_out_ref = be.empty_like(o1)
    neon_out_ref[:] = be.maximum(o1 + o2, 0)

    # need to have bsum false for this test to be valid
    assert allclose_with_out(neon_out_ref.get(), neon_out, rtol=0)
    print "Fprop matching"
    print "Beginning Back prop"
    erra = np.random.random(neon_out.shape)
    err = be.array(erra)

    ebr = neon_seq.layers[-1].bprop(err)
    ebr = neon_seq.layers[-2].bprop(ebr)
    trunk_neon = ebr.get()

    err = be.array(erra)
    err[:] = be.greater(neon_out_ref, 0) * err

    pstart = len(l1)
    eb1 = err
    for l in reversed(path1.layers[pstart:]):
        eb1 = l.bprop(eb1)

    eb2 = err
    for l in reversed(path2.layers[pstart:]):
        eb2 = l.bprop(eb2)

    err_ref = be.empty_like(eb1)
    err_ref[:] = eb1 + eb2

    assert allclose_with_out(err_ref.get(), trunk_neon, rtol=0)
Beispiel #12
0
def gradient_check_ref(seq_len, input_size, hidden_size, batch_size,
                       epsilon=1.0e-5, dtypeu=np.float64, threshold=1e-4):
    # this is a check of the reference code itself
    # estimates the gradients by adding perturbations
    # to the input and the weights and compares to
    # the values calculated in bprop

    # generate sparse random input matrix
    NervanaObject.be.bsz = NervanaObject.be.batch_size = batch_size
    input_shape = (seq_len, input_size, batch_size)
    # hidden_shape = (seq_len, hidden_size, batch_size)
    (inp_bl, nz_inds) = sparse_rand(input_shape, frac=1.0/input_shape[1])
    inp_bl = np.random.randn(*input_shape)

    # convert input matrix from neon to ref code format
    inp_bl = inp_bl.swapaxes(1, 2).astype(dtypeu)

    # generate reference LSTM
    lstm_ref = RefLSTM()
    WLSTM = lstm_ref.init(input_size, hidden_size).astype(dtypeu)

    # init parameters as done for neon
    WLSTM = np.random.randn(*WLSTM.shape)

    (Hout, cprev, hprev, cache) = lstm_ref.forward(inp_bl, WLSTM)

    # scale Hout by random matrix...
    rand_scale = np.random.random(Hout.shape)*2.0 - 1.0
    rand_scale = dtypeu(rand_scale)

    # line below would be the loss function
    # loss_bl = np.sum(rand_scale * Hout)

    # run bprop, input deltas is rand_scale
    (dX_bl, dWLSTM_bl, dc0, dh0) = lstm_ref.backward(rand_scale, cache)

    grads_est = np.zeros(dX_bl.shape)
    inp_pert = inp_bl.copy()
    for pert_ind in range(inp_bl.size):
        save_val = inp_pert.flat[pert_ind]

        # add/subtract perturbations to input
        inp_pert.flat[pert_ind] = save_val + epsilon
        # and run fprop on perturbed input
        (Hout_pos, cprev, hprev, cache) = lstm_ref.forward(inp_pert, WLSTM)

        inp_pert.flat[pert_ind] = save_val - epsilon
        (Hout_neg, cprev, hprev, cache) = lstm_ref.forward(inp_pert, WLSTM)

        # calculate the loss on outputs
        loss_pos = np.sum(rand_scale*Hout_pos)
        loss_neg = np.sum(rand_scale*Hout_neg)

        grads_est.flat[pert_ind] = 0.5*(loss_pos-loss_neg)/epsilon

        # reset input
        inp_pert.flat[pert_ind] = save_val

    # assert that gradient estimates within rel threshold of
    # bprop calculated deltas
    assert allclose_with_out(grads_est, dX_bl, rtol=threshold, atol=0.0)
    return
def test_pool_layer(poolargs, device_id):

    op = poolargs[0]

    dtype = np.float32

    ng = NervanaGPU(stochastic_round=False, bench=True, device_id=device_id)
    nc = NervanaCPU()

    N, C = 32, 32
    D, H, W = 1, 32, 32
    J, T, R, S = 2, 1, 3, 3
    padding_j, padding_d, padding_h, padding_w = 0, 0, 0, 0
    strides_j, strides_d, strides_h, strides_w = 2, 1, 2, 2
    # op = 'max'

    pool_ng = ng.pool_layer(
        dtype,
        op,
        N,
        C, D, H, W,
        J, T, R, S,
        padding_j, padding_d, padding_h, padding_w,
        strides_j, strides_d, strides_h, strides_w)

    pool_nc = nc.pool_layer(
        dtype,
        op,
        N,
        C, D, H, W,
        J, T, R, S,
        padding_j, padding_d, padding_h, padding_w,
        strides_j, strides_d, strides_h, strides_w)

    assert pool_ng.dimI == pool_nc.dimI
    assert pool_ng.dimO == pool_nc.dimO

    dimI = pool_ng.dimI
    dimO = pool_ng.dimO

    # generating input arrays for inputs and errors
    cpuI = np.random.uniform(0.0, 1.0, sliceable(dimI, 1)).astype(
        np.float16).astype(dtype)
    cpuE = np.random.uniform(-0.2, 0.2, dimO).astype(dtype)

    # zero pad the last row of cpu input for the sake of numpy
    if op == "max":
        cpuI[-1, :] = np.finfo(dtype).min
    else:
        cpuI[-1, :] = 0

    # =========GPU and CPU and numpy ==========
    beI = cpuI[:-1, :].reshape(dimI)
    beE = cpuE

    ngO, ngB = run_backend_pool(ng, pool_ng, beI, beE, dtype)
    ncO, ncB = run_backend_pool(nc, pool_nc, beI, beE, dtype)
    cpuO, cpuB = run_numpy_pool(op, cpuI, cpuE, dtype, pool_ng)

    for opA, ngA, ncA, cpuA in (
            ("fprop", ngO, ncO, cpuO),
            ("bprop", ngB, ncB.reshape(dimI), cpuB[:-1, :].reshape(dimI))):

        print opA
        assert allclose_with_out(ngA.get(), ncA.get(), rtol=0, atol=1e-4)
        assert allclose_with_out(ncA.get(), cpuA, rtol=0, atol=1e-5)

    del ng, nc
Beispiel #14
0
def check_gru(seq_len, input_size, hidden_size,
              batch_size, init_func, inp_moms=[0.0, 1.0]):
    # init_func is the initializer for the model params
    # inp_moms is the [ mean, std dev] of the random input
    input_shape = (input_size, seq_len * batch_size)
    output_shape = (hidden_size, seq_len * batch_size)
    NervanaObject.be.bsz = NervanaObject.be.batch_size = batch_size

    # neon GRU
    gru = GRU(hidden_size,
              init_func,
              activation=Tanh(),
              gate_activation=Logistic())

    # generate random input tensor
    inp = np.random.rand(*input_shape)*inp_moms[1] + inp_moms[0]
    inpa = gru.be.array(inp)
    # generate random deltas tensor
    deltas = np.random.randn(*output_shape)

    # run neon fprop
    gru.configure((input_size, seq_len))
    gru.allocate()
    gru.fprop(inpa)

    # reference numpy GRU
    gru_ref = RefGRU(input_size, hidden_size)
    WGRU = gru_ref.weights

    # make ref weights and biases the same with neon model
    r_range = range(hidden_size)
    z_range = range(hidden_size, hidden_size * 2)
    c_range = range(hidden_size * 2, hidden_size * 3)

    WGRU[gru_ref.weights_ind_br][:] = gru.b.get()[r_range]
    WGRU[gru_ref.weights_ind_bz][:] = gru.b.get()[z_range]
    WGRU[gru_ref.weights_ind_bc][:] = gru.b.get()[c_range]

    WGRU[gru_ref.weights_ind_Wxr][:] = gru.W_input.get()[r_range]
    WGRU[gru_ref.weights_ind_Wxz][:] = gru.W_input.get()[z_range]
    WGRU[gru_ref.weights_ind_Wxc][:] = gru.W_input.get()[c_range]

    WGRU[gru_ref.weights_ind_Rhr][:] = gru.W_recur.get()[r_range]
    WGRU[gru_ref.weights_ind_Rhz][:] = gru.W_recur.get()[z_range]
    WGRU[gru_ref.weights_ind_Rhc][:] = gru.W_recur.get()[c_range]

    # transpose input X and do fprop
    # the reference code expects these shapes:
    # input_shape: (seq_len, input_size, batch_size)
    # output_shape: (seq_len, hidden_size, batch_size)
    inp_ref = inp.copy().T.reshape(
        seq_len, batch_size, input_size).swapaxes(1, 2)
    deltas_ref = deltas.copy().T.reshape(
        seq_len, batch_size, hidden_size).swapaxes(1, 2)

    (dWGRU_ref, h_ref_list, dh_ref_list,
        dr_ref_list, dz_ref_list, dc_ref_list) = gru_ref.lossFun(inp_ref,
                                                                 deltas_ref)

    print '====Verifying hidden states===='
    print allclose_with_out(gru.h_buffer.get(),
                            h_ref_list,
                            rtol=0.0,
                            atol=1.0e-5)

    print 'fprop is verified'

    # now test the bprop
    print 'Making sure neon GRU match numpy GRU in bprop'
    gru.bprop(gru.be.array(deltas))
    # grab the delta W from gradient buffer
    dWinput_neon = gru.dW_input.get()
    dWrecur_neon = gru.dW_recur.get()
    db_neon = gru.db.get()
    dWxr_neon = dWinput_neon[r_range]
    dWxz_neon = dWinput_neon[z_range]
    dWxc_neon = dWinput_neon[c_range]
    dWrr_neon = dWrecur_neon[r_range]
    dWrz_neon = dWrecur_neon[z_range]
    dWrc_neon = dWrecur_neon[c_range]
    dbr_neon = db_neon[r_range]
    dbz_neon = db_neon[z_range]
    dbc_neon = db_neon[c_range]

    drzc_neon = gru.rzhcan_delta_buffer.get()
    dr_neon = drzc_neon[r_range]
    dz_neon = drzc_neon[z_range]
    dc_neon = drzc_neon[c_range]

    dWxr_ref = dWGRU_ref[gru_ref.dW_ind_Wxr]
    dWxz_ref = dWGRU_ref[gru_ref.dW_ind_Wxz]
    dWxc_ref = dWGRU_ref[gru_ref.dW_ind_Wxc]
    dWrr_ref = dWGRU_ref[gru_ref.dW_ind_Rhr]
    dWrz_ref = dWGRU_ref[gru_ref.dW_ind_Rhz]
    dWrc_ref = dWGRU_ref[gru_ref.dW_ind_Rhc]
    dbr_ref = dWGRU_ref[gru_ref.dW_ind_br]
    dbz_ref = dWGRU_ref[gru_ref.dW_ind_bz]
    dbc_ref = dWGRU_ref[gru_ref.dW_ind_bc]

    # print '====Verifying hidden deltas ===='
    print '====Verifying r deltas ===='
    assert allclose_with_out(dr_neon,
                             dr_ref_list,
                             rtol=0.0,
                             atol=1.0e-5)

    print '====Verifying z deltas ===='
    assert allclose_with_out(dz_neon,
                             dz_ref_list,
                             rtol=0.0,
                             atol=1.0e-5)

    print '====Verifying hcan deltas ===='
    assert allclose_with_out(dc_neon,
                             dc_ref_list,
                             rtol=0.0,
                             atol=1.0e-5)

    print '====Verifying update on W_input===='
    print 'dWxr'
    assert allclose_with_out(dWxr_neon,
                             dWxr_ref,
                             rtol=0.0,
                             atol=1.0e-5)
    print 'dWxz'
    assert allclose_with_out(dWxz_neon,
                             dWxz_ref,
                             rtol=0.0,
                             atol=1.0e-5)
    print 'dWxc'
    assert allclose_with_out(dWxc_neon,
                             dWxc_ref,
                             rtol=0.0,
                             atol=1.0e-5)

    print '====Verifying update on W_recur===='

    print 'dWrr'
    assert allclose_with_out(dWrr_neon,
                             dWrr_ref,
                             rtol=0.0,
                             atol=1.0e-5)
    print 'dWrz'
    assert allclose_with_out(dWrz_neon,
                             dWrz_ref,
                             rtol=0.0,
                             atol=1.0e-5)
    print 'dWrc'
    assert allclose_with_out(dWrc_neon,
                             dWrc_ref,
                             rtol=0.0,
                             atol=1.0e-5)

    print '====Verifying update on bias===='
    print 'dbr'
    assert allclose_with_out(dbr_neon,
                             dbr_ref,
                             rtol=0.0,
                             atol=1.0e-5)
    print 'dbz'
    assert allclose_with_out(dbz_neon,
                             dbz_ref,
                             rtol=0.0,
                             atol=1.0e-5)
    print 'dbc'
    assert allclose_with_out(dbc_neon,
                             dbc_ref,
                             rtol=0.0,
                             atol=1.0e-5)

    print 'bprop is verified'

    return
Beispiel #15
0
def check_gru(seq_len, input_size, hidden_size,
              batch_size, init_func, inp_moms=[0.0, 1.0]):
    # init_func is the initializer for the model params
    # inp_moms is the [ mean, std dev] of the random input
    input_shape = (input_size, seq_len * batch_size)
    output_shape = (hidden_size, seq_len * batch_size)
    NervanaObject.be.bsz = NervanaObject.be.batch_size = batch_size

    # neon GRU
    gru = GRU(hidden_size,
              init_func,
              activation=Tanh(),
              gate_activation=Logistic())

    # generate random input tensor
    inp = np.random.rand(*input_shape)*inp_moms[1] + inp_moms[0]
    inpa = gru.be.array(inp)
    # generate random deltas tensor
    deltas = np.random.randn(*output_shape)

    # run neon fprop
    gru.configure((input_size, seq_len))
    gru.prev_layer = True
    gru.allocate()
    gru.set_deltas([gru.be.iobuf(gru.in_shape)])
    gru.fprop(inpa)

    # reference numpy GRU
    gru_ref = RefGRU(input_size, hidden_size)
    WGRU = gru_ref.weights

    # make ref weights and biases the same with neon model
    r_range = range(hidden_size)
    z_range = range(hidden_size, hidden_size * 2)
    c_range = range(hidden_size * 2, hidden_size * 3)

    WGRU[gru_ref.weights_ind_br][:] = gru.b.get()[r_range]
    WGRU[gru_ref.weights_ind_bz][:] = gru.b.get()[z_range]
    WGRU[gru_ref.weights_ind_bc][:] = gru.b.get()[c_range]

    WGRU[gru_ref.weights_ind_Wxr][:] = gru.W_input.get()[r_range]
    WGRU[gru_ref.weights_ind_Wxz][:] = gru.W_input.get()[z_range]
    WGRU[gru_ref.weights_ind_Wxc][:] = gru.W_input.get()[c_range]

    WGRU[gru_ref.weights_ind_Rhr][:] = gru.W_recur.get()[r_range]
    WGRU[gru_ref.weights_ind_Rhz][:] = gru.W_recur.get()[z_range]
    WGRU[gru_ref.weights_ind_Rhc][:] = gru.W_recur.get()[c_range]

    # transpose input X and do fprop
    # the reference code expects these shapes:
    # input_shape: (seq_len, input_size, batch_size)
    # output_shape: (seq_len, hidden_size, batch_size)
    inp_ref = inp.copy().T.reshape(
        seq_len, batch_size, input_size).swapaxes(1, 2)
    deltas_ref = deltas.copy().T.reshape(
        seq_len, batch_size, hidden_size).swapaxes(1, 2)

    (dWGRU_ref, h_ref_list, dh_ref_list,
        dr_ref_list, dz_ref_list, dc_ref_list) = gru_ref.lossFun(inp_ref,
                                                                 deltas_ref)

    print '====Verifying hidden states===='
    print allclose_with_out(gru.outputs.get(),
                            h_ref_list,
                            rtol=0.0,
                            atol=1.0e-5)

    print 'fprop is verified'

    # now test the bprop
    print 'Making sure neon GRU match numpy GRU in bprop'
    gru.bprop(gru.be.array(deltas))
    # grab the delta W from gradient buffer
    dWinput_neon = gru.dW_input.get()
    dWrecur_neon = gru.dW_recur.get()
    db_neon = gru.db.get()
    dWxr_neon = dWinput_neon[r_range]
    dWxz_neon = dWinput_neon[z_range]
    dWxc_neon = dWinput_neon[c_range]
    dWrr_neon = dWrecur_neon[r_range]
    dWrz_neon = dWrecur_neon[z_range]
    dWrc_neon = dWrecur_neon[c_range]
    dbr_neon = db_neon[r_range]
    dbz_neon = db_neon[z_range]
    dbc_neon = db_neon[c_range]

    drzc_neon = gru.rzhcan_delta_buffer.get()
    dr_neon = drzc_neon[r_range]
    dz_neon = drzc_neon[z_range]
    dc_neon = drzc_neon[c_range]

    dWxr_ref = dWGRU_ref[gru_ref.dW_ind_Wxr]
    dWxz_ref = dWGRU_ref[gru_ref.dW_ind_Wxz]
    dWxc_ref = dWGRU_ref[gru_ref.dW_ind_Wxc]
    dWrr_ref = dWGRU_ref[gru_ref.dW_ind_Rhr]
    dWrz_ref = dWGRU_ref[gru_ref.dW_ind_Rhz]
    dWrc_ref = dWGRU_ref[gru_ref.dW_ind_Rhc]
    dbr_ref = dWGRU_ref[gru_ref.dW_ind_br]
    dbz_ref = dWGRU_ref[gru_ref.dW_ind_bz]
    dbc_ref = dWGRU_ref[gru_ref.dW_ind_bc]

    # print '====Verifying hidden deltas ===='
    print '====Verifying r deltas ===='
    assert allclose_with_out(dr_neon,
                             dr_ref_list,
                             rtol=0.0,
                             atol=1.0e-5)

    print '====Verifying z deltas ===='
    assert allclose_with_out(dz_neon,
                             dz_ref_list,
                             rtol=0.0,
                             atol=1.0e-5)

    print '====Verifying hcan deltas ===='
    assert allclose_with_out(dc_neon,
                             dc_ref_list,
                             rtol=0.0,
                             atol=1.0e-5)

    print '====Verifying update on W_input===='
    print 'dWxr'
    assert allclose_with_out(dWxr_neon,
                             dWxr_ref,
                             rtol=0.0,
                             atol=1.0e-5)
    print 'dWxz'
    assert allclose_with_out(dWxz_neon,
                             dWxz_ref,
                             rtol=0.0,
                             atol=1.0e-5)
    print 'dWxc'
    assert allclose_with_out(dWxc_neon,
                             dWxc_ref,
                             rtol=0.0,
                             atol=1.0e-5)

    print '====Verifying update on W_recur===='

    print 'dWrr'
    assert allclose_with_out(dWrr_neon,
                             dWrr_ref,
                             rtol=0.0,
                             atol=1.0e-5)
    print 'dWrz'
    assert allclose_with_out(dWrz_neon,
                             dWrz_ref,
                             rtol=0.0,
                             atol=1.0e-5)
    print 'dWrc'
    assert allclose_with_out(dWrc_neon,
                             dWrc_ref,
                             rtol=0.0,
                             atol=1.0e-5)

    print '====Verifying update on bias===='
    print 'dbr'
    assert allclose_with_out(dbr_neon,
                             dbr_ref,
                             rtol=0.0,
                             atol=1.0e-5)
    print 'dbz'
    assert allclose_with_out(dbz_neon,
                             dbz_ref,
                             rtol=0.0,
                             atol=1.0e-5)
    print 'dbc'
    assert allclose_with_out(dbc_neon,
                             dbc_ref,
                             rtol=0.0,
                             atol=1.0e-5)

    print 'bprop is verified'

    return
Beispiel #16
0
def check_rnn(seq_len,
              input_size,
              hidden_size,
              batch_size,
              init_func,
              inp_moms=[0.0, 1.0]):
    # init_func is the initializer for the model params
    # inp_moms is the [ mean, std dev] of the random input
    input_shape = (input_size, seq_len * batch_size)
    output_shape = (hidden_size, seq_len * batch_size)
    NervanaObject.be.bsz = NervanaObject.be.batch_size = batch_size

    # ======== create models ========
    # neon RNN
    rnn = Recurrent(hidden_size, init_func, Tanh())

    # reference numpy RNN
    rnn_ref = RefRecurrent(input_size, hidden_size)
    Wxh = rnn_ref.Wxh
    Whh = rnn_ref.Whh
    bh = rnn_ref.bh

    # ========= generate data =================
    # generate random input tensor
    inp = np.random.rand(*input_shape) * inp_moms[1] + inp_moms[0]
    inpa = rnn.be.array(inp)
    # generate random deltas tensor
    deltas = np.random.randn(*output_shape)

    # the reference code expects these shapes:
    # input_shape: (seq_len, input_size, batch_size)
    # output_shape: (seq_len, hidden_size, batch_size)
    inp_ref = inp.copy().T.reshape(seq_len, batch_size,
                                   input_size).swapaxes(1, 2)
    deltas_ref = deltas.copy().T.reshape(seq_len, batch_size,
                                         hidden_size).swapaxes(1, 2)

    # ========= running models ==========
    # run neon fprop
    rnn.fprop(inpa)

    # weights are only initialized after doing fprop, so now
    # make ref weights and biases the same with neon model
    Wxh[:] = rnn.W_input.get()
    Whh[:] = rnn.W_recur.get()
    bh[:] = rnn.b.get()

    (dWxh_ref, dWhh_ref, db_ref, h_ref_list, dh_ref_list,
     d_out_ref) = rnn_ref.lossFun(inp_ref, deltas_ref)

    # now test the bprop
    rnn.bprop(rnn.be.array(deltas))
    # grab the delta W from gradient buffer
    dWxh_neon = rnn.dW_input.get()
    dWhh_neon = rnn.dW_recur.get()
    db_neon = rnn.db.get()

    # comparing outputs
    print '====Verifying hidden states===='
    print allclose_with_out(rnn.h_buffer.get(),
                            h_ref_list,
                            rtol=0.0,
                            atol=1.0e-5)
    print 'fprop is verified'

    print '====Verifying update on W and b ===='
    print 'dWxh'
    assert allclose_with_out(dWxh_neon, dWxh_ref, rtol=0.0, atol=1.0e-5)
    print 'dWhh'
    assert allclose_with_out(dWhh_neon, dWhh_ref, rtol=0.0, atol=1.0e-5)

    print '====Verifying update on bias===='
    print 'db'
    assert allclose_with_out(db_neon, db_ref, rtol=0.0, atol=1.0e-5)

    print 'bprop is verified'

    return
def test_conv_layer(fargs_tests, device_id):

    dtype = np.float32

    ng = NervanaGPU(stochastic_round=False, bench=True, device_id=device_id)

    N, C, K = fargs_tests[0]
    D, H, W = fargs_tests[1]
    T, R, S = fargs_tests[2]
    padding_d, padding_h, padding_w = fargs_tests[3]
    strides_d, strides_h, strides_w = fargs_tests[4]

    conv_ng = ng.conv_layer(dtype, N, C, K, D, H, W, T, R, S, padding_d,
                            padding_h, padding_w, strides_d, strides_h,
                            strides_w)

    nc = NervanaCPU()
    conv_nc = nc.conv_layer(dtype, N, C, K, D, H, W, T, R, S, padding_d,
                            padding_h, padding_w, strides_d, strides_h,
                            strides_w)

    assert conv_nc.dimI == conv_ng.dimI
    assert conv_nc.dimF == conv_ng.dimF
    assert conv_nc.dimO == conv_ng.dimO
    assert conv_nc.M == conv_ng.M

    dimI = conv_ng.dimI
    dimF = conv_ng.dimF
    dimO = conv_ng.dimO

    # cpu input arrays
    cpuI = np.random.uniform(-0.8, 0.8, slicable(dimI, 1)).astype(np.float32)
    cpuF = np.random.uniform(0.0, 0.3, slicable(dimF)).astype(np.float32)
    cpuE = np.random.uniform(-0.2, 0.2, dimO).astype(np.float32)

    # zero pad the last row of cpu input for the sake of numpy
    cpuI[-1, :] = 0.0

    # =======GPU and CPU==========
    beI = cpuI[:-1, :].reshape(dimI)
    beF = cpuF.reshape(dimF)
    beE = cpuE

    start_gpu = default_timer()
    ngO, ngB, ngU = run_backend_conv(ng, conv_ng, beI, beF, beE, dtype)
    end_gpu = default_timer()

    start_cpu = default_timer()
    ncO, ncB, ncU = run_backend_conv(nc, conv_nc, beI, beF, beE, dtype)
    end_cpu = default_timer()

    print("gputime: %s, cputime %s" %
          (end_gpu - start_gpu, end_cpu - start_cpu))

    # ======numpy===========
    # cpu output arrays
    cpuO = np.zeros(dimO, dtype=dtype)
    cpuB = np.zeros(slicable(dimI, 1), dtype=dtype)
    cpuU = np.zeros(slicable(dimF), dtype=dtype)

    D, H, W = conv_nc.DHW
    T, R, S = conv_nc.TRS
    M, P, Q = conv_nc.MPQ

    pad_d, pad_h, pad_w = conv_nc.padding
    str_d, str_h, str_w = conv_nc.strides

    for m in range(M):
        mt = m * str_d - pad_d

        for p in range(P):
            pr = p * str_h - pad_h

            for q in range(Q):
                qs = q * str_w - pad_w

                idx = pixel_indices(conv_nc, mt, pr, qs)

                cpuO[:, m, p, q, :] = np.dot(cpuF.T, cpuI[idx, :])

                cpuB[idx, :] += np.dot(cpuF, cpuE[:, m, p, q, :])

                cpuU += np.dot(cpuI[idx, :], cpuE[:, m, p, q, :].T)

    for op, ngA, ncA, cpuA, w in (("fprop", ngO, ncO, cpuO,
                                   Q), ("bprop", ngB, ncB.reshape(dimI),
                                        cpuB[:-1, :].reshape(dimI), W),
                                  ("update", ngU, ncU.reshape(dimF),
                                   cpuU.reshape(dimF), S)):

        print(op)
        ncAnp = ncA.get().astype(np.float32)
        ngAnp = ngA.get().astype(np.float32)
        ncdif = cpuA - ncAnp
        ngdif = cpuA - ngAnp
        maxval = abs(cpuA).max()
        ncmaxdif = abs(ncdif).max()
        ngmaxdif = abs(ngdif).max()
        ncRatio = ncmaxdif / maxval
        ngRatio = ngmaxdif / maxval

        assert ncRatio < 1e-5
        assert ngRatio < 1e-5
        assert allclose_with_out(ncA.get(), cpuA, rtol=0, atol=1e-4)
        assert allclose_with_out(ngA.get(), cpuA, rtol=0, atol=1e-3)

    del ng
    del nc
def test_branch_model(backend_gpu):
    np.random.seed(0)
    be = NervanaObject.be
    be.bsz = 64
    main1 = main_branch()
    i1 = inception([(32, ), (32, 32), ('max', 16)])
    top = top_branch()
    neon_layer = Sequential(main1 + i1 + top)

    inshape = (3, 224, 224)
    insize = np.prod(inshape)
    inpa = np.random.random((insize, batch_size))
    neon_layer.configure(inshape)
    inp = neon_layer.be.array(inpa)

    neon_layer.allocate()
    print neon_layer.nested_str()
    neon_layer.layers[0].prev_layer = True
    neon_layer.allocate_deltas()
    neon_layer.layers[0].set_deltas([be.iobuf(inshape)])
    neon_out = neon_layer.fprop(inp).get()

    # Now make the reference pathways:
    main_trunk2 = Sequential(main_branch())
    main_trunk2.configure(inshape)
    main2 = main_trunk2.layers
    main2[0].prev_layer = True
    main2[0].set_deltas([be.iobuf(inshape)])
    (b1, b2, b3) = inception_bare(i1, [(32, ), (32, 32), ('max', 16)])

    for bb in (b1, b2, b3):
        oshape = inshape
        for ll in main2 + bb:
            oshape = ll.configure(oshape)

    main1_trunk = neon_layer.layers[:8]
    for ll, lo in zip(main2, main1_trunk):
        if ll.has_params:
            ll.set_params({'params': {'W': lo.W.get()}})
        ll.allocate()
        ll.set_deltas([be.iobuf(ll.in_shape)])
    for bb in (b1, b2, b3):
        for ll in bb:
            ll.allocate()
            ll.set_deltas([be.iobuf(ll.in_shape)])

    # Create the combined output buffer
    merge_output = be.empty_like(neon_layer.layers[8].outputs)

    x = inp
    for ll in main2:
        x = ll.fprop(x)

    start = 0
    for bb in (b1, b2, b3):
        xb = x
        for ll in bb:
            xb = ll.fprop(xb)
        end = start + xb.shape[0]
        merge_output[start:end] = xb
        start = end

    x = merge_output

    top_trunk = Sequential(top).layers
    for ll in top_trunk:
        x = ll.fprop(x)

    neon_out_ref = x.get()
    assert allclose_with_out(neon_out, neon_out_ref, rtol=0)

    print "Beginning Back prop"
    erra = np.random.random(neon_out.shape)
    err = be.array(erra)
    for ll in reversed(neon_layer.layers[8:]):
        err = ll.bprop(err)

    neon_deltas = err.get()
    for bb, errb in zip((b1, b2, b3), neon_layer.layers[8].error_views):
        for ll in reversed(bb):
            errb = ll.bprop(errb)

    # Now sum up the deltas at the root of the branch layer and compare
    ref_deltas = be.zeros_like(b1[0].deltas)
    ref_deltas[:] = b3[0].deltas + b2[0].deltas + b1[0].deltas

    neon_ref_deltas = ref_deltas.get()

    assert allclose_with_out(neon_deltas, neon_ref_deltas, rtol=0)
def test_branch_model(backend_gpu):
    np.random.seed(0)
    be = NervanaObject.be
    be.bsz = 64
    main1 = main_branch()
    i1 = inception([(32,), (32, 32), ('max', 16)])
    top = top_branch()
    neon_layer = Sequential(main1 + i1 + top)

    inshape = (3, 224, 224)
    insize = np.prod(inshape)
    inpa = np.random.random((insize, batch_size))
    neon_layer.configure(inshape)
    inp = neon_layer.be.array(inpa)

    neon_layer.allocate()
    print neon_layer.nested_str()
    neon_layer.layers[0].prev_layer = True
    neon_layer.allocate_deltas()
    neon_layer.layers[0].set_deltas([be.iobuf(inshape)])
    neon_out = neon_layer.fprop(inp).get()

    # Now make the reference pathways:
    main_trunk2 = Sequential(main_branch())
    main_trunk2.configure(inshape)
    main2 = main_trunk2.layers
    main2[0].prev_layer = True
    main2[0].set_deltas([be.iobuf(inshape)])
    (b1, b2, b3) = inception_bare(i1, [(32,), (32, 32), ('max', 16)])

    for bb in (b1, b2, b3):
        oshape = inshape
        for ll in main2 + bb:
            oshape = ll.configure(oshape)

    main1_trunk = neon_layer.layers[:8]
    for ll, lo in zip(main2, main1_trunk):
        if ll.has_params:
            ll.set_params({'params': {'W': lo.W.get()}})
        ll.allocate()
        ll.set_deltas([be.iobuf(ll.in_shape)])
    for bb in (b1, b2, b3):
        for ll in bb:
            ll.allocate()
            ll.set_deltas([be.iobuf(ll.in_shape)])

    # Create the combined output buffer
    merge_output = be.empty_like(neon_layer.layers[8].outputs)

    x = inp
    for ll in main2:
        x = ll.fprop(x)

    start = 0
    for bb in (b1, b2, b3):
        xb = x
        for ll in bb:
            xb = ll.fprop(xb)
        end = start + xb.shape[0]
        merge_output[start:end] = xb
        start = end

    x = merge_output

    top_trunk = Sequential(top).layers
    for ll in top_trunk:
        x = ll.fprop(x)

    neon_out_ref = x.get()
    assert allclose_with_out(neon_out, neon_out_ref, rtol=0)

    print "Beginning Back prop"
    erra = np.random.random(neon_out.shape)
    err = be.array(erra)
    for ll in reversed(neon_layer.layers[8:]):
        err = ll.bprop(err)

    neon_deltas = err.get()
    for bb, errb in zip((b1, b2, b3), neon_layer.layers[8].error_views):
        for ll in reversed(bb):
            errb = ll.bprop(errb)

    # Now sum up the deltas at the root of the branch layer and compare
    ref_deltas = be.zeros_like(b1[0].deltas)
    ref_deltas[:] = b3[0].deltas + b2[0].deltas + b1[0].deltas

    neon_ref_deltas = ref_deltas.get()

    assert allclose_with_out(neon_deltas, neon_ref_deltas, rtol=0)
def test_branch_model_fork(backend_gpu):
    from neon.layers import BranchNode, Tree
    np.random.seed(0)
    be = NervanaObject.be
    be.bsz = 64
    bnode = BranchNode()
    i1 = inception([(32,), (32, 32), ('max', 16)])
    top1 = top_branch()
    top2 = top_branch()
    p1 = Sequential(main_branch() + [bnode, i1] + top1)
    p2 = [bnode] + top2

    alpha2 = 0.3
    neon_layer = Tree([p1, p2], alphas=[1.0, alpha2])

    inshape = (3, 224, 224)
    insize = np.prod(inshape)
    inpa = np.random.random((insize, batch_size))
    neon_layer.configure(inshape)
    inp = neon_layer.be.array(inpa)

    neon_layer.allocate()
    print neon_layer.nested_str()
    neon_layer.layers[0].layers[0].prev_layer = True
    neon_layer.allocate_deltas()
    neon_layer.layers[0].layers[0].set_deltas([be.iobuf(inshape)])
    neon_out_dev = neon_layer.fprop(inp)
    neon_out = [d.get() for d in neon_out_dev]

    # Now make the reference pathways:
    main_trunk2 = Sequential(main_branch())
    main_trunk2.configure(inshape)
    main2 = main_trunk2.layers
    main2[0].prev_layer = True
    main2[0].set_deltas([be.iobuf(inshape)])

    branch2 = Sequential(top_branch())
    lbranch2 = branch2.layers
    (b1, b2, b3) = inception_bare(i1, [(32,), (32, 32), ('max', 16)])

    for bb in (b1, b2, b3, lbranch2):
        oshape = inshape
        for ll in main2 + bb:
            oshape = ll.configure(oshape)

    main1_trunk = neon_layer.layers[0].layers[:8]
    for ll, lo in zip(main2, main1_trunk):
        if ll.has_params:
            ll.set_params({'params': {'W': lo.W.get()}})
        ll.allocate()
        ll.set_deltas([be.iobuf(ll.in_shape)])

    for ll, lo in zip(lbranch2, neon_layer.layers[1].layers[1:]):
        if ll.has_params:
            ll.set_params({'params': {'W': lo.W.get()}})

    for bb in (b1, b2, b3, lbranch2):
        for ll in bb:
            ll.allocate()
            ll.set_deltas([be.iobuf(ll.in_shape)])

    # Create the combined output buffer
    merge_output = be.empty_like(neon_layer.layers[0].layers[9].outputs)

    x = inp
    for ll in main2:
        x = ll.fprop(x)
    main2_out = x

    start = 0
    for bb in (b1, b2, b3):
        xb = main2_out
        for ll in bb:
            xb = ll.fprop(xb)
        end = start + xb.shape[0]
        merge_output[start:end] = xb
        start = end

    x = merge_output

    top_trunk = Sequential(top1).layers
    for ll in top_trunk:
        x = ll.fprop(x)

    neon_out_ref = x.get()
    assert allclose_with_out(neon_out_ref, neon_out[0], rtol=0)

    # Now do second branch
    neon_out_ref2 = branch2.fprop(main2_out).get()
    assert allclose_with_out(neon_out_ref2, neon_out[1])

    print "Beginning Back prop"
    erra = [np.random.random(d.shape) for d in neon_out]
    err = [be.array(d) for d in erra]
    neon_layer.layers[0].layers[0].deltas = be.iobuf(inshape)
    neon_layer.bprop(err)

    bottom_neon_deltas = neon_layer.layers[0].layers[1].deltas.get()
    middle_neon_deltas = neon_layer.layers[1].layers[1].deltas.get()

    err0 = err[0]
    for ll in reversed(top_trunk):
        err0 = ll.bprop(err0)

    err1 = err[1]
    for ll in reversed(lbranch2):
        err1 = ll.bprop(err1)

    for bb, errb in zip((b1, b2, b3), neon_layer.layers[0].layers[-5].error_views):
        for ll in reversed(bb):
            errb = ll.bprop(errb)

    # Now sum up the deltas at the root of the branch layer and compare
    ref_deltas = be.zeros_like(b1[0].deltas)
    ref_deltas[:] = alpha2 * lbranch2[0].deltas
    ref_deltas[:] = ref_deltas + b3[0].deltas + b2[0].deltas + b1[0].deltas
    neon_ref_deltas = ref_deltas.get()
    assert allclose_with_out(middle_neon_deltas, neon_ref_deltas, rtol=0)

    x = ref_deltas
    main2[0].deltas = be.iobuf(inshape)

    for ll in reversed(main2):
        x = ll.bprop(x)

    bottom_neon_ref_deltas = main2[1].deltas.get()
    assert allclose_with_out(bottom_neon_deltas, bottom_neon_ref_deltas, rtol=0)
def test_branch_model_fork(backend_gpu):
    from neon.layers import BranchNode, Tree
    np.random.seed(0)
    be = NervanaObject.be
    be.bsz = 64
    bnode = BranchNode()
    i1 = inception([(32, ), (32, 32), ('max', 16)])
    top1 = top_branch()
    top2 = top_branch()
    p1 = Sequential(main_branch() + [bnode, i1] + top1)
    p2 = [bnode] + top2

    alpha2 = 0.3
    neon_layer = Tree([p1, p2], alphas=[1.0, alpha2])

    inshape = (3, 224, 224)
    insize = np.prod(inshape)
    inpa = np.random.random((insize, batch_size))
    neon_layer.configure(inshape)
    inp = neon_layer.be.array(inpa)

    neon_layer.allocate()
    print neon_layer.nested_str()
    neon_layer.layers[0].layers[0].prev_layer = True
    neon_layer.allocate_deltas()
    neon_layer.layers[0].layers[0].set_deltas([be.iobuf(inshape)])
    neon_out_dev = neon_layer.fprop(inp)
    neon_out = [d.get() for d in neon_out_dev]

    # Now make the reference pathways:
    main_trunk2 = Sequential(main_branch())
    main_trunk2.configure(inshape)
    main2 = main_trunk2.layers
    main2[0].prev_layer = True
    main2[0].set_deltas([be.iobuf(inshape)])

    branch2 = Sequential(top_branch())
    lbranch2 = branch2.layers
    (b1, b2, b3) = inception_bare(i1, [(32, ), (32, 32), ('max', 16)])

    for bb in (b1, b2, b3, lbranch2):
        oshape = inshape
        for ll in main2 + bb:
            oshape = ll.configure(oshape)

    main1_trunk = neon_layer.layers[0].layers[:8]
    for ll, lo in zip(main2, main1_trunk):
        if ll.has_params:
            ll.set_params({'params': {'W': lo.W.get()}})
        ll.allocate()
        ll.set_deltas([be.iobuf(ll.in_shape)])

    for ll, lo in zip(lbranch2, neon_layer.layers[1].layers[1:]):
        if ll.has_params:
            ll.set_params({'params': {'W': lo.W.get()}})

    for bb in (b1, b2, b3, lbranch2):
        for ll in bb:
            ll.allocate()
            ll.set_deltas([be.iobuf(ll.in_shape)])

    # Create the combined output buffer
    merge_output = be.empty_like(neon_layer.layers[0].layers[9].outputs)

    x = inp
    for ll in main2:
        x = ll.fprop(x)
    main2_out = x

    start = 0
    for bb in (b1, b2, b3):
        xb = main2_out
        for ll in bb:
            xb = ll.fprop(xb)
        end = start + xb.shape[0]
        merge_output[start:end] = xb
        start = end

    x = merge_output

    top_trunk = Sequential(top1).layers
    for ll in top_trunk:
        x = ll.fprop(x)

    neon_out_ref = x.get()
    assert allclose_with_out(neon_out_ref, neon_out[0], rtol=0)

    # Now do second branch
    neon_out_ref2 = branch2.fprop(main2_out).get()
    assert allclose_with_out(neon_out_ref2, neon_out[1])

    print "Beginning Back prop"
    erra = [np.random.random(d.shape) for d in neon_out]
    err = [be.array(d) for d in erra]
    neon_layer.layers[0].layers[0].deltas = be.iobuf(inshape)
    neon_layer.bprop(err)

    bottom_neon_deltas = neon_layer.layers[0].layers[1].deltas.get()
    middle_neon_deltas = neon_layer.layers[1].layers[1].deltas.get()

    err0 = err[0]
    for ll in reversed(top_trunk):
        err0 = ll.bprop(err0)

    err1 = err[1]
    for ll in reversed(lbranch2):
        err1 = ll.bprop(err1)

    for bb, errb in zip((b1, b2, b3),
                        neon_layer.layers[0].layers[-5].error_views):
        for ll in reversed(bb):
            errb = ll.bprop(errb)

    # Now sum up the deltas at the root of the branch layer and compare
    ref_deltas = be.zeros_like(b1[0].deltas)
    ref_deltas[:] = alpha2 * lbranch2[0].deltas
    ref_deltas[:] = ref_deltas + b3[0].deltas + b2[0].deltas + b1[0].deltas
    neon_ref_deltas = ref_deltas.get()
    assert allclose_with_out(middle_neon_deltas, neon_ref_deltas, rtol=0)

    x = ref_deltas
    main2[0].deltas = be.iobuf(inshape)

    for ll in reversed(main2):
        x = ll.bprop(x)

    bottom_neon_ref_deltas = main2[1].deltas.get()
    assert allclose_with_out(bottom_neon_deltas,
                             bottom_neon_ref_deltas,
                             rtol=0)
Beispiel #22
0
def test_conv_rand(backend_default, rand_convargs):
    indim, nifm, fshape, nofm, batch_size, stride, rng_max, w_rng, pad = rand_convargs
    NervanaObject.be.bsz = batch_size
    inp_rng = [0.0, rng_max]
    dtypeu = np.float32
    init_unif = Uniform(low=w_rng[0], high=w_rng[1])

    inshape = (nifm, indim, indim)
    insize = np.prod(inshape)

    # generate neon conv layer
    neon_layer = Convolution(fshape=(fshape, fshape, nofm),
                             strides=stride, padding=pad, init=init_unif)

    # generate the reference layer
    ref_layer = ConvLayerRef(1,
                             batch_size,
                             identity,
                             inshape[0],
                             inshape[1:3],
                             (fshape, fshape),
                             nofm,
                             stride,
                             dtypeu,
                             padding=pad)

    # setup input in range inp_rng
    inpa = np.random.random((insize, batch_size))
    inpa *= inp_rng[1] - inp_rng[0]
    inpa += inp_rng[0]
    inpa = inpa.astype(dtypeu)
    inp = neon_layer.be.array(inpa)
    inp.lshape = inshape

    # run fprop on neon
    neon_layer.configure(inshape)
    neon_layer.prev_layer = True
    neon_layer.allocate()
    neon_layer.set_deltas([neon_layer.be.iobuf(inshape)])
    neon_out = neon_layer.fprop(inp).get()

    # pull neon weights into ref layer weights
    ref_layer.weights = neon_layer.W.get().T
    ref_layer.fprop(inpa.T)
    ref_out = np.copy(ref_layer.y)

    # estimate the numerical precision by
    # permuting order of ops in ref layer
    # fprop calculation
    ref_layer.fprop(inpa.T, permute=True)
    ref_out_perm = ref_layer.y
    atol = 4*np.max(np.abs(ref_out - ref_out_perm))

    # compare ref and neon layer fprop outputs
    # using the empirically determined atol
    assert allclose_with_out(ref_out.T, neon_out, atol=atol, rtol=1.e-4)

    # generate random deltas array
    erra = np.random.random(neon_out.shape)
    erra *= (inp_rng[1] - inp_rng[0])
    erra += inp_rng[0]

    erra = erra.astype(dtypeu)
    err = neon_layer.be.array(erra)

    # run neon bprop
    neon_deltas = neon_layer.bprop(err).get()
    neon_dW = neon_layer.dW.get()

    # run ref code bprop
    ref_layer.bprop(erra.T, 1.0)
    ref_deltas = np.copy(ref_layer.berror_nopad.T)
    ref_dW = np.copy(ref_layer.updates)

    # estimate precision using permutation
    # of operation order on ref layer code
    ref_layer.bprop(erra.T, 1.0, permute=True)
    ref_deltas_perm = ref_layer.berror_nopad.T
    ref_dW_perm = ref_layer.updates

    atol = 4*np.max(np.abs(ref_deltas - ref_deltas_perm))
    assert allclose_with_out(ref_deltas, neon_deltas, atol=atol, rtol=1.e-4)

    atol = 4*np.max(np.abs(ref_dW - ref_dW_perm))
    assert allclose_with_out(ref_dW.T, neon_dW, atol=atol, rtol=1.e-4)
    return
def test_bibn(backend_default, fargs):

    seq_len, input_size, hidden_size, batch_size = fargs
    in_shape = (input_size, seq_len)
    NervanaObject.be.bsz = batch_size

    # setup the bi-directional rnn
    init_glorot = GlorotUniform()
    birnn = BiBNRNN(hidden_size, activation=Logistic(), init=init_glorot)
    birnn.configure(in_shape)
    birnn.prev_layer = True
    birnn.allocate()
    birnn.set_deltas([birnn.be.iobuf(birnn.in_shape)])

    # test fprop

    # set the ff buffer
    inp_np = np.random.random(birnn.h_ff_buffer.shape)
    inp_be = birnn.be.array(inp_np)
    birnn.h_ff_buffer[:] = inp_np

    # compare the bn output with calling the backend bn
    xsum = birnn.be.zeros_like(birnn.xmean)
    xvar = birnn.be.zeros_like(birnn.xvar)
    gmean = birnn.be.zeros_like(birnn.gmean)
    gvar = birnn.be.zeros_like(birnn.gvar)
    gamma = birnn.be.ones(birnn.gamma.shape)
    beta = birnn.be.zeros_like(birnn.beta)
    grad_gamma = birnn.be.zeros_like(gamma)
    grad_beta = birnn.be.zeros_like(beta)
    out_ref = birnn.be.zeros_like(birnn.h_ff_buffer)

    xsum[:] = birnn.be.sum(birnn.h_ff_buffer, axis=1)
    birnn.be.compound_fprop_bn(birnn.h_ff_buffer,
                               xsum,
                               xvar,
                               gmean,
                               gvar,
                               gamma,
                               beta,
                               out_ref,
                               birnn.eps,
                               birnn.rho,
                               accumbeta=0,
                               relu=False)

    # call the bibnrnn layer fprop_bn
    out_bn = birnn._fprop_bn(birnn.h_ff_buffer, inference=False)

    assert allclose_with_out(out_bn.get(),
                             out_ref.get(),
                             rtol=0.0,
                             atol=1.0e-5)

    # test bprop
    err_np = np.random.random(birnn.h_ff_buffer.shape)
    err_be = birnn.be.array(err_np)

    err_out_ref = birnn.be.empty_like(err_be)
    birnn.be.compound_bprop_bn(err_out_ref, grad_gamma, grad_beta, err_be,
                               inp_be, xsum, xvar, gamma, birnn.eps)

    err_out_bn = birnn._bprop_bn(err_be, out_bn)

    assert allclose_with_out(err_out_bn.get(),
                             err_out_ref.get(),
                             rtol=0.0,
                             atol=1.0e-5)
def test_conv_rand(backend_default, rand_convargs):
    indim, nifm, fshape, nofm, batch_size, stride, rng_max, w_rng, pad = rand_convargs
    NervanaObject.be.bsz = batch_size
    inp_rng = [0.0, rng_max]
    dtypeu = np.float32
    init_unif = Uniform(low=w_rng[0], high=w_rng[1])

    inshape = (nifm, indim, indim)
    insize = np.prod(inshape)

    # generate neon conv layer
    neon_layer = Convolution(fshape=(fshape, fshape, nofm),
                             strides=stride,
                             padding=pad,
                             init=init_unif)

    # generate the reference layer
    ref_layer = ConvLayerRef(1,
                             batch_size,
                             identity,
                             inshape[0],
                             inshape[1:3], (fshape, fshape),
                             nofm,
                             stride,
                             dtypeu,
                             padding=pad)

    # setup input in range inp_rng
    inpa = np.random.random((insize, batch_size))
    inpa *= inp_rng[1] - inp_rng[0]
    inpa += inp_rng[0]
    inpa = inpa.astype(dtypeu)
    inp = neon_layer.be.array(inpa)
    inp.lshape = inshape

    # run fprop on neon
    neon_layer.configure(inshape)
    neon_layer.prev_layer = True
    neon_layer.allocate()
    neon_layer.set_deltas([neon_layer.be.iobuf(inshape)])
    neon_out = neon_layer.fprop(inp).get()

    # pull neon weights into ref layer weights
    ref_layer.weights = neon_layer.W.get().T
    ref_layer.fprop(inpa.T)
    ref_out = np.copy(ref_layer.y)

    # estimate the numerical precision by
    # permuting order of ops in ref layer
    # fprop calculation
    ref_layer.fprop(inpa.T, permute=True)
    ref_out_perm = ref_layer.y
    atol = 4 * np.max(np.abs(ref_out - ref_out_perm))

    # compare ref and neon layer fprop outputs
    # using the empirically determined atol
    assert allclose_with_out(ref_out.T, neon_out, atol=atol, rtol=1.e-4)

    # generate random deltas array
    erra = np.random.random(neon_out.shape)
    erra *= (inp_rng[1] - inp_rng[0])
    erra += inp_rng[0]

    erra = erra.astype(dtypeu)
    err = neon_layer.be.array(erra)

    # run neon bprop
    neon_deltas = neon_layer.bprop(err).get()
    neon_dW = neon_layer.dW.get()

    # run ref code bprop
    ref_layer.bprop(erra.T, 1.0)
    ref_deltas = np.copy(ref_layer.berror_nopad.T)
    ref_dW = np.copy(ref_layer.updates)

    # estimate precision using permutation
    # of operation order on ref layer code
    ref_layer.bprop(erra.T, 1.0, permute=True)
    ref_deltas_perm = ref_layer.berror_nopad.T
    ref_dW_perm = ref_layer.updates

    atol = 4 * np.max(np.abs(ref_deltas - ref_deltas_perm))
    assert allclose_with_out(ref_deltas, neon_deltas, atol=atol, rtol=1.e-4)

    atol = 4 * np.max(np.abs(ref_dW - ref_dW_perm))
    assert allclose_with_out(ref_dW.T, neon_dW, atol=atol, rtol=1.e-4)
    return
def test_conv_layer(fargs_tests, device_id):

    dtype = np.float32

    ng = NervanaGPU(stochastic_round=False, bench=True, device_id=device_id)

    N, C, K = fargs_tests[0]
    D, H, W = fargs_tests[1]
    T, R, S = fargs_tests[2]
    padding_d, padding_h, padding_w = fargs_tests[3]
    strides_d, strides_h, strides_w = fargs_tests[4]

    conv_ng = ng.conv_layer(
        dtype,
        N, C, K,
        D, H, W,
        T, R, S,
        padding_d, padding_h, padding_w,
        strides_d, strides_h, strides_w)

    nc = NervanaCPU()
    conv_nc = nc.conv_layer(
        dtype,
        N, C, K,
        D, H, W,
        T, R, S,
        padding_d, padding_h, padding_w,
        strides_d, strides_h, strides_w)

    assert conv_nc.dimI == conv_ng.dimI
    assert conv_nc.dimF == conv_ng.dimF
    assert conv_nc.dimO == conv_ng.dimO
    assert conv_nc.M == conv_ng.M

    dimI = conv_ng.dimI
    dimF = conv_ng.dimF
    dimO = conv_ng.dimO

    # cpu input arrays
    cpuI = np.random.uniform(-0.8, 0.8, slicable(dimI, 1)).astype(np.float32)
    cpuF = np.random.uniform(0.0, 0.3, slicable(dimF)).astype(np.float32)
    cpuE = np.random.uniform(-0.2, 0.2, dimO).astype(np.float32)

    # zero pad the last row of cpu input for the sake of numpy
    cpuI[-1, :] = 0.0

    # =======GPU and CPU==========
    beI = cpuI[:-1, :].reshape(dimI)
    beF = cpuF.reshape(dimF)
    beE = cpuE

    start_gpu = default_timer()
    ngO, ngB, ngU = run_backend_conv(ng, conv_ng, beI, beF, beE, dtype)
    end_gpu = default_timer()

    start_cpu = default_timer()
    ncO, ncB, ncU = run_backend_conv(nc, conv_nc, beI, beF, beE, dtype)
    end_cpu = default_timer()

    print("gputime: %s, cputime %s" %
          (end_gpu - start_gpu, end_cpu - start_cpu))

    # ======numpy===========
    # cpu output arrays
    cpuO = np.zeros(dimO, dtype=dtype)
    cpuB = np.zeros(slicable(dimI, 1), dtype=dtype)
    cpuU = np.zeros(slicable(dimF), dtype=dtype)

    D, H, W = conv_nc.DHW
    T, R, S = conv_nc.TRS
    M, P, Q = conv_nc.MPQ

    pad_d, pad_h, pad_w = conv_nc.padding
    str_d, str_h, str_w = conv_nc.strides

    for m in range(M):
        mt = m * str_d - pad_d

        for p in range(P):
            pr = p * str_h - pad_h

            for q in range(Q):
                qs = q * str_w - pad_w

                idx = pixel_indices(conv_nc, mt, pr, qs)

                cpuO[:, m, p, q, :] = np.dot(cpuF.T, cpuI[idx, :])

                cpuB[idx, :] += np.dot(cpuF, cpuE[:, m, p, q, :])

                cpuU += np.dot(cpuI[idx, :], cpuE[:, m, p, q, :].T)

    for op, ngA, ncA, cpuA, w in (
            ("fprop", ngO, ncO, cpuO, Q),
            ("bprop", ngB, ncB.reshape(dimI), cpuB[:-1, :].reshape(dimI), W),
            ("update", ngU, ncU.reshape(dimF), cpuU.reshape(dimF), S)):

        print(op)
        ncAnp = ncA.get().astype(np.float32)
        ngAnp = ngA.get().astype(np.float32)
        ncdif = cpuA - ncAnp
        ngdif = cpuA - ngAnp
        maxval = abs(cpuA).max()
        ncmaxdif = abs(ncdif).max()
        ngmaxdif = abs(ngdif).max()
        ncRatio = ncmaxdif / maxval
        ngRatio = ngmaxdif / maxval

        assert ncRatio < 1e-5
        assert ngRatio < 1e-5
        assert allclose_with_out(ncA.get(), cpuA, rtol=0, atol=1e-4)
        assert allclose_with_out(ngA.get(), cpuA, rtol=0, atol=1e-3)

    del ng
    del nc