def tensors_allclose(a_tensors, b_tensors, rtol=0, atol=1e-7): """ For each backends, calls f with its tensors, and returns the results to allclose. Arguments: a_tensors: list of tensors, or a tensor b_tensors: (another) list of tensors, or a tensor rtol (float, optional): Relative tolerance. atol (float, optional): Absolute tolerance. Returns: bool: If the tensors of fs is all close """ # deal with individual tensor if type(a_tensors) is not list and type(b_tensors) is not list: a_tensors = [a_tensors] b_tensors = [b_tensors] results = [] for a_tensor, b_tensor in zip(a_tensors, b_tensors): if isinstance(a_tensor, Tensor): a_tensor = a_tensor.get() if isinstance(b_tensor, Tensor): b_tensor = b_tensor.get() results.append( allclose_with_out(a_tensor.astype(b_tensor.dtype), b_tensor, rtol=rtol, atol=atol)) return all(results)
def test_padding(backend_default, poolargs): fshape, nifm, padding, stride, in_sz, batch_size = poolargs NervanaObject.be.bsz = batch_size # basic sanity check with random inputs inshape = (nifm, in_sz, in_sz) insize = np.prod(inshape) neon_layer = Pooling(fshape=fshape, strides=stride, padding=padding) inp = neon_layer.be.array(np.random.random((insize, batch_size))) inp.lshape = inshape neon_layer.configure(inshape) neon_layer.prev_layer = True neon_layer.allocate() neon_layer.set_deltas([neon_layer.be.iobuf(inshape)]) out = neon_layer.fprop(inp).get() ncheck = [0, batch_size / 2, batch_size - 1] (out_exp, check_inds) = ref_pooling( inp, inp.lshape, (fshape, fshape), padding, (stride, stride), neon_layer.be, ncheck=ncheck ) out_shape = list(out_exp.shape[0:3]) out_shape.append(batch_size) outa = out.reshape(out_shape) assert allclose_with_out(out_exp, outa[:, :, :, check_inds], atol=0.0, rtol=0.0)
def test_padding(backend_default, poolargs): fshape, nifm, padding, stride, in_sz, batch_size = poolargs NervanaObject.be.bsz = batch_size # basic sanity check with random inputs inshape = (nifm, in_sz, in_sz) insize = np.prod(inshape) neon_layer = Pooling(fshape=fshape, strides=stride, padding=padding) inp = neon_layer.be.array(np.random.random((insize, batch_size))) inp.lshape = inshape neon_layer.configure(inshape) neon_layer.prev_layer = True neon_layer.allocate() neon_layer.set_deltas([neon_layer.be.iobuf(inshape)]) out = neon_layer.fprop(inp).get() ncheck = [0, batch_size/2, batch_size-1] (out_exp, check_inds) = ref_pooling(inp, inp.lshape, (fshape, fshape), padding, (stride, stride), neon_layer.be, ncheck=ncheck) out_shape = list(out_exp.shape[0:3]) out_shape.append(batch_size) outa = out.reshape(out_shape) assert allclose_with_out(out_exp, outa[:, :, :, check_inds], atol=0.0, rtol=0.0)
def tensors_allclose(a_tensors, b_tensors, rtol=0, atol=1e-7): """ For each backends, calls f with its tensors, and returns the results to allclose. Arguments: a_tensors: list of tensors, or a tensor b_tensors: (another) list of tensors, or a tensor rtol (float, optional): Relative tolerance. atol (float, optional): Absolute tolerance. Returns: bool: If the tensors of fs is all close """ # deal with individual tensor if type(a_tensors) is not list and type(b_tensors) is not list: a_tensors = [a_tensors] b_tensors = [b_tensors] results = [] for a_tensor, b_tensor in zip(a_tensors, b_tensors): if isinstance(a_tensor, Tensor): a_tensor = a_tensor.get() if isinstance(b_tensor, Tensor): b_tensor = b_tensor.get() results.append(allclose_with_out(a_tensor.astype(b_tensor.dtype), b_tensor, rtol=rtol, atol=atol)) return all(results)
def test_biSum(backend_default, fargs): seq_len, input_size, hidden_size, batch_size = fargs input_size *= 2 in_shape = (input_size, seq_len) NervanaObject.be.bsz = batch_size bisum = BiSum() bisum.configure(in_shape) bisum.prev_layer = True bisum.allocate() bisum.set_deltas([bisum.be.iobuf(bisum.in_shape)]) # inputs inp_np = np.random.random((input_size, seq_len * batch_size)) inp_be = bisum.be.array(inp_np) # outputs out_be = bisum.fprop(inp_be) del_be = bisum.bprop(out_be) out_ref = bisum.be.empty_like(out_be) out_ref[:] = inp_be[:input_size / 2] + inp_be[input_size / 2:] assert out_be.shape[0] * 2 == inp_be.shape[0] assert allclose_with_out(out_be.get(), out_ref.get(), rtol=0.0, atol=1.0e-5) assert allclose_with_out(del_be[:input_size / 2].get(), out_be.get(), rtol=0.0, atol=1.0e-5) assert allclose_with_out(del_be[input_size / 2:].get(), out_be.get(), rtol=0.0, atol=1.0e-5)
def mergesum_test_config(be, modfunc, use_stride=1): l1 = Conv(**conv_params(3, 16)) neon_layer = modfunc(16, use_stride) inshape = (16, 32, 32) insize = np.prod(inshape) inpa = np.random.random((insize, batch_size)) neon_seq = Sequential([l1] + neon_layer) neon_seq.configure(inshape) inp = be.array(inpa) neon_seq.allocate() # print neon_layer.nested_str() # neon_layer.layers[0].prev_layer = True neon_seq.allocate_deltas() neon_out = neon_seq.fprop(inp).get() # Now make the reference pathways: p1, p2 = module_factory_copy(neon_layer, modfunc, 16, use_stride) l11 = Conv(**conv_params(3, 16)) l12 = Conv(**conv_params(3, 16)) for ll in (l11, l12): for lcopy, lref in zip(ll, l1): if lcopy.has_params: lcopy.set_params(lref.get_params_serialize()) path1 = Sequential([l11] + p1) path2 = Sequential([l12] + p2) for ll in (path1, path2): ll.configure(inshape) ll.allocate() ll.allocate_deltas() o1 = path1.fprop(inp) o2 = path2.fprop(inp) neon_out_ref = be.empty_like(o1) neon_out_ref[:] = be.maximum(o1 + o2, 0) # need to have bsum false for this test to be valid assert allclose_with_out(neon_out_ref.get(), neon_out, rtol=0) print "Fprop matching" print "Beginning Back prop" erra = np.random.random(neon_out.shape) err = be.array(erra) ebr = neon_seq.layers[-1].bprop(err) ebr = neon_seq.layers[-2].bprop(ebr) trunk_neon = ebr.get() err = be.array(erra) err[:] = be.greater(neon_out_ref, 0) * err pstart = len(l1) eb1 = err for l in reversed(path1.layers[pstart:]): eb1 = l.bprop(eb1) eb2 = err for l in reversed(path2.layers[pstart:]): eb2 = l.bprop(eb2) err_ref = be.empty_like(eb1) err_ref[:] = eb1 + eb2 assert allclose_with_out(err_ref.get(), trunk_neon, rtol=0)
def check_lstm(seq_len, input_size, hidden_size, batch_size, init_func, inp_moms=[0.0, 1.0]): # init_func is the initializer for the model params # inp_moms is the [ mean, std dev] of the random input input_shape = (input_size, seq_len * batch_size) hidden_shape = (hidden_size, seq_len * batch_size) NervanaObject.be.bsz = NervanaObject.be.batch_size = batch_size # neon LSTM lstm = LSTM(hidden_size, init_func, activation=Tanh(), gate_activation=Logistic()) inp = np.random.rand(*input_shape) * inp_moms[1] + inp_moms[0] inpa = lstm.be.array(inp) # import pdb; pdb.set_trace() # run neon fprop lstm.fprop(inpa) # reference numpy LSTM lstm_ref = RefLSTM() WLSTM = lstm_ref.init(input_size, hidden_size) # make ref weights and biases with neon model WLSTM[0, :] = lstm.b.get().T WLSTM[1:input_size + 1, :] = lstm.W_input.get().T WLSTM[input_size + 1:] = lstm.W_recur.get().T # transpose input X and do fprop inp_ref = inp.copy().T.reshape(seq_len, batch_size, input_size) (Hout_ref, cprev, hprev, batch_cache) = lstm_ref.forward(inp_ref, WLSTM) # the output needs transpose as well Hout_ref = Hout_ref.reshape(seq_len * batch_size, hidden_size).T IFOGf_ref = batch_cache['IFOGf'].reshape(seq_len * batch_size, hidden_size * 4).T Ct_ref = batch_cache['Ct'].reshape(seq_len * batch_size, hidden_size).T # compare results print '====Verifying IFOG====' allclose_with_out(lstm.ifog_buffer.get(), IFOGf_ref, rtol=0.0, atol=1.0e-5) print '====Verifying cell states====' allclose_with_out(lstm.c_act_buffer.get(), Ct_ref, rtol=0.0, atol=1.0e-5) print '====Verifying hidden states====' allclose_with_out(lstm.h_buffer.get(), Hout_ref, rtol=0.0, atol=1.0e-5) print 'fprop is verified' # now test the bprop # generate random deltas tensor deltas = np.random.randn(*hidden_shape) lstm.bprop(lstm.be.array(deltas)) # grab the delta W from gradient buffer dWinput_neon = lstm.dW_input.get() dWrecur_neon = lstm.dW_recur.get() db_neon = lstm.db.get() # import pdb; pdb.set_trace() deltas_ref = deltas.copy().T.reshape(seq_len, batch_size, hidden_size) (dX_ref, dWLSTM_ref, dc0_ref, dh0_ref) = lstm_ref.backward(deltas_ref, batch_cache) dWrecur_ref = dWLSTM_ref[-hidden_size:, :] dWinput_ref = dWLSTM_ref[1:input_size + 1, :] db_ref = dWLSTM_ref[0, :] dX_ref = dX_ref.reshape(seq_len * batch_size, input_size).T # compare results print 'Making sure neon LSTM match numpy LSTM in bprop' print '====Verifying update on W_recur====' assert allclose_with_out(dWrecur_neon, dWrecur_ref.T, rtol=0.0, atol=1.0e-5) print '====Verifying update on W_input====' assert allclose_with_out(dWinput_neon, dWinput_ref.T, rtol=0.0, atol=1.0e-5) print '====Verifying update on bias====' assert allclose_with_out(db_neon.flatten(), db_ref, rtol=0.0, atol=1.0e-5) print '====Verifying output delta====' assert allclose_with_out(lstm.out_deltas_buffer.get(), dX_ref, rtol=0.0, atol=1.0e-5) print 'bprop is verified' return
def check_rnn(seq_len, input_size, hidden_size, batch_size, init_func, inp_moms=[0.0, 1.0]): # init_func is the initializer for the model params # inp_moms is the [ mean, std dev] of the random input input_shape = (input_size, seq_len * batch_size) output_shape = (hidden_size, seq_len * batch_size) NervanaObject.be.bsz = NervanaObject.be.batch_size = batch_size # ======== create models ======== # neon RNN rnn = Recurrent(hidden_size, init_func, Tanh()) # reference numpy RNN rnn_ref = RefRecurrent(input_size, hidden_size) Wxh = rnn_ref.Wxh Whh = rnn_ref.Whh bh = rnn_ref.bh # ========= generate data ================= # generate random input tensor inp = np.random.rand(*input_shape)*inp_moms[1] + inp_moms[0] inpa = rnn.be.array(inp) # generate random deltas tensor deltas = np.random.randn(*output_shape) # the reference code expects these shapes: # input_shape: (seq_len, input_size, batch_size) # output_shape: (seq_len, hidden_size, batch_size) inp_ref = inp.copy().T.reshape( seq_len, batch_size, input_size).swapaxes(1, 2) deltas_ref = deltas.copy().T.reshape( seq_len, batch_size, hidden_size).swapaxes(1, 2) # ========= running models ========== # run neon fprop rnn.fprop(inpa) # weights are only initialized after doing fprop, so now # make ref weights and biases the same with neon model Wxh[:] = rnn.W_input.get() Whh[:] = rnn.W_recur.get() bh[:] = rnn.b.get() (dWxh_ref, dWhh_ref, db_ref, h_ref_list, dh_ref_list, d_out_ref) = rnn_ref.lossFun(inp_ref, deltas_ref) # now test the bprop rnn.bprop(rnn.be.array(deltas)) # grab the delta W from gradient buffer dWxh_neon = rnn.dW_input.get() dWhh_neon = rnn.dW_recur.get() db_neon = rnn.db.get() # comparing outputs print '====Verifying hidden states====' print allclose_with_out(rnn.h_buffer.get(), h_ref_list, rtol=0.0, atol=1.0e-5) print 'fprop is verified' print '====Verifying update on W and b ====' print 'dWxh' assert allclose_with_out(dWxh_neon, dWxh_ref, rtol=0.0, atol=1.0e-5) print 'dWhh' assert allclose_with_out(dWhh_neon, dWhh_ref, rtol=0.0, atol=1.0e-5) print '====Verifying update on bias====' print 'db' assert allclose_with_out(db_neon, db_ref, rtol=0.0, atol=1.0e-5) print 'bprop is verified' return
def check_lstm(seq_len, input_size, hidden_size, batch_size, init_func, inp_moms=[0.0, 1.0]): # init_func is the initializer for the model params # inp_moms is the [ mean, std dev] of the random input input_shape = (input_size, seq_len * batch_size) hidden_shape = (hidden_size, seq_len * batch_size) NervanaObject.be.bsz = NervanaObject.be.batch_size = batch_size # neon LSTM lstm = LSTM(hidden_size, init_func, activation=Tanh(), gate_activation=Logistic()) inp = np.random.rand(*input_shape)*inp_moms[1] + inp_moms[0] inpa = lstm.be.array(inp) # run neon fprop lstm.configure((input_size, seq_len)) lstm.prev_layer = True # Hack to force allocating a delta buffer lstm.allocate() lstm.set_deltas([lstm.be.iobuf(lstm.in_shape)]) lstm.fprop(inpa) # reference numpy LSTM lstm_ref = RefLSTM() WLSTM = lstm_ref.init(input_size, hidden_size) # make ref weights and biases with neon model WLSTM[0, :] = lstm.b.get().T WLSTM[1:input_size+1, :] = lstm.W_input.get().T WLSTM[input_size+1:] = lstm.W_recur.get().T # transpose input X and do fprop inp_ref = inp.copy().T.reshape(seq_len, batch_size, input_size) (Hout_ref, cprev, hprev, batch_cache) = lstm_ref.forward(inp_ref, WLSTM) # the output needs transpose as well Hout_ref = Hout_ref.reshape(seq_len * batch_size, hidden_size).T IFOGf_ref = batch_cache['IFOGf'].reshape(seq_len * batch_size, hidden_size * 4).T Ct_ref = batch_cache['Ct'].reshape(seq_len * batch_size, hidden_size).T # compare results print '====Verifying IFOG====' allclose_with_out(lstm.ifog_buffer.get(), IFOGf_ref, rtol=0.0, atol=1.0e-5) print '====Verifying cell states====' allclose_with_out(lstm.c_act_buffer.get(), Ct_ref, rtol=0.0, atol=1.0e-5) print '====Verifying hidden states====' allclose_with_out(lstm.outputs.get(), Hout_ref, rtol=0.0, atol=1.0e-5) print 'fprop is verified' # now test the bprop # generate random deltas tensor deltas = np.random.randn(*hidden_shape) lstm.bprop(lstm.be.array(deltas)) # grab the delta W from gradient buffer dWinput_neon = lstm.dW_input.get() dWrecur_neon = lstm.dW_recur.get() db_neon = lstm.db.get() deltas_ref = deltas.copy().T.reshape(seq_len, batch_size, hidden_size) (dX_ref, dWLSTM_ref, dc0_ref, dh0_ref) = lstm_ref.backward(deltas_ref, batch_cache) dWrecur_ref = dWLSTM_ref[-hidden_size:, :] dWinput_ref = dWLSTM_ref[1:input_size+1, :] db_ref = dWLSTM_ref[0, :] dX_ref = dX_ref.reshape(seq_len * batch_size, input_size).T # compare results print 'Making sure neon LSTM match numpy LSTM in bprop' print '====Verifying update on W_recur====' assert allclose_with_out(dWrecur_neon, dWrecur_ref.T, rtol=0.0, atol=1.0e-5) print '====Verifying update on W_input====' assert allclose_with_out(dWinput_neon, dWinput_ref.T, rtol=0.0, atol=1.0e-5) print '====Verifying update on bias====' assert allclose_with_out(db_neon.flatten(), db_ref, rtol=0.0, atol=1.0e-5) print '====Verifying output delta====' assert allclose_with_out(lstm.out_deltas_buffer.get(), dX_ref, rtol=0.0, atol=1.0e-5) print 'bprop is verified' return
def gradient_check_ref(seq_len, input_size, hidden_size, batch_size, epsilon=1.0e-5, dtypeu=np.float64, threshold=1e-4): # this is a check of the reference code itself # estimates the gradients by adding perturbations # to the input and the weights and compares to # the values calculated in bprop # generate sparse random input matrix NervanaObject.be.bsz = NervanaObject.be.batch_size = batch_size input_shape = (seq_len, input_size, batch_size) # hidden_shape = (seq_len, hidden_size, batch_size) (inp_bl, nz_inds) = sparse_rand(input_shape, frac=1.0 / input_shape[1]) inp_bl = np.random.randn(*input_shape) # convert input matrix from neon to ref code format inp_bl = inp_bl.swapaxes(1, 2).astype(dtypeu) # generate reference LSTM lstm_ref = RefLSTM() WLSTM = lstm_ref.init(input_size, hidden_size).astype(dtypeu) # init parameters as done for neon WLSTM = np.random.randn(*WLSTM.shape) (Hout, cprev, hprev, cache) = lstm_ref.forward(inp_bl, WLSTM) # scale Hout by random matrix... rand_scale = np.random.random(Hout.shape) * 2.0 - 1.0 rand_scale = dtypeu(rand_scale) # line below would be the loss function # loss_bl = np.sum(rand_scale * Hout) # run bprop, input deltas is rand_scale (dX_bl, dWLSTM_bl, dc0, dh0) = lstm_ref.backward(rand_scale, cache) grads_est = np.zeros(dX_bl.shape) inp_pert = inp_bl.copy() for pert_ind in range(inp_bl.size): save_val = inp_pert.flat[pert_ind] # add/subtract perturbations to input inp_pert.flat[pert_ind] = save_val + epsilon # and run fprop on perturbed input (Hout_pos, cprev, hprev, cache) = lstm_ref.forward(inp_pert, WLSTM) inp_pert.flat[pert_ind] = save_val - epsilon (Hout_neg, cprev, hprev, cache) = lstm_ref.forward(inp_pert, WLSTM) # calculate the loss on outputs loss_pos = np.sum(rand_scale * Hout_pos) loss_neg = np.sum(rand_scale * Hout_neg) grads_est.flat[pert_ind] = 0.5 * (loss_pos - loss_neg) / epsilon # reset input inp_pert.flat[pert_ind] = save_val # assert that gradient estimates within rel threshold of # bprop calculated deltas assert allclose_with_out(grads_est, dX_bl, rtol=threshold, atol=0.0) return
def gradient_check_ref(seq_len, input_size, hidden_size, batch_size, epsilon=1.0e-5, dtypeu=np.float64, threshold=1e-4): # this is a check of the reference code itself # estimates the gradients by adding perturbations # to the input and the weights and compares to # the values calculated in bprop # generate sparse random input matrix NervanaObject.be.bsz = NervanaObject.be.batch_size = batch_size input_shape = (seq_len, input_size, batch_size) # hidden_shape = (seq_len, hidden_size, batch_size) (inp_bl, nz_inds) = sparse_rand(input_shape, frac=1.0/input_shape[1]) inp_bl = np.random.randn(*input_shape) # convert input matrix from neon to ref code format inp_bl = inp_bl.swapaxes(1, 2).astype(dtypeu) # generate reference LSTM lstm_ref = RefLSTM() WLSTM = lstm_ref.init(input_size, hidden_size).astype(dtypeu) # init parameters as done for neon WLSTM = np.random.randn(*WLSTM.shape) (Hout, cprev, hprev, cache) = lstm_ref.forward(inp_bl, WLSTM) # scale Hout by random matrix... rand_scale = np.random.random(Hout.shape)*2.0 - 1.0 rand_scale = dtypeu(rand_scale) # line below would be the loss function # loss_bl = np.sum(rand_scale * Hout) # run bprop, input deltas is rand_scale (dX_bl, dWLSTM_bl, dc0, dh0) = lstm_ref.backward(rand_scale, cache) grads_est = np.zeros(dX_bl.shape) inp_pert = inp_bl.copy() for pert_ind in range(inp_bl.size): save_val = inp_pert.flat[pert_ind] # add/subtract perturbations to input inp_pert.flat[pert_ind] = save_val + epsilon # and run fprop on perturbed input (Hout_pos, cprev, hprev, cache) = lstm_ref.forward(inp_pert, WLSTM) inp_pert.flat[pert_ind] = save_val - epsilon (Hout_neg, cprev, hprev, cache) = lstm_ref.forward(inp_pert, WLSTM) # calculate the loss on outputs loss_pos = np.sum(rand_scale*Hout_pos) loss_neg = np.sum(rand_scale*Hout_neg) grads_est.flat[pert_ind] = 0.5*(loss_pos-loss_neg)/epsilon # reset input inp_pert.flat[pert_ind] = save_val # assert that gradient estimates within rel threshold of # bprop calculated deltas assert allclose_with_out(grads_est, dX_bl, rtol=threshold, atol=0.0) return
def test_pool_layer(poolargs, device_id): op = poolargs[0] dtype = np.float32 ng = NervanaGPU(stochastic_round=False, bench=True, device_id=device_id) nc = NervanaCPU() N, C = 32, 32 D, H, W = 1, 32, 32 J, T, R, S = 2, 1, 3, 3 padding_j, padding_d, padding_h, padding_w = 0, 0, 0, 0 strides_j, strides_d, strides_h, strides_w = 2, 1, 2, 2 # op = 'max' pool_ng = ng.pool_layer( dtype, op, N, C, D, H, W, J, T, R, S, padding_j, padding_d, padding_h, padding_w, strides_j, strides_d, strides_h, strides_w) pool_nc = nc.pool_layer( dtype, op, N, C, D, H, W, J, T, R, S, padding_j, padding_d, padding_h, padding_w, strides_j, strides_d, strides_h, strides_w) assert pool_ng.dimI == pool_nc.dimI assert pool_ng.dimO == pool_nc.dimO dimI = pool_ng.dimI dimO = pool_ng.dimO # generating input arrays for inputs and errors cpuI = np.random.uniform(0.0, 1.0, sliceable(dimI, 1)).astype( np.float16).astype(dtype) cpuE = np.random.uniform(-0.2, 0.2, dimO).astype(dtype) # zero pad the last row of cpu input for the sake of numpy if op == "max": cpuI[-1, :] = np.finfo(dtype).min else: cpuI[-1, :] = 0 # =========GPU and CPU and numpy ========== beI = cpuI[:-1, :].reshape(dimI) beE = cpuE ngO, ngB = run_backend_pool(ng, pool_ng, beI, beE, dtype) ncO, ncB = run_backend_pool(nc, pool_nc, beI, beE, dtype) cpuO, cpuB = run_numpy_pool(op, cpuI, cpuE, dtype, pool_ng) for opA, ngA, ncA, cpuA in ( ("fprop", ngO, ncO, cpuO), ("bprop", ngB, ncB.reshape(dimI), cpuB[:-1, :].reshape(dimI))): print opA assert allclose_with_out(ngA.get(), ncA.get(), rtol=0, atol=1e-4) assert allclose_with_out(ncA.get(), cpuA, rtol=0, atol=1e-5) del ng, nc
def check_gru(seq_len, input_size, hidden_size, batch_size, init_func, inp_moms=[0.0, 1.0]): # init_func is the initializer for the model params # inp_moms is the [ mean, std dev] of the random input input_shape = (input_size, seq_len * batch_size) output_shape = (hidden_size, seq_len * batch_size) NervanaObject.be.bsz = NervanaObject.be.batch_size = batch_size # neon GRU gru = GRU(hidden_size, init_func, activation=Tanh(), gate_activation=Logistic()) # generate random input tensor inp = np.random.rand(*input_shape)*inp_moms[1] + inp_moms[0] inpa = gru.be.array(inp) # generate random deltas tensor deltas = np.random.randn(*output_shape) # run neon fprop gru.configure((input_size, seq_len)) gru.allocate() gru.fprop(inpa) # reference numpy GRU gru_ref = RefGRU(input_size, hidden_size) WGRU = gru_ref.weights # make ref weights and biases the same with neon model r_range = range(hidden_size) z_range = range(hidden_size, hidden_size * 2) c_range = range(hidden_size * 2, hidden_size * 3) WGRU[gru_ref.weights_ind_br][:] = gru.b.get()[r_range] WGRU[gru_ref.weights_ind_bz][:] = gru.b.get()[z_range] WGRU[gru_ref.weights_ind_bc][:] = gru.b.get()[c_range] WGRU[gru_ref.weights_ind_Wxr][:] = gru.W_input.get()[r_range] WGRU[gru_ref.weights_ind_Wxz][:] = gru.W_input.get()[z_range] WGRU[gru_ref.weights_ind_Wxc][:] = gru.W_input.get()[c_range] WGRU[gru_ref.weights_ind_Rhr][:] = gru.W_recur.get()[r_range] WGRU[gru_ref.weights_ind_Rhz][:] = gru.W_recur.get()[z_range] WGRU[gru_ref.weights_ind_Rhc][:] = gru.W_recur.get()[c_range] # transpose input X and do fprop # the reference code expects these shapes: # input_shape: (seq_len, input_size, batch_size) # output_shape: (seq_len, hidden_size, batch_size) inp_ref = inp.copy().T.reshape( seq_len, batch_size, input_size).swapaxes(1, 2) deltas_ref = deltas.copy().T.reshape( seq_len, batch_size, hidden_size).swapaxes(1, 2) (dWGRU_ref, h_ref_list, dh_ref_list, dr_ref_list, dz_ref_list, dc_ref_list) = gru_ref.lossFun(inp_ref, deltas_ref) print '====Verifying hidden states====' print allclose_with_out(gru.h_buffer.get(), h_ref_list, rtol=0.0, atol=1.0e-5) print 'fprop is verified' # now test the bprop print 'Making sure neon GRU match numpy GRU in bprop' gru.bprop(gru.be.array(deltas)) # grab the delta W from gradient buffer dWinput_neon = gru.dW_input.get() dWrecur_neon = gru.dW_recur.get() db_neon = gru.db.get() dWxr_neon = dWinput_neon[r_range] dWxz_neon = dWinput_neon[z_range] dWxc_neon = dWinput_neon[c_range] dWrr_neon = dWrecur_neon[r_range] dWrz_neon = dWrecur_neon[z_range] dWrc_neon = dWrecur_neon[c_range] dbr_neon = db_neon[r_range] dbz_neon = db_neon[z_range] dbc_neon = db_neon[c_range] drzc_neon = gru.rzhcan_delta_buffer.get() dr_neon = drzc_neon[r_range] dz_neon = drzc_neon[z_range] dc_neon = drzc_neon[c_range] dWxr_ref = dWGRU_ref[gru_ref.dW_ind_Wxr] dWxz_ref = dWGRU_ref[gru_ref.dW_ind_Wxz] dWxc_ref = dWGRU_ref[gru_ref.dW_ind_Wxc] dWrr_ref = dWGRU_ref[gru_ref.dW_ind_Rhr] dWrz_ref = dWGRU_ref[gru_ref.dW_ind_Rhz] dWrc_ref = dWGRU_ref[gru_ref.dW_ind_Rhc] dbr_ref = dWGRU_ref[gru_ref.dW_ind_br] dbz_ref = dWGRU_ref[gru_ref.dW_ind_bz] dbc_ref = dWGRU_ref[gru_ref.dW_ind_bc] # print '====Verifying hidden deltas ====' print '====Verifying r deltas ====' assert allclose_with_out(dr_neon, dr_ref_list, rtol=0.0, atol=1.0e-5) print '====Verifying z deltas ====' assert allclose_with_out(dz_neon, dz_ref_list, rtol=0.0, atol=1.0e-5) print '====Verifying hcan deltas ====' assert allclose_with_out(dc_neon, dc_ref_list, rtol=0.0, atol=1.0e-5) print '====Verifying update on W_input====' print 'dWxr' assert allclose_with_out(dWxr_neon, dWxr_ref, rtol=0.0, atol=1.0e-5) print 'dWxz' assert allclose_with_out(dWxz_neon, dWxz_ref, rtol=0.0, atol=1.0e-5) print 'dWxc' assert allclose_with_out(dWxc_neon, dWxc_ref, rtol=0.0, atol=1.0e-5) print '====Verifying update on W_recur====' print 'dWrr' assert allclose_with_out(dWrr_neon, dWrr_ref, rtol=0.0, atol=1.0e-5) print 'dWrz' assert allclose_with_out(dWrz_neon, dWrz_ref, rtol=0.0, atol=1.0e-5) print 'dWrc' assert allclose_with_out(dWrc_neon, dWrc_ref, rtol=0.0, atol=1.0e-5) print '====Verifying update on bias====' print 'dbr' assert allclose_with_out(dbr_neon, dbr_ref, rtol=0.0, atol=1.0e-5) print 'dbz' assert allclose_with_out(dbz_neon, dbz_ref, rtol=0.0, atol=1.0e-5) print 'dbc' assert allclose_with_out(dbc_neon, dbc_ref, rtol=0.0, atol=1.0e-5) print 'bprop is verified' return
def check_gru(seq_len, input_size, hidden_size, batch_size, init_func, inp_moms=[0.0, 1.0]): # init_func is the initializer for the model params # inp_moms is the [ mean, std dev] of the random input input_shape = (input_size, seq_len * batch_size) output_shape = (hidden_size, seq_len * batch_size) NervanaObject.be.bsz = NervanaObject.be.batch_size = batch_size # neon GRU gru = GRU(hidden_size, init_func, activation=Tanh(), gate_activation=Logistic()) # generate random input tensor inp = np.random.rand(*input_shape)*inp_moms[1] + inp_moms[0] inpa = gru.be.array(inp) # generate random deltas tensor deltas = np.random.randn(*output_shape) # run neon fprop gru.configure((input_size, seq_len)) gru.prev_layer = True gru.allocate() gru.set_deltas([gru.be.iobuf(gru.in_shape)]) gru.fprop(inpa) # reference numpy GRU gru_ref = RefGRU(input_size, hidden_size) WGRU = gru_ref.weights # make ref weights and biases the same with neon model r_range = range(hidden_size) z_range = range(hidden_size, hidden_size * 2) c_range = range(hidden_size * 2, hidden_size * 3) WGRU[gru_ref.weights_ind_br][:] = gru.b.get()[r_range] WGRU[gru_ref.weights_ind_bz][:] = gru.b.get()[z_range] WGRU[gru_ref.weights_ind_bc][:] = gru.b.get()[c_range] WGRU[gru_ref.weights_ind_Wxr][:] = gru.W_input.get()[r_range] WGRU[gru_ref.weights_ind_Wxz][:] = gru.W_input.get()[z_range] WGRU[gru_ref.weights_ind_Wxc][:] = gru.W_input.get()[c_range] WGRU[gru_ref.weights_ind_Rhr][:] = gru.W_recur.get()[r_range] WGRU[gru_ref.weights_ind_Rhz][:] = gru.W_recur.get()[z_range] WGRU[gru_ref.weights_ind_Rhc][:] = gru.W_recur.get()[c_range] # transpose input X and do fprop # the reference code expects these shapes: # input_shape: (seq_len, input_size, batch_size) # output_shape: (seq_len, hidden_size, batch_size) inp_ref = inp.copy().T.reshape( seq_len, batch_size, input_size).swapaxes(1, 2) deltas_ref = deltas.copy().T.reshape( seq_len, batch_size, hidden_size).swapaxes(1, 2) (dWGRU_ref, h_ref_list, dh_ref_list, dr_ref_list, dz_ref_list, dc_ref_list) = gru_ref.lossFun(inp_ref, deltas_ref) print '====Verifying hidden states====' print allclose_with_out(gru.outputs.get(), h_ref_list, rtol=0.0, atol=1.0e-5) print 'fprop is verified' # now test the bprop print 'Making sure neon GRU match numpy GRU in bprop' gru.bprop(gru.be.array(deltas)) # grab the delta W from gradient buffer dWinput_neon = gru.dW_input.get() dWrecur_neon = gru.dW_recur.get() db_neon = gru.db.get() dWxr_neon = dWinput_neon[r_range] dWxz_neon = dWinput_neon[z_range] dWxc_neon = dWinput_neon[c_range] dWrr_neon = dWrecur_neon[r_range] dWrz_neon = dWrecur_neon[z_range] dWrc_neon = dWrecur_neon[c_range] dbr_neon = db_neon[r_range] dbz_neon = db_neon[z_range] dbc_neon = db_neon[c_range] drzc_neon = gru.rzhcan_delta_buffer.get() dr_neon = drzc_neon[r_range] dz_neon = drzc_neon[z_range] dc_neon = drzc_neon[c_range] dWxr_ref = dWGRU_ref[gru_ref.dW_ind_Wxr] dWxz_ref = dWGRU_ref[gru_ref.dW_ind_Wxz] dWxc_ref = dWGRU_ref[gru_ref.dW_ind_Wxc] dWrr_ref = dWGRU_ref[gru_ref.dW_ind_Rhr] dWrz_ref = dWGRU_ref[gru_ref.dW_ind_Rhz] dWrc_ref = dWGRU_ref[gru_ref.dW_ind_Rhc] dbr_ref = dWGRU_ref[gru_ref.dW_ind_br] dbz_ref = dWGRU_ref[gru_ref.dW_ind_bz] dbc_ref = dWGRU_ref[gru_ref.dW_ind_bc] # print '====Verifying hidden deltas ====' print '====Verifying r deltas ====' assert allclose_with_out(dr_neon, dr_ref_list, rtol=0.0, atol=1.0e-5) print '====Verifying z deltas ====' assert allclose_with_out(dz_neon, dz_ref_list, rtol=0.0, atol=1.0e-5) print '====Verifying hcan deltas ====' assert allclose_with_out(dc_neon, dc_ref_list, rtol=0.0, atol=1.0e-5) print '====Verifying update on W_input====' print 'dWxr' assert allclose_with_out(dWxr_neon, dWxr_ref, rtol=0.0, atol=1.0e-5) print 'dWxz' assert allclose_with_out(dWxz_neon, dWxz_ref, rtol=0.0, atol=1.0e-5) print 'dWxc' assert allclose_with_out(dWxc_neon, dWxc_ref, rtol=0.0, atol=1.0e-5) print '====Verifying update on W_recur====' print 'dWrr' assert allclose_with_out(dWrr_neon, dWrr_ref, rtol=0.0, atol=1.0e-5) print 'dWrz' assert allclose_with_out(dWrz_neon, dWrz_ref, rtol=0.0, atol=1.0e-5) print 'dWrc' assert allclose_with_out(dWrc_neon, dWrc_ref, rtol=0.0, atol=1.0e-5) print '====Verifying update on bias====' print 'dbr' assert allclose_with_out(dbr_neon, dbr_ref, rtol=0.0, atol=1.0e-5) print 'dbz' assert allclose_with_out(dbz_neon, dbz_ref, rtol=0.0, atol=1.0e-5) print 'dbc' assert allclose_with_out(dbc_neon, dbc_ref, rtol=0.0, atol=1.0e-5) print 'bprop is verified' return
def check_rnn(seq_len, input_size, hidden_size, batch_size, init_func, inp_moms=[0.0, 1.0]): # init_func is the initializer for the model params # inp_moms is the [ mean, std dev] of the random input input_shape = (input_size, seq_len * batch_size) output_shape = (hidden_size, seq_len * batch_size) NervanaObject.be.bsz = NervanaObject.be.batch_size = batch_size # ======== create models ======== # neon RNN rnn = Recurrent(hidden_size, init_func, Tanh()) # reference numpy RNN rnn_ref = RefRecurrent(input_size, hidden_size) Wxh = rnn_ref.Wxh Whh = rnn_ref.Whh bh = rnn_ref.bh # ========= generate data ================= # generate random input tensor inp = np.random.rand(*input_shape) * inp_moms[1] + inp_moms[0] inpa = rnn.be.array(inp) # generate random deltas tensor deltas = np.random.randn(*output_shape) # the reference code expects these shapes: # input_shape: (seq_len, input_size, batch_size) # output_shape: (seq_len, hidden_size, batch_size) inp_ref = inp.copy().T.reshape(seq_len, batch_size, input_size).swapaxes(1, 2) deltas_ref = deltas.copy().T.reshape(seq_len, batch_size, hidden_size).swapaxes(1, 2) # ========= running models ========== # run neon fprop rnn.fprop(inpa) # weights are only initialized after doing fprop, so now # make ref weights and biases the same with neon model Wxh[:] = rnn.W_input.get() Whh[:] = rnn.W_recur.get() bh[:] = rnn.b.get() (dWxh_ref, dWhh_ref, db_ref, h_ref_list, dh_ref_list, d_out_ref) = rnn_ref.lossFun(inp_ref, deltas_ref) # now test the bprop rnn.bprop(rnn.be.array(deltas)) # grab the delta W from gradient buffer dWxh_neon = rnn.dW_input.get() dWhh_neon = rnn.dW_recur.get() db_neon = rnn.db.get() # comparing outputs print '====Verifying hidden states====' print allclose_with_out(rnn.h_buffer.get(), h_ref_list, rtol=0.0, atol=1.0e-5) print 'fprop is verified' print '====Verifying update on W and b ====' print 'dWxh' assert allclose_with_out(dWxh_neon, dWxh_ref, rtol=0.0, atol=1.0e-5) print 'dWhh' assert allclose_with_out(dWhh_neon, dWhh_ref, rtol=0.0, atol=1.0e-5) print '====Verifying update on bias====' print 'db' assert allclose_with_out(db_neon, db_ref, rtol=0.0, atol=1.0e-5) print 'bprop is verified' return
def test_conv_layer(fargs_tests, device_id): dtype = np.float32 ng = NervanaGPU(stochastic_round=False, bench=True, device_id=device_id) N, C, K = fargs_tests[0] D, H, W = fargs_tests[1] T, R, S = fargs_tests[2] padding_d, padding_h, padding_w = fargs_tests[3] strides_d, strides_h, strides_w = fargs_tests[4] conv_ng = ng.conv_layer(dtype, N, C, K, D, H, W, T, R, S, padding_d, padding_h, padding_w, strides_d, strides_h, strides_w) nc = NervanaCPU() conv_nc = nc.conv_layer(dtype, N, C, K, D, H, W, T, R, S, padding_d, padding_h, padding_w, strides_d, strides_h, strides_w) assert conv_nc.dimI == conv_ng.dimI assert conv_nc.dimF == conv_ng.dimF assert conv_nc.dimO == conv_ng.dimO assert conv_nc.M == conv_ng.M dimI = conv_ng.dimI dimF = conv_ng.dimF dimO = conv_ng.dimO # cpu input arrays cpuI = np.random.uniform(-0.8, 0.8, slicable(dimI, 1)).astype(np.float32) cpuF = np.random.uniform(0.0, 0.3, slicable(dimF)).astype(np.float32) cpuE = np.random.uniform(-0.2, 0.2, dimO).astype(np.float32) # zero pad the last row of cpu input for the sake of numpy cpuI[-1, :] = 0.0 # =======GPU and CPU========== beI = cpuI[:-1, :].reshape(dimI) beF = cpuF.reshape(dimF) beE = cpuE start_gpu = default_timer() ngO, ngB, ngU = run_backend_conv(ng, conv_ng, beI, beF, beE, dtype) end_gpu = default_timer() start_cpu = default_timer() ncO, ncB, ncU = run_backend_conv(nc, conv_nc, beI, beF, beE, dtype) end_cpu = default_timer() print("gputime: %s, cputime %s" % (end_gpu - start_gpu, end_cpu - start_cpu)) # ======numpy=========== # cpu output arrays cpuO = np.zeros(dimO, dtype=dtype) cpuB = np.zeros(slicable(dimI, 1), dtype=dtype) cpuU = np.zeros(slicable(dimF), dtype=dtype) D, H, W = conv_nc.DHW T, R, S = conv_nc.TRS M, P, Q = conv_nc.MPQ pad_d, pad_h, pad_w = conv_nc.padding str_d, str_h, str_w = conv_nc.strides for m in range(M): mt = m * str_d - pad_d for p in range(P): pr = p * str_h - pad_h for q in range(Q): qs = q * str_w - pad_w idx = pixel_indices(conv_nc, mt, pr, qs) cpuO[:, m, p, q, :] = np.dot(cpuF.T, cpuI[idx, :]) cpuB[idx, :] += np.dot(cpuF, cpuE[:, m, p, q, :]) cpuU += np.dot(cpuI[idx, :], cpuE[:, m, p, q, :].T) for op, ngA, ncA, cpuA, w in (("fprop", ngO, ncO, cpuO, Q), ("bprop", ngB, ncB.reshape(dimI), cpuB[:-1, :].reshape(dimI), W), ("update", ngU, ncU.reshape(dimF), cpuU.reshape(dimF), S)): print(op) ncAnp = ncA.get().astype(np.float32) ngAnp = ngA.get().astype(np.float32) ncdif = cpuA - ncAnp ngdif = cpuA - ngAnp maxval = abs(cpuA).max() ncmaxdif = abs(ncdif).max() ngmaxdif = abs(ngdif).max() ncRatio = ncmaxdif / maxval ngRatio = ngmaxdif / maxval assert ncRatio < 1e-5 assert ngRatio < 1e-5 assert allclose_with_out(ncA.get(), cpuA, rtol=0, atol=1e-4) assert allclose_with_out(ngA.get(), cpuA, rtol=0, atol=1e-3) del ng del nc
def test_branch_model(backend_gpu): np.random.seed(0) be = NervanaObject.be be.bsz = 64 main1 = main_branch() i1 = inception([(32, ), (32, 32), ('max', 16)]) top = top_branch() neon_layer = Sequential(main1 + i1 + top) inshape = (3, 224, 224) insize = np.prod(inshape) inpa = np.random.random((insize, batch_size)) neon_layer.configure(inshape) inp = neon_layer.be.array(inpa) neon_layer.allocate() print neon_layer.nested_str() neon_layer.layers[0].prev_layer = True neon_layer.allocate_deltas() neon_layer.layers[0].set_deltas([be.iobuf(inshape)]) neon_out = neon_layer.fprop(inp).get() # Now make the reference pathways: main_trunk2 = Sequential(main_branch()) main_trunk2.configure(inshape) main2 = main_trunk2.layers main2[0].prev_layer = True main2[0].set_deltas([be.iobuf(inshape)]) (b1, b2, b3) = inception_bare(i1, [(32, ), (32, 32), ('max', 16)]) for bb in (b1, b2, b3): oshape = inshape for ll in main2 + bb: oshape = ll.configure(oshape) main1_trunk = neon_layer.layers[:8] for ll, lo in zip(main2, main1_trunk): if ll.has_params: ll.set_params({'params': {'W': lo.W.get()}}) ll.allocate() ll.set_deltas([be.iobuf(ll.in_shape)]) for bb in (b1, b2, b3): for ll in bb: ll.allocate() ll.set_deltas([be.iobuf(ll.in_shape)]) # Create the combined output buffer merge_output = be.empty_like(neon_layer.layers[8].outputs) x = inp for ll in main2: x = ll.fprop(x) start = 0 for bb in (b1, b2, b3): xb = x for ll in bb: xb = ll.fprop(xb) end = start + xb.shape[0] merge_output[start:end] = xb start = end x = merge_output top_trunk = Sequential(top).layers for ll in top_trunk: x = ll.fprop(x) neon_out_ref = x.get() assert allclose_with_out(neon_out, neon_out_ref, rtol=0) print "Beginning Back prop" erra = np.random.random(neon_out.shape) err = be.array(erra) for ll in reversed(neon_layer.layers[8:]): err = ll.bprop(err) neon_deltas = err.get() for bb, errb in zip((b1, b2, b3), neon_layer.layers[8].error_views): for ll in reversed(bb): errb = ll.bprop(errb) # Now sum up the deltas at the root of the branch layer and compare ref_deltas = be.zeros_like(b1[0].deltas) ref_deltas[:] = b3[0].deltas + b2[0].deltas + b1[0].deltas neon_ref_deltas = ref_deltas.get() assert allclose_with_out(neon_deltas, neon_ref_deltas, rtol=0)
def test_branch_model(backend_gpu): np.random.seed(0) be = NervanaObject.be be.bsz = 64 main1 = main_branch() i1 = inception([(32,), (32, 32), ('max', 16)]) top = top_branch() neon_layer = Sequential(main1 + i1 + top) inshape = (3, 224, 224) insize = np.prod(inshape) inpa = np.random.random((insize, batch_size)) neon_layer.configure(inshape) inp = neon_layer.be.array(inpa) neon_layer.allocate() print neon_layer.nested_str() neon_layer.layers[0].prev_layer = True neon_layer.allocate_deltas() neon_layer.layers[0].set_deltas([be.iobuf(inshape)]) neon_out = neon_layer.fprop(inp).get() # Now make the reference pathways: main_trunk2 = Sequential(main_branch()) main_trunk2.configure(inshape) main2 = main_trunk2.layers main2[0].prev_layer = True main2[0].set_deltas([be.iobuf(inshape)]) (b1, b2, b3) = inception_bare(i1, [(32,), (32, 32), ('max', 16)]) for bb in (b1, b2, b3): oshape = inshape for ll in main2 + bb: oshape = ll.configure(oshape) main1_trunk = neon_layer.layers[:8] for ll, lo in zip(main2, main1_trunk): if ll.has_params: ll.set_params({'params': {'W': lo.W.get()}}) ll.allocate() ll.set_deltas([be.iobuf(ll.in_shape)]) for bb in (b1, b2, b3): for ll in bb: ll.allocate() ll.set_deltas([be.iobuf(ll.in_shape)]) # Create the combined output buffer merge_output = be.empty_like(neon_layer.layers[8].outputs) x = inp for ll in main2: x = ll.fprop(x) start = 0 for bb in (b1, b2, b3): xb = x for ll in bb: xb = ll.fprop(xb) end = start + xb.shape[0] merge_output[start:end] = xb start = end x = merge_output top_trunk = Sequential(top).layers for ll in top_trunk: x = ll.fprop(x) neon_out_ref = x.get() assert allclose_with_out(neon_out, neon_out_ref, rtol=0) print "Beginning Back prop" erra = np.random.random(neon_out.shape) err = be.array(erra) for ll in reversed(neon_layer.layers[8:]): err = ll.bprop(err) neon_deltas = err.get() for bb, errb in zip((b1, b2, b3), neon_layer.layers[8].error_views): for ll in reversed(bb): errb = ll.bprop(errb) # Now sum up the deltas at the root of the branch layer and compare ref_deltas = be.zeros_like(b1[0].deltas) ref_deltas[:] = b3[0].deltas + b2[0].deltas + b1[0].deltas neon_ref_deltas = ref_deltas.get() assert allclose_with_out(neon_deltas, neon_ref_deltas, rtol=0)
def test_branch_model_fork(backend_gpu): from neon.layers import BranchNode, Tree np.random.seed(0) be = NervanaObject.be be.bsz = 64 bnode = BranchNode() i1 = inception([(32,), (32, 32), ('max', 16)]) top1 = top_branch() top2 = top_branch() p1 = Sequential(main_branch() + [bnode, i1] + top1) p2 = [bnode] + top2 alpha2 = 0.3 neon_layer = Tree([p1, p2], alphas=[1.0, alpha2]) inshape = (3, 224, 224) insize = np.prod(inshape) inpa = np.random.random((insize, batch_size)) neon_layer.configure(inshape) inp = neon_layer.be.array(inpa) neon_layer.allocate() print neon_layer.nested_str() neon_layer.layers[0].layers[0].prev_layer = True neon_layer.allocate_deltas() neon_layer.layers[0].layers[0].set_deltas([be.iobuf(inshape)]) neon_out_dev = neon_layer.fprop(inp) neon_out = [d.get() for d in neon_out_dev] # Now make the reference pathways: main_trunk2 = Sequential(main_branch()) main_trunk2.configure(inshape) main2 = main_trunk2.layers main2[0].prev_layer = True main2[0].set_deltas([be.iobuf(inshape)]) branch2 = Sequential(top_branch()) lbranch2 = branch2.layers (b1, b2, b3) = inception_bare(i1, [(32,), (32, 32), ('max', 16)]) for bb in (b1, b2, b3, lbranch2): oshape = inshape for ll in main2 + bb: oshape = ll.configure(oshape) main1_trunk = neon_layer.layers[0].layers[:8] for ll, lo in zip(main2, main1_trunk): if ll.has_params: ll.set_params({'params': {'W': lo.W.get()}}) ll.allocate() ll.set_deltas([be.iobuf(ll.in_shape)]) for ll, lo in zip(lbranch2, neon_layer.layers[1].layers[1:]): if ll.has_params: ll.set_params({'params': {'W': lo.W.get()}}) for bb in (b1, b2, b3, lbranch2): for ll in bb: ll.allocate() ll.set_deltas([be.iobuf(ll.in_shape)]) # Create the combined output buffer merge_output = be.empty_like(neon_layer.layers[0].layers[9].outputs) x = inp for ll in main2: x = ll.fprop(x) main2_out = x start = 0 for bb in (b1, b2, b3): xb = main2_out for ll in bb: xb = ll.fprop(xb) end = start + xb.shape[0] merge_output[start:end] = xb start = end x = merge_output top_trunk = Sequential(top1).layers for ll in top_trunk: x = ll.fprop(x) neon_out_ref = x.get() assert allclose_with_out(neon_out_ref, neon_out[0], rtol=0) # Now do second branch neon_out_ref2 = branch2.fprop(main2_out).get() assert allclose_with_out(neon_out_ref2, neon_out[1]) print "Beginning Back prop" erra = [np.random.random(d.shape) for d in neon_out] err = [be.array(d) for d in erra] neon_layer.layers[0].layers[0].deltas = be.iobuf(inshape) neon_layer.bprop(err) bottom_neon_deltas = neon_layer.layers[0].layers[1].deltas.get() middle_neon_deltas = neon_layer.layers[1].layers[1].deltas.get() err0 = err[0] for ll in reversed(top_trunk): err0 = ll.bprop(err0) err1 = err[1] for ll in reversed(lbranch2): err1 = ll.bprop(err1) for bb, errb in zip((b1, b2, b3), neon_layer.layers[0].layers[-5].error_views): for ll in reversed(bb): errb = ll.bprop(errb) # Now sum up the deltas at the root of the branch layer and compare ref_deltas = be.zeros_like(b1[0].deltas) ref_deltas[:] = alpha2 * lbranch2[0].deltas ref_deltas[:] = ref_deltas + b3[0].deltas + b2[0].deltas + b1[0].deltas neon_ref_deltas = ref_deltas.get() assert allclose_with_out(middle_neon_deltas, neon_ref_deltas, rtol=0) x = ref_deltas main2[0].deltas = be.iobuf(inshape) for ll in reversed(main2): x = ll.bprop(x) bottom_neon_ref_deltas = main2[1].deltas.get() assert allclose_with_out(bottom_neon_deltas, bottom_neon_ref_deltas, rtol=0)
def test_branch_model_fork(backend_gpu): from neon.layers import BranchNode, Tree np.random.seed(0) be = NervanaObject.be be.bsz = 64 bnode = BranchNode() i1 = inception([(32, ), (32, 32), ('max', 16)]) top1 = top_branch() top2 = top_branch() p1 = Sequential(main_branch() + [bnode, i1] + top1) p2 = [bnode] + top2 alpha2 = 0.3 neon_layer = Tree([p1, p2], alphas=[1.0, alpha2]) inshape = (3, 224, 224) insize = np.prod(inshape) inpa = np.random.random((insize, batch_size)) neon_layer.configure(inshape) inp = neon_layer.be.array(inpa) neon_layer.allocate() print neon_layer.nested_str() neon_layer.layers[0].layers[0].prev_layer = True neon_layer.allocate_deltas() neon_layer.layers[0].layers[0].set_deltas([be.iobuf(inshape)]) neon_out_dev = neon_layer.fprop(inp) neon_out = [d.get() for d in neon_out_dev] # Now make the reference pathways: main_trunk2 = Sequential(main_branch()) main_trunk2.configure(inshape) main2 = main_trunk2.layers main2[0].prev_layer = True main2[0].set_deltas([be.iobuf(inshape)]) branch2 = Sequential(top_branch()) lbranch2 = branch2.layers (b1, b2, b3) = inception_bare(i1, [(32, ), (32, 32), ('max', 16)]) for bb in (b1, b2, b3, lbranch2): oshape = inshape for ll in main2 + bb: oshape = ll.configure(oshape) main1_trunk = neon_layer.layers[0].layers[:8] for ll, lo in zip(main2, main1_trunk): if ll.has_params: ll.set_params({'params': {'W': lo.W.get()}}) ll.allocate() ll.set_deltas([be.iobuf(ll.in_shape)]) for ll, lo in zip(lbranch2, neon_layer.layers[1].layers[1:]): if ll.has_params: ll.set_params({'params': {'W': lo.W.get()}}) for bb in (b1, b2, b3, lbranch2): for ll in bb: ll.allocate() ll.set_deltas([be.iobuf(ll.in_shape)]) # Create the combined output buffer merge_output = be.empty_like(neon_layer.layers[0].layers[9].outputs) x = inp for ll in main2: x = ll.fprop(x) main2_out = x start = 0 for bb in (b1, b2, b3): xb = main2_out for ll in bb: xb = ll.fprop(xb) end = start + xb.shape[0] merge_output[start:end] = xb start = end x = merge_output top_trunk = Sequential(top1).layers for ll in top_trunk: x = ll.fprop(x) neon_out_ref = x.get() assert allclose_with_out(neon_out_ref, neon_out[0], rtol=0) # Now do second branch neon_out_ref2 = branch2.fprop(main2_out).get() assert allclose_with_out(neon_out_ref2, neon_out[1]) print "Beginning Back prop" erra = [np.random.random(d.shape) for d in neon_out] err = [be.array(d) for d in erra] neon_layer.layers[0].layers[0].deltas = be.iobuf(inshape) neon_layer.bprop(err) bottom_neon_deltas = neon_layer.layers[0].layers[1].deltas.get() middle_neon_deltas = neon_layer.layers[1].layers[1].deltas.get() err0 = err[0] for ll in reversed(top_trunk): err0 = ll.bprop(err0) err1 = err[1] for ll in reversed(lbranch2): err1 = ll.bprop(err1) for bb, errb in zip((b1, b2, b3), neon_layer.layers[0].layers[-5].error_views): for ll in reversed(bb): errb = ll.bprop(errb) # Now sum up the deltas at the root of the branch layer and compare ref_deltas = be.zeros_like(b1[0].deltas) ref_deltas[:] = alpha2 * lbranch2[0].deltas ref_deltas[:] = ref_deltas + b3[0].deltas + b2[0].deltas + b1[0].deltas neon_ref_deltas = ref_deltas.get() assert allclose_with_out(middle_neon_deltas, neon_ref_deltas, rtol=0) x = ref_deltas main2[0].deltas = be.iobuf(inshape) for ll in reversed(main2): x = ll.bprop(x) bottom_neon_ref_deltas = main2[1].deltas.get() assert allclose_with_out(bottom_neon_deltas, bottom_neon_ref_deltas, rtol=0)
def test_conv_rand(backend_default, rand_convargs): indim, nifm, fshape, nofm, batch_size, stride, rng_max, w_rng, pad = rand_convargs NervanaObject.be.bsz = batch_size inp_rng = [0.0, rng_max] dtypeu = np.float32 init_unif = Uniform(low=w_rng[0], high=w_rng[1]) inshape = (nifm, indim, indim) insize = np.prod(inshape) # generate neon conv layer neon_layer = Convolution(fshape=(fshape, fshape, nofm), strides=stride, padding=pad, init=init_unif) # generate the reference layer ref_layer = ConvLayerRef(1, batch_size, identity, inshape[0], inshape[1:3], (fshape, fshape), nofm, stride, dtypeu, padding=pad) # setup input in range inp_rng inpa = np.random.random((insize, batch_size)) inpa *= inp_rng[1] - inp_rng[0] inpa += inp_rng[0] inpa = inpa.astype(dtypeu) inp = neon_layer.be.array(inpa) inp.lshape = inshape # run fprop on neon neon_layer.configure(inshape) neon_layer.prev_layer = True neon_layer.allocate() neon_layer.set_deltas([neon_layer.be.iobuf(inshape)]) neon_out = neon_layer.fprop(inp).get() # pull neon weights into ref layer weights ref_layer.weights = neon_layer.W.get().T ref_layer.fprop(inpa.T) ref_out = np.copy(ref_layer.y) # estimate the numerical precision by # permuting order of ops in ref layer # fprop calculation ref_layer.fprop(inpa.T, permute=True) ref_out_perm = ref_layer.y atol = 4*np.max(np.abs(ref_out - ref_out_perm)) # compare ref and neon layer fprop outputs # using the empirically determined atol assert allclose_with_out(ref_out.T, neon_out, atol=atol, rtol=1.e-4) # generate random deltas array erra = np.random.random(neon_out.shape) erra *= (inp_rng[1] - inp_rng[0]) erra += inp_rng[0] erra = erra.astype(dtypeu) err = neon_layer.be.array(erra) # run neon bprop neon_deltas = neon_layer.bprop(err).get() neon_dW = neon_layer.dW.get() # run ref code bprop ref_layer.bprop(erra.T, 1.0) ref_deltas = np.copy(ref_layer.berror_nopad.T) ref_dW = np.copy(ref_layer.updates) # estimate precision using permutation # of operation order on ref layer code ref_layer.bprop(erra.T, 1.0, permute=True) ref_deltas_perm = ref_layer.berror_nopad.T ref_dW_perm = ref_layer.updates atol = 4*np.max(np.abs(ref_deltas - ref_deltas_perm)) assert allclose_with_out(ref_deltas, neon_deltas, atol=atol, rtol=1.e-4) atol = 4*np.max(np.abs(ref_dW - ref_dW_perm)) assert allclose_with_out(ref_dW.T, neon_dW, atol=atol, rtol=1.e-4) return
def test_bibn(backend_default, fargs): seq_len, input_size, hidden_size, batch_size = fargs in_shape = (input_size, seq_len) NervanaObject.be.bsz = batch_size # setup the bi-directional rnn init_glorot = GlorotUniform() birnn = BiBNRNN(hidden_size, activation=Logistic(), init=init_glorot) birnn.configure(in_shape) birnn.prev_layer = True birnn.allocate() birnn.set_deltas([birnn.be.iobuf(birnn.in_shape)]) # test fprop # set the ff buffer inp_np = np.random.random(birnn.h_ff_buffer.shape) inp_be = birnn.be.array(inp_np) birnn.h_ff_buffer[:] = inp_np # compare the bn output with calling the backend bn xsum = birnn.be.zeros_like(birnn.xmean) xvar = birnn.be.zeros_like(birnn.xvar) gmean = birnn.be.zeros_like(birnn.gmean) gvar = birnn.be.zeros_like(birnn.gvar) gamma = birnn.be.ones(birnn.gamma.shape) beta = birnn.be.zeros_like(birnn.beta) grad_gamma = birnn.be.zeros_like(gamma) grad_beta = birnn.be.zeros_like(beta) out_ref = birnn.be.zeros_like(birnn.h_ff_buffer) xsum[:] = birnn.be.sum(birnn.h_ff_buffer, axis=1) birnn.be.compound_fprop_bn(birnn.h_ff_buffer, xsum, xvar, gmean, gvar, gamma, beta, out_ref, birnn.eps, birnn.rho, accumbeta=0, relu=False) # call the bibnrnn layer fprop_bn out_bn = birnn._fprop_bn(birnn.h_ff_buffer, inference=False) assert allclose_with_out(out_bn.get(), out_ref.get(), rtol=0.0, atol=1.0e-5) # test bprop err_np = np.random.random(birnn.h_ff_buffer.shape) err_be = birnn.be.array(err_np) err_out_ref = birnn.be.empty_like(err_be) birnn.be.compound_bprop_bn(err_out_ref, grad_gamma, grad_beta, err_be, inp_be, xsum, xvar, gamma, birnn.eps) err_out_bn = birnn._bprop_bn(err_be, out_bn) assert allclose_with_out(err_out_bn.get(), err_out_ref.get(), rtol=0.0, atol=1.0e-5)
def test_conv_rand(backend_default, rand_convargs): indim, nifm, fshape, nofm, batch_size, stride, rng_max, w_rng, pad = rand_convargs NervanaObject.be.bsz = batch_size inp_rng = [0.0, rng_max] dtypeu = np.float32 init_unif = Uniform(low=w_rng[0], high=w_rng[1]) inshape = (nifm, indim, indim) insize = np.prod(inshape) # generate neon conv layer neon_layer = Convolution(fshape=(fshape, fshape, nofm), strides=stride, padding=pad, init=init_unif) # generate the reference layer ref_layer = ConvLayerRef(1, batch_size, identity, inshape[0], inshape[1:3], (fshape, fshape), nofm, stride, dtypeu, padding=pad) # setup input in range inp_rng inpa = np.random.random((insize, batch_size)) inpa *= inp_rng[1] - inp_rng[0] inpa += inp_rng[0] inpa = inpa.astype(dtypeu) inp = neon_layer.be.array(inpa) inp.lshape = inshape # run fprop on neon neon_layer.configure(inshape) neon_layer.prev_layer = True neon_layer.allocate() neon_layer.set_deltas([neon_layer.be.iobuf(inshape)]) neon_out = neon_layer.fprop(inp).get() # pull neon weights into ref layer weights ref_layer.weights = neon_layer.W.get().T ref_layer.fprop(inpa.T) ref_out = np.copy(ref_layer.y) # estimate the numerical precision by # permuting order of ops in ref layer # fprop calculation ref_layer.fprop(inpa.T, permute=True) ref_out_perm = ref_layer.y atol = 4 * np.max(np.abs(ref_out - ref_out_perm)) # compare ref and neon layer fprop outputs # using the empirically determined atol assert allclose_with_out(ref_out.T, neon_out, atol=atol, rtol=1.e-4) # generate random deltas array erra = np.random.random(neon_out.shape) erra *= (inp_rng[1] - inp_rng[0]) erra += inp_rng[0] erra = erra.astype(dtypeu) err = neon_layer.be.array(erra) # run neon bprop neon_deltas = neon_layer.bprop(err).get() neon_dW = neon_layer.dW.get() # run ref code bprop ref_layer.bprop(erra.T, 1.0) ref_deltas = np.copy(ref_layer.berror_nopad.T) ref_dW = np.copy(ref_layer.updates) # estimate precision using permutation # of operation order on ref layer code ref_layer.bprop(erra.T, 1.0, permute=True) ref_deltas_perm = ref_layer.berror_nopad.T ref_dW_perm = ref_layer.updates atol = 4 * np.max(np.abs(ref_deltas - ref_deltas_perm)) assert allclose_with_out(ref_deltas, neon_deltas, atol=atol, rtol=1.e-4) atol = 4 * np.max(np.abs(ref_dW - ref_dW_perm)) assert allclose_with_out(ref_dW.T, neon_dW, atol=atol, rtol=1.e-4) return
def test_conv_layer(fargs_tests, device_id): dtype = np.float32 ng = NervanaGPU(stochastic_round=False, bench=True, device_id=device_id) N, C, K = fargs_tests[0] D, H, W = fargs_tests[1] T, R, S = fargs_tests[2] padding_d, padding_h, padding_w = fargs_tests[3] strides_d, strides_h, strides_w = fargs_tests[4] conv_ng = ng.conv_layer( dtype, N, C, K, D, H, W, T, R, S, padding_d, padding_h, padding_w, strides_d, strides_h, strides_w) nc = NervanaCPU() conv_nc = nc.conv_layer( dtype, N, C, K, D, H, W, T, R, S, padding_d, padding_h, padding_w, strides_d, strides_h, strides_w) assert conv_nc.dimI == conv_ng.dimI assert conv_nc.dimF == conv_ng.dimF assert conv_nc.dimO == conv_ng.dimO assert conv_nc.M == conv_ng.M dimI = conv_ng.dimI dimF = conv_ng.dimF dimO = conv_ng.dimO # cpu input arrays cpuI = np.random.uniform(-0.8, 0.8, slicable(dimI, 1)).astype(np.float32) cpuF = np.random.uniform(0.0, 0.3, slicable(dimF)).astype(np.float32) cpuE = np.random.uniform(-0.2, 0.2, dimO).astype(np.float32) # zero pad the last row of cpu input for the sake of numpy cpuI[-1, :] = 0.0 # =======GPU and CPU========== beI = cpuI[:-1, :].reshape(dimI) beF = cpuF.reshape(dimF) beE = cpuE start_gpu = default_timer() ngO, ngB, ngU = run_backend_conv(ng, conv_ng, beI, beF, beE, dtype) end_gpu = default_timer() start_cpu = default_timer() ncO, ncB, ncU = run_backend_conv(nc, conv_nc, beI, beF, beE, dtype) end_cpu = default_timer() print("gputime: %s, cputime %s" % (end_gpu - start_gpu, end_cpu - start_cpu)) # ======numpy=========== # cpu output arrays cpuO = np.zeros(dimO, dtype=dtype) cpuB = np.zeros(slicable(dimI, 1), dtype=dtype) cpuU = np.zeros(slicable(dimF), dtype=dtype) D, H, W = conv_nc.DHW T, R, S = conv_nc.TRS M, P, Q = conv_nc.MPQ pad_d, pad_h, pad_w = conv_nc.padding str_d, str_h, str_w = conv_nc.strides for m in range(M): mt = m * str_d - pad_d for p in range(P): pr = p * str_h - pad_h for q in range(Q): qs = q * str_w - pad_w idx = pixel_indices(conv_nc, mt, pr, qs) cpuO[:, m, p, q, :] = np.dot(cpuF.T, cpuI[idx, :]) cpuB[idx, :] += np.dot(cpuF, cpuE[:, m, p, q, :]) cpuU += np.dot(cpuI[idx, :], cpuE[:, m, p, q, :].T) for op, ngA, ncA, cpuA, w in ( ("fprop", ngO, ncO, cpuO, Q), ("bprop", ngB, ncB.reshape(dimI), cpuB[:-1, :].reshape(dimI), W), ("update", ngU, ncU.reshape(dimF), cpuU.reshape(dimF), S)): print(op) ncAnp = ncA.get().astype(np.float32) ngAnp = ngA.get().astype(np.float32) ncdif = cpuA - ncAnp ngdif = cpuA - ngAnp maxval = abs(cpuA).max() ncmaxdif = abs(ncdif).max() ngmaxdif = abs(ngdif).max() ncRatio = ncmaxdif / maxval ngRatio = ngmaxdif / maxval assert ncRatio < 1e-5 assert ngRatio < 1e-5 assert allclose_with_out(ncA.get(), cpuA, rtol=0, atol=1e-4) assert allclose_with_out(ngA.get(), cpuA, rtol=0, atol=1e-3) del ng del nc