def gradient_calc(seq_len, input_size, hidden_size, batch_size, epsilon=None, rand_scale=None, inp_bl=None): NervanaObject.be.bsz = NervanaObject.be.batch_size = batch_size input_shape = (input_size, seq_len * batch_size) # generate input if one is not given if inp_bl is None: inp_bl = np.random.randn(*input_shape) # neon rnn instance rnn = Recurrent(hidden_size, Gaussian(), Tanh()) inpa = rnn.be.array(np.copy(inp_bl)) # run fprop on the baseline input rnn.configure((input_size, seq_len)) rnn.prev_layer = True rnn.allocate() rnn.set_deltas([rnn.be.iobuf(rnn.in_shape)]) out_bl = rnn.fprop(inpa).get() # random scaling/hash to generate fake loss if rand_scale is None: rand_scale = np.random.random(out_bl.shape) * 2.0 - 1.0 # loss function would be: # loss_bl = np.sum(rand_scale * out_bl) # run back prop with rand_scale as the errors # use copy to avoid any interactions deltas_neon = rnn.bprop(rnn.be.array(np.copy(rand_scale))).get() # add a perturbation to each input element grads_est = np.zeros(inpa.shape) inp_pert = inp_bl.copy() for pert_ind in range(inpa.size): save_val = inp_pert.flat[pert_ind] inp_pert.flat[pert_ind] = save_val + epsilon reset_rnn(rnn) rnn.allocate() out_pos = rnn.fprop(rnn.be.array(inp_pert)).get() inp_pert.flat[pert_ind] = save_val - epsilon reset_rnn(rnn) rnn.allocate() out_neg = rnn.fprop(rnn.be.array(inp_pert)).get() # calculate the loss with perturbations loss_pos = np.sum(rand_scale*out_pos) loss_neg = np.sum(rand_scale*out_neg) # compute the gradient estimate grad = 0.5*(loss_pos-loss_neg)/epsilon grads_est.flat[pert_ind] = grad # reset the perturbed input element inp_pert.flat[pert_ind] = save_val del rnn return (grads_est, deltas_neon)
def gradient_calc(seq_len, input_size, hidden_size, batch_size, epsilon=None, rand_scale=None, inp_bl=None): NervanaObject.be.bsz = NervanaObject.be.batch_size = batch_size input_shape = (input_size, seq_len * batch_size) # generate input if one is not given if inp_bl is None: inp_bl = np.random.randn(*input_shape) # neon rnn instance rnn = Recurrent(hidden_size, Gaussian(), activation=Tanh()) inpa = rnn.be.array(np.copy(inp_bl)) # run fprop on the baseline input rnn.configure((input_size, seq_len)) rnn.prev_layer = True rnn.allocate() rnn.set_deltas([rnn.be.iobuf(rnn.in_shape)]) out_bl = rnn.fprop(inpa).get() # random scaling/hash to generate fake loss if rand_scale is None: rand_scale = np.random.random(out_bl.shape) * 2.0 - 1.0 # loss function would be: # loss_bl = np.sum(rand_scale * out_bl) # run back prop with rand_scale as the errors # use copy to avoid any interactions deltas_neon = rnn.bprop(rnn.be.array(np.copy(rand_scale))).get() # add a perturbation to each input element grads_est = np.zeros(inpa.shape) inp_pert = inp_bl.copy() for pert_ind in range(inpa.size): save_val = inp_pert.flat[pert_ind] inp_pert.flat[pert_ind] = save_val + epsilon reset_rnn(rnn) rnn.allocate() out_pos = rnn.fprop(rnn.be.array(inp_pert)).get() inp_pert.flat[pert_ind] = save_val - epsilon reset_rnn(rnn) rnn.allocate() out_neg = rnn.fprop(rnn.be.array(inp_pert)).get() # calculate the loss with perturbations loss_pos = np.sum(rand_scale * out_pos) loss_neg = np.sum(rand_scale * out_neg) # compute the gradient estimate grad = 0.5 * (loss_pos - loss_neg) / epsilon grads_est.flat[pert_ind] = grad # reset the perturbed input element inp_pert.flat[pert_ind] = save_val del rnn return (grads_est, deltas_neon)
def check_rnn(seq_len, input_size, hidden_size, batch_size, init_func, inp_moms=[0.0, 1.0]): # init_func is the initializer for the model params # inp_moms is the [ mean, std dev] of the random input input_shape = (input_size, seq_len * batch_size) output_shape = (hidden_size, seq_len * batch_size) NervanaObject.be.bsz = NervanaObject.be.batch_size = batch_size # ======== create models ======== # neon RNN rnn = Recurrent(hidden_size, init_func, activation=Tanh()) # reference numpy RNN rnn_ref = RefRecurrent(input_size, hidden_size) Wxh = rnn_ref.Wxh Whh = rnn_ref.Whh bh = rnn_ref.bh # ========= generate data ================= # generate random input tensor inp = np.random.rand(*input_shape) * inp_moms[1] + inp_moms[0] inpa = rnn.be.array(inp) # generate random deltas tensor deltas = np.random.randn(*output_shape) # the reference code expects these shapes: # input_shape: (seq_len, input_size, batch_size) # output_shape: (seq_len, hidden_size, batch_size) inp_ref = inp.copy().T.reshape(seq_len, batch_size, input_size).swapaxes(1, 2) deltas_ref = deltas.copy().T.reshape(seq_len, batch_size, hidden_size).swapaxes(1, 2) # ========= running models ========== # run neon fprop rnn.configure((input_size, seq_len)) rnn.prev_layer = True rnn.allocate() dtree = DeltasTree() rnn.allocate_deltas(dtree) dtree.allocate_buffers() rnn.set_deltas(dtree) rnn.fprop(inpa) # weights are only initialized after doing fprop, so now # make ref weights and biases the same with neon model Wxh[:] = rnn.W_input.get() Whh[:] = rnn.W_recur.get() bh[:] = rnn.b.get() (dWxh_ref, dWhh_ref, db_ref, h_ref_list, dh_ref_list, d_out_ref) = rnn_ref.lossFun(inp_ref, deltas_ref) # now test the bprop rnn.bprop(rnn.be.array(deltas)) # grab the delta W from gradient buffer dWxh_neon = rnn.dW_input.get() dWhh_neon = rnn.dW_recur.get() db_neon = rnn.db.get() # comparing outputs neon_logger.display('====Verifying hidden states====') assert allclose_with_out(rnn.outputs.get(), h_ref_list, rtol=0.0, atol=1.0e-5) neon_logger.display('fprop is verified') neon_logger.display('====Verifying update on W and b ====') neon_logger.display('dWxh') assert allclose_with_out(dWxh_neon, dWxh_ref, rtol=0.0, atol=1.0e-5) neon_logger.display('dWhh') assert allclose_with_out(dWhh_neon, dWhh_ref, rtol=0.0, atol=1.0e-5) neon_logger.display('====Verifying update on bias====') neon_logger.display('db') assert allclose_with_out(db_neon, db_ref, rtol=0.0, atol=1.0e-5) neon_logger.display('bprop is verified') return
def check_rnn(seq_len, input_size, hidden_size, batch_size, init_func, inp_moms=[0.0, 1.0]): # init_func is the initializer for the model params # inp_moms is the [ mean, std dev] of the random input input_shape = (input_size, seq_len * batch_size) output_shape = (hidden_size, seq_len * batch_size) NervanaObject.be.bsz = NervanaObject.be.batch_size = batch_size # ======== create models ======== # neon RNN rnn = Recurrent(hidden_size, init_func, Tanh()) # reference numpy RNN rnn_ref = RefRecurrent(input_size, hidden_size) Wxh = rnn_ref.Wxh Whh = rnn_ref.Whh bh = rnn_ref.bh # ========= generate data ================= # generate random input tensor inp = np.random.rand(*input_shape)*inp_moms[1] + inp_moms[0] inpa = rnn.be.array(inp) # generate random deltas tensor deltas = np.random.randn(*output_shape) # the reference code expects these shapes: # input_shape: (seq_len, input_size, batch_size) # output_shape: (seq_len, hidden_size, batch_size) inp_ref = inp.copy().T.reshape( seq_len, batch_size, input_size).swapaxes(1, 2) deltas_ref = deltas.copy().T.reshape( seq_len, batch_size, hidden_size).swapaxes(1, 2) # ========= running models ========== # run neon fprop rnn.configure((input_size, seq_len)) rnn.prev_layer = True rnn.allocate() rnn.set_deltas([rnn.be.iobuf(rnn.in_shape)]) rnn.fprop(inpa) # weights are only initialized after doing fprop, so now # make ref weights and biases the same with neon model Wxh[:] = rnn.W_input.get() Whh[:] = rnn.W_recur.get() bh[:] = rnn.b.get() (dWxh_ref, dWhh_ref, db_ref, h_ref_list, dh_ref_list, d_out_ref) = rnn_ref.lossFun(inp_ref, deltas_ref) # now test the bprop rnn.bprop(rnn.be.array(deltas)) # grab the delta W from gradient buffer dWxh_neon = rnn.dW_input.get() dWhh_neon = rnn.dW_recur.get() db_neon = rnn.db.get() # comparing outputs print '====Verifying hidden states====' print allclose_with_out(rnn.h_buffer.get(), h_ref_list, rtol=0.0, atol=1.0e-5) print 'fprop is verified' print '====Verifying update on W and b ====' print 'dWxh' assert allclose_with_out(dWxh_neon, dWxh_ref, rtol=0.0, atol=1.0e-5) print 'dWhh' assert allclose_with_out(dWhh_neon, dWhh_ref, rtol=0.0, atol=1.0e-5) print '====Verifying update on bias====' print 'db' assert allclose_with_out(db_neon, db_ref, rtol=0.0, atol=1.0e-5) print 'bprop is verified' return