def gradient_calc(seq_len, input_size, hidden_size, batch_size, epsilon=None, rand_scale=None, inp_bl=None): NervanaObject.be.bsz = NervanaObject.be.batch_size = batch_size input_shape = (input_size, seq_len * batch_size) # generate input if one is not given if inp_bl is None: inp_bl = np.random.randn(*input_shape) # neon lstm instance lstm = LSTM(hidden_size, Gaussian(), activation=Tanh(), gate_activation=Logistic()) inpa = lstm.be.array(np.copy(inp_bl)) # run fprop on the baseline input lstm.configure((input_size, seq_len)) lstm.prev_layer = True # Hack to force allocating a delta buffer lstm.allocate() dtree = DeltasTree() lstm.allocate_deltas(dtree) dtree.allocate_buffers() lstm.set_deltas(dtree) out_bl = lstm.fprop(inpa).get() # random scaling/hash to generate fake loss if rand_scale is None: rand_scale = np.random.random(out_bl.shape) * 2.0 - 1.0 # loss function would be: # loss_bl = np.sum(rand_scale * out_bl) # run back prop with rand_scale as the errors # use copy to avoid any interactions deltas_neon = lstm.bprop(lstm.be.array(np.copy(rand_scale))).get() # add a perturbation to each input element grads_est = np.zeros(inpa.shape) inp_pert = inp_bl.copy() for pert_ind in range(inpa.size): save_val = inp_pert.flat[pert_ind] inp_pert.flat[pert_ind] = save_val + epsilon reset_lstm(lstm) lstm.allocate() out_pos = lstm.fprop(lstm.be.array(inp_pert)).get() inp_pert.flat[pert_ind] = save_val - epsilon reset_lstm(lstm) lstm.allocate() out_neg = lstm.fprop(lstm.be.array(inp_pert)).get() # calculate the loss with perturbations loss_pos = np.sum(rand_scale * out_pos) loss_neg = np.sum(rand_scale * out_neg) # compute the gradient estimate grad = 0.5 / float(epsilon) * (loss_pos - loss_neg) grads_est.flat[pert_ind] = grad # reset the perturbed input element inp_pert.flat[pert_ind] = save_val del lstm return (grads_est, deltas_neon)
def check_lstm(seq_len, input_size, hidden_size, batch_size, init_func, inp_moms=[0.0, 1.0]): # init_func is the initializer for the model params # inp_moms is the [ mean, std dev] of the random input input_shape = (input_size, seq_len * batch_size) hidden_shape = (hidden_size, seq_len * batch_size) NervanaObject.be.bsz = NervanaObject.be.batch_size = batch_size # neon LSTM lstm = LSTM(hidden_size, init_func, activation=Tanh(), gate_activation=Logistic()) inp = np.random.rand(*input_shape) * inp_moms[1] + inp_moms[0] inpa = lstm.be.array(inp) # run neon fprop lstm.configure((input_size, seq_len)) lstm.prev_layer = True # Hack to force allocating a delta buffer lstm.allocate() dtree = DeltasTree() lstm.allocate_deltas(dtree) dtree.allocate_buffers() lstm.set_deltas(dtree) lstm.fprop(inpa) # reference numpy LSTM lstm_ref = RefLSTM() WLSTM = lstm_ref.init(input_size, hidden_size) # make ref weights and biases with neon model WLSTM[0, :] = lstm.b.get().T WLSTM[1:input_size + 1, :] = lstm.W_input.get().T WLSTM[input_size + 1:] = lstm.W_recur.get().T # transpose input X and do fprop inp_ref = inp.copy().T.reshape(seq_len, batch_size, input_size) (Hout_ref, cprev, hprev, batch_cache) = lstm_ref.forward(inp_ref, WLSTM) # the output needs transpose as well Hout_ref = Hout_ref.reshape(seq_len * batch_size, hidden_size).T IFOGf_ref = batch_cache['IFOGf'].reshape(seq_len * batch_size, hidden_size * 4).T Ct_ref = batch_cache['Ct'].reshape(seq_len * batch_size, hidden_size).T # compare results neon_logger.display('====Verifying IFOG====') assert allclose_with_out(lstm.ifog_buffer.get(), IFOGf_ref, rtol=0.0, atol=1.5e-5) neon_logger.display('====Verifying cell states====') assert allclose_with_out(lstm.c_act_buffer.get(), Ct_ref, rtol=0.0, atol=1.5e-5) neon_logger.display('====Verifying hidden states====') assert allclose_with_out(lstm.outputs.get(), Hout_ref, rtol=0.0, atol=1.5e-5) neon_logger.display('fprop is verified') # now test the bprop # generate random deltas tensor deltas = np.random.randn(*hidden_shape) lstm.bprop(lstm.be.array(deltas)) # grab the delta W from gradient buffer dWinput_neon = lstm.dW_input.get() dWrecur_neon = lstm.dW_recur.get() db_neon = lstm.db.get() deltas_ref = deltas.copy().T.reshape(seq_len, batch_size, hidden_size) (dX_ref, dWLSTM_ref, dc0_ref, dh0_ref) = lstm_ref.backward(deltas_ref, batch_cache) dWrecur_ref = dWLSTM_ref[-hidden_size:, :] dWinput_ref = dWLSTM_ref[1:input_size + 1, :] db_ref = dWLSTM_ref[0, :] dX_ref = dX_ref.reshape(seq_len * batch_size, input_size).T # compare results neon_logger.display('Making sure neon LSTM match numpy LSTM in bprop') neon_logger.display('====Verifying update on W_recur====') assert allclose_with_out(dWrecur_neon, dWrecur_ref.T, rtol=0.0, atol=1.5e-5) neon_logger.display('====Verifying update on W_input====') assert allclose_with_out(dWinput_neon, dWinput_ref.T, rtol=0.0, atol=1.5e-5) neon_logger.display('====Verifying update on bias====') assert allclose_with_out(db_neon.flatten(), db_ref, rtol=0.0, atol=1.5e-5) neon_logger.display('====Verifying output delta====') assert allclose_with_out(lstm.out_deltas_buffer.get(), dX_ref, rtol=0.0, atol=1.5e-5) neon_logger.display('bprop is verified') return