def gradient_calc(seq_len, input_size, hidden_size, batch_size, epsilon=None, rand_scale=None, inp_bl=None): NervanaObject.be.bsz = NervanaObject.be.batch_size = batch_size input_shape = (input_size, seq_len * batch_size) # generate input if one is not given if inp_bl is None: inp_bl = np.random.randn(*input_shape) # neon gru instance gru = GRU(hidden_size, init=Gaussian(), activation=Tanh(), gate_activation=Logistic()) inpa = gru.be.array(np.copy(inp_bl)) # run fprop on the baseline input gru.configure((input_size, seq_len)) gru.prev_layer = True gru.allocate() gru.set_deltas([gru.be.iobuf(gru.in_shape)]) out_bl = gru.fprop(inpa).get() # random scaling/hash to generate fake loss if rand_scale is None: rand_scale = np.random.random(out_bl.shape) * 2.0 - 1.0 # loss function would be: # loss_bl = np.sum(rand_scale * out_bl) # run back prop with rand_scale as the errors # use copy to avoid any interactions deltas_neon = gru.bprop(gru.be.array(np.copy(rand_scale))).get() # add a perturbation to each input element grads_est = np.zeros(inpa.shape) inp_pert = inp_bl.copy() for pert_ind in range(inpa.size): save_val = inp_pert.flat[pert_ind] inp_pert.flat[pert_ind] = save_val + epsilon reset_gru(gru) gru.allocate() out_pos = gru.fprop(gru.be.array(inp_pert)).get() inp_pert.flat[pert_ind] = save_val - epsilon reset_gru(gru) gru.allocate() out_neg = gru.fprop(gru.be.array(inp_pert)).get() # calculate the loss with perturbations loss_pos = np.sum(rand_scale*out_pos) loss_neg = np.sum(rand_scale*out_neg) # compute the gradient estimate grad = 0.5*(loss_pos-loss_neg)/epsilon grads_est.flat[pert_ind] = grad # reset the perturbed input element inp_pert.flat[pert_ind] = save_val del gru return (grads_est, deltas_neon)
def gradient_calc(seq_len, input_size, hidden_size, batch_size, epsilon=None, rand_scale=None, inp_bl=None): NervanaObject.be.bsz = NervanaObject.be.batch_size = batch_size input_shape = (input_size, seq_len * batch_size) # generate input if one is not given if inp_bl is None: inp_bl = np.random.randn(*input_shape) # neon gru instance gru = GRU(hidden_size, init=Gaussian(), activation=Tanh(), gate_activation=Logistic()) inpa = gru.be.array(np.copy(inp_bl)) # run fprop on the baseline input gru.configure((input_size, seq_len)) gru.prev_layer = True gru.allocate() gru.set_deltas([gru.be.iobuf(gru.in_shape)]) out_bl = gru.fprop(inpa).get() # random scaling/hash to generate fake loss if rand_scale is None: rand_scale = np.random.random(out_bl.shape) * 2.0 - 1.0 # loss function would be: # loss_bl = np.sum(rand_scale * out_bl) # run back prop with rand_scale as the errors # use copy to avoid any interactions deltas_neon = gru.bprop(gru.be.array(np.copy(rand_scale))).get() # add a perturbation to each input element grads_est = np.zeros(inpa.shape) inp_pert = inp_bl.copy() for pert_ind in range(inpa.size): save_val = inp_pert.flat[pert_ind] inp_pert.flat[pert_ind] = save_val + epsilon reset_gru(gru) gru.allocate() out_pos = gru.fprop(gru.be.array(inp_pert)).get() inp_pert.flat[pert_ind] = save_val - epsilon reset_gru(gru) gru.allocate() out_neg = gru.fprop(gru.be.array(inp_pert)).get() # calculate the loss with perturbations loss_pos = np.sum(rand_scale * out_pos) loss_neg = np.sum(rand_scale * out_neg) # compute the gradient estimate grad = 0.5 / float(epsilon) * (loss_pos - loss_neg) grads_est.flat[pert_ind] = grad # reset the perturbed input element inp_pert.flat[pert_ind] = save_val del gru return (grads_est, deltas_neon)
def check_gru(seq_len, input_size, hidden_size, batch_size, init_func, inp_moms=[0.0, 1.0]): # init_func is the initializer for the model params # inp_moms is the [ mean, std dev] of the random input input_shape = (input_size, seq_len * batch_size) output_shape = (hidden_size, seq_len * batch_size) NervanaObject.be.bsz = NervanaObject.be.batch_size = batch_size # neon GRU gru = GRU(hidden_size, init_func, activation=Tanh(), gate_activation=Logistic()) # generate random input tensor inp = np.random.rand(*input_shape)*inp_moms[1] + inp_moms[0] inpa = gru.be.array(inp) # generate random deltas tensor deltas = np.random.randn(*output_shape) # run neon fprop gru.configure((input_size, seq_len)) gru.prev_layer = True gru.allocate() gru.set_deltas([gru.be.iobuf(gru.in_shape)]) gru.fprop(inpa) # reference numpy GRU gru_ref = RefGRU(input_size, hidden_size) WGRU = gru_ref.weights # make ref weights and biases the same with neon model r_range = range(hidden_size) z_range = range(hidden_size, hidden_size * 2) c_range = range(hidden_size * 2, hidden_size * 3) WGRU[gru_ref.weights_ind_br][:] = gru.b.get()[r_range] WGRU[gru_ref.weights_ind_bz][:] = gru.b.get()[z_range] WGRU[gru_ref.weights_ind_bc][:] = gru.b.get()[c_range] WGRU[gru_ref.weights_ind_Wxr][:] = gru.W_input.get()[r_range] WGRU[gru_ref.weights_ind_Wxz][:] = gru.W_input.get()[z_range] WGRU[gru_ref.weights_ind_Wxc][:] = gru.W_input.get()[c_range] WGRU[gru_ref.weights_ind_Rhr][:] = gru.W_recur.get()[r_range] WGRU[gru_ref.weights_ind_Rhz][:] = gru.W_recur.get()[z_range] WGRU[gru_ref.weights_ind_Rhc][:] = gru.W_recur.get()[c_range] # transpose input X and do fprop # the reference code expects these shapes: # input_shape: (seq_len, input_size, batch_size) # output_shape: (seq_len, hidden_size, batch_size) inp_ref = inp.copy().T.reshape( seq_len, batch_size, input_size).swapaxes(1, 2) deltas_ref = deltas.copy().T.reshape( seq_len, batch_size, hidden_size).swapaxes(1, 2) (dWGRU_ref, h_ref_list, dh_ref_list, dr_ref_list, dz_ref_list, dc_ref_list) = gru_ref.lossFun(inp_ref, deltas_ref) print '====Verifying hidden states====' print allclose_with_out(gru.outputs.get(), h_ref_list, rtol=0.0, atol=1.0e-5) print 'fprop is verified' # now test the bprop print 'Making sure neon GRU match numpy GRU in bprop' gru.bprop(gru.be.array(deltas)) # grab the delta W from gradient buffer dWinput_neon = gru.dW_input.get() dWrecur_neon = gru.dW_recur.get() db_neon = gru.db.get() dWxr_neon = dWinput_neon[r_range] dWxz_neon = dWinput_neon[z_range] dWxc_neon = dWinput_neon[c_range] dWrr_neon = dWrecur_neon[r_range] dWrz_neon = dWrecur_neon[z_range] dWrc_neon = dWrecur_neon[c_range] dbr_neon = db_neon[r_range] dbz_neon = db_neon[z_range] dbc_neon = db_neon[c_range] drzc_neon = gru.rzhcan_delta_buffer.get() dr_neon = drzc_neon[r_range] dz_neon = drzc_neon[z_range] dc_neon = drzc_neon[c_range] dWxr_ref = dWGRU_ref[gru_ref.dW_ind_Wxr] dWxz_ref = dWGRU_ref[gru_ref.dW_ind_Wxz] dWxc_ref = dWGRU_ref[gru_ref.dW_ind_Wxc] dWrr_ref = dWGRU_ref[gru_ref.dW_ind_Rhr] dWrz_ref = dWGRU_ref[gru_ref.dW_ind_Rhz] dWrc_ref = dWGRU_ref[gru_ref.dW_ind_Rhc] dbr_ref = dWGRU_ref[gru_ref.dW_ind_br] dbz_ref = dWGRU_ref[gru_ref.dW_ind_bz] dbc_ref = dWGRU_ref[gru_ref.dW_ind_bc] # print '====Verifying hidden deltas ====' print '====Verifying r deltas ====' assert allclose_with_out(dr_neon, dr_ref_list, rtol=0.0, atol=1.0e-5) print '====Verifying z deltas ====' assert allclose_with_out(dz_neon, dz_ref_list, rtol=0.0, atol=1.0e-5) print '====Verifying hcan deltas ====' assert allclose_with_out(dc_neon, dc_ref_list, rtol=0.0, atol=1.0e-5) print '====Verifying update on W_input====' print 'dWxr' assert allclose_with_out(dWxr_neon, dWxr_ref, rtol=0.0, atol=1.0e-5) print 'dWxz' assert allclose_with_out(dWxz_neon, dWxz_ref, rtol=0.0, atol=1.0e-5) print 'dWxc' assert allclose_with_out(dWxc_neon, dWxc_ref, rtol=0.0, atol=1.0e-5) print '====Verifying update on W_recur====' print 'dWrr' assert allclose_with_out(dWrr_neon, dWrr_ref, rtol=0.0, atol=1.0e-5) print 'dWrz' assert allclose_with_out(dWrz_neon, dWrz_ref, rtol=0.0, atol=1.0e-5) print 'dWrc' assert allclose_with_out(dWrc_neon, dWrc_ref, rtol=0.0, atol=1.0e-5) print '====Verifying update on bias====' print 'dbr' assert allclose_with_out(dbr_neon, dbr_ref, rtol=0.0, atol=1.0e-5) print 'dbz' assert allclose_with_out(dbz_neon, dbz_ref, rtol=0.0, atol=1.0e-5) print 'dbc' assert allclose_with_out(dbc_neon, dbc_ref, rtol=0.0, atol=1.0e-5) print 'bprop is verified' return
def check_gru(seq_len, input_size, hidden_size, batch_size, init_func, inp_moms=[0.0, 1.0], add_init_state=False): # init_func is the initializer for the model params # inp_moms is the [ mean, std dev] of the random input input_shape = (input_size, seq_len * batch_size) output_shape = (hidden_size, seq_len * batch_size) slice_shape = (hidden_size, batch_size) NervanaObject.be.bsz = NervanaObject.be.batch_size = batch_size # neon GRU gru = GRU(hidden_size, init_func, activation=Tanh(), gate_activation=Logistic()) # generate random input tensor inp = np.random.rand(*input_shape) * inp_moms[1] + inp_moms[0] inp_dev = gru.be.array(inp) # generate random deltas tensor deltas = np.random.randn(*output_shape) # run neon fprop gru.configure((input_size, seq_len)) gru.prev_layer = True gru.allocate() test_buffer = DeltasTree() gru.allocate_deltas(test_buffer) test_buffer.allocate_buffers() gru.set_deltas(test_buffer) if add_init_state: init_state = np.random.rand(*slice_shape) * inp_moms[1] + inp_moms[0] init_state_dev = gru.be.array(init_state) gru.fprop(inp_dev, init_state=init_state_dev) else: gru.fprop(inp_dev) # reference numpy GRU gru_ref = RefGRU(input_size, hidden_size) WGRU = gru_ref.weights # make ref weights and biases the same with neon model r_range = list(range(hidden_size)) z_range = list(range(hidden_size, hidden_size * 2)) c_range = list(range(hidden_size * 2, hidden_size * 3)) WGRU[gru_ref.weights_ind_br][:] = gru.b.get()[r_range] WGRU[gru_ref.weights_ind_bz][:] = gru.b.get()[z_range] WGRU[gru_ref.weights_ind_bc][:] = gru.b.get()[c_range] WGRU[gru_ref.weights_ind_Wxr][:] = gru.W_input.get()[r_range] WGRU[gru_ref.weights_ind_Wxz][:] = gru.W_input.get()[z_range] WGRU[gru_ref.weights_ind_Wxc][:] = gru.W_input.get()[c_range] WGRU[gru_ref.weights_ind_Rhr][:] = gru.W_recur.get()[r_range] WGRU[gru_ref.weights_ind_Rhz][:] = gru.W_recur.get()[z_range] WGRU[gru_ref.weights_ind_Rhc][:] = gru.W_recur.get()[c_range] # transpose input X and do fprop # the reference code expects these shapes: # input_shape: (seq_len, input_size, batch_size) # output_shape: (seq_len, hidden_size, batch_size) inp_ref = inp.copy().T.reshape(seq_len, batch_size, input_size).swapaxes(1, 2) deltas_ref = deltas.copy().T.reshape(seq_len, batch_size, hidden_size).swapaxes(1, 2) if add_init_state: init_state_ref = init_state.copy() (dWGRU_ref, h_ref_list, dh_ref_list, dr_ref_list, dz_ref_list, dc_ref_list) = gru_ref.lossFun(inp_ref, deltas_ref, init_state_ref) else: (dWGRU_ref, h_ref_list, dh_ref_list, dr_ref_list, dz_ref_list, dc_ref_list) = gru_ref.lossFun(inp_ref, deltas_ref) neon_logger.display('====Verifying hidden states====') assert allclose_with_out(gru.outputs.get(), h_ref_list, rtol=0.0, atol=1.0e-5) neon_logger.display('fprop is verified') # now test the bprop neon_logger.display('Making sure neon GRU matches numpy GRU in bprop') gru.bprop(gru.be.array(deltas)) # grab the delta W from gradient buffer dWinput_neon = gru.dW_input.get() dWrecur_neon = gru.dW_recur.get() db_neon = gru.db.get() dWxr_neon = dWinput_neon[r_range] dWxz_neon = dWinput_neon[z_range] dWxc_neon = dWinput_neon[c_range] dWrr_neon = dWrecur_neon[r_range] dWrz_neon = dWrecur_neon[z_range] dWrc_neon = dWrecur_neon[c_range] dbr_neon = db_neon[r_range] dbz_neon = db_neon[z_range] dbc_neon = db_neon[c_range] drzc_neon = gru.rzhcan_delta_buffer.get() dr_neon = drzc_neon[r_range] dz_neon = drzc_neon[z_range] dc_neon = drzc_neon[c_range] dWxr_ref = dWGRU_ref[gru_ref.dW_ind_Wxr] dWxz_ref = dWGRU_ref[gru_ref.dW_ind_Wxz] dWxc_ref = dWGRU_ref[gru_ref.dW_ind_Wxc] dWrr_ref = dWGRU_ref[gru_ref.dW_ind_Rhr] dWrz_ref = dWGRU_ref[gru_ref.dW_ind_Rhz] dWrc_ref = dWGRU_ref[gru_ref.dW_ind_Rhc] dbr_ref = dWGRU_ref[gru_ref.dW_ind_br] dbz_ref = dWGRU_ref[gru_ref.dW_ind_bz] dbc_ref = dWGRU_ref[gru_ref.dW_ind_bc] # neon_logger.display '====Verifying hidden deltas ====' neon_logger.display('====Verifying r deltas ====') assert allclose_with_out(dr_neon, dr_ref_list, rtol=0.0, atol=1.0e-5) neon_logger.display('====Verifying z deltas ====') assert allclose_with_out(dz_neon, dz_ref_list, rtol=0.0, atol=1.0e-5) neon_logger.display('====Verifying hcan deltas ====') assert allclose_with_out(dc_neon, dc_ref_list, rtol=0.0, atol=1.0e-5) neon_logger.display('====Verifying update on W_input====') neon_logger.display('dWxr') assert allclose_with_out(dWxr_neon, dWxr_ref, rtol=0.0, atol=1.0e-5) neon_logger.display('dWxz') assert allclose_with_out(dWxz_neon, dWxz_ref, rtol=0.0, atol=1.0e-5) neon_logger.display('dWxc') assert allclose_with_out(dWxc_neon, dWxc_ref, rtol=0.0, atol=1.0e-5) neon_logger.display('====Verifying update on W_recur====') neon_logger.display('dWrr') assert allclose_with_out(dWrr_neon, dWrr_ref, rtol=0.0, atol=1.0e-5) neon_logger.display('dWrz') assert allclose_with_out(dWrz_neon, dWrz_ref, rtol=0.0, atol=1.0e-5) neon_logger.display('dWrc') assert allclose_with_out(dWrc_neon, dWrc_ref, rtol=0.0, atol=1.0e-5) neon_logger.display('====Verifying update on bias====') neon_logger.display('dbr') assert allclose_with_out(dbr_neon, dbr_ref, rtol=0.0, atol=1.0e-5) neon_logger.display('dbz') assert allclose_with_out(dbz_neon, dbz_ref, rtol=0.0, atol=1.0e-5) neon_logger.display('dbc') assert allclose_with_out(dbc_neon, dbc_ref, rtol=0.0, atol=1.0e-5) neon_logger.display('bprop is verified') return