def check_stacked_lstm(seq_len, input_size, hidden_size, batch_size, init_func, return_seq=True, backward=False, reset_cells=False, num_iter=2): Cin = ng.make_axis(input_size, name='Feature') REC = ng.make_axis(seq_len, name='REC') N = ng.make_axis(batch_size, name='N') with ExecutorFactory() as ex: np.random.seed(0) inp_ng = ng.placeholder([Cin, REC, N]) lstm_ng_1 = LSTM(hidden_size, init_func, activation=Tanh(), gate_activation=Logistic(), reset_cells=reset_cells, return_sequence=return_seq, backward=backward) lstm_ng_2 = LSTM(hidden_size + 1, init_func, activation=Tanh(), gate_activation=Logistic(), reset_cells=reset_cells, return_sequence=return_seq, backward=backward) out_ng_1 = lstm_ng_1(inp_ng) out_ng_2 = lstm_ng_2(out_ng_1) fprop_neon_fun_2 = ex.executor(out_ng_2, inp_ng) gates = ['i', 'f', 'o', 'g'] Wxh_neon_1_fun = copier_T( ex.executor(list(lstm_ng_1.W_input[k] for k in gates))) Whh_neon_1_fun = copier_T( ex.executor(list(lstm_ng_1.W_recur[k] for k in gates))) bh_neon_1_fun = copier(ex.executor(list(lstm_ng_1.b[k] for k in gates))) Wxh_neon_2_fun = copier_T( ex.executor(list(lstm_ng_2.W_input[k] for k in gates))) Whh_neon_2_fun = copier_T( ex.executor(list(lstm_ng_2.W_recur[k] for k in gates))) bh_neon_2_fun = copier(ex.executor(list(lstm_ng_2.b[k] for k in gates))) h_init_fun = ex.executor(lstm_ng_2.h_init) # fprop on random inputs for multiple iterations fprop_neon_2_list = [] input_value_list = [] for i in range(num_iter): input_value = rng.uniform(-1, 1, inp_ng.axes) fprop_neon_2 = fprop_neon_fun_2(input_value).copy() # comparing outputs if return_seq is True: fprop_neon_2 = fprop_neon_2[:, :, 0] input_value_list.append(input_value) fprop_neon_2_list.append(fprop_neon_2) if reset_cells is False: # look at the last hidden states h_init_neon = fprop_neon_2[:, -1].reshape(-1, 1) h_init_ng = h_init_fun() ng.testing.assert_allclose(h_init_neon, h_init_ng, rtol=rtol, atol=atol) # after the rnn graph has been executed, can get the W values. Get copies so # shared values don't confuse derivatives # concatenate weights to i, f, o, g together (in this order) gates = ['i', 'f', 'o', 'g'] Wxh_neon_1 = np.concatenate(Wxh_neon_1_fun(), 1) Whh_neon_1 = np.concatenate(Whh_neon_1_fun(), 1) bh_neon_1 = np.concatenate(bh_neon_1_fun()) Wxh_neon_2 = np.concatenate(Wxh_neon_2_fun(), 1) Whh_neon_2 = np.concatenate(Whh_neon_2_fun(), 1) bh_neon_2 = np.concatenate(bh_neon_2_fun()) # reference numpy LSTM lstm_ref_1 = RefLSTM() lstm_ref_2 = RefLSTM() WLSTM_1 = lstm_ref_1.init(input_size, hidden_size) WLSTM_2 = lstm_ref_2.init(hidden_size, hidden_size + 1) # make ref weights and biases the same with neon model WLSTM_1[0, :] = bh_neon_1 WLSTM_1[1:input_size + 1, :] = Wxh_neon_1 WLSTM_1[input_size + 1:] = Whh_neon_1 WLSTM_2[0, :] = bh_neon_2 WLSTM_2[1:hidden_size + 1, :] = Wxh_neon_2 WLSTM_2[hidden_size + 1:] = Whh_neon_2 # transpose input X and do fprop fprop_ref_2_list = [] c0_1 = h0_1 = None c0_2 = h0_2 = None for i in range(num_iter): input_value = input_value_list[i] inp_ref = input_value.copy().transpose([1, 2, 0]) (Hout_ref_1, cprev_1, hprev_1, batch_cache) = lstm_ref_1.forward(inp_ref, WLSTM_1, c0_1, h0_1) (Hout_ref_2, cprev_2, hprev_2, batch_cache) = lstm_ref_2.forward(Hout_ref_1, WLSTM_2, c0_2, h0_2) if reset_cells is False: c0_1 = cprev_1 h0_1 = hprev_1 c0_2 = cprev_2 h0_2 = hprev_2 # the output needs transpose as well Hout_ref_2 = Hout_ref_2.reshape(seq_len * batch_size, hidden_size + 1).T fprop_ref_2_list.append(Hout_ref_2) for i in range(num_iter): ng.testing.assert_allclose(fprop_neon_2_list[i], fprop_ref_2_list[i], rtol=rtol, atol=atol)
def check_lstm(seq_len, input_size, hidden_size, batch_size, init_func, return_seq=True, backward=False, reset_cells=False, num_iter=2): Cin = ng.make_axis(input_size, name='Feature') REC = ng.make_axis(seq_len, name='REC') N = ng.make_axis(batch_size, name='N') with ExecutorFactory() as ex: np.random.seed(0) inp_ng = ng.placeholder([Cin, REC, N]) lstm_ng = LSTM(hidden_size, init_func, activation=Tanh(), gate_activation=Logistic(), reset_cells=reset_cells, return_sequence=return_seq, backward=backward) out_ng = lstm_ng(inp_ng) fprop_neon_fun = copier(ex.executor((out_ng, lstm_ng.h_init), inp_ng)) gates = ['i', 'f', 'o', 'g'] Wxh_neon_fun = copier_T( ex.executor(list(lstm_ng.W_input[k] for k in gates))) Whh_neon_fun = copier_T( ex.executor(list(lstm_ng.W_recur[k] for k in gates))) bh_neon_fun = copier(ex.executor(list(lstm_ng.b[k] for k in gates))) fprop_neon_list = [] input_value_list = [] for i in range(num_iter): # fprop on random inputs input_value = rng.uniform(-1, 1, inp_ng.axes) fprop_neon, h_init_neon = fprop_neon_fun(input_value) if return_seq is True: fprop_neon = fprop_neon[:, :, 0] input_value_list.append(input_value) fprop_neon_list.append(fprop_neon) if reset_cells is False: # look at the last hidden states ng.testing.assert_allclose(fprop_neon[:, -1].reshape(-1, 1), h_init_neon, rtol=rtol, atol=atol) # after the rnn graph has been executed, can get the W values. Get copies so # shared values don't confuse derivatives # concatenate weights to i, f, o, g together (in this order) Wxh_neon = Wxh_neon_fun() Whh_neon = Whh_neon_fun() bh_neon = bh_neon_fun() # reference numpy LSTM lstm_ref = RefLSTM() WLSTM = lstm_ref.init(input_size, hidden_size) # make ref weights and biases with neon model WLSTM[0, :] = np.concatenate(bh_neon) WLSTM[1:input_size + 1, :] = np.concatenate(Wxh_neon, 1) WLSTM[input_size + 1:] = np.concatenate(Whh_neon, 1) # transpose input X and do fprop fprop_ref_list = [] c0 = h0 = None for i in range(num_iter): input_value = input_value_list[i] inp_ref = input_value.copy().transpose([1, 2, 0]) (Hout_ref, cprev, hprev, batch_cache) = lstm_ref.forward(inp_ref, WLSTM, c0, h0) if reset_cells is False: c0 = cprev h0 = hprev # the output needs transpose as well Hout_ref = Hout_ref.reshape(seq_len * batch_size, hidden_size).T fprop_ref_list.append(Hout_ref) for i in range(num_iter): ng.testing.assert_allclose(fprop_neon_list[i], fprop_ref_list[i], rtol=rtol, atol=atol)
def check_lstm(seq_len, input_size, hidden_size, batch_size, init_func, inp_moms=[0.0, 1.0]): # init_func is the initializer for the model params # inp_moms is the [ mean, std dev] of the random input input_shape = (input_size, seq_len * batch_size) hidden_shape = (hidden_size, seq_len * batch_size) NervanaObject.be.bsz = NervanaObject.be.batch_size = batch_size # neon LSTM lstm = LSTM(hidden_size, init_func, activation=Tanh(), gate_activation=Logistic()) inp = np.random.rand(*input_shape) * inp_moms[1] + inp_moms[0] inpa = lstm.be.array(inp) # run neon fprop lstm.configure((input_size, seq_len)) lstm.prev_layer = True # Hack to force allocating a delta buffer lstm.allocate() lstm.set_deltas([lstm.be.iobuf(lstm.in_shape)]) lstm.fprop(inpa) # reference numpy LSTM lstm_ref = RefLSTM() WLSTM = lstm_ref.init(input_size, hidden_size) # make ref weights and biases with neon model WLSTM[0, :] = lstm.b.get().T WLSTM[1:input_size + 1, :] = lstm.W_input.get().T WLSTM[input_size + 1:] = lstm.W_recur.get().T # transpose input X and do fprop inp_ref = inp.copy().T.reshape(seq_len, batch_size, input_size) (Hout_ref, cprev, hprev, batch_cache) = lstm_ref.forward(inp_ref, WLSTM) # the output needs transpose as well Hout_ref = Hout_ref.reshape(seq_len * batch_size, hidden_size).T IFOGf_ref = batch_cache['IFOGf'].reshape(seq_len * batch_size, hidden_size * 4).T Ct_ref = batch_cache['Ct'].reshape(seq_len * batch_size, hidden_size).T # compare results neon_logger.display('====Verifying IFOG====') assert allclose_with_out(lstm.ifog_buffer.get(), IFOGf_ref, rtol=0.0, atol=1.5e-5) neon_logger.display('====Verifying cell states====') assert allclose_with_out(lstm.c_act_buffer.get(), Ct_ref, rtol=0.0, atol=1.5e-5) neon_logger.display('====Verifying hidden states====') assert allclose_with_out(lstm.outputs.get(), Hout_ref, rtol=0.0, atol=1.5e-5) neon_logger.display('fprop is verified') # now test the bprop # generate random deltas tensor deltas = np.random.randn(*hidden_shape) lstm.bprop(lstm.be.array(deltas)) # grab the delta W from gradient buffer dWinput_neon = lstm.dW_input.get() dWrecur_neon = lstm.dW_recur.get() db_neon = lstm.db.get() deltas_ref = deltas.copy().T.reshape(seq_len, batch_size, hidden_size) (dX_ref, dWLSTM_ref, dc0_ref, dh0_ref) = lstm_ref.backward(deltas_ref, batch_cache) dWrecur_ref = dWLSTM_ref[-hidden_size:, :] dWinput_ref = dWLSTM_ref[1:input_size + 1, :] db_ref = dWLSTM_ref[0, :] dX_ref = dX_ref.reshape(seq_len * batch_size, input_size).T # compare results neon_logger.display('Making sure neon LSTM match numpy LSTM in bprop') neon_logger.display('====Verifying update on W_recur====') assert allclose_with_out(dWrecur_neon, dWrecur_ref.T, rtol=0.0, atol=1.5e-5) neon_logger.display('====Verifying update on W_input====') assert allclose_with_out(dWinput_neon, dWinput_ref.T, rtol=0.0, atol=1.5e-5) neon_logger.display('====Verifying update on bias====') assert allclose_with_out(db_neon.flatten(), db_ref, rtol=0.0, atol=1.5e-5) neon_logger.display('====Verifying output delta====') assert allclose_with_out(lstm.out_deltas_buffer.get(), dX_ref, rtol=0.0, atol=1.5e-5) neon_logger.display('bprop is verified') return
def gradient_check_ref(seq_len, input_size, hidden_size, batch_size, epsilon=1.0e-5, dtypeu=np.float64, threshold=1e-4): # this is a check of the reference code itself # estimates the gradients by adding perturbations # to the input and the weights and compares to # the values calculated in bprop # generate sparse random input matrix NervanaObject.be.bsz = NervanaObject.be.batch_size = batch_size input_shape = (seq_len, input_size, batch_size) # hidden_shape = (seq_len, hidden_size, batch_size) (inp_bl, nz_inds) = sparse_rand(input_shape, frac=1.0 / float(input_shape[1])) inp_bl = np.random.randn(*input_shape) # convert input matrix from neon to ref code format inp_bl = inp_bl.swapaxes(1, 2).astype(dtypeu) # generate reference LSTM lstm_ref = RefLSTM() WLSTM = lstm_ref.init(input_size, hidden_size).astype(dtypeu) # init parameters as done for neon WLSTM = np.random.randn(*WLSTM.shape) (Hout, cprev, hprev, cache) = lstm_ref.forward(inp_bl, WLSTM) # scale Hout by random matrix... rand_scale = np.random.random(Hout.shape) * 2.0 - 1.0 rand_scale = dtypeu(rand_scale) # line below would be the loss function # loss_bl = np.sum(rand_scale * Hout) # run bprop, input deltas is rand_scale (dX_bl, dWLSTM_bl, dc0, dh0) = lstm_ref.backward(rand_scale, cache) grads_est = np.zeros(dX_bl.shape) inp_pert = inp_bl.copy() for pert_ind in range(inp_bl.size): save_val = inp_pert.flat[pert_ind] # add/subtract perturbations to input inp_pert.flat[pert_ind] = save_val + epsilon # and run fprop on perturbed input (Hout_pos, cprev, hprev, cache) = lstm_ref.forward(inp_pert, WLSTM) inp_pert.flat[pert_ind] = save_val - epsilon (Hout_neg, cprev, hprev, cache) = lstm_ref.forward(inp_pert, WLSTM) # calculate the loss on outputs loss_pos = np.sum(rand_scale * Hout_pos) loss_neg = np.sum(rand_scale * Hout_neg) grads_est.flat[pert_ind] = 0.5 / float(epsilon) * (loss_pos - loss_neg) # reset input inp_pert.flat[pert_ind] = save_val # assert that gradient estimates within rel threshold of # bprop calculated deltas assert allclose_with_out(grads_est, dX_bl, rtol=threshold, atol=0.0) return
def check_lstm(seq_len, input_size, hidden_size, batch_size, init_func, inp_moms=[0.0, 1.0]): # init_func is the initializer for the model params # inp_moms is the [ mean, std dev] of the random input input_shape = (input_size, seq_len * batch_size) hidden_shape = (hidden_size, seq_len * batch_size) NervanaObject.be.bsz = NervanaObject.be.batch_size = batch_size # neon LSTM lstm = LSTM(hidden_size, init_func, activation=Tanh(), gate_activation=Logistic()) inp = np.random.rand(*input_shape) * inp_moms[1] + inp_moms[0] inpa = lstm.be.array(inp) # run neon fprop lstm.configure((input_size, seq_len)) lstm.prev_layer = True # Hack to force allocating a delta buffer lstm.allocate() dtree = DeltasTree() lstm.allocate_deltas(dtree) dtree.allocate_buffers() lstm.set_deltas(dtree) lstm.fprop(inpa) # reference numpy LSTM lstm_ref = RefLSTM() WLSTM = lstm_ref.init(input_size, hidden_size) # make ref weights and biases with neon model WLSTM[0, :] = lstm.b.get().T WLSTM[1:input_size + 1, :] = lstm.W_input.get().T WLSTM[input_size + 1:] = lstm.W_recur.get().T # transpose input X and do fprop inp_ref = inp.copy().T.reshape(seq_len, batch_size, input_size) (Hout_ref, cprev, hprev, batch_cache) = lstm_ref.forward(inp_ref, WLSTM) # the output needs transpose as well Hout_ref = Hout_ref.reshape(seq_len * batch_size, hidden_size).T IFOGf_ref = batch_cache['IFOGf'].reshape(seq_len * batch_size, hidden_size * 4).T Ct_ref = batch_cache['Ct'].reshape(seq_len * batch_size, hidden_size).T # compare results neon_logger.display('====Verifying IFOG====') assert allclose_with_out(lstm.ifog_buffer.get(), IFOGf_ref, rtol=0.0, atol=1.5e-5) neon_logger.display('====Verifying cell states====') assert allclose_with_out(lstm.c_act_buffer.get(), Ct_ref, rtol=0.0, atol=1.5e-5) neon_logger.display('====Verifying hidden states====') assert allclose_with_out(lstm.outputs.get(), Hout_ref, rtol=0.0, atol=1.5e-5) neon_logger.display('fprop is verified') # now test the bprop # generate random deltas tensor deltas = np.random.randn(*hidden_shape) lstm.bprop(lstm.be.array(deltas)) # grab the delta W from gradient buffer dWinput_neon = lstm.dW_input.get() dWrecur_neon = lstm.dW_recur.get() db_neon = lstm.db.get() deltas_ref = deltas.copy().T.reshape(seq_len, batch_size, hidden_size) (dX_ref, dWLSTM_ref, dc0_ref, dh0_ref) = lstm_ref.backward(deltas_ref, batch_cache) dWrecur_ref = dWLSTM_ref[-hidden_size:, :] dWinput_ref = dWLSTM_ref[1:input_size + 1, :] db_ref = dWLSTM_ref[0, :] dX_ref = dX_ref.reshape(seq_len * batch_size, input_size).T # compare results neon_logger.display('Making sure neon LSTM match numpy LSTM in bprop') neon_logger.display('====Verifying update on W_recur====') assert allclose_with_out(dWrecur_neon, dWrecur_ref.T, rtol=0.0, atol=1.5e-5) neon_logger.display('====Verifying update on W_input====') assert allclose_with_out(dWinput_neon, dWinput_ref.T, rtol=0.0, atol=1.5e-5) neon_logger.display('====Verifying update on bias====') assert allclose_with_out(db_neon.flatten(), db_ref, rtol=0.0, atol=1.5e-5) neon_logger.display('====Verifying output delta====') assert allclose_with_out(lstm.out_deltas_buffer.get(), dX_ref, rtol=0.0, atol=1.5e-5) neon_logger.display('bprop is verified') return