def check_rnn(seq_len, input_size, hidden_size, batch_size, init_func, inp_moms=[0.0, 1.0]): # init_func is the initializer for the model params # inp_moms is the [ mean, std dev] of the random input input_shape = (input_size, seq_len * batch_size) output_shape = (hidden_size, seq_len * batch_size) NervanaObject.be.bsz = NervanaObject.be.batch_size = batch_size # ======== create models ======== # neon RNN rnn = Recurrent(hidden_size, init_func, activation=Tanh()) # reference numpy RNN rnn_ref = RefRecurrent(input_size, hidden_size) Wxh = rnn_ref.Wxh Whh = rnn_ref.Whh bh = rnn_ref.bh # ========= generate data ================= # generate random input tensor inp = np.random.rand(*input_shape) * inp_moms[1] + inp_moms[0] inpa = rnn.be.array(inp) # generate random deltas tensor deltas = np.random.randn(*output_shape) # the reference code expects these shapes: # input_shape: (seq_len, input_size, batch_size) # output_shape: (seq_len, hidden_size, batch_size) inp_ref = inp.copy().T.reshape( seq_len, batch_size, input_size).swapaxes(1, 2) deltas_ref = deltas.copy().T.reshape( seq_len, batch_size, hidden_size).swapaxes(1, 2) # ========= running models ========== # run neon fprop rnn.configure((input_size, seq_len)) rnn.prev_layer = True rnn.allocate() rnn.set_deltas([rnn.be.iobuf(rnn.in_shape)]) rnn.fprop(inpa) # weights are only initialized after doing fprop, so now # make ref weights and biases the same with neon model Wxh[:] = rnn.W_input.get() Whh[:] = rnn.W_recur.get() bh[:] = rnn.b.get() (dWxh_ref, dWhh_ref, db_ref, h_ref_list, dh_ref_list, d_out_ref) = rnn_ref.lossFun(inp_ref, deltas_ref) # now test the bprop rnn.bprop(rnn.be.array(deltas)) # grab the delta W from gradient buffer dWxh_neon = rnn.dW_input.get() dWhh_neon = rnn.dW_recur.get() db_neon = rnn.db.get() # comparing outputs neon_logger.display('====Verifying hidden states====') neon_logger.display(allclose_with_out(rnn.outputs.get(), h_ref_list, rtol=0.0, atol=1.0e-5)) neon_logger.display('fprop is verified') neon_logger.display('====Verifying update on W and b ====') neon_logger.display('dWxh') assert allclose_with_out(dWxh_neon, dWxh_ref, rtol=0.0, atol=1.0e-5) neon_logger.display('dWhh') assert allclose_with_out(dWhh_neon, dWhh_ref, rtol=0.0, atol=1.0e-5) neon_logger.display('====Verifying update on bias====') neon_logger.display('db') assert allclose_with_out(db_neon, db_ref, rtol=0.0, atol=1.0e-5) neon_logger.display('bprop is verified') return
def check_rnn(seq_len, input_size, hidden_size, batch_size, init_func, inp_moms=[0.0, 1.0]): # init_func is the initializer for the model params # inp_moms is the [ mean, std dev] of the random input input_shape = (input_size, seq_len * batch_size) output_shape = (hidden_size, seq_len * batch_size) NervanaObject.be.bsz = NervanaObject.be.batch_size = batch_size # ======== create models ======== # neon RNN rnn = Recurrent(hidden_size, init_func, activation=Tanh()) # reference numpy RNN rnn_ref = RefRecurrent(input_size, hidden_size) Wxh = rnn_ref.Wxh Whh = rnn_ref.Whh bh = rnn_ref.bh # ========= generate data ================= # generate random input tensor inp = np.random.rand(*input_shape) * inp_moms[1] + inp_moms[0] inpa = rnn.be.array(inp) # generate random deltas tensor deltas = np.random.randn(*output_shape) # the reference code expects these shapes: # input_shape: (seq_len, input_size, batch_size) # output_shape: (seq_len, hidden_size, batch_size) inp_ref = inp.copy().T.reshape(seq_len, batch_size, input_size).swapaxes(1, 2) deltas_ref = deltas.copy().T.reshape(seq_len, batch_size, hidden_size).swapaxes(1, 2) # ========= running models ========== # run neon fprop rnn.configure((input_size, seq_len)) rnn.prev_layer = True rnn.allocate() dtree = DeltasTree() rnn.allocate_deltas(dtree) dtree.allocate_buffers() rnn.set_deltas(dtree) rnn.fprop(inpa) # weights are only initialized after doing fprop, so now # make ref weights and biases the same with neon model Wxh[:] = rnn.W_input.get() Whh[:] = rnn.W_recur.get() bh[:] = rnn.b.get() (dWxh_ref, dWhh_ref, db_ref, h_ref_list, dh_ref_list, d_out_ref) = rnn_ref.lossFun(inp_ref, deltas_ref) # now test the bprop rnn.bprop(rnn.be.array(deltas)) # grab the delta W from gradient buffer dWxh_neon = rnn.dW_input.get() dWhh_neon = rnn.dW_recur.get() db_neon = rnn.db.get() # comparing outputs neon_logger.display('====Verifying hidden states====') assert allclose_with_out(rnn.outputs.get(), h_ref_list, rtol=0.0, atol=1.0e-5) neon_logger.display('fprop is verified') neon_logger.display('====Verifying update on W and b ====') neon_logger.display('dWxh') assert allclose_with_out(dWxh_neon, dWxh_ref, rtol=0.0, atol=1.0e-5) neon_logger.display('dWhh') assert allclose_with_out(dWhh_neon, dWhh_ref, rtol=0.0, atol=1.0e-5) neon_logger.display('====Verifying update on bias====') neon_logger.display('db') assert allclose_with_out(db_neon, db_ref, rtol=0.0, atol=1.0e-5) neon_logger.display('bprop is verified') return
def check_rnn(seq_len, input_size, hidden_size, batch_size, init_func, return_seq=True): # init_func is the initializer for the model params assert batch_size == 1, "the recurrent reference implementation only support batch size 1" # ========== neon model ========== Cin = ng.make_axis(input_size) REC = ng.make_axis(seq_len, recurrent=True) N = ng.make_axis(batch_size, batch=True) H = ng.make_axis(hidden_size) ax_s = ng.make_axes([H, N]) ex = ExecutorFactory() np.random.seed(0) rnn_ng = Recurrent(hidden_size, init_func, activation=Tanh(), reset_cells=True, return_sequence=return_seq) inp_ng = ng.placeholder([Cin, REC, N]) init_state_ng = ng.placeholder(ax_s) # fprop graph out_ng = rnn_ng.train_outputs(inp_ng, init_state=init_state_ng) out_ng.input = True rnn_W_input = rnn_ng.W_input rnn_W_input.input = True rnn_W_recur = rnn_ng.W_recur rnn_W_recur.input = True rnn_b = rnn_ng.b rnn_b.input = True fprop_neon_fun = ex.executor(out_ng, inp_ng, init_state_ng) dWrecur_s_fun = ex.derivative(out_ng, rnn_W_recur, inp_ng, rnn_W_input, rnn_b) dWrecur_n_fun = ex.numeric_derivative(out_ng, rnn_W_recur, delta, inp_ng, rnn_W_input, rnn_b) dWinput_s_fun = ex.derivative(out_ng, rnn_W_input, inp_ng, rnn_W_recur, rnn_b) dWinput_n_fun = ex.numeric_derivative(out_ng, rnn_W_input, delta, inp_ng, rnn_W_recur, rnn_b) dWb_s_fun = ex.derivative(out_ng, rnn_b, inp_ng, rnn_W_input, rnn_W_recur) dWb_n_fun = ex.numeric_derivative(out_ng, rnn_b, delta, inp_ng, rnn_W_input, rnn_W_recur) # fprop on random inputs input_value = rng.uniform(-1, 1, inp_ng.axes) init_state_value = rng.uniform(-1, 1, init_state_ng.axes) fprop_neon = fprop_neon_fun(input_value, init_state_value).copy() # after the rnn graph has been executed, can get the W values. Get copies so # shared values don't confuse derivatives Wxh_neon = rnn_ng.W_input.value.get(None).copy() Whh_neon = rnn_ng.W_recur.value.get(None).copy() bh_neon = rnn_ng.b.value.get(None).copy() # bprop derivs dWrecur_s = dWrecur_s_fun(Whh_neon, input_value, Wxh_neon, bh_neon) dWrecur_n = dWrecur_n_fun(Whh_neon, input_value, Wxh_neon, bh_neon) np.testing.assert_allclose(dWrecur_s, dWrecur_n, rtol=rtol, atol=atol) dWb_s = dWb_s_fun(bh_neon, input_value, Wxh_neon, Whh_neon) dWb_n = dWb_n_fun(bh_neon, input_value, Wxh_neon, Whh_neon) np.testing.assert_allclose(dWb_s, dWb_n, rtol=rtol, atol=atol) dWinput_s = dWinput_s_fun(Wxh_neon, input_value, Whh_neon, bh_neon) dWinput_n = dWinput_n_fun(Wxh_neon, input_value, Whh_neon, bh_neon) np.testing.assert_allclose(dWinput_s, dWinput_n, rtol=rtol, atol=atol) # ========= reference model ========== output_shape = (hidden_size, seq_len * batch_size) # generate random deltas tensor deltas = np.random.randn(*output_shape) # the reference code expects these shapes: # input_shape: (seq_len, input_size, batch_size) # output_shape: (seq_len, hidden_size, batch_size) deltas_ref = deltas.copy().T.reshape(seq_len, batch_size, hidden_size).swapaxes(1, 2) inp_ref = input_value.transpose([1, 0, 2]) # reference numpy RNN rnn_ref = RefRecurrent(input_size, hidden_size) rnn_ref.Wxh[:] = Wxh_neon rnn_ref.Whh[:] = Whh_neon rnn_ref.bh[:] = bh_neon.reshape(rnn_ref.bh.shape) (dWxh_ref, dWhh_ref, db_ref, h_ref_list, dh_ref_list, d_out_ref) = rnn_ref.lossFun(inp_ref, deltas_ref, init_states=init_state_value) # comparing outputs if return_seq is False: h_ref_list = h_ref_list[:, -1].reshape(-1, 1) else: fprop_neon = fprop_neon[:, :, 0] np.testing.assert_allclose(fprop_neon, h_ref_list, rtol=0.0, atol=1.0e-5) return