def forward(self, input, kernel, recurrent_kernel): """ # Arguments inputs: [input numpy array with shape (batch, in_features), state numpy array with shape (batch, units)] # Returns outputs: numpy array with shape (batch, units) """ x, prev_h = input _, all_units = kernel.shape units = all_units // 3 kernel_z, kernel_r, kernel_h = kernel[:, :units], kernel[:, units:2*units], kernel[:, 2*units:all_units] recurrent_kernel_z = recurrent_kernel[:, :units] recurrent_kernel_r = recurrent_kernel[:, units:2*units] recurrent_kernel_h = recurrent_kernel[:, 2*units:all_units] ##################################################################################### # code here # reset gate x_r = sigmoid(np.dot(prev_h, recurrent_kernel_r) + np.dot(x, kernel_r)) # update gate x_z = sigmoid(np.dot(prev_h, recurrent_kernel_z) + np.dot(x, kernel_z)) # new gate x_h = np.tanh(np.dot(x_r * prev_h, recurrent_kernel_h) + np.dot(x, kernel_h)) ##################################################################################### output = (1 - x_z) * x_h + x_z * prev_h return output
def forward(self, input, kernel, recurrent_kernel): """ # Arguments inputs: [input numpy array with shape (batch, in_features), cell state numpy array with shape (batch, units), hidden state numpy array with shape (batch, units)] # Returns outputs: [New hidden state numpy array with shape (batch, units), New cell_state numpy array with shape (batch, units)] Note: We assume no bias term in lstm """ x, prev_c, prev_h = input #prev_c: previous cell state; prev_h: previous hidden state _, all_units = kernel.shape units = all_units // 4 kernel_i, kernel_f, kernel_c, kernel_o = kernel[:, : units], kernel[:, units: 2 * units], kernel[:, 2 * units: 3 * units], kernel[:, 3 * units: all_units], recurrent_kernel_i = recurrent_kernel[:, : units] # recurrent weight of input gate recurrent_kernel_f = recurrent_kernel[:, units:2 * units] # recurrent weight of forget gate recurrent_kernel_c = recurrent_kernel[:, 2 * units:3 * units] # recurrent weight of cell gate recurrent_kernel_o = recurrent_kernel[:, 3 * units: all_units] # recurrent weight of output gate #################### To do #################### f = sigmoid( np.matmul(x, kernel_f) + np.matmul(prev_h, recurrent_kernel_f)) i = sigmoid( np.matmul(x, kernel_i) + np.matmul(prev_h, recurrent_kernel_i)) o = sigmoid( np.matmul(x, kernel_o) + np.matmul(prev_h, recurrent_kernel_o)) cell = np.multiply(f, prev_c) + np.multiply( i, np.tanh( np.matmul(x, kernel_c) + np.matmul(prev_h, recurrent_kernel_c))) hidden = np.multiply(o, np.tanh(cell)) ############################################### return hidden, cell
def backward(self, out_grad, input, kernel, recurrent_kernel): """ # Arguments out_grad: [gradient to output_hidden state, gradient to output_cell_state] inputs: [input numpy array with shape (batch, in_features), cell state numpy array with shape (batch, units), hidden state numpy array with shape (batch, units)] # Returns in_grad: [gradients to input numpy array with shape (batch, in_features), gradients to cell state numpy array with shape (batch, units), gradients to hidden state numpy array with shape (batch, units)] """ x, prev_c, prev_h = input #prev_c: previous cell state; prev_h: previous hidden state _, all_units = kernel.shape units = all_units // 4 kernel_i, kernel_f, kernel_c, kernel_o= kernel[:, :units], kernel[:, units:2*units], kernel[:, 2*units:3*units], kernel[:, 3*units:all_units], recurrent_kernel_i = recurrent_kernel[:, :units] recurrent_kernel_f = recurrent_kernel[:, units:2*units] recurrent_kernel_c = recurrent_kernel[:, 2*units:3*units] recurrent_kernel_o = recurrent_kernel[:, 3*units:all_units] h_grad, c_grad = out_grad x_f = sigmoid(x.dot(kernel_f) + prev_h.dot(recurrent_kernel_f)) x_i = sigmoid(x.dot(kernel_i) + prev_h.dot(recurrent_kernel_i)) x_o = sigmoid(x.dot(kernel_o) + prev_h.dot(recurrent_kernel_o)) x_c = np.tanh(x.dot(kernel_c) + prev_h.dot(recurrent_kernel_c)) c = x_i * x_c + x_f * prev_c h = x_o * np.tanh(c) do = h_grad * np.tanh(c) df = c_grad * prev_c dc = c_grad * x_i di = c_grad * x_c dAc = dc * (1-x_c*x_c) dAi = di * x_i * (1-x_i) dAf = df * x_f * (1-x_f) dAo = do * x_o * (1-x_o) x_grad = dAc.dot(kernel_c.T)+dAi.dot(kernel_i.T)+dAf.dot(kernel_f.T)+dAo.dot(kernel_o.T) kernel_c_grad = x.T.dot(dAc) kernel_i_grad = x.T.dot(dAi) kernel_f_grad = x.T.dot(dAf) kernel_o_grad = x.T.dot(dAo) prev_h_grad = dAc.dot(recurrent_kernel_c.T)+dAi.dot(recurrent_kernel_i.T)+dAf.dot(recurrent_kernel_f.T)+dAo.dot(recurrent_kernel_o.T) recurrent_kernel_c_grad = prev_h.T.dot(dAc) recurrent_kernel_i_grad = prev_h.T.dot(dAi) recurrent_kernel_f_grad = prev_h.T.dot(dAf) recurrent_kernel_o_grad = prev_h.T.dot(dAo) prev_c_grad = c_grad * x_f + prev_h_grad * x_o * (1-np.tanh(prev_c) * np.tanh(prev_c)) in_grad = [x_grad, prev_c_grad, prev_h_grad] kernel_grad = np.concatenate([kernel_i_grad, kernel_f_grad, kernel_c_grad,kernel_o_grad], axis=-1) recurrent_kernel_grad = np.concatenate([recurrent_kernel_i_grad, recurrent_kernel_f_grad, recurrent_kernel_c_grad,recurrent_kernel_o_grad], axis=-1) return in_grad, kernel_grad, recurrent_kernel_grad
def forward(self, input, kernel, recurrent_kernel): """ # Arguments inputs: [input numpy array with shape (batch, in_features), state numpy array with shape (batch, units)] # Returns outputs: numpy array with shape (batch, units) """ x, prev_h = input _, all_units = kernel.shape units = all_units // 3 kernel_z = kernel[:, :units] kernel_r = kernel[:, units:2*units] kernel_h = kernel[:, 2*units:] recurrent_kernel_z = recurrent_kernel[:, :units] recurrent_kernel_r = recurrent_kernel[:, units:2*units] recurrent_kernel_h = recurrent_kernel[:, 2*units:] # code here ##################################################################################### # Initialize x_z = None x_r = None x_h = None # Compute for reset, update and new gate (matrix1 + matrix2) x_z = sigmoid(np.matmul(x, kernel_z) + np.matmul(prev_h, recurrent_kernel_z)) x_r = sigmoid(np.matmul(x, kernel_r) + np.matmul(prev_h, recurrent_kernel_r)) x_h = np.tanh(np.matmul(x, kernel_h) + np.matmul(x_r * prev_h, recurrent_kernel_h)) ##################################################################################### output = (1 - x_z) * x_h + x_z * prev_h return output
def backward(self, out_grad, input, kernel, recurrent_kernel): """ # Arguments in_grads: numpy array with shape (batch, units), gradients to outputs inputs: [input numpy array with shape (batch, in_features), state numpy array with shape (batch, units)], same with forward inputs # Returns out_grads: [gradients to input numpy array with shape (batch, in_features), gradients to state numpy array with shape (batch, units)] """ x, prev_h = input _, all_units = kernel.shape units = all_units // 3 kernel_z, kernel_r, kernel_h = kernel[:, : units], kernel[:, units:2 * units], kernel[:, 2 * units: all_units] recurrent_kernel_z = recurrent_kernel[:, :units] recurrent_kernel_r = recurrent_kernel[:, units:2 * units] recurrent_kernel_h = recurrent_kernel[:, 2 * units:all_units] ##################################################################################### # code here x_r = sigmoid( np.dot(np.nan_to_num(x), kernel_r) + np.dot(prev_h, recurrent_kernel_r)) # update gate x_z = sigmoid( np.dot(np.nan_to_num(x), kernel_z) + np.dot(prev_h, recurrent_kernel_z)) # new gate x_h = np.tanh( np.dot(np.nan_to_num(x), kernel_h) + np.dot(np.multiply(x_r, prev_h), recurrent_kernel_h)) d_h = out_grad * (1 - x_z) * (1 - np.square(x_h)) d_z = out_grad * (prev_h - x_h) * (x_z * (1 - x_z)) d_r = np.dot(d_h, recurrent_kernel_h.T) * prev_h * (x_r * (1 - x_r)) x_grad = np.nan_to_num( np.dot(d_z, kernel_z.T) + np.dot(d_r, kernel_r.T) + np.dot(d_h, kernel_h.T)) prev_h_grad = np.nan_to_num(out_grad * x_z + np.dot(d_h, recurrent_kernel_h.T) * x_r + np.dot(d_z, recurrent_kernel_z.T) + np.dot(d_r, recurrent_kernel_r.T)) kernel_r_grad = np.nan_to_num(np.dot(x.T, d_r)) kernel_z_grad = np.nan_to_num(np.dot(x.T, d_z)) kernel_h_grad = np.nan_to_num(np.dot(x.T, d_h)) recurrent_kernel_r_grad = np.nan_to_num(np.dot(prev_h.T, d_r)) recurrent_kernel_z_grad = np.nan_to_num(np.dot(prev_h.T, d_z)) recurrent_kernel_h_grad = np.nan_to_num( np.dot(np.multiply(x_r, prev_h).T, d_h)) ##################################################################################### in_grad = [x_grad, prev_h_grad] kernel_grad = np.concatenate( [kernel_z_grad, kernel_r_grad, kernel_h_grad], axis=-1) r_kernel_grad = np.concatenate([ recurrent_kernel_z_grad, recurrent_kernel_r_grad, recurrent_kernel_h_grad ], axis=-1) return in_grad, kernel_grad, r_kernel_grad
def backward(self, out_grad, input, kernel, recurrent_kernel): """ # Arguments in_grads: numpy array with shape (batch, units), gradients to outputs inputs: [input numpy array with shape (batch, in_features), state numpy array with shape (batch, units)], same with forward inputs # Returns out_grads: [gradients to input numpy array with shape (batch, in_features), gradients to state numpy array with shape (batch, units)] """ x, prev_h = input _, all_units = kernel.shape units = all_units // 3 kernel_z, kernel_r, kernel_h = kernel[:, :units], kernel[:, units:2*units], kernel[:, 2*units:all_units] recurrent_kernel_z = recurrent_kernel[:, :units] recurrent_kernel_r = recurrent_kernel[:, units:2*units] recurrent_kernel_h = recurrent_kernel[:, 2*units:all_units] ##################################################################################### # code here # https://towardsdatascience.com/forward-and-backpropagation-in-grus-derived-deep-learning-5764f374f3f5 zt = sigmoid(np.dot(prev_h, recurrent_kernel_z) + np.dot(x, kernel_z)) rt = sigmoid(np.dot(prev_h, recurrent_kernel_r) + np.dot(x, kernel_r)) h_hat = np.tanh(np.dot(rt*prev_h, recurrent_kernel_h) + np.dot(x, kernel_h)) d0 = out_grad d1 = d0 * zt d2 = d0 * prev_h d3 = d0 * h_hat d4 = -1 * d3 d5 = d2 + d4 d6 = d0 * (1- zt) d7 = d5 * (zt * (1 - zt)) d8 = d6 * (1 - h_hat**2) d9 = np.dot(d8, kernel_h.T) d10 = np.dot(d8, recurrent_kernel_h.T) d11 = np.dot(d7, kernel_z.T) d12 = np.dot(d7, recurrent_kernel_z.T) d14 = d10 * rt d15 = d10 * prev_h d16 = d15 * (rt * (1 - rt)) d13 = np.dot(d16, kernel_r.T) d17 = np.dot(d16, recurrent_kernel_r.T) x_grad = np.nan_to_num(d9 + d11 + d13) prev_h_grad = np.nan_to_num(d12 + d14 + d1 + d17) kernel_r_grad = np.nan_to_num(np.dot(x.T, d16)) kernel_z_grad = np.nan_to_num(np.dot(x.T, d7)) kernel_h_grad = np.nan_to_num(np.dot(x.T, d8)) recurrent_kernel_r_grad = np.nan_to_num(np.dot(prev_h.T, d16)) recurrent_kernel_z_grad = np.nan_to_num(np.dot(prev_h.T, d7)) recurrent_kernel_h_grad = np.nan_to_num(np.dot((rt*prev_h).T, d8)) ##################################################################################### in_grad = [x_grad, prev_h_grad] kernel_grad = np.concatenate([kernel_z_grad, kernel_r_grad, kernel_h_grad], axis=-1) r_kernel_grad = np.concatenate([recurrent_kernel_z_grad, recurrent_kernel_r_grad, recurrent_kernel_h_grad], axis=-1) return in_grad, kernel_grad, r_kernel_grad
def backward(self, out_grad, input, kernel, recurrent_kernel): """ # Arguments in_grads: numpy array with shape (batch, units), gradients to outputs inputs: [input numpy array with shape (batch, in_features), state numpy array with shape (batch, units)], same with forward inputs # Returns out_grads: [gradients to input numpy array with shape (batch, in_features), gradients to state numpy array with shape (batch, units)] """ x, prev_h = input _, all_units = kernel.shape units = all_units // 3 kernel_z, kernel_r, kernel_h = kernel[:, : units], kernel[:, units:2 * units], kernel[:, 2 * units: all_units] recurrent_kernel_z = recurrent_kernel[:, :units] recurrent_kernel_r = recurrent_kernel[:, units:2 * units] recurrent_kernel_h = recurrent_kernel[:, 2 * units:all_units] # reset gate x_r = sigmoid(x.dot(kernel_r) + prev_h.dot(recurrent_kernel_r)) # update gate x_z = sigmoid(x.dot(kernel_z) + prev_h.dot(recurrent_kernel_z)) # new gate x_h = np.tanh(x.dot(kernel_h) + (x_r * prev_h).dot(recurrent_kernel_h)) x_h_tanh_grad = np.nan_to_num( (1 - x_z) * out_grad * (1 - np.square(x_h))) x_r_sig_grad = np.matmul( x_h_tanh_grad, np.transpose(recurrent_kernel_h)) * x_r * (1 - x_r) * prev_h x_z_sig_grad = out_grad * x_z * (1 - x_z) * (prev_h - x_h) x_grad = np.matmul(x_h_tanh_grad, np.transpose(kernel_h)) + np.matmul( x_r_sig_grad, np.transpose(kernel_r)) + np.matmul( x_z_sig_grad, kernel_z.T) prev_h_grad = np.matmul( x_h_tanh_grad, recurrent_kernel_h.T) * x_r + x_z * out_grad + np.matmul( x_r_sig_grad, recurrent_kernel_r.T) + np.matmul( x_z_sig_grad, recurrent_kernel_z.T) kernel_r_grad = np.matmul(x.T, x_r_sig_grad) kernel_z_grad = np.matmul(x.T, x_z_sig_grad) kernel_h_grad = np.matmul(x.T, x_h_tanh_grad) recurrent_kernel_r_grad = np.matmul(prev_h.T, x_r_sig_grad) recurrent_kernel_z_grad = np.matmul(prev_h.T, x_z_sig_grad) recurrent_kernel_h_grad = np.matmul( np.multiply(x_r, prev_h).T, x_h_tanh_grad) in_grad = [x_grad, prev_h_grad] kernel_grad = np.concatenate( [kernel_z_grad, kernel_r_grad, kernel_h_grad], axis=-1) r_kernel_grad = np.concatenate([ recurrent_kernel_z_grad, recurrent_kernel_r_grad, recurrent_kernel_h_grad ], axis=-1) return in_grad, kernel_grad, r_kernel_grad
def backward(self, out_grad, input, kernel, recurrent_kernel): """ # Arguments in_grads: numpy array with shape (batch, units), gradients to outputs inputs: [input numpy array with shape (batch, in_features), state numpy array with shape (batch, units)], same with forward inputs # Returns out_grads: [gradients to input numpy array with shape (batch, in_features), gradients to state numpy array with shape (batch, units)] """ x, prev_h = input x = np.nan_to_num(x) prev_h = np.nan_to_num(prev_h) _, all_units = kernel.shape units = all_units // 3 kernel_z, kernel_r, kernel_h = kernel[:, : units], kernel[:, units:2 * units], kernel[:, 2 * units: all_units] recurrent_kernel_z = recurrent_kernel[:, :units] recurrent_kernel_r = recurrent_kernel[:, units:2 * units] recurrent_kernel_h = recurrent_kernel[:, 2 * units:all_units] ##################################################################################### # code here # reset gate x_r = sigmoid(x.dot(kernel_r) + prev_h.dot(recurrent_kernel_r)) # x:batch,features | kernel_r:feature, units | prev_h:batch,units | recurent_kernel_r:units, units| =>x_r:batch,units # update gate x_z = sigmoid(x.dot(kernel_z) + prev_h.dot(recurrent_kernel_z)) # x:batch,features | kernel_z:feature, units | prev_h:batch,units | recurent_kernel_z:units, units| =>x_z:batch,units # new gate x_h = np.tanh(x.dot(kernel_h) + (x_r * prev_h).dot(recurrent_kernel_h)) # x:batch,features | kernel_z:feature, units | recurent_kernel_z:units, units | x_r:batch,units | prev_h:batch,units | x_h:batch,units x_z_grad = out_grad * ( -x_h + prev_h ) # x_h:batch,units | prev_h:batch,units | out_grad:batch,units | x_z_grad:batch,units x_h_grad = out_grad * ( 1 - x_z ) # x_z:batch,units | out_grad:batch,units | x_h_grad:batch,units tanh_h_grad = x_h_grad * ( 1 - np.square(x_h) ) # x_h_grad:batch,units | x_h:batch,units | tanh_h_grad:batch,units x_r_grad = tanh_h_grad.dot( recurrent_kernel_h.T ) * prev_h # tanh_h_grad:batch,units | recurent_kernel_z:units, units | prev_h:batch,units | x_r_grad:batch,units sig_r_grad = x_r_grad * x_r * ( 1 - x_r ) # x_r_grad:batch,units | x_r:batch,units | sig_r_grad:batch,units sig_z_grad = x_z_grad * x_z * ( 1 - x_z ) # x_z_grad:batch,units | x_z:batch,units | sig_z_grad:batch,units # x_grad_out, prev_h_grad_out = out_grad x_grad = sig_r_grad.dot(kernel_r.T) + sig_z_grad.dot( kernel_z.T ) + tanh_h_grad.dot( kernel_h.T ) # sig_r_grad:batch,units | kernel_r:feature, units |sig_z_grad:batch,units | kernel_z:feature, units | tanh_h_grad:batch,units | kernel_h:feature, units | x_grad:batch,features prev_h_grad = sig_r_grad.dot(recurrent_kernel_r.T) + sig_z_grad.dot( recurrent_kernel_z.T ) + ( tanh_h_grad.dot(recurrent_kernel_h.T) * x_r ) + out_grad * x_z # sig_r_grad:batch,units | recurrent_kernel_r:units, units | prev_h_grad: batch,units kernel_r_grad = ( (sig_r_grad.T).dot(x) ).T # sig_r_grad:batch,units | x:batch,features | kernel_r_grad: features,units kernel_z_grad = ( (sig_z_grad.T).dot(x) ).T # sig_z_grad:batch,units | x:batch,features | kernel_z_grad: features,units kernel_h_grad = ( (tanh_h_grad.T).dot(x) ).T # tanh_h_grad:batch,units | x:batch,features | kernel_h_grad: features,units recurrent_kernel_r_grad = ( (sig_r_grad.T).dot(prev_h) ).T # sig_r_grad:batch,units | prev_h:batch,units | recurrent_kernel_r_grad:units,units recurrent_kernel_z_grad = ( (sig_z_grad.T).dot(prev_h) ).T # sig_z_grad:batch,units | prev_h:batch,units | recurrent_kernel_z_grad:units,units recurrent_kernel_h_grad = ((tanh_h_grad.T).dot( x_r * prev_h)).T #x_h_grad * (1 - np.square(x_h)) * x_r * prev_h ##################################################################################### in_grad = [x_grad, prev_h_grad] kernel_grad = np.concatenate( [kernel_z_grad, kernel_r_grad, kernel_h_grad], axis=-1) r_kernel_grad = np.concatenate([ recurrent_kernel_z_grad, recurrent_kernel_r_grad, recurrent_kernel_h_grad ], axis=-1) return in_grad, kernel_grad, r_kernel_grad
def backward(self, out_grad, input, kernel, recurrent_kernel): """ # Arguments in_grads: numpy array with shape (batch, units), gradients to outputs inputs: [input numpy array with shape (batch, in_features), state numpy array with shape (batch, units)], same with forward inputs # Returns out_grads: [gradients to input numpy array with shape (batch, in_features), gradients to state numpy array with shape (batch, units)] """ x, prev_h = input _, all_units = kernel.shape units = all_units // 3 kernel_z = kernel[:, :units] kernel_r = kernel[:, units:2 * units] kernel_h = kernel[:, 2 * units:all_units] recurrent_kernel_z = recurrent_kernel[:, :units] recurrent_kernel_r = recurrent_kernel[:, units:2*units] recurrent_kernel_h = recurrent_kernel[:, 2*units:all_units] # code here ##################################################################################### # Initialize x_grad, prev_h_grad = np.zeros_like(x), np.zeros_like(prev_h) kernel_z_grad, recurrent_kernel_z_grad = np.zeros_like(kernel_z), np.zeros_like(recurrent_kernel_z) kernel_r_grad, recurrent_kernel_r_grad = np.zeros_like(kernel_r), np.zeros_like(recurrent_kernel_r) kernel_h_grad, recurrent_kernel_h_grad = np.zeros_like(kernel_h), np.zeros_like(recurrent_kernel_h) # Compute basic information x_z = sigmoid(np.matmul(x, kernel_z) + np.matmul(prev_h, recurrent_kernel_z)) x_r = sigmoid(np.matmul(x, kernel_r) + np.matmul(prev_h, recurrent_kernel_r)) x_h = np.tanh(np.matmul(x, kernel_h) + np.matmul(prev_h * x_r, recurrent_kernel_h)) # Compute for new gate tmp_h = out_grad * (1 - x_h**2) * (1 - x_z) matrix1_h = np.matmul(tmp_h, np.transpose(kernel_h)) matrix2_h = np.matmul(tmp_h, np.transpose(recurrent_kernel_h)) # Compute for update gate tmp_z = out_grad * (prev_h - x_h) * x_z * (1 - x_z) matrix1_z = np.matmul(tmp_z, np.transpose(kernel_z)) matrix2_z = np.matmul(tmp_z, np.transpose(recurrent_kernel_z)) # Compute for reset gate tmp_r = matrix2_h * prev_h * (x_r * (1 - x_r)) matrix1_r = np.matmul(tmp_r, np.transpose(kernel_r)) matrix2_r = np.matmul(tmp_r, np.transpose(recurrent_kernel_r)) # Compute the gradient of input x_grad = matrix1_z + matrix1_r + matrix1_h prev_h_grad = matrix2_z + matrix2_r + matrix2_h * x_r + out_grad * x_z # Compute the gradient of kernel kernel_r_grad = np.matmul(np.transpose(x), tmp_r) kernel_z_grad = np.matmul(np.transpose(x), tmp_z) kernel_h_grad = np.matmul(np.transpose(x), tmp_h) # Compute the gradient of recurrent kernel recurrent_kernel_r_grad = np.matmul(np.transpose(prev_h), tmp_r) recurrent_kernel_z_grad = np.matmul(np.transpose(prev_h), tmp_z) recurrent_kernel_h_grad = np.matmul(np.transpose(prev_h * x_r), tmp_h) ##################################################################################### in_grad = [x_grad, prev_h_grad] kernel_grad = np.concatenate([kernel_z_grad, kernel_r_grad, kernel_h_grad], axis=-1) recurrent_kernel_grad = np.concatenate([recurrent_kernel_z_grad, recurrent_kernel_r_grad, recurrent_kernel_h_grad], axis=-1) return in_grad, kernel_grad, recurrent_kernel_grad
def backward(self, out_grad, input, kernel, recurrent_kernel): """ # Arguments in_grads: numpy array with shape (batch, units), gradients to outputs inputs: [input numpy array with shape (batch, in_features), state numpy array with shape (batch, units)], same with forward inputs # Returns out_grads: [gradients to input numpy array with shape (batch, in_features), gradients to state numpy array with shape (batch, units)] """ x, prev_h = input _, all_units = kernel.shape units = all_units // 3 kernel_z, kernel_r, kernel_h = kernel[:, : units], kernel[:, units:2 * units], kernel[:, 2 * units: all_units] recurrent_kernel_z = recurrent_kernel[:, :units] recurrent_kernel_r = recurrent_kernel[:, units:2 * units] recurrent_kernel_h = recurrent_kernel[:, 2 * units:all_units] # reset gate x_r = sigmoid(x.dot(kernel_r) + prev_h.dot(recurrent_kernel_r)) # update gate x_z = sigmoid(x.dot(kernel_z) + prev_h.dot(recurrent_kernel_z)) # new gate x_h = np.tanh( x.dot(kernel_h) + np.dot(x_r * prev_h, recurrent_kernel_h)) output = (1 - x_z) * x_h + x_z * prev_h sig_grad_xr = x_r * (1 - x_r) sig_grad_xz = x_z * (1 - x_z) tanh_grad_xh = (1 - np.square(x_h)) x_grad = (out_grad * (prev_h - x_h) * sig_grad_xz).dot(kernel_z.T) + \ (out_grad * (1 - x_z) * tanh_grad_xh).dot(kernel_h.T) + \ ((out_grad * (1 - x_z) * tanh_grad_xh).dot(recurrent_kernel_h.T) * prev_h * sig_grad_xr).dot(kernel_r.T) prev_h_grad = out_grad * x_z + (out_grad * (prev_h - x_h) * sig_grad_xz).dot(recurrent_kernel_z.T) + \ (out_grad * (1 - x_z) * tanh_grad_xh).dot(recurrent_kernel_h.T) * x_r + \ ((out_grad * (1 - x_z) * tanh_grad_xh).dot(recurrent_kernel_h.T) * prev_h * sig_grad_xr).dot(recurrent_kernel_r.T) kernel_r_grad = x.T.dot( (out_grad * (1 - x_z) * tanh_grad_xh).dot(recurrent_kernel_h.T) * prev_h * sig_grad_xr) kernel_z_grad = x.T.dot(out_grad * (prev_h - x_h) * sig_grad_xz) kernel_h_grad = x.T.dot(out_grad * (1 - x_z) * tanh_grad_xh) recurrent_kernel_r_grad = prev_h.T.dot( (out_grad * (1 - x_z) * tanh_grad_xh).dot(recurrent_kernel_h.T) * prev_h * sig_grad_xr) recurrent_kernel_z_grad = prev_h.T.dot(out_grad * (prev_h - x_h) * sig_grad_xz) recurrent_kernel_h_grad = (x_r * prev_h).T.dot(out_grad * (1 - x_z) * tanh_grad_xh) in_grad = [x_grad, prev_h_grad] kernel_grad = np.concatenate( [kernel_z_grad, kernel_r_grad, kernel_h_grad], axis=-1) r_kernel_grad = np.concatenate([ recurrent_kernel_z_grad, recurrent_kernel_r_grad, recurrent_kernel_h_grad ], axis=-1) return in_grad, kernel_grad, r_kernel_grad
def sigmoid(self): return F.sigmoid(self)
def backward(self, out_grad, input, kernel, recurrent_kernel): """ # Arguments in_grads: numpy array with shape (batch, units), gradients to outputs inputs: [input numpy array with shape (batch, in_features), state numpy array with shape (batch, units)], same with forward inputs # Returns out_grads: [gradients to input numpy array with shape (batch, in_features), gradients to state numpy array with shape (batch, units)] """ x, prev_h = input _, all_units = kernel.shape units = all_units // 3 kernel_z, kernel_r, kernel_h = kernel[:, : units], kernel[:, units:2 * units], kernel[:, 2 * units: all_units] recurrent_kernel_z = recurrent_kernel[:, :units] recurrent_kernel_r = recurrent_kernel[:, units:2 * units] recurrent_kernel_h = recurrent_kernel[:, 2 * units:all_units] ##################################################################################### # code here x_z = sigmoid(prev_h.dot(recurrent_kernel_z) + x.dot(kernel_z)) x_r = sigmoid(prev_h.dot(recurrent_kernel_r) + x.dot(kernel_r)) prev_h_in = x_r * prev_h x_h = np.tanh(prev_h_in.dot(recurrent_kernel_h) + x.dot(kernel_h)) d1 = x_z * out_grad d2 = prev_h * out_grad d3 = x_h * out_grad d4 = -1 * d3 d5 = d2 + d4 d6 = (1 - x_z) * out_grad d7 = d5 * (x_z * (1 - x_z)) d8 = d6 * (1 - np.square(x_h)) d9 = d8.dot(kernel_h.T) d10 = d8.dot(recurrent_kernel_h.T) d11 = d7.dot(kernel_z.T) d12 = d7.dot(recurrent_kernel_z.T) d14 = d10 * x_r d15 = d10 * prev_h d16 = d15 * (x_r * (1 - x_r)) d13 = d16.dot(kernel_r.T) d17 = d16.dot(recurrent_kernel_r.T) x_grad = d9 + d11 + d13 prev_h_grad = d12 + d14 + d1 + d17 kernel_r_grad = np.dot(x.T, d16) kernel_z_grad = np.dot(x.T, d7) kernel_h_grad = np.dot(x.T, d8) recurrent_kernel_r_grad = np.dot(prev_h.T, d16) recurrent_kernel_z_grad = np.dot(prev_h.T, d7) recurrent_kernel_h_grad = np.dot((prev_h * x_r).T, d8) ##################################################################################### in_grad = [x_grad, prev_h_grad] kernel_grad = np.concatenate( [kernel_z_grad, kernel_r_grad, kernel_h_grad], axis=-1) r_kernel_grad = np.concatenate([ recurrent_kernel_z_grad, recurrent_kernel_r_grad, recurrent_kernel_h_grad ], axis=-1) return in_grad, kernel_grad, r_kernel_grad