Exemple #1
0
    def forward(self, input, kernel, recurrent_kernel):
        """
        # Arguments
            inputs: [input numpy array with shape (batch, in_features), 
                    state numpy array with shape (batch, units)]

        # Returns
            outputs: numpy array with shape (batch, units)
        """
        x, prev_h = input
        _, all_units = kernel.shape
        units = all_units // 3
        kernel_z, kernel_r, kernel_h = kernel[:, :units], kernel[:, units:2*units],  kernel[:, 2*units:all_units]
        recurrent_kernel_z = recurrent_kernel[:, :units]
        recurrent_kernel_r = recurrent_kernel[:, units:2*units]
        recurrent_kernel_h = recurrent_kernel[:, 2*units:all_units]

        #####################################################################################
        # code here
        # reset gate
        x_r = sigmoid(np.dot(prev_h, recurrent_kernel_r) + np.dot(x, kernel_r))
        # update gate
        x_z = sigmoid(np.dot(prev_h, recurrent_kernel_z) + np.dot(x, kernel_z))
        # new gate
        x_h = np.tanh(np.dot(x_r * prev_h, recurrent_kernel_h) + np.dot(x, kernel_h))
        #####################################################################################

        output = (1 - x_z) * x_h + x_z * prev_h
        
        return output
    def forward(self, input, kernel, recurrent_kernel):
        """
        # Arguments
            inputs: [input numpy array with shape (batch, in_features), 
                    cell state numpy array with shape (batch, units),
                    hidden state numpy array with shape (batch, units)]

        # Returns
            outputs: [New hidden state numpy array with shape (batch, units),
                      New cell_state numpy array with shape (batch, units)]
                    
        Note: We assume no bias term in lstm           
        """
        x, prev_c, prev_h = input  #prev_c: previous cell state; prev_h: previous hidden state
        _, all_units = kernel.shape
        units = all_units // 4
        kernel_i, kernel_f, kernel_c, kernel_o = kernel[:, :
                                                        units], kernel[:,
                                                                       units:
                                                                       2 *
                                                                       units], kernel[:,
                                                                                      2
                                                                                      *
                                                                                      units:
                                                                                      3
                                                                                      *
                                                                                      units], kernel[:,
                                                                                                     3
                                                                                                     *
                                                                                                     units:
                                                                                                     all_units],
        recurrent_kernel_i = recurrent_kernel[:, :
                                              units]  # recurrent weight of input gate
        recurrent_kernel_f = recurrent_kernel[:, units:2 *
                                              units]  # recurrent weight of forget gate
        recurrent_kernel_c = recurrent_kernel[:, 2 * units:3 *
                                              units]  # recurrent weight of cell gate
        recurrent_kernel_o = recurrent_kernel[:, 3 * units:
                                              all_units]  # recurrent weight of output gate

        #################### To do ####################
        f = sigmoid(
            np.matmul(x, kernel_f) + np.matmul(prev_h, recurrent_kernel_f))
        i = sigmoid(
            np.matmul(x, kernel_i) + np.matmul(prev_h, recurrent_kernel_i))
        o = sigmoid(
            np.matmul(x, kernel_o) + np.matmul(prev_h, recurrent_kernel_o))
        cell = np.multiply(f, prev_c) + np.multiply(
            i,
            np.tanh(
                np.matmul(x, kernel_c) +
                np.matmul(prev_h, recurrent_kernel_c)))
        hidden = np.multiply(o, np.tanh(cell))
        ###############################################

        return hidden, cell
Exemple #3
0
    def backward(self, out_grad, input, kernel, recurrent_kernel):
        """
        # Arguments
            out_grad: [gradient to output_hidden state, gradient to output_cell_state]
            inputs: [input numpy array with shape (batch, in_features), 
                    cell state numpy array with shape (batch, units),
                    hidden state numpy array with shape (batch, units)]

        # Returns
            in_grad: [gradients to input numpy array with shape (batch, in_features),
                        gradients to cell state numpy array with shape (batch, units),
                        gradients to hidden state numpy array with shape (batch, units)]
        """
        x, prev_c, prev_h = input #prev_c: previous cell state; prev_h: previous hidden state
        _, all_units = kernel.shape
        units = all_units // 4
        kernel_i, kernel_f, kernel_c, kernel_o= kernel[:, :units], kernel[:, units:2*units], kernel[:, 2*units:3*units], kernel[:, 3*units:all_units],
        recurrent_kernel_i = recurrent_kernel[:, :units]
        recurrent_kernel_f = recurrent_kernel[:, units:2*units]
        recurrent_kernel_c = recurrent_kernel[:, 2*units:3*units]
        recurrent_kernel_o = recurrent_kernel[:, 3*units:all_units]
        h_grad, c_grad = out_grad
        x_f = sigmoid(x.dot(kernel_f) + prev_h.dot(recurrent_kernel_f))
        x_i = sigmoid(x.dot(kernel_i) + prev_h.dot(recurrent_kernel_i))
        x_o = sigmoid(x.dot(kernel_o) + prev_h.dot(recurrent_kernel_o))
        x_c = np.tanh(x.dot(kernel_c) + prev_h.dot(recurrent_kernel_c))
        c = x_i * x_c + x_f * prev_c
        h = x_o * np.tanh(c)
        do = h_grad * np.tanh(c)
        df = c_grad * prev_c
        dc = c_grad * x_i
        di = c_grad * x_c
        dAc = dc * (1-x_c*x_c)
        dAi = di * x_i * (1-x_i)
        dAf = df * x_f * (1-x_f)
        dAo = do * x_o * (1-x_o)
        x_grad = dAc.dot(kernel_c.T)+dAi.dot(kernel_i.T)+dAf.dot(kernel_f.T)+dAo.dot(kernel_o.T)
        kernel_c_grad = x.T.dot(dAc)
        kernel_i_grad = x.T.dot(dAi)
        kernel_f_grad = x.T.dot(dAf)
        kernel_o_grad = x.T.dot(dAo)
        prev_h_grad = dAc.dot(recurrent_kernel_c.T)+dAi.dot(recurrent_kernel_i.T)+dAf.dot(recurrent_kernel_f.T)+dAo.dot(recurrent_kernel_o.T)
        recurrent_kernel_c_grad = prev_h.T.dot(dAc)
        recurrent_kernel_i_grad = prev_h.T.dot(dAi)
        recurrent_kernel_f_grad = prev_h.T.dot(dAf)
        recurrent_kernel_o_grad = prev_h.T.dot(dAo)
        prev_c_grad = c_grad * x_f + prev_h_grad * x_o * (1-np.tanh(prev_c) * np.tanh(prev_c))

        in_grad = [x_grad, prev_c_grad, prev_h_grad]
        kernel_grad = np.concatenate([kernel_i_grad, kernel_f_grad, kernel_c_grad,kernel_o_grad], axis=-1)
        recurrent_kernel_grad = np.concatenate([recurrent_kernel_i_grad, recurrent_kernel_f_grad, recurrent_kernel_c_grad,recurrent_kernel_o_grad], axis=-1)

        return in_grad, kernel_grad, recurrent_kernel_grad
Exemple #4
0
	def forward(self, input, kernel, recurrent_kernel):
		"""
		# Arguments
			inputs: [input numpy array with shape (batch, in_features), 
					 state numpy array with shape (batch, units)]

		# Returns
			outputs: numpy array with shape (batch, units)
		"""
		x, prev_h = input
		_, all_units = kernel.shape
		units = all_units // 3

		kernel_z = kernel[:, :units]
		kernel_r = kernel[:, units:2*units]
		kernel_h = kernel[:, 2*units:]

		recurrent_kernel_z = recurrent_kernel[:, :units]
		recurrent_kernel_r = recurrent_kernel[:, units:2*units]
		recurrent_kernel_h = recurrent_kernel[:, 2*units:]
		
		# code here
		#####################################################################################
		# Initialize
		x_z = None
		x_r = None
		x_h = None

		# Compute for reset, update and new gate (matrix1 + matrix2)
		x_z = sigmoid(np.matmul(x, kernel_z) + np.matmul(prev_h, recurrent_kernel_z))
		x_r = sigmoid(np.matmul(x, kernel_r) + np.matmul(prev_h, recurrent_kernel_r))
		x_h = np.tanh(np.matmul(x, kernel_h) + np.matmul(x_r * prev_h, recurrent_kernel_h))
		#####################################################################################
		
		output = (1 - x_z) * x_h + x_z * prev_h
		return output
Exemple #5
0
    def backward(self, out_grad, input, kernel, recurrent_kernel):
        """
        # Arguments
            in_grads: numpy array with shape (batch, units), gradients to outputs
            inputs: [input numpy array with shape (batch, in_features), 
                    state numpy array with shape (batch, units)], same with forward inputs

        # Returns
            out_grads: [gradients to input numpy array with shape (batch, in_features), 
                        gradients to state numpy array with shape (batch, units)]
        """
        x, prev_h = input
        _, all_units = kernel.shape
        units = all_units // 3
        kernel_z, kernel_r, kernel_h = kernel[:, :
                                              units], kernel[:, units:2 *
                                                             units], kernel[:,
                                                                            2 *
                                                                            units:
                                                                            all_units]
        recurrent_kernel_z = recurrent_kernel[:, :units]
        recurrent_kernel_r = recurrent_kernel[:, units:2 * units]
        recurrent_kernel_h = recurrent_kernel[:, 2 * units:all_units]

        #####################################################################################
        # code here
        x_r = sigmoid(
            np.dot(np.nan_to_num(x), kernel_r) +
            np.dot(prev_h, recurrent_kernel_r))
        # update gate
        x_z = sigmoid(
            np.dot(np.nan_to_num(x), kernel_z) +
            np.dot(prev_h, recurrent_kernel_z))
        # new gate
        x_h = np.tanh(
            np.dot(np.nan_to_num(x), kernel_h) +
            np.dot(np.multiply(x_r, prev_h), recurrent_kernel_h))

        d_h = out_grad * (1 - x_z) * (1 - np.square(x_h))
        d_z = out_grad * (prev_h - x_h) * (x_z * (1 - x_z))
        d_r = np.dot(d_h, recurrent_kernel_h.T) * prev_h * (x_r * (1 - x_r))

        x_grad = np.nan_to_num(
            np.dot(d_z, kernel_z.T) + np.dot(d_r, kernel_r.T) +
            np.dot(d_h, kernel_h.T))
        prev_h_grad = np.nan_to_num(out_grad * x_z +
                                    np.dot(d_h, recurrent_kernel_h.T) * x_r +
                                    np.dot(d_z, recurrent_kernel_z.T) +
                                    np.dot(d_r, recurrent_kernel_r.T))

        kernel_r_grad = np.nan_to_num(np.dot(x.T, d_r))
        kernel_z_grad = np.nan_to_num(np.dot(x.T, d_z))
        kernel_h_grad = np.nan_to_num(np.dot(x.T, d_h))

        recurrent_kernel_r_grad = np.nan_to_num(np.dot(prev_h.T, d_r))
        recurrent_kernel_z_grad = np.nan_to_num(np.dot(prev_h.T, d_z))
        recurrent_kernel_h_grad = np.nan_to_num(
            np.dot(np.multiply(x_r, prev_h).T, d_h))
        #####################################################################################

        in_grad = [x_grad, prev_h_grad]
        kernel_grad = np.concatenate(
            [kernel_z_grad, kernel_r_grad, kernel_h_grad], axis=-1)
        r_kernel_grad = np.concatenate([
            recurrent_kernel_z_grad, recurrent_kernel_r_grad,
            recurrent_kernel_h_grad
        ],
                                       axis=-1)

        return in_grad, kernel_grad, r_kernel_grad
Exemple #6
0
    def backward(self, out_grad, input, kernel, recurrent_kernel):
        """
        # Arguments
            in_grads: numpy array with shape (batch, units), gradients to outputs
            inputs: [input numpy array with shape (batch, in_features), 
                    state numpy array with shape (batch, units)], same with forward inputs

        # Returns
            out_grads: [gradients to input numpy array with shape (batch, in_features), 
                        gradients to state numpy array with shape (batch, units)]
        """
        x, prev_h = input
        _, all_units = kernel.shape
        units = all_units // 3
        kernel_z, kernel_r, kernel_h = kernel[:, :units], kernel[:, units:2*units],  kernel[:, 2*units:all_units]
        recurrent_kernel_z = recurrent_kernel[:, :units]
        recurrent_kernel_r = recurrent_kernel[:, units:2*units]
        recurrent_kernel_h = recurrent_kernel[:, 2*units:all_units]

        #####################################################################################
        # code here
        # https://towardsdatascience.com/forward-and-backpropagation-in-grus-derived-deep-learning-5764f374f3f5
        
        zt = sigmoid(np.dot(prev_h, recurrent_kernel_z) + np.dot(x, kernel_z))
        rt = sigmoid(np.dot(prev_h, recurrent_kernel_r) + np.dot(x, kernel_r))
        h_hat = np.tanh(np.dot(rt*prev_h, recurrent_kernel_h) + np.dot(x, kernel_h))

        d0 = out_grad
        d1 = d0 * zt
        d2 = d0 * prev_h
        d3 = d0 * h_hat
        d4 = -1 * d3
        d5 = d2 + d4
        d6 = d0 * (1- zt)
        d7 = d5 * (zt * (1 - zt))
        d8 = d6 * (1 - h_hat**2)
        d9 = np.dot(d8, kernel_h.T)
        d10 = np.dot(d8, recurrent_kernel_h.T)
        d11 = np.dot(d7, kernel_z.T)
        d12 = np.dot(d7, recurrent_kernel_z.T)
        d14 = d10 * rt
        d15 = d10 * prev_h
        d16 = d15 * (rt * (1 - rt))
        d13 = np.dot(d16, kernel_r.T)
        d17 = np.dot(d16, recurrent_kernel_r.T)

        x_grad = np.nan_to_num(d9 + d11 + d13)
        prev_h_grad = np.nan_to_num(d12 + d14 + d1 + d17)

        kernel_r_grad = np.nan_to_num(np.dot(x.T, d16))
        kernel_z_grad = np.nan_to_num(np.dot(x.T, d7))
        kernel_h_grad = np.nan_to_num(np.dot(x.T, d8))

        recurrent_kernel_r_grad = np.nan_to_num(np.dot(prev_h.T, d16))
        recurrent_kernel_z_grad = np.nan_to_num(np.dot(prev_h.T, d7))
        recurrent_kernel_h_grad = np.nan_to_num(np.dot((rt*prev_h).T, d8))
        #####################################################################################

        in_grad = [x_grad, prev_h_grad]
        kernel_grad = np.concatenate([kernel_z_grad, kernel_r_grad, kernel_h_grad], axis=-1)
        r_kernel_grad = np.concatenate([recurrent_kernel_z_grad, recurrent_kernel_r_grad, recurrent_kernel_h_grad], axis=-1)

        return in_grad, kernel_grad, r_kernel_grad
Exemple #7
0
    def backward(self, out_grad, input, kernel, recurrent_kernel):
        """
        # Arguments
            in_grads: numpy array with shape (batch, units), gradients to outputs
            inputs: [input numpy array with shape (batch, in_features), 
                    state numpy array with shape (batch, units)], same with forward inputs

        # Returns
            out_grads: [gradients to input numpy array with shape (batch, in_features), 
                        gradients to state numpy array with shape (batch, units)]
        """
        x, prev_h = input
        _, all_units = kernel.shape
        units = all_units // 3
        kernel_z, kernel_r, kernel_h = kernel[:, :
                                              units], kernel[:, units:2 *
                                                             units], kernel[:,
                                                                            2 *
                                                                            units:
                                                                            all_units]
        recurrent_kernel_z = recurrent_kernel[:, :units]
        recurrent_kernel_r = recurrent_kernel[:, units:2 * units]
        recurrent_kernel_h = recurrent_kernel[:, 2 * units:all_units]

        # reset gate
        x_r = sigmoid(x.dot(kernel_r) + prev_h.dot(recurrent_kernel_r))
        # update gate
        x_z = sigmoid(x.dot(kernel_z) + prev_h.dot(recurrent_kernel_z))
        # new gate
        x_h = np.tanh(x.dot(kernel_h) + (x_r * prev_h).dot(recurrent_kernel_h))

        x_h_tanh_grad = np.nan_to_num(
            (1 - x_z) * out_grad * (1 - np.square(x_h)))
        x_r_sig_grad = np.matmul(
            x_h_tanh_grad,
            np.transpose(recurrent_kernel_h)) * x_r * (1 - x_r) * prev_h
        x_z_sig_grad = out_grad * x_z * (1 - x_z) * (prev_h - x_h)

        x_grad = np.matmul(x_h_tanh_grad, np.transpose(kernel_h)) + np.matmul(
            x_r_sig_grad, np.transpose(kernel_r)) + np.matmul(
                x_z_sig_grad, kernel_z.T)
        prev_h_grad = np.matmul(
            x_h_tanh_grad,
            recurrent_kernel_h.T) * x_r + x_z * out_grad + np.matmul(
                x_r_sig_grad, recurrent_kernel_r.T) + np.matmul(
                    x_z_sig_grad, recurrent_kernel_z.T)

        kernel_r_grad = np.matmul(x.T, x_r_sig_grad)
        kernel_z_grad = np.matmul(x.T, x_z_sig_grad)
        kernel_h_grad = np.matmul(x.T, x_h_tanh_grad)

        recurrent_kernel_r_grad = np.matmul(prev_h.T, x_r_sig_grad)
        recurrent_kernel_z_grad = np.matmul(prev_h.T, x_z_sig_grad)
        recurrent_kernel_h_grad = np.matmul(
            np.multiply(x_r, prev_h).T, x_h_tanh_grad)

        in_grad = [x_grad, prev_h_grad]
        kernel_grad = np.concatenate(
            [kernel_z_grad, kernel_r_grad, kernel_h_grad], axis=-1)
        r_kernel_grad = np.concatenate([
            recurrent_kernel_z_grad, recurrent_kernel_r_grad,
            recurrent_kernel_h_grad
        ],
                                       axis=-1)

        return in_grad, kernel_grad, r_kernel_grad
Exemple #8
0
    def backward(self, out_grad, input, kernel, recurrent_kernel):
        """
        # Arguments
            in_grads: numpy array with shape (batch, units), gradients to outputs
            inputs: [input numpy array with shape (batch, in_features), 
                    state numpy array with shape (batch, units)], same with forward inputs

        # Returns
            out_grads: [gradients to input numpy array with shape (batch, in_features), 
                        gradients to state numpy array with shape (batch, units)]
        """
        x, prev_h = input
        x = np.nan_to_num(x)
        prev_h = np.nan_to_num(prev_h)
        _, all_units = kernel.shape
        units = all_units // 3
        kernel_z, kernel_r, kernel_h = kernel[:, :
                                              units], kernel[:, units:2 *
                                                             units], kernel[:,
                                                                            2 *
                                                                            units:
                                                                            all_units]
        recurrent_kernel_z = recurrent_kernel[:, :units]
        recurrent_kernel_r = recurrent_kernel[:, units:2 * units]
        recurrent_kernel_h = recurrent_kernel[:, 2 * units:all_units]

        #####################################################################################
        # code here

        # reset gate
        x_r = sigmoid(x.dot(kernel_r) + prev_h.dot(recurrent_kernel_r))
        # x:batch,features | kernel_r:feature, units | prev_h:batch,units | recurent_kernel_r:units, units| =>x_r:batch,units

        # update gate
        x_z = sigmoid(x.dot(kernel_z) + prev_h.dot(recurrent_kernel_z))
        # x:batch,features | kernel_z:feature, units | prev_h:batch,units | recurent_kernel_z:units, units| =>x_z:batch,units

        # new gate
        x_h = np.tanh(x.dot(kernel_h) + (x_r * prev_h).dot(recurrent_kernel_h))
        # x:batch,features | kernel_z:feature, units | recurent_kernel_z:units, units | x_r:batch,units | prev_h:batch,units | x_h:batch,units

        x_z_grad = out_grad * (
            -x_h + prev_h
        )  # x_h:batch,units | prev_h:batch,units | out_grad:batch,units | x_z_grad:batch,units
        x_h_grad = out_grad * (
            1 - x_z
        )  # x_z:batch,units | out_grad:batch,units | x_h_grad:batch,units

        tanh_h_grad = x_h_grad * (
            1 - np.square(x_h)
        )  # x_h_grad:batch,units | x_h:batch,units | tanh_h_grad:batch,units
        x_r_grad = tanh_h_grad.dot(
            recurrent_kernel_h.T
        ) * prev_h  # tanh_h_grad:batch,units | recurent_kernel_z:units, units | prev_h:batch,units | x_r_grad:batch,units

        sig_r_grad = x_r_grad * x_r * (
            1 - x_r
        )  # x_r_grad:batch,units | x_r:batch,units | sig_r_grad:batch,units
        sig_z_grad = x_z_grad * x_z * (
            1 - x_z
        )  # x_z_grad:batch,units | x_z:batch,units | sig_z_grad:batch,units

        # x_grad_out, prev_h_grad_out = out_grad
        x_grad = sig_r_grad.dot(kernel_r.T) + sig_z_grad.dot(
            kernel_z.T
        ) + tanh_h_grad.dot(
            kernel_h.T
        )  # sig_r_grad:batch,units | kernel_r:feature, units |sig_z_grad:batch,units | kernel_z:feature, units | tanh_h_grad:batch,units | kernel_h:feature, units | x_grad:batch,features
        prev_h_grad = sig_r_grad.dot(recurrent_kernel_r.T) + sig_z_grad.dot(
            recurrent_kernel_z.T
        ) + (
            tanh_h_grad.dot(recurrent_kernel_h.T) * x_r
        ) + out_grad * x_z  # sig_r_grad:batch,units | recurrent_kernel_r:units, units | prev_h_grad: batch,units

        kernel_r_grad = (
            (sig_r_grad.T).dot(x)
        ).T  # sig_r_grad:batch,units | x:batch,features | kernel_r_grad: features,units
        kernel_z_grad = (
            (sig_z_grad.T).dot(x)
        ).T  # sig_z_grad:batch,units | x:batch,features | kernel_z_grad: features,units
        kernel_h_grad = (
            (tanh_h_grad.T).dot(x)
        ).T  # tanh_h_grad:batch,units | x:batch,features | kernel_h_grad: features,units

        recurrent_kernel_r_grad = (
            (sig_r_grad.T).dot(prev_h)
        ).T  # sig_r_grad:batch,units | prev_h:batch,units | recurrent_kernel_r_grad:units,units
        recurrent_kernel_z_grad = (
            (sig_z_grad.T).dot(prev_h)
        ).T  # sig_z_grad:batch,units | prev_h:batch,units | recurrent_kernel_z_grad:units,units
        recurrent_kernel_h_grad = ((tanh_h_grad.T).dot(
            x_r * prev_h)).T  #x_h_grad * (1 - np.square(x_h)) * x_r * prev_h
        #####################################################################################

        in_grad = [x_grad, prev_h_grad]
        kernel_grad = np.concatenate(
            [kernel_z_grad, kernel_r_grad, kernel_h_grad], axis=-1)
        r_kernel_grad = np.concatenate([
            recurrent_kernel_z_grad, recurrent_kernel_r_grad,
            recurrent_kernel_h_grad
        ],
                                       axis=-1)

        return in_grad, kernel_grad, r_kernel_grad
Exemple #9
0
	def backward(self, out_grad, input, kernel, recurrent_kernel):
		"""
		# Arguments
			in_grads: numpy array with shape (batch, units), gradients to outputs
			inputs:   [input numpy array with shape (batch, in_features), 
					   state numpy array with shape (batch, units)], same with forward inputs

		# Returns
			out_grads: [gradients to input numpy array with shape (batch, in_features), 
						gradients to state numpy array with shape (batch, units)]
		"""
		x, prev_h = input
		_, all_units = kernel.shape
		units = all_units // 3

		kernel_z = kernel[:, :units]
		kernel_r = kernel[:, units:2 * units]
		kernel_h = kernel[:, 2 * units:all_units]

		recurrent_kernel_z = recurrent_kernel[:, :units]
		recurrent_kernel_r = recurrent_kernel[:, units:2*units]
		recurrent_kernel_h = recurrent_kernel[:, 2*units:all_units]

		# code here
		#####################################################################################
		# Initialize
		x_grad, prev_h_grad = np.zeros_like(x), np.zeros_like(prev_h)
		kernel_z_grad, recurrent_kernel_z_grad = np.zeros_like(kernel_z), np.zeros_like(recurrent_kernel_z)
		kernel_r_grad, recurrent_kernel_r_grad = np.zeros_like(kernel_r), np.zeros_like(recurrent_kernel_r)
		kernel_h_grad, recurrent_kernel_h_grad = np.zeros_like(kernel_h), np.zeros_like(recurrent_kernel_h)

		# Compute basic information
		x_z = sigmoid(np.matmul(x, kernel_z) + np.matmul(prev_h, recurrent_kernel_z))
		x_r = sigmoid(np.matmul(x, kernel_r) + np.matmul(prev_h, recurrent_kernel_r))
		x_h = np.tanh(np.matmul(x, kernel_h) + np.matmul(prev_h * x_r, recurrent_kernel_h))

		# Compute for new gate
		tmp_h = out_grad * (1 - x_h**2) * (1 - x_z)
		matrix1_h = np.matmul(tmp_h, np.transpose(kernel_h))
		matrix2_h = np.matmul(tmp_h, np.transpose(recurrent_kernel_h))

		# Compute for update gate
		tmp_z = out_grad * (prev_h - x_h) * x_z * (1 - x_z)
		matrix1_z = np.matmul(tmp_z, np.transpose(kernel_z))
		matrix2_z = np.matmul(tmp_z, np.transpose(recurrent_kernel_z))

		# Compute for reset gate
		tmp_r = matrix2_h * prev_h * (x_r * (1 - x_r))
		matrix1_r = np.matmul(tmp_r, np.transpose(kernel_r))
		matrix2_r = np.matmul(tmp_r, np.transpose(recurrent_kernel_r))
		
		# Compute the gradient of input
		x_grad      = matrix1_z + matrix1_r + matrix1_h
		prev_h_grad = matrix2_z + matrix2_r + matrix2_h * x_r + out_grad * x_z
		
		# Compute the gradient of kernel
		kernel_r_grad = np.matmul(np.transpose(x), tmp_r)
		kernel_z_grad = np.matmul(np.transpose(x), tmp_z)
		kernel_h_grad = np.matmul(np.transpose(x), tmp_h)

		# Compute the gradient of recurrent kernel
		recurrent_kernel_r_grad = np.matmul(np.transpose(prev_h), tmp_r)
		recurrent_kernel_z_grad = np.matmul(np.transpose(prev_h), tmp_z)
		recurrent_kernel_h_grad = np.matmul(np.transpose(prev_h * x_r), tmp_h)
		#####################################################################################

		in_grad = [x_grad, prev_h_grad]
		kernel_grad           = np.concatenate([kernel_z_grad, kernel_r_grad, kernel_h_grad], axis=-1)
		recurrent_kernel_grad = np.concatenate([recurrent_kernel_z_grad, recurrent_kernel_r_grad, recurrent_kernel_h_grad], axis=-1)

		return in_grad, kernel_grad, recurrent_kernel_grad
    def backward(self, out_grad, input, kernel, recurrent_kernel):
        """
        # Arguments
            in_grads: numpy array with shape (batch, units), gradients to outputs
            inputs: [input numpy array with shape (batch, in_features), 
                    state numpy array with shape (batch, units)], same with forward inputs

        # Returns
            out_grads: [gradients to input numpy array with shape (batch, in_features), 
                        gradients to state numpy array with shape (batch, units)]
        """
        x, prev_h = input
        _, all_units = kernel.shape
        units = all_units // 3
        kernel_z, kernel_r, kernel_h = kernel[:, :
                                              units], kernel[:, units:2 *
                                                             units], kernel[:,
                                                                            2 *
                                                                            units:
                                                                            all_units]
        recurrent_kernel_z = recurrent_kernel[:, :units]
        recurrent_kernel_r = recurrent_kernel[:, units:2 * units]
        recurrent_kernel_h = recurrent_kernel[:, 2 * units:all_units]
        # reset gate
        x_r = sigmoid(x.dot(kernel_r) + prev_h.dot(recurrent_kernel_r))
        # update gate
        x_z = sigmoid(x.dot(kernel_z) + prev_h.dot(recurrent_kernel_z))
        # new gate
        x_h = np.tanh(
            x.dot(kernel_h) + np.dot(x_r * prev_h, recurrent_kernel_h))
        output = (1 - x_z) * x_h + x_z * prev_h

        sig_grad_xr = x_r * (1 - x_r)
        sig_grad_xz = x_z * (1 - x_z)
        tanh_grad_xh = (1 - np.square(x_h))

        x_grad = (out_grad * (prev_h - x_h) * sig_grad_xz).dot(kernel_z.T) + \
            (out_grad * (1 - x_z) * tanh_grad_xh).dot(kernel_h.T) + \
            ((out_grad * (1 - x_z) * tanh_grad_xh).dot(recurrent_kernel_h.T) * prev_h * sig_grad_xr).dot(kernel_r.T)

        prev_h_grad = out_grad * x_z + (out_grad * (prev_h - x_h) * sig_grad_xz).dot(recurrent_kernel_z.T) + \
            (out_grad * (1 - x_z) * tanh_grad_xh).dot(recurrent_kernel_h.T) * x_r + \
            ((out_grad * (1 - x_z) * tanh_grad_xh).dot(recurrent_kernel_h.T) * prev_h * sig_grad_xr).dot(recurrent_kernel_r.T)

        kernel_r_grad = x.T.dot(
            (out_grad * (1 - x_z) * tanh_grad_xh).dot(recurrent_kernel_h.T) *
            prev_h * sig_grad_xr)
        kernel_z_grad = x.T.dot(out_grad * (prev_h - x_h) * sig_grad_xz)
        kernel_h_grad = x.T.dot(out_grad * (1 - x_z) * tanh_grad_xh)

        recurrent_kernel_r_grad = prev_h.T.dot(
            (out_grad * (1 - x_z) * tanh_grad_xh).dot(recurrent_kernel_h.T) *
            prev_h * sig_grad_xr)
        recurrent_kernel_z_grad = prev_h.T.dot(out_grad * (prev_h - x_h) *
                                               sig_grad_xz)
        recurrent_kernel_h_grad = (x_r * prev_h).T.dot(out_grad * (1 - x_z) *
                                                       tanh_grad_xh)

        in_grad = [x_grad, prev_h_grad]
        kernel_grad = np.concatenate(
            [kernel_z_grad, kernel_r_grad, kernel_h_grad], axis=-1)
        r_kernel_grad = np.concatenate([
            recurrent_kernel_z_grad, recurrent_kernel_r_grad,
            recurrent_kernel_h_grad
        ],
                                       axis=-1)

        return in_grad, kernel_grad, r_kernel_grad
Exemple #11
0
 def sigmoid(self):
     return F.sigmoid(self)
Exemple #12
0
    def backward(self, out_grad, input, kernel, recurrent_kernel):
        """
        # Arguments
            in_grads: numpy array with shape (batch, units), gradients to outputs
            inputs: [input numpy array with shape (batch, in_features), 
                    state numpy array with shape (batch, units)], same with forward inputs

        # Returns
            out_grads: [gradients to input numpy array with shape (batch, in_features), 
                        gradients to state numpy array with shape (batch, units)]
        """
        x, prev_h = input
        _, all_units = kernel.shape
        units = all_units // 3
        kernel_z, kernel_r, kernel_h = kernel[:, :
                                              units], kernel[:, units:2 *
                                                             units], kernel[:,
                                                                            2 *
                                                                            units:
                                                                            all_units]
        recurrent_kernel_z = recurrent_kernel[:, :units]
        recurrent_kernel_r = recurrent_kernel[:, units:2 * units]
        recurrent_kernel_h = recurrent_kernel[:, 2 * units:all_units]

        #####################################################################################
        # code here
        x_z = sigmoid(prev_h.dot(recurrent_kernel_z) + x.dot(kernel_z))
        x_r = sigmoid(prev_h.dot(recurrent_kernel_r) + x.dot(kernel_r))
        prev_h_in = x_r * prev_h
        x_h = np.tanh(prev_h_in.dot(recurrent_kernel_h) + x.dot(kernel_h))

        d1 = x_z * out_grad
        d2 = prev_h * out_grad
        d3 = x_h * out_grad
        d4 = -1 * d3
        d5 = d2 + d4
        d6 = (1 - x_z) * out_grad
        d7 = d5 * (x_z * (1 - x_z))
        d8 = d6 * (1 - np.square(x_h))
        d9 = d8.dot(kernel_h.T)
        d10 = d8.dot(recurrent_kernel_h.T)
        d11 = d7.dot(kernel_z.T)
        d12 = d7.dot(recurrent_kernel_z.T)
        d14 = d10 * x_r
        d15 = d10 * prev_h

        d16 = d15 * (x_r * (1 - x_r))
        d13 = d16.dot(kernel_r.T)
        d17 = d16.dot(recurrent_kernel_r.T)

        x_grad = d9 + d11 + d13

        prev_h_grad = d12 + d14 + d1 + d17

        kernel_r_grad = np.dot(x.T, d16)
        kernel_z_grad = np.dot(x.T, d7)
        kernel_h_grad = np.dot(x.T, d8)

        recurrent_kernel_r_grad = np.dot(prev_h.T, d16)
        recurrent_kernel_z_grad = np.dot(prev_h.T, d7)
        recurrent_kernel_h_grad = np.dot((prev_h * x_r).T, d8)
        #####################################################################################

        in_grad = [x_grad, prev_h_grad]
        kernel_grad = np.concatenate(
            [kernel_z_grad, kernel_r_grad, kernel_h_grad], axis=-1)
        r_kernel_grad = np.concatenate([
            recurrent_kernel_z_grad, recurrent_kernel_r_grad,
            recurrent_kernel_h_grad
        ],
                                       axis=-1)

        return in_grad, kernel_grad, r_kernel_grad