Beispiel #1
0
    def backpropagation(self, input, output):

        '''
        Compute gradientes, with the back-propagation method
        inputs:
            x: vector with the (embedding) indicies of the words of a
                sentence
            outputs: vector with the indicies of the tags for each word of
                        the sentence outputs:
            gradient_parameters: vector with parameters gradientes
        '''

        # Get parameters and sizes
        W_e, W_x, W_h, W_y = self.parameters
        nr_steps = input.shape[0]

        log_p_y, y, h, z_e, x = self.log_forward(input)
        p_y = np.exp(log_p_y)

        # Initialize gradients with zero entrances
        gradient_W_e = np.zeros(W_e.shape)
        gradient_W_x = np.zeros(W_x.shape)
        gradient_W_h = np.zeros(W_h.shape)
        gradient_W_y = np.zeros(W_y.shape)

        # ----------
        # Solution to Exercise 6.1

        # Gradient of the cost with respect to the last linear model
        I = index2onehot(output, W_y.shape[0])
        error = (p_y - I) / nr_steps

        # backward pass, with gradient computation
        error_h_next = np.zeros_like(h[0, :])
        for t in reversed(range(nr_steps)):

            # Output linear
            error_h = np.dot(W_y.T, error[t, :]) + error_h_next

            # Non-linear
            error_raw = h[t+1, :] * (1. - h[t+1, :]) * error_h

            # Hidden-linear
            error_h_next = np.dot(W_h.T, error_raw)

            # Weight gradients
            gradient_W_y += np.outer(error[t, :], h[t+1, :])
            gradient_W_h += np.outer(error_raw, h[t, :])
            gradient_W_x += np.outer(error_raw, z_e[t, :])
            gradient_W_e[x[t], :] += W_x.T.dot(error_raw)

        # End of Solution to Exercise 6.1
        # ----------

        # Normalize over sentence length
        gradient_parameters = [
            gradient_W_e, gradient_W_x, gradient_W_h, gradient_W_y
        ]

        return gradient_parameters
Beispiel #2
0
    def backpropagation(self, input, output):
        """Gradients for sigmoid hidden layers and output softmax"""

        # Run forward and store activations for each layer
        log_prob_y, layer_inputs = self.log_forward(input)
        prob_y = np.exp(log_prob_y)

        num_examples, num_clases = prob_y.shape
        num_hidden_layers = len(self.parameters) - 1

        # For each layer in reverse store the backpropagated error, then compute
        # the gradients from the errors and the layer inputs
        errors = []

        # ----------
        # Solution to Exercise 3.2

        # Initial error is the cost derivative at the last layer (for cross
        # entropy cost)
        I = index2onehot(output, num_clases)
        error = (prob_y - I) / num_examples
        errors.append(error)

        # Backpropagate through each layer
        for n in reversed(range(num_hidden_layers)):

            # Backpropagate through linear layer
            error = np.dot(error, self.parameters[n+1][0])

            # Backpropagate through sigmoid layer
            error *= layer_inputs[n+1] * (1-layer_inputs[n+1])

            # Collect error
            errors.append(error)

        # Reverse errors
        errors = errors[::-1]

        # Compute gradients from errors
        gradients = []
        for n in range(num_hidden_layers + 1):

            # Weight gradient
            weight_gradient = np.zeros(self.parameters[n][0].shape)
            for l in range(num_examples):
                weight_gradient += np.outer(
                    errors[n][l, :],
                    layer_inputs[n][l, :]
                )

            # Bias gradient
            bias_gradient = np.sum(errors[n], axis=0, keepdims=True)

            # Store gradients
            gradients.append([weight_gradient, bias_gradient])

        # End of solution to Exercise 3.2
        # ----------

        return gradients
Beispiel #3
0
    def backpropagation(self, input, output):
        """Gradients for sigmoid hidden layers and output softmax"""

        # Run forward and store activations for each layer
        log_prob_y, layer_inputs = self.log_forward(input)
        prob_y = np.exp(log_prob_y)

        num_examples, num_clases = prob_y.shape
        num_hidden_layers = len(self.parameters) - 1

        # For each layer in reverse store the backpropagated error, then compute
        # the gradients from the errors and the layer inputs
        errors = []

        # ----------
        # Solution to Exercise 2

        # Initial error is the cost derivative at the last layer (for cross
        # entropy cost)
        I = index2onehot(output, num_clases)
        error = (prob_y - I) / num_examples
        errors.append(error)

        # Backpropagate through each layer
        for n in reversed(range(num_hidden_layers)):

            # Backpropagate through linear layer
            error = np.dot(error, self.parameters[n+1][0])

            # Backpropagate through sigmoid layer
            error *= layer_inputs[n+1] * (1-layer_inputs[n+1])

            # Collect error
            errors.append(error)

        # Reverse errors
        errors = errors[::-1]

        # Compute gradients from errors
        gradients = []
        for n in range(num_hidden_layers + 1):

            # Weight gradient
            weight_gradient = np.zeros(self.parameters[n][0].shape)
            for l in range(num_examples):
                weight_gradient += np.outer(
                    errors[n][l, :],
                    layer_inputs[n][l, :]
                )

            # Bias gradient
            bias_gradient = np.sum(errors[n], axis=0, keepdims=True)

            # Store gradients
            gradients.append([weight_gradient, bias_gradient])

        # End of solution to Exercise 2
        # ----------

        return gradients
Beispiel #4
0
    def update(self, input=None, output=None):
        """Stochastic Gradient Descent update"""

        # Probabilities of each class
        class_probabilities = np.exp(self.log_forward(input))
        batch_size, num_classes = class_probabilities.shape

        # Error derivative at softmax layer
        I = index2onehot(output, num_classes)
        error = (class_probabilities - I) / batch_size

        # Weight gradient
        gradient_weight = np.zeros(self.weight.shape)
        for l in range(batch_size):
            gradient_weight += np.outer(error[l, :], input[l, :])

        # Bias gradient
        gradient_bias = np.sum(error, axis=0, keepdims=True)

        # SGD update
        self.weight = self.weight - self.learning_rate * gradient_weight
        self.bias = self.bias - self.learning_rate * gradient_bias
    def update(self, input=None, output=None):
        """Stochastic Gradient Descent update"""

        # Probabilities of each class
        class_probabilities = np.exp(self.log_forward(input))
        batch_size, num_classes = class_probabilities.shape

        # Error derivative at softmax layer
        I = index2onehot(output, num_classes)
        error = (class_probabilities - I) / batch_size

        # Weight gradient
        gradient_weight = np.zeros(self.weight.shape)
        for l in np.arange(batch_size):
            gradient_weight += np.outer(error[l, :], input[l, :])

        # Bias gradient
        gradient_bias = np.sum(error, axis=0, keepdims=True)

        # SGD update
        self.weight = self.weight - self.learning_rate * gradient_weight
        self.bias = self.bias - self.learning_rate * gradient_bias
Beispiel #6
0
    def backpropagation(self, input, output):
        #print('my backprop...')
        '''
        Network definition:
        ++++++++++++++++++
        [] => node
        
               z1 ->         a1 ->       z2 ->     a2 ->                                     
        [x]       
        [x]                   [o]
        [x]                   [o]                  [OP0]
        [.]       w1          [.]         w2      
        [.]  (13989, 20)      [.]       (20,2)     [OP1]
        [.]                   [o]
        [x]

      Input(X)           Hidden Layer1        Ouptput nodes  
        13989                20                     2

        Definitions:
        z1 = np.dot(self.x, self.w1) + self.b1
        self.a1 = sigmoid(z1)
        z2 = np.dot(self.a1, self.w2) + self.b2
        self.a2 = sigmoid(z2)

        Derivatives for back-propagation:
        ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
        Let C = Cross Entropy loss. We want to use this loss and back propagate and adjust
        the weights aka parameters of our model.

        1. You need to start from the last layer. Hence start with w2, b2

            Using chain rule,
            dC     dC    da2   dz2
            ---  = --- . --- . ---
            dw2    da2   dz2   dw2

            z2 = a2w2 + b3
            a2 = softmax(z2)
            dz2/dw2 = a2
            da2/dz2 = my_softmax_derivative(z2)
            dC/da2  = cost function derivative(a2) => the model code already calculates this for us.

            Let, a2_delta be the product of the terms below:
                       dC    da2
            a2_delta = --- . --- 
                       da2   dz2

            Note, a2_delta = error in the current code (the error-derivative to be propagated)

            dC     
            ---  = a2_delta . a2            (1)
            dw2


            For changes in biases,
            dC     dC    da2   dz2
            ---  = --- . --- . ---
            db2    da2   dz2   db2

            dz2/db2 = 1. First two terms as same from the above equation.
            Hence,
            dC
            ---  = a2_delta                 (2)
            db2    


            +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

            (copied from flow above, helps in chain rule)

                   z1 ->         a1 ->       z2 ->     a2 ->                                     
              w1 ->                     w2->  


          2. Now do the same for w1, b1.

            z1 = x.w1 + b1
            a1 = sigmoid(z1)

            dC     dC    da1   dz1
            ---  = --- . --- . ---
            dw1    da1   dz1   dw1

            dz1/dw1 = x
            da1/dz1 = sigmoid_derv(z1)              -- (A)

            dC     dC    da2   dz2
            ---  = --- . --- . --- => dC/da1 = a2_delta.w2 -- (B)
            da1    da2   dz2   da1

            Thus,
            dC     dC    da1   dz1
            ---  = --- . --- . ---
            dw1    da1   dz1   dw1

            and set a1_delta = dC/da1 . da1/dz1

            dC/dw1 = a1_delta * x                       -----------------   (3)

            where
            a1_delta = (a2_delta.w2) * sigmoid_derv(z1) (from A & B) --- equation (4)


            dC     dC    da1   dz1
            ---  = --- . --- . ---
            db1    da1   dz1   db1
                  = a1_delta                            -- (5)


        In this model definition of the class,
        parameters contain weights and biases.
        this is the shape:
        +		self.parameters[0][0].shape	(20, 13989)	tuple
        +		self.parameters[0][1].shape	(1, 20)	tuple
        13989 => input dimension.
        20 => hidden layer dimension.
        self.parameters[0][1].shape[0] = 1 represents the bias term. You need that for all the hidden nodes.

        +		self.parameters[1][0].shape	(2, 20)	tuple
        +		self.parameters[1][1].shape	(1, 2)	tuple

        layer inputs contain the inputs to the hidden and the output nodes
        The '30' is the batch size.
        +		layer_inputs[0].shape	(30, 13989)	tuple
        +		layer_inputs[1].shape	(30, 20)	tuple
        '''
        # Run forward and store activations for each layer
        log_prob_y, layer_inputs = self.log_forward(input)
        prob_y = np.exp(log_prob_y)

        num_examples, num_clases = prob_y.shape
        num_hidden_layers = len(self.parameters) - 1

        # Initial error is the cost derivative at the last layer (for cross entropy cost)
        I = index2onehot(output, num_clases)
        error = (prob_y - I) / num_examples  #CE derivate

        a2 = prob_y  #output from last layer
        a1 = layer_inputs[1]
        x = layer_inputs[0]

        #why am i taking .T, since the weigths are stored as such, opposite to what i would conceptually expect
        w2 = self.parameters[1][0].T
        w1 = self.parameters[0][0].T

        a2_delta = error  #details for CE http://neuralnetworksanddeeplearning.com/chap3.html#introducing_the_cross-entropy_cost_function
        a1_delta = np.dot(a2_delta, w2.T) * self.my_sigmoid_derivative(
            a1)  # eq(4)

        gradient_w2 = np.dot(a1.T, a2_delta)  #eq (1)
        gradient_b2 = np.sum(a2_delta, axis=0, keepdims=True)  #eq (2)
        gradient_w1 = np.dot(x.T, a1_delta)  #eq (3)
        gradient_b1 = np.sum(a1_delta, axis=0)  #eq (5)
        gradients = []
        gradients.append([gradient_w1.T, np.asmatrix(gradient_b1)])
        gradients.append([gradient_w2.T, gradient_b2])
        return gradients