def backpropagation(self, input, output): ''' Compute gradientes, with the back-propagation method inputs: x: vector with the (embedding) indicies of the words of a sentence outputs: vector with the indicies of the tags for each word of the sentence outputs: gradient_parameters: vector with parameters gradientes ''' # Get parameters and sizes W_e, W_x, W_h, W_y = self.parameters nr_steps = input.shape[0] log_p_y, y, h, z_e, x = self.log_forward(input) p_y = np.exp(log_p_y) # Initialize gradients with zero entrances gradient_W_e = np.zeros(W_e.shape) gradient_W_x = np.zeros(W_x.shape) gradient_W_h = np.zeros(W_h.shape) gradient_W_y = np.zeros(W_y.shape) # ---------- # Solution to Exercise 6.1 # Gradient of the cost with respect to the last linear model I = index2onehot(output, W_y.shape[0]) error = (p_y - I) / nr_steps # backward pass, with gradient computation error_h_next = np.zeros_like(h[0, :]) for t in reversed(range(nr_steps)): # Output linear error_h = np.dot(W_y.T, error[t, :]) + error_h_next # Non-linear error_raw = h[t+1, :] * (1. - h[t+1, :]) * error_h # Hidden-linear error_h_next = np.dot(W_h.T, error_raw) # Weight gradients gradient_W_y += np.outer(error[t, :], h[t+1, :]) gradient_W_h += np.outer(error_raw, h[t, :]) gradient_W_x += np.outer(error_raw, z_e[t, :]) gradient_W_e[x[t], :] += W_x.T.dot(error_raw) # End of Solution to Exercise 6.1 # ---------- # Normalize over sentence length gradient_parameters = [ gradient_W_e, gradient_W_x, gradient_W_h, gradient_W_y ] return gradient_parameters
def backpropagation(self, input, output): """Gradients for sigmoid hidden layers and output softmax""" # Run forward and store activations for each layer log_prob_y, layer_inputs = self.log_forward(input) prob_y = np.exp(log_prob_y) num_examples, num_clases = prob_y.shape num_hidden_layers = len(self.parameters) - 1 # For each layer in reverse store the backpropagated error, then compute # the gradients from the errors and the layer inputs errors = [] # ---------- # Solution to Exercise 3.2 # Initial error is the cost derivative at the last layer (for cross # entropy cost) I = index2onehot(output, num_clases) error = (prob_y - I) / num_examples errors.append(error) # Backpropagate through each layer for n in reversed(range(num_hidden_layers)): # Backpropagate through linear layer error = np.dot(error, self.parameters[n+1][0]) # Backpropagate through sigmoid layer error *= layer_inputs[n+1] * (1-layer_inputs[n+1]) # Collect error errors.append(error) # Reverse errors errors = errors[::-1] # Compute gradients from errors gradients = [] for n in range(num_hidden_layers + 1): # Weight gradient weight_gradient = np.zeros(self.parameters[n][0].shape) for l in range(num_examples): weight_gradient += np.outer( errors[n][l, :], layer_inputs[n][l, :] ) # Bias gradient bias_gradient = np.sum(errors[n], axis=0, keepdims=True) # Store gradients gradients.append([weight_gradient, bias_gradient]) # End of solution to Exercise 3.2 # ---------- return gradients
def backpropagation(self, input, output): """Gradients for sigmoid hidden layers and output softmax""" # Run forward and store activations for each layer log_prob_y, layer_inputs = self.log_forward(input) prob_y = np.exp(log_prob_y) num_examples, num_clases = prob_y.shape num_hidden_layers = len(self.parameters) - 1 # For each layer in reverse store the backpropagated error, then compute # the gradients from the errors and the layer inputs errors = [] # ---------- # Solution to Exercise 2 # Initial error is the cost derivative at the last layer (for cross # entropy cost) I = index2onehot(output, num_clases) error = (prob_y - I) / num_examples errors.append(error) # Backpropagate through each layer for n in reversed(range(num_hidden_layers)): # Backpropagate through linear layer error = np.dot(error, self.parameters[n+1][0]) # Backpropagate through sigmoid layer error *= layer_inputs[n+1] * (1-layer_inputs[n+1]) # Collect error errors.append(error) # Reverse errors errors = errors[::-1] # Compute gradients from errors gradients = [] for n in range(num_hidden_layers + 1): # Weight gradient weight_gradient = np.zeros(self.parameters[n][0].shape) for l in range(num_examples): weight_gradient += np.outer( errors[n][l, :], layer_inputs[n][l, :] ) # Bias gradient bias_gradient = np.sum(errors[n], axis=0, keepdims=True) # Store gradients gradients.append([weight_gradient, bias_gradient]) # End of solution to Exercise 2 # ---------- return gradients
def update(self, input=None, output=None): """Stochastic Gradient Descent update""" # Probabilities of each class class_probabilities = np.exp(self.log_forward(input)) batch_size, num_classes = class_probabilities.shape # Error derivative at softmax layer I = index2onehot(output, num_classes) error = (class_probabilities - I) / batch_size # Weight gradient gradient_weight = np.zeros(self.weight.shape) for l in range(batch_size): gradient_weight += np.outer(error[l, :], input[l, :]) # Bias gradient gradient_bias = np.sum(error, axis=0, keepdims=True) # SGD update self.weight = self.weight - self.learning_rate * gradient_weight self.bias = self.bias - self.learning_rate * gradient_bias
def update(self, input=None, output=None): """Stochastic Gradient Descent update""" # Probabilities of each class class_probabilities = np.exp(self.log_forward(input)) batch_size, num_classes = class_probabilities.shape # Error derivative at softmax layer I = index2onehot(output, num_classes) error = (class_probabilities - I) / batch_size # Weight gradient gradient_weight = np.zeros(self.weight.shape) for l in np.arange(batch_size): gradient_weight += np.outer(error[l, :], input[l, :]) # Bias gradient gradient_bias = np.sum(error, axis=0, keepdims=True) # SGD update self.weight = self.weight - self.learning_rate * gradient_weight self.bias = self.bias - self.learning_rate * gradient_bias
def backpropagation(self, input, output): #print('my backprop...') ''' Network definition: ++++++++++++++++++ [] => node z1 -> a1 -> z2 -> a2 -> [x] [x] [o] [x] [o] [OP0] [.] w1 [.] w2 [.] (13989, 20) [.] (20,2) [OP1] [.] [o] [x] Input(X) Hidden Layer1 Ouptput nodes 13989 20 2 Definitions: z1 = np.dot(self.x, self.w1) + self.b1 self.a1 = sigmoid(z1) z2 = np.dot(self.a1, self.w2) + self.b2 self.a2 = sigmoid(z2) Derivatives for back-propagation: ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Let C = Cross Entropy loss. We want to use this loss and back propagate and adjust the weights aka parameters of our model. 1. You need to start from the last layer. Hence start with w2, b2 Using chain rule, dC dC da2 dz2 --- = --- . --- . --- dw2 da2 dz2 dw2 z2 = a2w2 + b3 a2 = softmax(z2) dz2/dw2 = a2 da2/dz2 = my_softmax_derivative(z2) dC/da2 = cost function derivative(a2) => the model code already calculates this for us. Let, a2_delta be the product of the terms below: dC da2 a2_delta = --- . --- da2 dz2 Note, a2_delta = error in the current code (the error-derivative to be propagated) dC --- = a2_delta . a2 (1) dw2 For changes in biases, dC dC da2 dz2 --- = --- . --- . --- db2 da2 dz2 db2 dz2/db2 = 1. First two terms as same from the above equation. Hence, dC --- = a2_delta (2) db2 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ (copied from flow above, helps in chain rule) z1 -> a1 -> z2 -> a2 -> w1 -> w2-> 2. Now do the same for w1, b1. z1 = x.w1 + b1 a1 = sigmoid(z1) dC dC da1 dz1 --- = --- . --- . --- dw1 da1 dz1 dw1 dz1/dw1 = x da1/dz1 = sigmoid_derv(z1) -- (A) dC dC da2 dz2 --- = --- . --- . --- => dC/da1 = a2_delta.w2 -- (B) da1 da2 dz2 da1 Thus, dC dC da1 dz1 --- = --- . --- . --- dw1 da1 dz1 dw1 and set a1_delta = dC/da1 . da1/dz1 dC/dw1 = a1_delta * x ----------------- (3) where a1_delta = (a2_delta.w2) * sigmoid_derv(z1) (from A & B) --- equation (4) dC dC da1 dz1 --- = --- . --- . --- db1 da1 dz1 db1 = a1_delta -- (5) In this model definition of the class, parameters contain weights and biases. this is the shape: + self.parameters[0][0].shape (20, 13989) tuple + self.parameters[0][1].shape (1, 20) tuple 13989 => input dimension. 20 => hidden layer dimension. self.parameters[0][1].shape[0] = 1 represents the bias term. You need that for all the hidden nodes. + self.parameters[1][0].shape (2, 20) tuple + self.parameters[1][1].shape (1, 2) tuple layer inputs contain the inputs to the hidden and the output nodes The '30' is the batch size. + layer_inputs[0].shape (30, 13989) tuple + layer_inputs[1].shape (30, 20) tuple ''' # Run forward and store activations for each layer log_prob_y, layer_inputs = self.log_forward(input) prob_y = np.exp(log_prob_y) num_examples, num_clases = prob_y.shape num_hidden_layers = len(self.parameters) - 1 # Initial error is the cost derivative at the last layer (for cross entropy cost) I = index2onehot(output, num_clases) error = (prob_y - I) / num_examples #CE derivate a2 = prob_y #output from last layer a1 = layer_inputs[1] x = layer_inputs[0] #why am i taking .T, since the weigths are stored as such, opposite to what i would conceptually expect w2 = self.parameters[1][0].T w1 = self.parameters[0][0].T a2_delta = error #details for CE http://neuralnetworksanddeeplearning.com/chap3.html#introducing_the_cross-entropy_cost_function a1_delta = np.dot(a2_delta, w2.T) * self.my_sigmoid_derivative( a1) # eq(4) gradient_w2 = np.dot(a1.T, a2_delta) #eq (1) gradient_b2 = np.sum(a2_delta, axis=0, keepdims=True) #eq (2) gradient_w1 = np.dot(x.T, a1_delta) #eq (3) gradient_b1 = np.sum(a1_delta, axis=0) #eq (5) gradients = [] gradients.append([gradient_w1.T, np.asmatrix(gradient_b1)]) gradients.append([gradient_w2.T, gradient_b2]) return gradients