def backprop3(self, x, y): nabla_b = [np.zeros(b.shape) for b in self.biases] nabla_w = [np.zeros(w.shape) for w in self.weights] # feedforward activation = x activations = [x] # list to store all the activation matrices, layer by layer zs = [] # list to store all the "sum of weighted inputs z" matrices, layer by layer i = 0 for b, w in zip(self.biases, self.weights): # insert the vector of biases on the first column of the weight matrix w = np.insert(w, 0, b.transpose(), axis=1) i = i+1 # insert ones on the first line of the matrix of activations activation = np.insert(activation, 0, np.ones(activation[0].shape), 0) z = np.dot(w, activation) zs.append(z) activation = sigmoid(z) activations.append(activation) delta = self.cost_derivative(activations[-1], y) * \ sigmoid_prime(zs[-1]) nabla_b[-1] = np.expand_dims(np.sum(delta, axis=1), axis=1) nabla_w[-1] = np.dot(delta, activations[-2].transpose()) for l in range(2, self.num_layers): z = zs[-l] sp = sigmoid_prime(z) delta = np.dot(self.weights[-l + 1].transpose(), delta) * sp nabla_b[-l] = np.expand_dims(np.sum(delta, axis=1), axis=1) nabla_w[-l] = np.dot(delta, activations[-l - 1].transpose()) return (nabla_b, nabla_w)
def backprop(self, x, y): """Single sample based. Return a tuple ``(nabla_b, nabla_w)`` representing the gradient for the cost function C_x. ``nabla_b`` and ``nabla_w`` are layer-by-layer lists of numpy arrays, similar to ``self.biases`` and ``self.weights``. """ nabla_b = [np.zeros(b.shape) for b in self.biases] nabla_w = [np.zeros(w.shape) for w in self.weights] # feedforward activation = x activations = [x] # list to store all the activations, layer by layer zs = [] # list to store all the z vectors, layer by layer for b, w in zip(self.biases, self.weights): z = np.dot(w, activation)+b zs.append(z) activation = sigmoid(z) activations.append(activation) # backward pass delta = self.cost_derivative(activations[-1], y) * sigmoid_prime(zs[-1]) # IMPORTANT nabla_b[-1] = delta nabla_w[-1] = np.dot(delta, activations[-2].transpose()) for l in range(2, self.num_layers): z = zs[-l] sp = sigmoid_prime(z) delta = np.dot(self.weights[-l+1].transpose(), delta) * sp nabla_b[-l] = delta nabla_w[-l] = np.dot(delta, activations[-l-1].transpose()) return (nabla_b, nabla_w)
def backprop_matrix(self, x, y): """Full-batch method. """ nabla_b = [np.zeros(b.shape) for b in self.biases] nabla_w = [np.zeros(w.shape) for w in self.weights] # feedforward activation = x activations = [x] # list to store all the activations, layer by layer zs = [] # list to store all the z vectors, layer by layer for b, w in zip(self.biases, self.weights): z = np.dot(w, activation) + np.repeat(b, activation.shape[1], axis=1) # IMPORTANT zs.append(z) activation = sigmoid(z) activations.append(activation) # backward pass delta = self.cost_derivative(activations[-1], y) * sigmoid_prime(zs[-1]) nabla_b[-1] = np.sum(delta, axis=1).reshape([-1, 1]) nabla_w[-1] = np.dot(delta, activations[-2].transpose()) for l in range(2, self.num_layers): z = zs[-l] sp = sigmoid_prime(z) delta = np.dot(self.weights[-l+1].transpose(), delta) * sp nabla_b[-l] = np.sum(delta, axis=1).reshape([-1, 1]) nabla_w[-l] = np.dot(delta, activations[-l-1].transpose()) return (nabla_b, nabla_w)
def compute_grads(X: np.ndarray, Y: np.ndarray, cache: dict, params: dict) -> dict: """ Compute gradients using backpropagation algorithm Parameters ---------- X: [n,m] matrix of training examples Y: [1,m] matrix of output labels/values cache: Dictionary of intermediate values from forward-propagation step params: Dictionary of weights & biases Returns --------- a dictionary containing the gradients """ m = X.shape[0] A1 = cache["A1"] # [n_h,m] A2 = cache["A2"] # [n_y,m] W2 = params["W2"] # [n_y,n_h] dA2 = -(Y / A2) + (1 - Y) / (1 - A2) # [n_y,m] dZ2 = dA2 * sigmoid_prime(A2) # [n_y,m] dW2 = np.dot(dZ2, A1.T) / m # [n_y,n_h] = [n_y,m] . [m,n_h] db2 = np.mean(dZ2, axis=1, keepdims=True) # [n_y,1] dA1 = np.dot(W2.T, dZ2) # [n_h,m] = [n_h,n_y] . [n_y,m] dZ1 = dA1 * sigmoid_prime(A1) # [n_h,m] dW1 = np.dot(dZ1, X.T) / m # [n_h,n] = [n_h,m] . [ m,n] db1 = np.mean(dZ1, axis=1, keepdims=True) # [n_h,1] return dict(dW1=dW1, db1=db1, dW2=dW2, db2=db2)
def backprop(self, x, y): nabla_b = [np.zeros(b.shape) for b in self.b_] nabla_w = [np.zeros(w.shape) for w in self.w_] activation = x activations = [x] zs = [] for b, w in zip(self.b_, self.w_): z = np.dot(w, activation) + b zs.append(z) activation = sigmoid(z) activations.append(activation) delta = self.cost_derivative(activations[-1], y) * \ sigmoid_prime(zs[-1]) nabla_b[-1] = delta nabla_w[-1] = np.dot(delta, activations[-2].transpose()) for l in range(2, self.num_layers_): z = zs[-l] sp = sigmoid_prime(z) delta = np.dot(self.w_[-l + 1].transpose(), delta) * sp nabla_b[-l] = delta nabla_w[-l] = np.dot(delta, activations[-l - 1].transpose()) return (nabla_b, nabla_w)
def backward_pass(self): dCdZ = ((self.Xs[-1] - self.Y) * sigmoid_prime(self.Zs[-1])).T dCdWs = [dCdZ * self.Xs[-2] + self.Ws[-1].T * self.regularization] dCdBs = [dCdZ] for i in range(len(self.Ws) - 1, 0, -1): dZdZ = self.Ws[i] * sigmoid_prime(self.Zs[i - 1]).T #dZdX * dXdZ dCdZ = np.dot(dCdZ, dZdZ) dCdW = dCdZ * self.Xs[i - 1] + self.Ws[ i - 1].T * self.regularization #dCdZ * dZdW dCdWs.append(dCdW) dCdBs.append(dCdZ) dCdWs = [dCdW.T for dCdW in dCdWs[::-1]] dCdBs = [dCdB.T for dCdB in dCdBs[::-1]] return dCdWs, dCdBs
def backward_pass(self): dCdZ = ( (self.Xs[-1] - self.Y) * sigmoid_prime(self.Zs[-1])) # 1x10 | 10x1 dCdWs = [ np.dot(dCdZ, self.Xs[-2].T) ] # dCdZ * dZdWs = 1x10 · 10x10x30 = 1x10x30 | 10x1 · 1x30 = 10x30 for i in range(len(self.sizes) - 2, 0, -1): sp = sigmoid_prime(self.Zs[i - 1]) dCdZ = np.dot( self.Ws[i][:, :-1].T, dCdZ ) * sp # dCdZ * dZdZ = 1x10 · 10·30 = 1x30 | (30x10 · 10x1) * 30x1 = 30x1 dCdWs.append( np.dot(dCdZ, self.Xs[i - 1].T) ) # dCdZ * dZdW = 1x30 · 30x30x100 = 1x30x100 | 30x1 · 1x100 = 30x100 return dCdWs[::-1]
def prime_activate(self, activation): if(self.hidden_type == "SIGMOID"): prime = utils.sigmoid_prime(activation) elif(self.visible_type == "RELU"): prime = utils.relu_prime(activation) elif(self.visible_type == "LEAKY_RELU"): prime = utils.leaky_relu_prime(activation) elif(self.hidden_type == "LINEAR"): prime = activation else: raise NotImplemented("Unrecogonised hidden type") return prime
def compute_grads(X: np.ndarray, Y: np.ndarray, cache: dict, params: dict) -> dict: """ Compute gradients using backpropagation algorithm Parameters ---------- X: [n,m] matrix of training examples Y: [1,m] matrix of output labels/values cache: Dictionary of intermediate values from forward-propagation step params: Dictionary of weights & biases from each layer Returns --------- a dictionary containing the gradients computed for each layer """ m = X.shape[0] grads = {} layers = len(params) A = cache[layers]["A"] dA = -(Y / A) + (1 - Y) / (1 - A) dZ = dA * sigmoid_prime(A) grads[layers] = {"dA": dA, "dZ": dZ} for l in range(layers - 1, 0, -1): next_layer = l + 1 dA = np.dot(params[next_layer]["W"].T, grads[next_layer]["dZ"]) dZ = dA * sigmoid_prime(cache[l]["A"]) grads[l] = {"dA": dA, "dZ": dZ} for l in range(1, layers + 1): grads[l]["dW"] = np.dot(grads[l]["dZ"], cache[l - 1]["A"].T) / m grads[l]["db"] = np.mean(grads[l]["dZ"], axis=1, keepdims=True) return grads
def backProp(self, X, y, lmbda): ones = np.ones(1) a1, z2, a2, z3, h = self.feedForward(X) J = self.Cost(h,y,lmbda) m = X.shape[0] delta1 = np.zeros(self.weights[0].shape) # (3, 6) delta2 = np.zeros(self.weights[1].shape) # (3, 4) ones = np.ones((m,1)) diff = h - y z2 = np.hstack((ones, z2)) # (5,4) d2 = np.multiply(np.dot(self.weights[1].T, diff.T).T, utils.sigmoid_prime(z2)) # (5000, 26) delta1 += np.dot((d2[:, 1:]).T, a1) delta2 += np.dot(diff.T, a2) delta1 = delta1 / m delta2 = delta2 / m # Añadir la regularización, pero no al bias delta1[:, 1:] = delta1[:, 1:] + (self.weights[0][:, 1:] * lmbda) / m delta2[:, 1:] = delta2[:, 1:] + (self.weights[1][:, 1:] * lmbda) / m return J, [delta1, delta2]
def cost(self, theta, indices, weights_shape, biases_shape, lambda_, sparsity, beta,\ data, cost_fct, log_cost=True): if cost_fct == 'cross-entropy': if beta != 0 or sparsity != 0: beta = 0 sparsity = 0 print 'WARNING: Cross-entropy does not support sparsity' # Unrolling the weights and biases for jj in range(self.mid * 2): w, b = self._unroll(theta, jj, indices, weights_shape, biases_shape) self.layers[jj].weights = w self.layers[jj].hidden_biases = b # Number of training examples m = data.shape[1] # Forward pass h = self.feedforward(data.T).T # Sparsity sparsity_cost = 0 wgrad = [] bgrad = [] ############################################################################################ # Cost function if cost_fct == 'L2': # Back-propagation delta = -(data - h) # Compute the gradient: for jj in range(self.mid * 2 - 1, -1, -1): if jj < self.mid * 2 - 1: # TODO: Sparsity: do we want it at every (hidden) layer ?? """print jj print np.shape(self.layers[2].output.T) print hn print self.layers[2].hidden_nodes print m exit()""" hn = self.layers[jj].output.T.shape[0] #print hn, np.shape(self.layers[2].output.T) rho_hat = np.mean(self.layers[jj].output.T, axis=1) rho = np.tile(sparsity, hn) #print np.shape(rho_hat), rho.shape if beta == 0: sparsity_delta = 0 sparsity_cost = 0 else: sparsity_delta = np.tile(- rho / rho_hat + (1 - rho) / (1 - rho_hat), (m, 1)).transpose() sparsity_cost += beta * np.sum(utils.KL_divergence(rho, rho_hat)) delta = self.layers[jj+1].weights.dot(delta) + beta * sparsity_delta if self.layers[jj].hidden_type == 'SIGMOID': delta *= utils.sigmoid_prime(self.layers[jj].activation.T) elif self.layers[jj].hidden_type == 'RELU': delta *= utils.relu_prime(self.layers[jj].activation.T) elif self.layers[jj].hidden_type == 'LINEAR': pass # Nothing more to do else: raise NotImplemented("Hidden type %s not implemented" % self.layers[jj].hidden_type) grad_w = delta.dot(self.layers[jj].input) / m + lambda_ * self.layers[jj].weights.T / m grad_b = np.mean(delta, axis=1) wgrad.append(grad_w.T) bgrad.append(grad_b) # Reverse the order since back-propagation goes backwards wgrad = wgrad[::-1] bgrad = bgrad[::-1] # Computes the L2 norm + regularisation #TODO: COST MISSES THE COMPLETE SPARSITY !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! cost = np.sum((h - data) ** 2) / (2 * m) + (lambda_ / 2) * \ (sum([((self.layers[jj].weights)**2).sum() for jj in range(self.mid * 2)])) + \ sparsity_cost elif cost_fct == 'cross-entropy': # Compute the gradients: # http://neuralnetworksanddeeplearning.com/chap3.html for details dEda = None for jj in range(self.mid * 2 - 1, -1, -1): #print jj, '-------' * 6 # The output of the layer right before is if jj - 1 < 0: hn = data.T else: hn = self.layers[jj-1].output # If last layer, we compute the delta = output - expectation if dEda is None: dEda = h - data else: wp1 = self.layers[jj+1].weights a = self.layers[jj].output dEda = wp1.dot(dEda) * (a * (1. - a)).T dEdb = np.mean(dEda, axis=1) dEdw = dEda.dot(hn) / m dEdw = dEdw.T wgrad.append(dEdw) bgrad.append(dEdb) # Reverse the order since back-propagation goes backwards wgrad = wgrad[::-1] bgrad = bgrad[::-1] # Computes the cross-entropy cost = - np.sum(data * np.log(h) + (1. - data) * np.log(1. - h), axis=0) cost = np.mean(cost) else: raise NotImplemented() if log_cost: self.train_history.append(cost) #exit() # Returns the gradient as a vector. grad = self._roll(wgrad, bgrad, return_info=False) return cost, grad
def delta(z, a, y): """Return the error delta from the output layer.""" return (a - y) * sigmoid_prime(z)
def cost(self, theta, indices, weights_shape, biases_shape, lambda_, sparsity, beta,\ data, corruption, cost_fct, dropout, log_cost=True): if cost_fct == 'cross-entropy': if beta != 0 or sparsity != 0: beta = 0 sparsity = 0 #print 'WARNING: Cross-entropy does not support sparsity' # Unrolling the weights and biases for jj in range(self.mid * 2): w, b = self._unroll(theta, jj, indices, weights_shape, biases_shape) self.layers[jj].weights = w self.layers[jj].hidden_biases = b # Number of training examples m = data.shape[1] # Forward pass if corruption is not None: cdata = self._corrupt(data, corruption) else: cdata = data ch = self.feedforward(cdata.T, dropout=dropout).T h = self.feedforward(data.T, dropout=dropout).T # Sparsity sparsity_cost = 0 wgrad = [] bgrad = [] ############################################################################################ # Cost function if cost_fct == 'L2': # Back-propagation delta = -(data - ch) # Compute the gradient: for jj in range(self.mid * 2 - 1, -1, -1): if jj < self.mid * 2 - 1: hn = self.layers[jj].output.T.shape[0] rho_hat = np.mean(self.layers[jj].output.T, axis=1) if beta == 0: sparsity_grad = 0 sparsity_cost = 0 else: rho = sparsity sparsity_cost += beta * np.sum(u.KL_divergence(rho, rho_hat)) sparsity_grad = beta * u.KL_prime(rho, rho_hat) sparsity_grad = np.matrix(sparsity_grad).T #spars_grad = np.tile(spars_grad, m).reshape(m,self.hidden_nodes).T #print rho_hat.mean(), 'cost:', sparsity_cost, '<<<<<<<<<<<<<<<' sparsity_cost += beta * np.sum(u.KL_divergence(rho, rho_hat)) delta = self.layers[jj+1].weights.dot(delta) + beta * sparsity_grad delta = np.array(delta) if self.layers[jj].hidden_type == 'SIGMOID': delta *= u.sigmoid_prime(self.layers[jj].activation.T) elif self.layers[jj].hidden_type == 'RELU': delta *= u.relu_prime(self.layers[jj].activation.T) elif self.layers[jj].hidden_type == 'LEAKY_RELU': delta *= u.leaky_relu_prime(self.layers[jj].activation.T) elif self.layers[jj].hidden_type == 'LINEAR': pass else: raise ValueError("Unknown activation function %s" % self.layers[jj].hidden_type) grad_w = delta.dot(self.layers[jj].input) / m + lambda_ * self.layers[jj].weights.T grad_b = np.mean(delta, axis=1) wgrad.append(grad_w.T) bgrad.append(grad_b) # Reverse the order since back-propagation goes backwards wgrad = wgrad[::-1] bgrad = bgrad[::-1] # Computes the L2 norm + regularisation #TODO: COST MISSES THE COMPLETE SPARSITY !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! cost = np.sum((h - data) ** 2) / (2 * m) + (lambda_ / 2) * \ (sum([((self.layers[jj].weights)**2).sum() for jj in range(self.mid * 2)])) + \ sparsity_cost #print 'tot cost', cost elif cost_fct == 'cross-entropy': # Compute the gradients: # http://neuralnetworksanddeeplearning.com/chap3.html for details dEda = None for jj in range(self.mid * 2 - 1, -1, -1): #print jj, '-------' * 6 # The output of the layer right before is if jj - 1 < 0: hn = data.T else: hn = self.layers[jj-1].output # If last layer, we compute the delta = output - expectation if dEda is None: dEda = ch - data else: wp1 = self.layers[jj+1].weights if corruption is None: a = self.layers[jj].output else: a = self.feedforward_to_layer(cdata.T, jj) dEda = wp1.dot(dEda) * (a * (1. - a)).T dEdb = np.mean(dEda, axis=1) dEdw = dEda.dot(hn) / m + lambda_ * self.layers[jj].weights.T dEdw = dEdw.T wgrad.append(dEdw) bgrad.append(dEdb) # Reverse the order since back-propagation goes backwards wgrad = wgrad[::-1] bgrad = bgrad[::-1] # Computes the cross-entropy cost = - np.sum(data * np.log(ch) + (1. - data) * np.log(1. - ch), axis=0) cost = np.mean(cost) if log_cost: self.train_history.append(cost) #exit() # Returns the gradient as a vector. grad = self._roll(wgrad, bgrad, return_info=False) return cost, grad