Exemple #1
0
    def backprop3(self, x, y):

        nabla_b = [np.zeros(b.shape) for b in self.biases]
        nabla_w = [np.zeros(w.shape) for w in self.weights]
        # feedforward
        activation = x
        activations = [x]  # list to store all the activation matrices, layer by layer
        zs = []  # list to store all the "sum of weighted inputs z" matrices, layer by layer
        i = 0
        for b, w in zip(self.biases, self.weights):
            # insert the vector of biases on the first column of the weight matrix
            w = np.insert(w, 0, b.transpose(), axis=1)
            i = i+1
            # insert ones on the first line of the matrix of activations
            activation = np.insert(activation, 0, np.ones(activation[0].shape), 0)
            z = np.dot(w, activation)
            zs.append(z)
            activation = sigmoid(z)
            activations.append(activation)
        delta = self.cost_derivative(activations[-1], y) * \
            sigmoid_prime(zs[-1])
        nabla_b[-1] = np.expand_dims(np.sum(delta, axis=1), axis=1)
        nabla_w[-1] = np.dot(delta, activations[-2].transpose())

        for l in range(2, self.num_layers):
            z = zs[-l]
            sp = sigmoid_prime(z)
            delta = np.dot(self.weights[-l + 1].transpose(), delta) * sp
            nabla_b[-l] = np.expand_dims(np.sum(delta, axis=1), axis=1)
            nabla_w[-l] = np.dot(delta, activations[-l - 1].transpose())

        return (nabla_b, nabla_w)
Exemple #2
0
    def backprop(self, x, y):
        """Single sample based.

        Return a tuple ``(nabla_b, nabla_w)`` representing the
        gradient for the cost function C_x.  ``nabla_b`` and
        ``nabla_w`` are layer-by-layer lists of numpy arrays, similar
        to ``self.biases`` and ``self.weights``.
        """
        nabla_b = [np.zeros(b.shape) for b in self.biases]
        nabla_w = [np.zeros(w.shape) for w in self.weights]
        # feedforward
        activation = x
        activations = [x]  # list to store all the activations, layer by layer
        zs = []  # list to store all the z vectors, layer by layer
        for b, w in zip(self.biases, self.weights):
            z = np.dot(w, activation)+b
            zs.append(z)
            activation = sigmoid(z)
            activations.append(activation)
        # backward pass
        delta = self.cost_derivative(activations[-1], y) * sigmoid_prime(zs[-1])   # IMPORTANT
        nabla_b[-1] = delta
        nabla_w[-1] = np.dot(delta, activations[-2].transpose())

        for l in range(2, self.num_layers):
            z = zs[-l]
            sp = sigmoid_prime(z)
            delta = np.dot(self.weights[-l+1].transpose(), delta) * sp
            nabla_b[-l] = delta
            nabla_w[-l] = np.dot(delta, activations[-l-1].transpose())
        return (nabla_b, nabla_w)
Exemple #3
0
    def backprop_matrix(self, x, y):
        """Full-batch method.
        """
        nabla_b = [np.zeros(b.shape) for b in self.biases]
        nabla_w = [np.zeros(w.shape) for w in self.weights]
        # feedforward
        activation = x
        activations = [x]  # list to store all the activations, layer by layer
        zs = []  # list to store all the z vectors, layer by layer
        for b, w in zip(self.biases, self.weights):
            z = np.dot(w, activation) + np.repeat(b, activation.shape[1], axis=1)   # IMPORTANT
            zs.append(z)
            activation = sigmoid(z)
            activations.append(activation)
        # backward pass
        delta = self.cost_derivative(activations[-1], y) * sigmoid_prime(zs[-1])
        nabla_b[-1] = np.sum(delta, axis=1).reshape([-1, 1])
        nabla_w[-1] = np.dot(delta, activations[-2].transpose())

        for l in range(2, self.num_layers):
            z = zs[-l]
            sp = sigmoid_prime(z)
            delta = np.dot(self.weights[-l+1].transpose(), delta) * sp
            nabla_b[-l] = np.sum(delta, axis=1).reshape([-1, 1])
            nabla_w[-l] = np.dot(delta, activations[-l-1].transpose())
        return (nabla_b, nabla_w)
Exemple #4
0
def compute_grads(X: np.ndarray, Y: np.ndarray, cache: dict, params: dict) -> dict:
    """
    Compute gradients using backpropagation algorithm

    Parameters
    ----------
    X: [n,m] matrix of training examples
    Y: [1,m] matrix of output labels/values
    cache: Dictionary of intermediate values from forward-propagation step
    params: Dictionary of weights & biases

    Returns
    ---------
    a dictionary containing the gradients
    """

    m = X.shape[0]
    A1 = cache["A1"]  # [n_h,m]
    A2 = cache["A2"]  # [n_y,m]
    W2 = params["W2"]  # [n_y,n_h]

    dA2 = -(Y / A2) + (1 - Y) / (1 - A2)  # [n_y,m]
    dZ2 = dA2 * sigmoid_prime(A2)  # [n_y,m]

    dW2 = np.dot(dZ2, A1.T) / m  #  [n_y,n_h] = [n_y,m] . [m,n_h]
    db2 = np.mean(dZ2, axis=1, keepdims=True)  # [n_y,1]

    dA1 = np.dot(W2.T, dZ2)  # [n_h,m] =  [n_h,n_y] . [n_y,m]
    dZ1 = dA1 * sigmoid_prime(A1)  # [n_h,m]

    dW1 = np.dot(dZ1, X.T) / m  # [n_h,n] = [n_h,m] . [ m,n]
    db1 = np.mean(dZ1, axis=1, keepdims=True)  # [n_h,1]

    return dict(dW1=dW1, db1=db1, dW2=dW2, db2=db2)
Exemple #5
0
    def backprop(self, x, y):
        nabla_b = [np.zeros(b.shape) for b in self.b_]
        nabla_w = [np.zeros(w.shape) for w in self.w_]

        activation = x
        activations = [x]
        zs = []
        for b, w in zip(self.b_, self.w_):
            z = np.dot(w, activation) + b
            zs.append(z)
            activation = sigmoid(z)
            activations.append(activation)

        delta = self.cost_derivative(activations[-1], y) * \
                sigmoid_prime(zs[-1])
        nabla_b[-1] = delta
        nabla_w[-1] = np.dot(delta, activations[-2].transpose())

        for l in range(2, self.num_layers_):
            z = zs[-l]
            sp = sigmoid_prime(z)
            delta = np.dot(self.w_[-l + 1].transpose(), delta) * sp
            nabla_b[-l] = delta
            nabla_w[-l] = np.dot(delta, activations[-l - 1].transpose())
        return (nabla_b, nabla_w)
Exemple #6
0
    def backward_pass(self):
        dCdZ = ((self.Xs[-1] - self.Y) * sigmoid_prime(self.Zs[-1])).T
        dCdWs = [dCdZ * self.Xs[-2] + self.Ws[-1].T * self.regularization]
        dCdBs = [dCdZ]

        for i in range(len(self.Ws) - 1, 0, -1):
            dZdZ = self.Ws[i] * sigmoid_prime(self.Zs[i - 1]).T  #dZdX * dXdZ
            dCdZ = np.dot(dCdZ, dZdZ)
            dCdW = dCdZ * self.Xs[i - 1] + self.Ws[
                i - 1].T * self.regularization  #dCdZ * dZdW
            dCdWs.append(dCdW)
            dCdBs.append(dCdZ)

        dCdWs = [dCdW.T for dCdW in dCdWs[::-1]]
        dCdBs = [dCdB.T for dCdB in dCdBs[::-1]]
        return dCdWs, dCdBs
Exemple #7
0
    def backward_pass(self):
        dCdZ = (
            (self.Xs[-1] - self.Y) * sigmoid_prime(self.Zs[-1]))  # 1x10 | 10x1
        dCdWs = [
            np.dot(dCdZ, self.Xs[-2].T)
        ]  # dCdZ * dZdWs = 1x10 · 10x10x30 = 1x10x30 | 10x1 · 1x30 = 10x30

        for i in range(len(self.sizes) - 2, 0, -1):
            sp = sigmoid_prime(self.Zs[i - 1])
            dCdZ = np.dot(
                self.Ws[i][:, :-1].T, dCdZ
            ) * sp  # dCdZ * dZdZ = 1x10 · 10·30 = 1x30 | (30x10 · 10x1) * 30x1 = 30x1
            dCdWs.append(
                np.dot(dCdZ, self.Xs[i - 1].T)
            )  # dCdZ * dZdW = 1x30 · 30x30x100 = 1x30x100 | 30x1 · 1x100 = 30x100

        return dCdWs[::-1]
Exemple #8
0
	def prime_activate(self, activation):
		if(self.hidden_type == "SIGMOID"):
			prime = utils.sigmoid_prime(activation)
		elif(self.visible_type == "RELU"):
			prime = utils.relu_prime(activation)
		elif(self.visible_type == "LEAKY_RELU"):
			prime = utils.leaky_relu_prime(activation)
		elif(self.hidden_type == "LINEAR"):
			prime = activation
		else:
			raise NotImplemented("Unrecogonised hidden type")
			
		return prime
Exemple #9
0
def compute_grads(X: np.ndarray, Y: np.ndarray, cache: dict,
                  params: dict) -> dict:
    """
    Compute gradients using backpropagation algorithm

    Parameters
    ----------
    X: [n,m] matrix of training examples
    Y: [1,m] matrix of output labels/values
    cache: Dictionary of intermediate values from forward-propagation step
    params: Dictionary of weights & biases from each layer

    Returns
    ---------
    a dictionary containing the gradients computed for each layer
    """

    m = X.shape[0]
    grads = {}
    layers = len(params)

    A = cache[layers]["A"]
    dA = -(Y / A) + (1 - Y) / (1 - A)
    dZ = dA * sigmoid_prime(A)
    grads[layers] = {"dA": dA, "dZ": dZ}

    for l in range(layers - 1, 0, -1):
        next_layer = l + 1
        dA = np.dot(params[next_layer]["W"].T, grads[next_layer]["dZ"])
        dZ = dA * sigmoid_prime(cache[l]["A"])
        grads[l] = {"dA": dA, "dZ": dZ}

    for l in range(1, layers + 1):
        grads[l]["dW"] = np.dot(grads[l]["dZ"], cache[l - 1]["A"].T) / m
        grads[l]["db"] = np.mean(grads[l]["dZ"], axis=1, keepdims=True)

    return grads
Exemple #10
0
  def backProp(self, X, y, lmbda):
    ones = np.ones(1)
    a1, z2, a2, z3, h = self.feedForward(X)
    J = self.Cost(h,y,lmbda)
    m = X.shape[0]
    delta1 = np.zeros(self.weights[0].shape)  # (3, 6)
    delta2 = np.zeros(self.weights[1].shape) # (3, 4)
    ones = np.ones((m,1))
    diff = h - y
    z2 = np.hstack((ones, z2)) # (5,4)
    d2 = np.multiply(np.dot(self.weights[1].T, diff.T).T, utils.sigmoid_prime(z2))  # (5000, 26)
    delta1 += np.dot((d2[:, 1:]).T, a1)
    delta2 += np.dot(diff.T, a2)

    delta1 = delta1 / m
    delta2 = delta2 / m
    
    # Añadir la regularización, pero no al bias
    delta1[:, 1:] = delta1[:, 1:] + (self.weights[0][:, 1:] * lmbda) / m
    delta2[:, 1:] = delta2[:, 1:] + (self.weights[1][:, 1:] * lmbda) / m

    return J, [delta1, delta2]
Exemple #11
0
	def cost(self, theta, indices, weights_shape, biases_shape, lambda_, sparsity, beta,\
			 data, cost_fct, log_cost=True):

		if cost_fct == 'cross-entropy':
			if beta != 0 or sparsity != 0:
				beta = 0
				sparsity = 0
				print 'WARNING: Cross-entropy does not support sparsity'

		# Unrolling the weights and biases
		for jj in range(self.mid * 2):
			w, b = self._unroll(theta, jj, indices, weights_shape, biases_shape)

			self.layers[jj].weights = w
			self.layers[jj].hidden_biases = b
	
		# Number of training examples
		m = data.shape[1]

		# Forward pass
		h = self.feedforward(data.T).T

		# Sparsity
		sparsity_cost = 0
		
		wgrad = []
		bgrad = []
		
		############################################################################################
		# Cost function
		if cost_fct == 'L2':
			
			# Back-propagation
			delta = -(data - h)
		
			# Compute the gradient:
			for jj in range(self.mid * 2 - 1, -1, -1):

				if jj < self.mid * 2 - 1:
					# TODO: Sparsity: do we want it at every (hidden) layer ?? 
					"""print jj
					print np.shape(self.layers[2].output.T)
					
					print hn
					print self.layers[2].hidden_nodes
					print m 
					exit()"""
					hn = self.layers[jj].output.T.shape[0]
					#print hn, np.shape(self.layers[2].output.T)
					rho_hat = np.mean(self.layers[jj].output.T, axis=1)
					rho = np.tile(sparsity, hn)
					#print np.shape(rho_hat), rho.shape
					if beta == 0:
						sparsity_delta = 0
						sparsity_cost = 0
					else:
						sparsity_delta = np.tile(- rho / rho_hat + (1 - rho) / (1 - rho_hat), (m, 1)).transpose()
						sparsity_cost += beta * np.sum(utils.KL_divergence(rho, rho_hat))
	
					delta = self.layers[jj+1].weights.dot(delta) + beta * sparsity_delta

				if self.layers[jj].hidden_type == 'SIGMOID':
					delta *= utils.sigmoid_prime(self.layers[jj].activation.T)
				elif self.layers[jj].hidden_type == 'RELU':
					delta *= utils.relu_prime(self.layers[jj].activation.T)
				elif self.layers[jj].hidden_type == 'LINEAR':
					pass # Nothing more to do
				else:
					raise NotImplemented("Hidden type %s not implemented" % self.layers[jj].hidden_type)
				
				grad_w = delta.dot(self.layers[jj].input) / m + lambda_ * self.layers[jj].weights.T / m
				grad_b = np.mean(delta, axis=1)
				wgrad.append(grad_w.T)
				bgrad.append(grad_b)
					
			# Reverse the order since back-propagation goes backwards 
			wgrad = wgrad[::-1]
			bgrad = bgrad[::-1]
			
			# Computes the L2 norm + regularisation
			#TODO: COST MISSES THE COMPLETE SPARSITY !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
			cost = np.sum((h - data) ** 2) / (2 * m) + (lambda_ / 2) * \
				(sum([((self.layers[jj].weights)**2).sum() for jj in range(self.mid * 2)])) + \
				sparsity_cost
		elif cost_fct == 'cross-entropy':
			# Compute the gradients:
			# http://neuralnetworksanddeeplearning.com/chap3.html for details
			dEda = None
			
			for jj in range(self.mid * 2 - 1, -1, -1):
				#print jj, '-------' * 6
				# The output of the layer right before is
				if jj - 1 < 0:
					hn = data.T
				else:
					hn = self.layers[jj-1].output
				
				# If last layer, we compute the delta = output - expectation
				if dEda is None: 
					dEda = h - data
				else:
					wp1 = self.layers[jj+1].weights
					a = self.layers[jj].output
					dEda = wp1.dot(dEda) * (a * (1. - a)).T
					
				dEdb = np.mean(dEda, axis=1)

				dEdw = dEda.dot(hn) / m
				dEdw = dEdw.T
				
				wgrad.append(dEdw)
				bgrad.append(dEdb)
	
			# Reverse the order since back-propagation goes backwards 
			wgrad = wgrad[::-1]
			bgrad = bgrad[::-1]

			# Computes the cross-entropy
			cost = - np.sum(data * np.log(h) + (1. - data) * np.log(1. - h), axis=0) 
			cost = np.mean(cost)
		else: 
			raise NotImplemented()
		if log_cost:
			self.train_history.append(cost)
		
		#exit()
		# Returns the gradient as a vector.
		grad = self._roll(wgrad, bgrad, return_info=False)
		return cost, grad
Exemple #12
0
 def delta(z, a, y):
     """Return the error delta from the output layer."""
     return (a - y) * sigmoid_prime(z)
Exemple #13
0
	def cost(self, theta, indices, weights_shape, biases_shape, lambda_, sparsity, beta,\
			 data, corruption, cost_fct, dropout, log_cost=True):

		if cost_fct == 'cross-entropy':
			if beta != 0 or sparsity != 0:
				beta = 0
				sparsity = 0
				#print 'WARNING: Cross-entropy does not support sparsity'

		# Unrolling the weights and biases
		for jj in range(self.mid * 2):
			w, b = self._unroll(theta, jj, indices, weights_shape, biases_shape)

			self.layers[jj].weights = w
			self.layers[jj].hidden_biases = b
	
		# Number of training examples
		m = data.shape[1]

		# Forward pass
			
		if corruption is not None:
			cdata = self._corrupt(data, corruption)
		else:
			cdata = data
		ch = self.feedforward(cdata.T, dropout=dropout).T
		h = self.feedforward(data.T, dropout=dropout).T

		# Sparsity
		sparsity_cost = 0
		
		wgrad = []
		bgrad = []
		
		############################################################################################
		# Cost function

		if cost_fct == 'L2':
		
			# Back-propagation
			delta = -(data - ch)
		
			# Compute the gradient:
			for jj in range(self.mid * 2 - 1, -1, -1):
				if jj < self.mid * 2 - 1:

					hn = self.layers[jj].output.T.shape[0]
					rho_hat = np.mean(self.layers[jj].output.T, axis=1)

					if beta == 0:
						sparsity_grad = 0
						sparsity_cost = 0
					else:
						rho = sparsity
						
						sparsity_cost += beta * np.sum(u.KL_divergence(rho, rho_hat))
						sparsity_grad = beta * u.KL_prime(rho, rho_hat)
						sparsity_grad = np.matrix(sparsity_grad).T
						#spars_grad = np.tile(spars_grad, m).reshape(m,self.hidden_nodes).T
						#print rho_hat.mean(), 'cost:', sparsity_cost, '<<<<<<<<<<<<<<<'
						
						sparsity_cost += beta * np.sum(u.KL_divergence(rho, rho_hat))
	
					delta = self.layers[jj+1].weights.dot(delta) + beta * sparsity_grad
					delta = np.array(delta)
				
				if self.layers[jj].hidden_type == 'SIGMOID':
					delta *= u.sigmoid_prime(self.layers[jj].activation.T)
				elif self.layers[jj].hidden_type == 'RELU':
					delta *= u.relu_prime(self.layers[jj].activation.T)
				elif self.layers[jj].hidden_type == 'LEAKY_RELU':
					delta *= u.leaky_relu_prime(self.layers[jj].activation.T)
				elif self.layers[jj].hidden_type == 'LINEAR':
					pass 
				else:
					raise ValueError("Unknown activation function %s" % self.layers[jj].hidden_type)
				
				grad_w = delta.dot(self.layers[jj].input) / m + lambda_ * self.layers[jj].weights.T
				grad_b = np.mean(delta, axis=1)
				wgrad.append(grad_w.T)
				bgrad.append(grad_b)
					
			# Reverse the order since back-propagation goes backwards 
			wgrad = wgrad[::-1]
			bgrad = bgrad[::-1]
			
			# Computes the L2 norm + regularisation
			#TODO: COST MISSES THE COMPLETE SPARSITY !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
			cost = np.sum((h - data) ** 2) / (2 * m) + (lambda_ / 2) * \
				(sum([((self.layers[jj].weights)**2).sum() for jj in range(self.mid * 2)])) + \
				sparsity_cost
			#print 'tot cost', cost
		elif cost_fct == 'cross-entropy':
			# Compute the gradients:
			# http://neuralnetworksanddeeplearning.com/chap3.html for details
			dEda = None
			
			for jj in range(self.mid * 2 - 1, -1, -1):
				#print jj, '-------' * 6
				# The output of the layer right before is
				if jj - 1 < 0:
					hn = data.T
				else:
					hn = self.layers[jj-1].output
				
				# If last layer, we compute the delta = output - expectation
				if dEda is None: 
					dEda = ch - data
				else:
					wp1 = self.layers[jj+1].weights
					if corruption is None:
						a = self.layers[jj].output
					else:
						a = self.feedforward_to_layer(cdata.T, jj)
					dEda = wp1.dot(dEda) * (a * (1. - a)).T
					
				dEdb = np.mean(dEda, axis=1)

				dEdw = dEda.dot(hn) / m + lambda_ * self.layers[jj].weights.T
				dEdw = dEdw.T
				
				wgrad.append(dEdw)
				bgrad.append(dEdb)
	
			# Reverse the order since back-propagation goes backwards 
			wgrad = wgrad[::-1]
			bgrad = bgrad[::-1]

			# Computes the cross-entropy
			cost = - np.sum(data * np.log(ch) + (1. - data) * np.log(1. - ch), axis=0) 
			cost = np.mean(cost)
			
		if log_cost:
			self.train_history.append(cost)
		
		#exit()
		# Returns the gradient as a vector.
		grad = self._roll(wgrad, bgrad, return_info=False)
		return cost, grad