def gradients(self, x, loss_function, y_target): ''' Inputs: x: network input loss_function: Since the gradients of the loss function need to be computed, this has to be provided. y_target: Target values for the network output. Return value: gradients: Gradients of the loss function w.r.t. all weights and biases of the network. Gradients have a weights and biases member, the indexing starts with 0 for the first hidden layer (W_1, b_1) and ends with the output layer (W_out, b_out) ''' gradients = sup.Variables() # Outputs of each layer (layer_evaluations[0] is input x) layer_evaluations = [] for layer_idx, layer in enumerate(self.layers): layer_evaluations.append(self.evaluateLayer(layer_idx, x)) # Output equals the evaluation of the last layer network_output = self.output(x) # Derivative of cost w.r.t. the network output dCost_dy = #TODO: implement cost function derivative w.r.t. output variables of network # Element-wise multiplication with sigmoid derivative (sigmoid is applied element-wise) delta_fused = #TODO: start backpropagating the error # Gradient backpropagation ## Start from last layer and propagate error gradient through until first layer ## Attention!!!: layer_evaluations[0] is the network input while self.layers[0] is the first hidden layer for layer_idx in np.arange(len(self.layers)-1, -1, -1): logger.debug('Computing the gradient for layer {}'.format(layer_idx)) # If layer is not last layer, update delta_fused (which is accumulating the back-propagated gradient) # TODO: implement backpropagation of gradient for arbitrary number of layers return gradients
def testBatchGradient(self): # Manually build a gradient list gradient_list = [] n_layers = 2 batch_size = 3 w = np.ones([3, 5]) b = np.ones([1, 5]) grad = sup.Variables() for i in range(n_layers): grad.weights.append(w) grad.biases.append(b) for j in range(batch_size): gradient_list.append(grad) # Manually compute batch gradient batch_gradient_manual = grad * batch_size batch_gradient = self.optimizer.computeBatchGradient(gradient_list) self.assertTrue(batch_gradient_manual == batch_gradient)
def testVariables(self): var = sup.Variables() var.weights.append(np.ones([2, 2])) var.weights.append(np.ones([2, 2])) var.biases.append(np.ones([1, 2])) var.biases.append(np.ones([1, 2])) # Multiplication var_neg = var * (-1) for i in range(len(var)): self.assertTrue( np.all(var.weights[i] + var_neg.weights[i] == np.zeros_like(var.weights[i]))) self.assertTrue( np.all(var.biases[i] + var_neg.biases[i] == np.zeros_like(var.biases[i]))) # Addition var_add = var + var_neg for i in range(len(var_add)): self.assertTrue( np.all(var_add.weights[i] == np.zeros_like(var.weights[i]))) self.assertTrue( np.all(var_add.biases[i] == np.zeros_like(var.biases[i]))) # Subtraction var_sub = var - var for i in range(len(var_sub)): self.assertTrue( np.all(var_sub.weights[i] == np.zeros_like(var.weights[i]))) self.assertTrue( np.all(var_sub.biases[i] == np.zeros_like(var.biases[i]))) # Equality self.assertTrue(var == var) self.assertFalse(var == var_sub) # Inequality self.assertFalse(var != var) self.assertTrue(var != var_sub)
hidden_layer_specs = [] hidden_layer_specs.append({ 'activation': activation.SigmoidActivation(), 'dim': w_1.shape[1] }) output_dim = y_target.shape[1] fc_net = network.FCNetwork(input_dim, output_dim, hidden_layer_specs) fc_net.layers[0].setWeights(w_1) fc_net.layers[0].setBiases(b_1) fc_net.layers[1].setWeights(w_out) fc_net.layers[1].setBiases(b_out) correct_gradients = sup.Variables() class TestActivationFunctions(unittest.TestCase): """Test activation functions for value and gradient.""" def testUnit(self): unitActivation = activation.UnitActivation() input = np.random.rand(10) self.assertTrue(np.all(input == unitActivation.evaluate(input))) self.assertTrue( np.all(np.ones_like(input) == unitActivation.derivative(input))) def testSigmoid(self): sigmoidActivation = activation.SigmoidActivation() input = np.random.rand(10) self.assertTrue(
def getParameters(self): nn_params = sup.Variables() for l in self.layers: nn_params.weights.append(l.getWeights()) nn_params.biases.append(l.getBiases()) return nn_params
def gradients(self, x, loss_function, y_target): ''' Inputs: x: network input loss_function: Since the gradients of the loss function need to be computed, this has to be provided. y_target: Target values for the network output. Return value: gradients: Gradients of the loss function w.r.t. all weights and biases of the network. Gradients have a weights and biases member, the indexing starts with 0 for the first hidden layer (W_1, b_1) and ends with the output layer (W_out, b_out) ''' gradients = sup.Variables() gradients.weights = [None] * self.numberOfLayers( ) #Start with None, should definitely throw an error if we have an indexing problem later gradients.biases = [None] * self.numberOfLayers() # Outputs of each layer (layer_evaluations[0] is input x) layer_evaluations = [] for layer_idx, layer in enumerate(self.layers): layer_evaluations.append(self.evaluateLayer(layer_idx, x)) # Output equals the evaluation of the last layer network_output = self.output(x) print("Network output shape: {}".format(network_output)) ## Output layer dCost_dy = loss_function.derivative( network_output, y_target ) #TODO: implement cost function derivative w.r.t. output variables of network # Element-wise multiplication with sigmoid derivative (sigmoid is applied element-wise) # delta_fused = dCost_dy * self.layers[-1].derivativeActivation(network_output) #TODO: start backpropagating the error #TODO: the above line is more general, but does not work right now delta_fused = dCost_dy * network_output * (1 - network_output) # L_w_out = np.dot(network_output.T, delta_fused) # L_b_out = delta_fused print("Output weights shape: {}".format( np.dot(layer_evaluations[-1].T, delta_fused))) gradients.weights[self.numberOfLayers() - 1] = np.dot( layer_evaluations[-1].T, delta_fused) gradients.biases[self.numberOfLayers() - 1] = delta_fused print(layer_evaluations) # Gradient backpropagation ## Start from last layer and propagate error gradient through until first layer ## Attention!!!: layer_evaluations[0] is the network input while self.layers[0] is the first hidden layer for layer_idx in np.arange(len(self.layers) - 2, -1, -1): logger.debug( 'Computing the gradient for layer {}'.format(layer_idx)) print("Gradient for layer_idx: {}".format(layer_idx)) # If layer is not last layer, update delta_fused (which is accumulating the back-propagated gradient) # TODO: implement backpropagation of gradient for arbitrary number of layers L_w_prev = self.layers[ layer_idx + 1].getWeights() # 'prev' w.r.t. the back prop # print("Current layer evaluations shape: {}".format(layer_evaluations[layer_idx])) print("\tPrevious layer weights (index {}) shape: {}".format( layer_idx + 1, L_w_prev.shape)) print("\tPrevious layer biases (index {}) shape: {}".format( layer_idx + 1, gradients.biases[layer_idx + 1].shape)) print("\tCurrent delta_fused shape: {}".format(delta_fused.shape)) # delta_fused = np.dot(delta_fused, L_w_prev.T) * self.layers[layer_idx].derivativeActivation(layer_evaluations[layer_idx]) delta_fused = np.dot(delta_fused, L_w_prev.T) * layer_evaluations[ layer_idx + 1] * (1 - layer_evaluations[layer_idx + 1]) gradients.weights[layer_idx] = np.dot( layer_evaluations[layer_idx].T, delta_fused) # weight gradients.biases[layer_idx] = delta_fused # bias print( "\tLayer index {} will have weight shape {} and bias shape {}". format(layer_idx, gradients.weights[layer_idx].shape, gradients.biases[layer_idx].shape)) return gradients