def backward(self, dy): """ Performs the backward pass of the model. :param dy: N x 1 array. The gradient wrt the output of the network. :return: Gradients of the model output wrt the model weights """ # Note that last layer has no activation cache_affine = self.cache['affine' + str(self.num_layer)] dh, dW, db = affine_backward(dy, cache_affine) self.grads['W' + str(self.num_layer)] = \ dW + 2 * self.reg_strength * self.params['W' + str(self.num_layer)] self.grads['b' + str(self.num_layer)] = db # The rest sandwich layers for i in range(self.num_layer - 2, -1, -1): # Unpack cache cache_sigmoid = self.cache['sigmoid' + str(i + 1)] cache_affine = self.cache['affine' + str(i + 1)] # Activation backward dh = self.activation.backward(dh, cache_sigmoid) # Affine backward dh, dW, db = affine_backward(dh, cache_affine) # Refresh the gradients self.grads['W' + str(i + 1)] = dW + 2 * self.reg_strength * \ self.params['W' + str(i + 1)] self.grads['b' + str(i + 1)] = db return self.grads
def backward(self, dy): ######################################################################## # TODO: Your backward here # ######################################################################## cache_affine = self.cache['affine' + str(self.num_layer)] dh, dW, db = affine_backward(dy, cache_affine) self.grads['W' + str(self.num_layer)] = dW + 2 * self.reg_strength * self.params['W' + str(self.num_layer)] self.grads['b' + str(self.num_layer)] = db for i in range(self.num_layer - 2, -1, -1): cache_Relu = self.cache['Relu' + str(i + 1)] # cache_Tanh = self.cache['Tanh' + str(i + 1)] # cache_LRelu = self.cache['LeakyRelu' + str(i + 1)] cache_affine = self.cache['affine' + str(i + 1)] dh = self.activation.backward(dh, cache_Relu) # dh = self.activation.backward(dh, cache_Tanh) # dh = self.activation.backward(dh, cache_LRelu) dh, dW, db = affine_backward(dh, cache_affine) self.grads['W' + str(i + 1)] = dW + 2 * self.reg_strength * self.params['W' + str(i + 1)] self.grads['b' + str(i + 1)] = db pass ######################################################################## # END OF YOUR CODE # ######################################################################## return self.grads
def backward(self, dy): #grads = None ######################################################################## # TODO: Your backward here # ######################################################################## # Note that last layer has no activation cache_affine = self.cache['affine' + str(self.num_layer)] dh, dW, db = affine_backward(dy, cache_affine) self.grads['W' + str(self.num_layer)] = \ dW + 2 * self.reg_strength * self.params['W' + str(self.num_layer)] self.grads['b' + str(self.num_layer)] = db # The rest sandwich layers for i in range(self.num_layer - 2, -1, -1): # Unpack cache cache_activation = self.cache[self.str_activation + str(i + 1)] cache_affine = self.cache['affine' + str(i + 1)] # Activation backward dh = self.activation.backward(dh, cache_activation) # Affine backward dh, dW, db = affine_backward(dh, cache_affine) # Refresh the gradients self.grads['W' + str(i + 1)] = dW + 2 * self.reg_strength * \ self.params['W' + str(i + 1)] self.grads['b' + str(i + 1)] = db return self.grads
def backward(self, dy): """ Performs the backward pass of the model. :param dy: N x 1 array. The gradient wrt the output of the network. :return: Gradients of the model output wrt the model weights """ # Unpack cache cache_affine1 = self.cache['affine1'] cache_sigmoid = self.cache['sigmoid'] cache_affine2 = self.cache['affine2'] dW1 = None db1 = None dW2 = None db2 = None ######################################################################## # TODO # # Implement the backward pass using the layers you implemented. # # Like the forward pass, it consists of 3 steps: # # 1. Backward the second affine layer # # 2. Backward the sigmoid layer # # 3. Backward the first affine layer # # You should now have the gradients wrt all model parameters # ######################################################################## # Backward second layer dh_, dW2, db2 = affine_backward(dy, cache_affine2) # Backward Activation function dh = sigmoid_backward(dh_, cache_sigmoid) # Backward first layer dx, dW1, db1 = affine_backward(dh, cache_affine1) ######################################################################## # END OF YOUR CODE # ######################################################################## self.grads['W1'] = dW1 self.grads['b1'] = db1 self.grads['W2'] = dW2 self.grads['b2'] = db2 # calculate the number of operation and memory batch_size = dy.shape[0] self.num_operation = 2 * batch_size * self.input_size * self.hidden_size + \ batch_size * self.hidden_size + 2 * batch_size * self.hidden_size * 1 self.memory_backward = sys.getsizeof(dW1) + sys.getsizeof( db1) + sys.getsizeof(dW2) + sys.getsizeof(db2) self.memory = self.memory_forward + self.memory_backward return self.grads
def backward(self, dy): grads = None ######################################################################## # TODO: Your backward here # ######################################################################## cache_affine = self.cache['affine' + str(self.num_layer)] dh, dW, db = affine_backward(dy, cache_affine) self.grads['W' + str(self.num_layer)] = \ dW + 2 * self.reg_strength * self.params['W' + str(self.num_layer)] self.grads['b' + str(self.num_layer)] = db # The rest sandwich layers for i in range(self.num_layer - 2, -1, -1): # Unpack cache cache_sigmoid = self.cache['sigmoid' + str(i + 1)] cache_affine = self.cache['affine' + str(i + 1)] # Activation backward dh = self.activation.backward(dh, cache_sigmoid) # Affine backward dh, dW, db = affine_backward(dh, cache_affine) # Refresh the gradients self.grads['W' + str(i + 1)] = dW + 2 * self.reg_strength * \ self.params['W' + str(i + 1)] self.grads['b' + str(i + 1)] = db pass ######################################################################## # END OF YOUR CODE # ######################################################################## return self.grads