def test_affine_layer_backward(self): print("\n======== TestLayers.test_affine_layer_backward:") x = np.random.randn(10, 2, 3) w = np.random.randn(6, 5) b = np.random.randn(5) dout = np.random.randn(10, 5) dx_num = check_gradient.eval_numerical_gradient_array( lambda x: layers.affine_forward(x, w, b)[0], x, dout) dw_num = check_gradient.eval_numerical_gradient_array( lambda w: layers.affine_forward(x, w, b)[0], w, dout) db_num = check_gradient.eval_numerical_gradient_array( lambda b: layers.affine_forward(x, w, b)[0], b, dout) _, cache = layers.affine_forward(x, w, b) dx, dw, db = layers.affine_backward(dout, cache) dx_diff = error.rel_error(dx_num, dx) dw_diff = error.rel_error(dw_num, dw) db_diff = error.rel_error(db_num, db) print("dx error : %.9f" % dx_diff) print("dw error : %.9f" % dw_diff) print("db error : %.9f" % db_diff) # NOTE : occasionally we may randomly get a value greater than self.eps # here... I don't think its worth re-writing this test such that it can # pass every time, rather it might be better self.assertLessEqual(dx_diff, self.eps) self.assertLessEqual(dw_diff, self.eps) self.assertLessEqual(db_diff, self.eps) print("======== TestLayers.test_affine_layer_backward: <END> ")
def test_gradient(self): x = np.random.randn(10, 2, 3) w = np.random.randn(6, 5) b = np.random.randn(5) dout = np.random.randn(10, 5) dx_num = check_gradient.eval_numerical_gradient_array( lambda x: layers.affine_forward(x, w, b)[0], x, dout) dw_num = check_gradient.eval_numerical_gradient_array( lambda w: layers.affine_forward(x, w, b)[0], w, dout) db_num = check_gradient.eval_numerical_gradient_array( lambda b: layers.affine_forward(x, w, b)[0], b, dout) _, cache = layers.affine_forward(x, w, b) dx, dw, db = layers.affine_backward(dout, cache) print("dx error : %.6f " % error.rel_error(dx_num, dx)) print("dw error : %.6f " % error.rel_error(dw_num, dw)) print("db error : %.6f " % error.rel_error(db_num, db))
def backward(self, dout: np.ndarray, cache: Tuple[Any, Any, Any]) -> Tuple[np.ndarray, np.ndarray]: """ TODO : docstring """ start, end, layer_caches = cache dnext_a = dout grads = {} for i in reversed(range(start, end + 1)): i1 = i + 1 if i1 == len(self.conv_params) + 1: # Last affine layer dprev_a, dw, db = layers.affine_backward( dnext_a, layer_caches.pop()) grads['W%d' % i1] = dw grads['b%d' % i1] = db elif i == len(self.conv_params): # Affine hidden layer dprev_a, dw, db, dgamma, dbeta = layers.affine_norm_relu_backward( dnext_a, layer_caches.pop()) grads['W%d' % i1] = dw grads['b%d' % i1] = db grads['gamma%d' % i1] = dgamma grads['beta%d' % i1] = dbeta elif 0 <= i < len(self.conv_params): dprev_a, dw, db, dgamma, dbeta = layers.conv_bn_relu_backward( dnext_a, layer_caches.pop()) grads['W%d' % i1] = dw grads['b%d' % i1] = db grads['gamma%d' % i1] = dgamma grads['beta%d' % i1] = dbeta else: raise ValueError('Invalid layer index %d' % i) dnext_a = dprev_a dX = dnext_a return dX, grads
def loss(self, X, y=None): """ Evaluate loss and gradient for the three-layer convnet """ X = X.astype(self.dtype) # convert datatype if y is None: mode = 'test' else: mode = 'train' # TODO: Batchnorm here N = X.shape[0] W1, b1 = self.params['W1'], self.params['b1'] W2, b2 = self.params['W2'], self.params['b2'] W3, b3 = self.params['W3'], self.params['b3'] # TODO : more batchnorm stuff here fsize = W1.shape[2] conv_param = {'stride': 1, 'pad': int((fsize - 1) / 2)} pool_param = {'pool_height': 2, 'pool_width': 2, 'stride': 2} scores = None # =============================== # FORWARD PASS # =============================== x = X w = W1 b = b1 # Forward into the conv layer # TODO : batchnorm conv_layer, cache_conv_layer = conv_layers.conv_relu_pool_forward( x, w, b, conv_param, pool_param) N, F, Hp, Wp = conv_layer.shape # Shape of output # Forward into the hidden layer x = conv_layer.reshape((N, F, Hp * Wp)) w = W2 b = b2 hidden_layer, cache_hidden_layer = layers.affine_relu_forward(x, w, b) N, Hh = hidden_layer.shape # Forward into linear output layer x = hidden_layer w = W3 b = b3 scores, cache_scores = layers.affine_forward(x, w, b) if mode == 'test': return scores loss = 0 grads = {} # =============================== # BACKWARD PASS # =============================== data_loss, dscores = layers.softmax_loss(scores, y) reg_loss = 0.5 * self.reg * np.sum(W1**2) reg_loss = 0.5 * self.reg * np.sum(W2**2) reg_loss = 0.5 * self.reg * np.sum(W3**2) loss = data_loss + reg_loss # backprop into output layer dx3, dW3, db3 = layers.affine_backward(dscores, cache_scores) dW3 += self.reg * W3 # backprop into first fc layer dx2, dW2, db2 = layers.affine_relu_backward(dx3, cache_hidden_layer) dW2 += self.reg * W2 # Backprop into conv layer dx2 = dx2.reshape(N, F, Hp, Wp) # Note - don't forget to reshape here... dx, dW1, db1 = conv_layers.conv_relu_pool_backward( dx2, cache_conv_layer) dW1 += self.reg * W1 grads.update({ 'W1': dW1, 'W2': dW2, 'W3': dW3, 'b1': db1, 'b2': db2, 'b3': db3 }) return loss, grads
def loss(self, X: np.ndarray, y: Union[None, np.ndarray] = None) -> Any: """ Evaluate loss and gradient for the convnet """ X = X.astype(self.dtype) N = X.shape[0] if y is None: mode = 'test' else: mode = 'train' # Layer parameters conv_param = {'stride': 1, 'pad': int((self.filter_size - 1) / 2)} pool_param = {'pool_height': 2, 'pool_width': 2, 'stride': 2} if self.use_batchnorm: for k, bn in self.bn_params.items(): bn[mode] = mode scores = None blocks = {} blocks['h0'] = X # =============================== # FORWARD PASS # =============================== # Forward into conv block for l in range(self.L): idx = l + 1 W = self.params['W' + str(idx)] b = self.params['b' + str(idx)] h = blocks['h' + str(idx - 1)] if self.use_batchnorm: beta = self.params['beta' + str(idx)] gamma = self.params['gamma' + str(idx)] bn_param = self.bn_params['bn_param' + str(idx)] h, cache_h = conv_layers.conv_norm_relu_pool_forward( h, W, b, conv_param, pool_param, gamma, beta, bn_param) else: h, cache_h = conv_layers.conv_relu_pool_forward( h, W, b, conv_param, pool_param) blocks['h' + str(idx)] = h blocks['cache_h' + str(idx)] = cache_h # Forward into linear blocks for l in range(self.M): idx = self.L + l + 1 h = blocks['h' + str(idx - 1)] if l == 0: h = h.reshape(N, np.prod(h.shape[1:])) W = self.params['W' + str(idx)] b = self.params['b' + str(idx)] if self.use_batchnorm: beta = self.params['beta' + str(idx)] gamma = self.params['gamma' + str(idx)] bn_param = self.bn_params['bn_param' + str(idx)] h, cache_h = layers.affine_norm_relu_forward( h, W, b, gamma, beta, bn_param) else: h, cache_h = layers.affine_relu_forward(h, W, b) blocks['h' + str(idx)] = h blocks['cache_h' + str(idx)] = cache_h # Forward into the score idx = self.L + self.M + 1 W = self.params['W' + str(idx)] b = self.params['b' + str(idx)] h = blocks['h' + str(idx - 1)] h, cache_h = layers.affine_forward(h, W, b) blocks['h' + str(idx)] = h blocks['cache_h' + str(idx)] = cache_h scores = blocks['h' + str(idx)] if y is None: return scores loss = 0.0 grads: Dict[str, Any] = {} # Compute the loss data_loss, dscores = layers.softmax_loss(scores, y) reg_loss = 0.0 for k in self.params.keys(): if k[0] == 'W': for w in self.params[k]: reg_loss += 0.5 * self.reg * np.sum(w * w) loss = data_loss + reg_loss # =============================== # BACKWARD PASS # =============================== idx = self.L + self.M + 1 dh = dscores h_cache = blocks['cache_h' + str(idx)] dh, dW, db = layers.affine_backward(dh, h_cache) blocks['dh' + str(idx - 1)] = dh blocks['dW' + str(idx)] = dW blocks['db' + str(idx)] = db # Backprop into the linear layers for l in range(self.M)[::-1]: idx = self.L + l + 1 dh = blocks['dh' + str(idx)] h_cache = blocks['cache_h' + str(idx)] if self.use_batchnorm: dh, dW, db, dgamma, dbeta = layers.affine_norm_relu_backward( dh, h_cache) blocks['dgamma' + str(idx)] = dgamma blocks['dbeta' + str(idx)] = dbeta else: dh, dW, db = layers.affine_relu_backward(dh, h_cache) blocks['dh' + str(idx - 1)] = dh blocks['dW' + str(idx)] = dW blocks['db' + str(idx)] = db # Backprop into conv blocks for l in range(self.L)[::-1]: idx = l + 1 dh = blocks['dh' + str(idx)] h_cache = blocks['cache_h' + str(idx)] if l == max(range(self.L)[::-1]): dh = dh.reshape(*blocks['h' + str(idx)].shape) if self.use_batchnorm: dh, dW, db, dgamma, dbeta = conv_layers.conv_norm_relu_pool_backward( dh, h_cache) blocks['dgamma' + str(idx)] = dgamma blocks['dbeta' + str(idx)] = dbeta else: dh, dW, db = conv_layers.conv_relu_pool_backward(dh, h_cache) blocks['dh' + str(idx - 1)] = dh blocks['dW' + str(idx)] = dW blocks['db' + str(idx)] = db # Add reg term to W gradients dw_list = {} for key, val in blocks.items(): if key[:2] == 'dW': dw_list[key[1:]] = val + self.reg * self.params[key[1:]] db_list = {} for key, val in blocks.items(): if key[:2] == 'db': db_list[key[1:]] = val ## TODO : This is a hack dgamma_list = {} for key, val in blocks.items(): if key[:6] == 'dgamma': dgamma_list[key[1:]] = val # TODO : This is a hack dbeta_list = {} for key, val in blocks.items(): if key[:5] == 'dbeta': dbeta_list[key[1:]] = val grads = {} grads.update(dw_list) grads.update(db_list) grads.update(dgamma_list) grads.update(dbeta_list) return loss, grads
def loss(self, X: np.ndarray, y: Union[np.ndarray, None] = None) -> Union[np.ndarray, Any]: """ LOSS Compute loss and gradients for the fully connected network """ X = X.astype(self.dtype) if y is None: mode = 'test' else: mode = 'train' # Set batchnorm params based on whether this is a training or a test # run self.dropout_param['mode'] = mode if self.use_batchnorm: for k, bn_param in self.bn_params.items(): bn_param[mode] = mode # =============================== # FORWARD PASS # =============================== hidden = {} hidden['h0'] = X.reshape(X.shape[0], np.prod(X.shape[1:])) # TODO ; Check this... if self.use_dropout: hdrop, cache_hdrop = layers.dropout_forward( hidden['h0'], self.dropout_param) hidden['hdrop0'] = hdrop hidden['cache_hdrop0'] = cache_hdrop # Iterate over layers for l in range(self.num_layers): idx = l + 1 w = self.params['W' + str(idx)] b = self.params['b' + str(idx)] if self.use_dropout: h = hidden['hdrop' + str(idx - 1)] else: h = hidden['h' + str(idx - 1)] if self.use_batchnorm and idx != self.num_layers: gamma = self.params['gamma' + str(idx)] beta = self.params['beta' + str(idx)] bn_param = self.bn_params['bn_param' + str(idx)] # Compute the forward pass # output layer is a special case if idx == self.num_layers: h, cache_h = layers.affine_forward(h, w, b) hidden['h' + str(idx)] = h hidden['cache_h' + str(idx)] = cache_h else: if self.use_batchnorm: h, cache_h = layers.affine_norm_relu_forward( h, w, b, gamma, beta, bn_param) hidden['h' + str(idx)] = h hidden['cache_h' + str(idx)] = cache_h else: h, cache_h = layers.affine_relu_forward(h, w, b) hidden['h' + str(idx)] = h hidden['cache_h' + str(idx)] = cache_h if self.use_dropout: h = hidden['h' + str(idx)] hdrop, cache_hdrop = layers.dropout_forward( h, self.dropout_param) hidden['hdrop' + str(idx)] = hdrop hidden['cache_hdrop' + str(idx)] = cache_hdrop scores = hidden['h' + str(self.num_layers)] if mode == 'test': return scores loss = 0.0 grads: Dict[str, Any] = {} # Compute loss data_loss, dscores = layers.softmax_loss(scores, y) reg_loss = 0 for k in self.params.keys(): if k[0] == 'W': for w in self.params[k]: reg_loss += 0.5 * self.reg * np.sum(w * w) loss = data_loss + reg_loss # =============================== # BACKWARD PASS # =============================== hidden['dh' + str(self.num_layers)] = dscores for l in range(self.num_layers)[::-1]: idx = l + 1 dh = hidden['dh' + str(idx)] h_cache = hidden['cache_h' + str(idx)] if idx == self.num_layers: dh, dw, db = layers.affine_backward(dh, h_cache) hidden['dh' + str(idx - 1)] = dh hidden['dW' + str(idx)] = dw hidden['db' + str(idx)] = db else: if self.use_dropout: cache_hdrop = hidden['cache_hdrop' + str(idx)] dh = layers.dropout_backward(dh, cache_hdrop) if self.use_batchnorm: dh, dw, db, dgamma, dbeta = layers.affine_norm_relu_backward( dh, h_cache) hidden['dh' + str(idx - 1)] = dh hidden['dW' + str(idx)] = dw hidden['db' + str(idx)] = db hidden['dgamma' + str(idx)] = dgamma hidden['dbeta' + str(idx)] = dbeta else: dh, dw, db = layers.affine_relu_backward( dh, h_cache) # TODO This layer definition hidden['dh' + str(idx - 1)] = dh hidden['dW' + str(idx)] = dw hidden['db' + str(idx)] = db # w gradients where we add the regularization term # TODO :' Tidy this up dw_list = {} for key, val in hidden.items(): if key[:2] == 'dW': dw_list[key[1:]] = val + self.reg * self.params[key[1:]] db_list = {} for key, val in hidden.items(): if key[:2] == 'db': db_list[key[1:]] = val # TODO : This is a hack dgamma_list = {} for key, val in hidden.items(): if key[:6] == 'dgamma': dgamma_list[key[1:]] = val # TODO : This is a hack dbeta_list = {} for key, val in hidden.items(): if key[:5] == 'dbeta': dbeta_list[key[1:]] = val grads = {} grads.update(dw_list) grads.update(db_list) grads.update(dgamma_list) grads.update(dbeta_list) return loss, grads