def affine_relu_backward( dout, cache ): """ Backward pass for the affine-relu convenience layer """ fc_cache, relu_cache = cache da = layers.relu_backward( dout, relu_cache ) dx, dw, db = layers.affine_backward( da, fc_cache ) return dx, dw, db
def affine_relu_backward(dout, cache): """ Backward pass for the affine-relu convenience layer """ fc_cache, relu_cache = cache da = relu_backward(dout, relu_cache) dx, dw, db = affine_backward(da, fc_cache) return dx, dw, db
def affine_batchnorm_relu_backward(dout, cache): """ Backward pass for the Affine->BatchNorm->ReLU convenience layer """ fc_cache, bn_cache, relu_cache = cache da = relu_backward(dout, relu_cache) dan, dgamma, dbeta = batchnorm_backward(da, bn_cache) dx, dw, db = affine_backward(dan, fc_cache) return dx, dw, db, dgamma, dbeta
def affine_bn_relu_backward(dout, cache): """ Backward pass for the affine-bn-relu convenience layer """ fc_cache, bn_cache, relu_cache = cache dx = layers.relu_backward(dout, relu_cache) dx, dgamma, dbeta = layers.batchnorm_backward_alt(dx, bn_cache) dx, dw, db = layers.affine_backward(dx, fc_cache) return dx, dw, db, dgamma, dbeta
def combo_backward(dout, cache): """ Backward pass for the affine-relu convenience layer """ dgamma, dbeta = 0, 0 fc_cache, bn_cache, relu_cache = cache da = relu_backward(dout, relu_cache) if bn_cache is not None: da, dgamma, dbeta = batchnorm_backward(da, bn_cache) dx, dw, db = affine_backward(da, fc_cache) return dx, dw, db, dgamma, dbeta
def rnn_step_full_backward(dcurrent_h, cache): (x, prev_h, Wx, Wh, bh, Ws, bs, current_h, cache_affine, dscore) = cache dcurrent_h_, dWs, dbs = affine_backward(dscore, cache_affine) dcurrent_h = dcurrent_h + dcurrent_h_ dcurrent_state = dcurrent_h * (1 - np.square(current_h)) dx = dcurrent_state.dot(Wx.T) dWx = x.T.dot(dcurrent_state) dprev_h = dcurrent_state.dot(Wh.T) dWh = prev_h.T.dot(dcurrent_state) dbh = np.sum(dcurrent_state, axis=0) return dx, dprev_h, dWx, dWh, dbh, dWs, dbs
def lstm_step_backward(dnext_h, dnext_c, cache): """ Backward pass for a single timestep of an LSTM. Inputs: - dnext_h: Gradients of next hidden state, of shape (N, H) - dnext_c: Gradients of next cell state, of shape (N, H) - cache: Values from the forward pass Returns a tuple of: - dx: Gradient of input data, of shape (N, D) - dprev_h: Gradient of previous hidden state, of shape (N, H) - dprev_c: Gradient of previous cell state, of shape (N, H) - dWx: Gradient of input-to-hidden weights, of shape (D, 4H) - dWh: Gradient of hidden-to-hidden weights, of shape (H, 4H) - db: Gradient of biases, of shape (4H,) """ dx, dprev_h, dprev_c, dWx, dWh, db = None, None, None, None, None, None ############################################################################# # TODO: Implement the backward pass for a single timestep of an LSTM. # # # # HINT: For sigmoid and tanh you can compute local derivatives in terms of # # the output value from the nonlinearity. # ############################################################################# N, H = dnext_h.shape o, tanh_next_c, prev_c, cache_gates, i, f, o, g, D = cache dtanh_next_c = dnext_h * o dnext_c_sum = dtanh_next_c * (1 - tanh_next_c * tanh_next_c) + dnext_c dprev_c = dnext_c_sum * f di = dnext_c_sum * g df = dnext_c_sum * prev_c do = dnext_h * tanh_next_c dg = dnext_c_sum * i di_before_sigmoid = di * (i * (1 - i)) df_before_sigmoid = df * (f * (1 - f)) do_before_sigmoid = do * (o * (1 - o)) dg_before_tanh = dg * (1 - g * g) d_gates = np.concatenate((di_before_sigmoid, df_before_sigmoid, do_before_sigmoid, dg_before_tanh), axis=1) dinputs, dW, db = affine_backward(d_gates, cache_gates) dWx = dW[:D, :] dWh = dW[D:, :] dx = dinputs[:, :D] dprev_h = dinputs[:, D:] ############################################################################## # END OF YOUR CODE # ############################################################################## return dx, dprev_h, dprev_c, dWx, dWh, db
def test_affine_backward(): x = np.random.randn(10, 2, 3) w = np.random.randn(6, 5) b = np.random.randn(5) dout = np.random.randn(10, 5) dx_num = eval_numerical_gradient_array(lambda x: affine_forward(x, w, b)[0], x, dout) dw_num = eval_numerical_gradient_array(lambda w: affine_forward(x, w, b)[0], w, dout) db_num = eval_numerical_gradient_array(lambda b: affine_forward(x, w, b)[0], b, dout) _, cache = affine_forward(x, w, b) dx, dw, db = affine_backward(dout, cache) assert dx.shape == dx.shape assert dw.shape == dw.shape assert db.shape == db.shape assert rel_error(dx_num,dx) < 5e-7 assert rel_error(dw_num,dw) < 5e-7 assert rel_error(db_num,db) < 5e-7
def loss(self, X, y=None): """ Compute loss and gradient for a minibatch of data. Inputs: - X: Array of input data of shape (N, d_1, ..., d_k) - y: Array of labels, of shape (N,). y[i] gives the label for X[i]. Returns: If y is None, then run a test-time forward pass of the model and return: - scores: Array of shape (N, C) giving classification scores, where scores[i, c] is the classification score for X[i] and class c. If y is not None, then run a training-time forward and backward pass and return a tuple of: - loss: Scalar value giving the loss - grads: Dictionary with the same keys as self.params, mapping parameter names to gradients of the loss with respect to those parameters. """ scores = None ############################################################################ # TODO: Implement the forward pass for the two-layer net, computing the # # class scores for X and storing them in the scores variable. # ############################################################################ hidden1_out, h1_cache = affine_forward(X, self.params['W1'], self.params['b1']) relu_out, relu_cache = relu_forward(hidden1_out) scores, h2_cache = affine_forward(relu_out, self.params['W2'], self.params['b2']) ############################################################################ # END OF YOUR CODE # ############################################################################ # If y is None then we are in test mode so just return scores if y is None: return scores loss, grads = 0, {} ############################################################################ # TODO: Implement the backward pass for the two-layer net. Store the loss # # in the loss variable and gradients in the grads dictionary. Compute data # # loss using softmax, and make sure that grads[k] holds the gradients for # # self.params[k]. Don't forget to add L2 regularization! # # # # NOTE: To ensure that your implementation matches ours and you pass the # # automated tests, make sure that your L2 regularization includes a factor # # of 0.5 to simplify the expression for the gradient. # ############################################################################ """ X_reshape=np.reshape(X,(X.shape[0],-1)) num_trains=X.shape[0] loss,_=softmax_loss(scores,y) loss=loss+self.reg*0.5*(np.sum(self.params['W2']*self.params['W2'])+np.sum(self.params['W1']*self.params['W1'])) softmax_output=np.exp(scores)/np.sum(np.exp(scores),axis=1).reshape(-1,1) softmax_output[range(num_trains),list(y)]=softmax_output[range(num_trains),list(y)]-1 grads['b2']=np.zeros_like(self.params['b2']) grads['W2']=np.zeros_like(self.params['W2']) grads['b1']=np.zeros_like(self.params['b1']) grads['W1']=np.zeros_like(self.params['W1']) grads['b2']=np.sum(softmax_output,axis=0) grads['W2']=np.dot(relu_out.T,softmax_output) grads_b1_tmp=np.dot(softmax_output,self.params['W2'].T) tmp=(relu_out>0)*grads_b1_tmp grads['b1']=np.sum(tmp,axis=0) grads['W1']=np.dot(X_reshape.T,grads_b1_tmp) grads['W1']=grads['W1']/num_trains+self.reg*self.params['W1'] grads['b1']=grads['b1']/num_trains grads['W2']=grads['W2']/num_trains+self.reg*self.params['W2'] grads['b2']=grads['b2']/num_trains """ num_trains = X.shape[0] loss, dscore = softmax_loss(scores, y) loss = loss + self.reg * 0.5 * ( np.sum(self.params['W2'] * self.params['W2']) + np.sum(self.params['W1'] * self.params['W1'])) grads_h2, grads_w2, grads_b2 = affine_backward(dout=dscore, cache=h2_cache) grads_relu = relu_backward(grads_h2, relu_cache) grads_h1, grads_w1, grads_b1 = affine_backward(grads_relu, h1_cache) grads['W1'] = grads_w1 + self.reg * self.params['W1'] grads['W2'] = grads_w2 + self.reg * self.params['W2'] grads['b1'] = grads_b1 grads['b2'] = grads_b2 ############################################################################ # END OF YOUR CODE # ############################################################################ return loss, grads
def loss(self, X, y=None): """ Compute loss and gradient for the fully-connected net. Inputs: - X: Array of input data of shape (N, d_1, ..., d_k) - y: Array of labels, of shape (N,). y[i] gives the label for X[i]. Returns: If y is None, then run a test-time forward pass of the model and return: - scores: Array of shape (N, C) giving classification scores, where scores[i, c] is the classification score for X[i] and class c. """ X = X.astype(self.dtype) mode = 'test' if y is None else 'train' # Set train/test mode for batchnorm params and dropout param since they # behave differently during training and testing. if self.dropout_param is not None: self.dropout_param['mode'] = mode if self.use_batchnorm: for bn_param in self.bn_params: bn_param[mode] = mode scores = None ####################################################################### # TODO: Implement the forward pass for the fully-connected net, # computing the class scores for X and storing them in the scores # variable. # # When using dropout, you'll need to pass self.dropout_param to each # dropout forward pass. # # When using batch normalization, you'll need to pass self.bn_params[0] # to the forward pass for the first batch normalization layer, # pass self.bn_params[1] to the forward pass for the second batch # normalization layer, etc. ####################################################################### IN = X caches = {} if self.use_dropout: dropout_caches = {} for l in range(self.num_layers - 1): W = self.params["W{}".format(l + 1)] b = self.params["b{}".format(l + 1)] if self.use_batchnorm: gamma = self.params["gamma{}".format(l + 1)] beta = self.params["beta{}".format(l + 1)] IN, cache = affine_batchnorm_relu_forward( IN, W, b, gamma, beta, self.bn_params[l]) else: IN, cache = affine_relu_forward(IN, W, b) caches[l] = cache if self.use_dropout: IN, d_cache = dropout_forward(IN, self.dropout_param) dropout_caches[l] = d_cache # forward pass: last affine layer num_last = self.num_layers name_W_last = "W{}".format(num_last) name_b_last = "b{}".format(num_last) W_last = self.params[name_W_last] b_last = self.params[name_b_last] scores, cache_last = affine_forward(IN, W_last, b_last) ####################################################################### # END OF YOUR CODE # ####################################################################### # If test mode return early if mode == 'test': return scores loss, grads = 0.0, {} ####################################################################### # TODO: Implement the backward pass for the fully-connected net. # Store the loss in the loss variable and gradients in the grads # dictionary. Compute data loss using softmax, and make sure that # grads[k] holds the gradients for self.params[k]. Don't forget to add # L2 regularization! # # When using batch normalization, you don't need to regularize the # scale and shift parameters. # # NOTE: To ensure that your implementation matches ours and you pass # the automated tests, make sure that your L2 regularization includes a # factor of 0.5 to simplify the expression for the gradient. ####################################################################### # loss loss, dscores = softmax_loss(scores, y) # regularization loss for l in range(self.num_layers): W = self.params["W{}".format(l + 1)] loss += 0.5 * self.reg * np.sum(W * W) # backprop through last affine layer dx, dw, db = affine_backward(dscores, cache_last) grads[name_W_last] = dw + self.reg * W_last grads[name_b_last] = db # backprop through affine-batchnorm-relu layers for l in reversed(range(self.num_layers - 1)): name_W = "W{}".format(l + 1) name_b = "b{}".format(l + 1) if self.use_dropout: dx = dropout_backward(dx, dropout_caches[l]) if self.use_batchnorm: dx, dw, db, dgamma, dbeta = affine_batchnorm_relu_backward( dx, caches[l]) grads["gamma{}".format(l + 1)] = dgamma grads["beta{}".format(l + 1)] = dbeta else: dx, dw, db = affine_relu_backward(dx, caches[l]) grads[name_W] = dw + self.reg * self.params[name_W] grads[name_b] = db ####################################################################### # END OF YOUR CODE # ####################################################################### return loss, grads
def two_layer_net(X, model, y=None, reg=0.0): """ Compute the loss and gradients for a two layer fully connected neural network. The net has an input dimension of D, a hidden layer dimension of H, and performs classification over C classes. We use a softmax loss function and L2 regularization the the weight matrices. The two layer net should use a ReLU nonlinearity after the first affine layer. The two layer net has the following architecture: input - fully connected layer - ReLU - fully connected layer - softmax The outputs of the second fully-connected layer are the scores for each class. Inputs: - X: Input data of shape (N, D). Each X[i] is a training sample. - model: Dictionary mapping parameter names to arrays of parameter values. It should contain the following: - W1: First layer weights; has shape (D, H) - b1: First layer biases; has shape (H,) - W2: Second layer weights; has shape (H, C) - b2: Second layer biases; has shape (C,) - y: Vector of training labels. y[i] is the label for X[i], and each y[i] is an integer in the range 0 <= y[i] < C. This parameter is optional; if it is not passed then we only return scores, and if it is passed then we instead return the loss and gradients. - reg: Regularization strength. Returns: If y not is passed, return a matrix scores of shape (N, C) where scores[i, c] is the score for class c on input X[i]. If y is not passed, instead return a tuple of: - loss: Loss (data loss and regularization loss) for this batch of training samples. - grads: Dictionary mapping parameter names to gradients of those parameters with respect to the loss function. This should have the same keys as model. """ # unpack variables from the model dictionary W1, b1, W2, b2 = model['W1'], model['b1'], model['W2'], model['b2'] N, D = X.shape # compute the forward pass scores = None ############################################################################# # TODO: Perform the forward pass, computing the class scores for the input. # # Store the result in the scores variable, which should be an array of # # shape (N, C). # ############################################################################# # relu = lambda x: np.maximum(x,0) # H, C = W2.shape # scores = np.zeros((N,C)) # layer1 = np.maximum(np.dot(X,W1) + b1,0) # scores = np.dot(layer1,W2) + b2 ## above is the test implementation ## NOW, using cs231n/layers.py ## NOTICE define layer0 = X # then behaviour is 'functional' layer(n+1) = f(layer(n) | parameters) from cs231n.layers import affine_forward, relu_forward, softmax_loss from cs231n.layers import affine_backward, relu_backward layer1, cache1 = affine_forward(X, W1, b1) layer2, cache2 = relu_forward(layer1) layer3, cache3 = affine_forward(layer2, W2, b2) scores = layer3 ############################################################################# # END OF YOUR CODE # ############################################################################# # If the targets are not given then jump out, we're done if y is None: return scores # compute the loss loss = None ############################################################################# # TODO: Finish the forward pass, and compute the loss. This should include # # both the data loss and L2 regularization for W1 and W2. Store the result # # in the variable loss, which should be a scalar. Use the Softmax # # classifier loss. So that your results match ours, multiply the # # regularization loss by 0.5 # ############################################################################# # rows = np.sum(np.exp(scores), axis=1) # layer4 = np.mean(-layer3[range(N), y] + np.log(rows)) # loss = layer4 + 0.5 * reg * (np.sum(W1 * W1) + np.sum(W2 * W2)) # loss, dx = softmax_loss(scores, y) loss += 0.5 * reg * np.sum(W1*W1) + 0.5 * reg * np.sum(W2 * W2) ############################################################################# # END OF YOUR CODE # ############################################################################# # compute the gradients grads = {} ############################################################################# # TODO: Compute the backward pass, computing the derivatives of the weights # # and biases. Store the results in the grads dictionary. For example, # # grads['W1'] should store the gradient on W1, and be a matrix of same size # ############################################################################# dlayer2, grads['W2'], grads['b2'] = affine_backward(dx, cache3) dlayer1 = relu_backward(dlayer2, cache2) dLayer0, grads['W1'], grads['b1'] = affine_backward(dlayer1, cache1) #gradients need to have regularization term grads['W2'] += reg * W2 grads['W1'] += reg * W1 ############################################################################# # END OF YOUR CODE # ############################################################################# return loss, grads
def loss(self, X, y=None): """ Compute loss and gradient for a minibatch of data. Inputs: - X: Array of input data of shape (N, d_1, ..., d_k) - y: Array of labels, of shape (N,). y[i] gives the label for X[i]. Returns: If y is None, then run a test-time forward pass of the model and return: - scores: Array of shape (N, C) giving classification scores, where scores[i, c] is the classification score for X[i] and class c. If y is not None, then run a training-time forward and backward pass and return a tuple of: - loss: Scalar value giving the loss - grads: Dictionary with the same keys as self.params, mapping parameter names to gradients of the loss with respect to those parameters. """ scores = None ############################################################################ # TODO: Implement the forward pass for the two-layer net, computing the # # class scores for X and storing them in the scores variable. # ############################################################################ out_affine1, cache_affine1 = layers.affine_forward( X, self.params["W1"], self.params["b1"]) out_relu1, cache_relu1 = layers.relu_forward(out_affine1) out_affine2, cache_affine2 = layers.affine_forward( out_relu1, self.params["W2"], self.params["b2"]) # no need to compute SVM/softmax loss, just give the argmax result When # we are in prediction. scores = out_affine2 ############################################################################ # END OF YOUR CODE # ############################################################################ # If y is None then we are in test mode so just return scores if y is None: return scores loss, grads = 0, {} ############################################################################ # TODO: Implement the backward pass for the two-layer net. Store the loss # # in the loss variable and gradients in the grads dictionary. Compute data # # loss using softmax, and make sure that grads[k] holds the gradients for # # self.params[k]. Don't forget to add L2 regularization! # # # # NOTE: To ensure that your implementation matches ours and you pass the # # automated tests, make sure that your L2 regularization includes a factor # # of 0.5 to simplify the expression for the gradient. # ############################################################################ # in training, compute the loss and do backprop. loss, dloss = layers.softmax_loss(scores, y) # need to add regularization here... loss += 0.5 * self.reg * (np.sum(self.params["W1"]**2) + np.sum(self.params["W2"]**2)) dout_affine2 = layers.affine_backward(dloss, cache_affine2) grads["W2"] = dout_affine2[1] + self.reg * self.params["W2"] grads["b2"] = dout_affine2[2] dout_relu1 = layers.relu_backward(dout_affine2[0], cache_relu1) dout_affine1 = layers.affine_backward(dout_relu1, cache_affine1) grads["W1"] = dout_affine1[1] + self.reg * self.params["W1"] grads["b1"] = dout_affine1[2] ############################################################################ # END OF YOUR CODE # ############################################################################ return loss, grads
def loss(self, X, y=None): """ Compute loss and gradient for the fully-connected net. Input / output: Same as TwoLayerNet above. """ X = X.astype(self.dtype) mode = 'test' if y is None else 'train' # Set train/test mode for batchnorm params and dropout param since they # behave differently during training and testing. if self.use_dropout: self.dropout_param['mode'] = mode if self.use_batchnorm: for bn_param in self.bn_params: bn_param['mode'] = mode scores = None ############################################################################ # TODO: Implement the forward pass for the fully-connected net, computing # # the class scores for X and storing them in the scores variable. # # # # When using dropout, you'll need to pass self.dropout_param to each # # dropout forward pass. # # # # When using batch normalization, you'll need to pass self.bn_params[0] to # # the forward pass for the first batch normalization layer, pass # # self.bn_params[1] to the forward pass for the second batch normalization # # layer, etc. # ############################################################################ caches = collections.defaultdict(list) out_layer = X for i in range(self.num_layers - 1): n = str(i + 1) # (zy) The learned parameters are for BN affine transformation used # in training, while the running average is used for prediction. if self.use_batchnorm: out_layer, cache = affine_bn_relu_forward( out_layer, self.params["W" + n], self.params["b" + n], self.params["gamma" + n], self.params["beta" + n], self.bn_params[i]) caches["affine_bn_relu"].append(cache) else: out_layer, cache = layers.affine_forward( out_layer, self.params["W" + n], self.params["b" + n]) caches["affine"].append(cache) out_layer, cache = layers.relu_forward(out_layer) caches["relu"].append(cache) if self.use_dropout: out_layer, cache = layers.dropout_forward( out_layer, self.dropout_param) caches["drop"].append(cache) nn = str(self.num_layers) scores, cache = layers.affine_forward(out_layer, self.params["W" + nn], self.params["b" + nn]) ############################################################################ # END OF YOUR CODE # ############################################################################ # If test mode return early if mode == 'test': return scores loss, grads = 0.0, {} ############################################################################ # TODO: Implement the backward pass for the fully-connected net. Store the # # loss in the loss variable and gradients in the grads dictionary. Compute # # data loss using softmax, and make sure that grads[k] holds the gradients # # for self.params[k]. Don't forget to add L2 regularization! # # # # When using batch normalization, you don't need to regularize the scale # # and shift parameters. # # # # NOTE: To ensure that your implementation matches ours and you pass the # # automated tests, make sure that your L2 regularization includes a factor # # of 0.5 to simplify the expression for the gradient. # ############################################################################ loss, dloss = layers.softmax_loss(scores, y) # for regularization if self.reg != 0: for k, v in self.params.items(): # only include the w parameters, excluding gamma, beta and b if k.startswith("W"): loss += 0.5 * self.reg * np.sum(v**2) # get the gradient out = layers.affine_backward(dloss, cache) dout, grads["W" + nn], grads["b" + nn] = out grads["W" + nn] += self.reg * cache[1] for i in range(self.num_layers - 2, -1, -1): n = str(i + 1) if self.use_dropout: dout = layers.dropout_backward(dout, caches["drop"][i]) if self.use_batchnorm: out = affine_bn_relu_backward(dout, caches["affine_bn_relu"][i]) dout, grads["W"+n], grads["b"+n], \ grads["gamma"+n], grads["beta"+n] = out grads["W" + n] += self.reg * self.params["W" + n] if self.reg else 0 else: dout = layers.relu_backward(dout, caches["relu"][i]) out = layers.affine_backward(dout, caches["affine"][i]) dout, grads["W" + n], grads["b" + n] = out # need to include regularization grads["W" + n] += self.reg * caches["affine"][i][1] ############################################################################ # END OF YOUR CODE # ############################################################################ return loss, grads
def loss(self, X, y=None): """ Compute loss and gradient for the fully-connected net. Input / output: Same as TwoLayerNet above. """ X = X.astype(self.dtype) mode = 'test' if y is None else 'train' # Set train/test mode for batchnorm params and dropout param since they # behave differently during training and testing. if self.use_dropout: self.dropout_param['mode'] = mode if self.normalization == 'batchnorm': for bn_param in self.bn_params: bn_param['mode'] = mode scores = None ############################################################################ # TODO: Implement the forward pass for the fully-connected net, computing # # the class scores for X and storing them in the scores variable. # # # # When using dropout, you'll need to pass self.dropout_param to each # # dropout forward pass. # # # # When using batch normalization, you'll need to pass self.bn_params[0] to # # the forward pass for the first batch normalization layer, pass # # self.bn_params[1] to the forward pass for the second batch normalization # # layer, etc. # ############################################################################ # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)***** arg, caches = X, [] for i in range(1, self.num_layers + 1): cache = {} W = self.params[f"W{i}"] b = self.params[f"b{i}"] arg, cache['fc_cache'] = affine_forward(arg, W, b) if i != self.num_layers and self.normalization: gamma = self.params[f"gamma{i}"] beta = self.params[f"beta{i}"] normalize_forward = batchnorm_forward if self.normalization is 'batchnorm' else layernorm_forward arg, cache['bn_cache'] = normalize_forward(arg, gamma, beta, self.bn_params[i-1]) arg, cache['relu_cache'] = relu_forward(arg) if self.use_dropout: arg, cache['dropout_cache'] = dropout_forward(arg, self.dropout_param) caches.append(cache) scores = arg # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)***** ############################################################################ # END OF YOUR CODE # ############################################################################ # If test mode return early if mode == 'test': return scores loss, grads = 0.0, {} ############################################################################ # TODO: Implement the backward pass for the fully-connected net. Store the # # loss in the loss variable and gradients in the grads dictionary. Compute # # data loss using softmax, and make sure that grads[k] holds the gradients # # for self.params[k]. Don't forget to add L2 regularization! # # # # When using batch/layer normalization, you don't need to regularize the scale # # and shift parameters. # # # # NOTE: To ensure that your implementation matches ours and you pass the # # automated tests, make sure that your L2 regularization includes a factor # # of 0.5 to simplify the expression for the gradient. # ############################################################################ # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)***** loss, dout = softmax_loss(scores, y) for i in range(self.num_layers, 0, -1): W = self.params[f"W{i}"] cache = caches[i-1] if self.use_dropout: dout = dropout_backward(dout, cache['dropout_cache']) da = relu_backward(dout, cache['relu_cache']) if i != self.num_layers and self.normalization: normalize_backward = batchnorm_backward if self.normalization is 'batchnorm' else layernorm_backward da, dgamma, dbeta = batchnorm_backward(da, cache['bn_cache']) grads[f"gamma{i}"] = dgamma grads[f"beta{i}"] = dbeta dout, dw, db = affine_backward(da, cache['fc_cache']) grads[f"W{i}"] = dw + self.reg * W grads[f"b{i}"] = db loss += 0.5 * self.reg * np.sum(W * W) # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)***** ############################################################################ # END OF YOUR CODE # ############################################################################ return loss, grads
# Test the affine_backward function np.random.seed(231) x = np.random.randn(10, 2, 3) w = np.random.randn(6, 5) b = np.random.randn(5) dout = np.random.randn(10, 5) dx_num = eval_numerical_gradient_array(lambda x: affine_forward(x, w, b)[0], x, dout) dw_num = eval_numerical_gradient_array(lambda w: affine_forward(x, w, b)[0], w, dout) db_num = eval_numerical_gradient_array(lambda b: affine_forward(x, w, b)[0], b, dout) _, cache = affine_forward(x, w, b) dx, dw, db = affine_backward(dout, cache) # The error should be around 1e-10 print('Testing affine_backward function:') print('dx error: ', rel_error(dx_num, dx)) print('dw error: ', rel_error(dw_num, dw)) print('db error: ', rel_error(db_num, db)) # Test the relu_forward function x = np.linspace(-0.5, 0.5, num=12).reshape(3, 4) out, _ = relu_forward(x) correct_out = np.array([[ 0., 0., 0.,
def loss(self, X, y=None): """ Compute loss and gradient for the fully-connected net. Input / output: Same as TwoLayerNet above. """ X = X.astype(self.dtype) mode = 'test' if y is None else 'train' # Set train/test mode for batchnorm params and dropout param since they # behave differently during training and testing. if self.use_dropout: self.dropout_param['mode'] = mode if self.normalization == 'batchnorm': for bn_param in self.bn_params: bn_param['mode'] = mode scores = None ############################################################################ # TODO: Implement the forward pass for the fully-connected net, computing # # the class scores for X and storing them in the scores variable. # # # # When using dropout, you'll need to pass self.dropout_param to each # # dropout forward pass. # # # # When using batch normalization, you'll need to pass self.bn_params[0] to # # the forward pass for the first batch normalization layer, pass # # self.bn_params[1] to the forward pass for the second batch normalization # # layer, etc. # ############################################################################ combo_caches = [] fc_cache = None N = X.shape[0] D = np.prod(X.shape[1:]) x_ = X.reshape(N, D) # middle combo layers for layer in range(1, self.num_layers): #[1, 2, ..., L-1] w = self.params['W' + str(layer)] b = self.params['b' + str(layer)] # prepare for batch normalization gamma, beta, bn_parma = 1., 0, None if self.normalization == 'batchnorm': gamma = self.params['gamma' + str(layer)] beta = self.params['beta' + str(layer)] bn_parma = self.bn_params[layer - 1] # zero based x_, cache = combo_forward(x_, w, b, gamma, beta, bn_parma) combo_caches.append(cache) # final fully connected layer w = self.params['W' + str(self.num_layers)] b = self.params['b' + str(self.num_layers)] scores, fc_cache = affine_forward(x_, w, b) ############################################################################ # END OF YOUR CODE # ############################################################################ # If test mode return early if mode == 'test': return scores loss, grads = 0.0, {} ############################################################################ # TODO: Implement the backward pass for the fully-connected net. Store the # # loss in the loss variable and gradients in the grads dictionary. Compute # # data loss using softmax, and make sure that grads[k] holds the gradients # # for self.params[k]. Don't forget to add L2 regularization! # # # # When using batch/layer normalization, you don't need to regularize the scale # # and shift parameters. # # # # NOTE: To ensure that your implementation matches ours and you pass the # # automated tests, make sure that your L2 regularization includes a factor # # of 0.5 to simplify the expression for the gradient. # ############################################################################ loss, dout = softmax_loss(scores, y) # finall fully connected layer dout, dw, db = affine_backward(dout, fc_cache) grads['W' + str(self.num_layers )] = dw + self.reg * self.params['W' + str(self.num_layers)] grads['b' + str(self.num_layers)] = db # adjust loss with regularization term of dWL loss += 0.5 * self.reg * np.sum(self.params['W' + str(self.num_layers)] **2) # middle combo layers for layer in range(self.num_layers - 1, 0, -1): # [L-1, L-2, ... ,1] dout, dw, db, dgamma, dbeta = combo_backward( dout, combo_caches[layer - 1]) grads['W' + str(layer)] = dw + self.reg * self.params['W' + str(layer)] grads['b' + str(layer)] = db if self.normalization == 'batchnorm': grads['gamma' + str(layer)] = dgamma grads['beta' + str(layer)] = dbeta # adjust loss with regularization term of dWl loss += 0.5 * self.reg * np.sum(self.params['W' + str(layer)]**2) ############################################################################ # END OF YOUR CODE # ############################################################################ return loss, grads
def loss(self, X, y=None): """ Compute loss and gradient for a minibatch of data. Inputs: - X: Array of input data of shape (N, d_1, ..., d_k) - y: Array of labels, of shape (N,). y[i] gives the label for X[i]. Returns: If y is None, then run a test-time forward pass of the model and return: - scores: Array of shape (N, C) giving classification scores, where scores[i, c] is the classification score for X[i] and class c. If y is not None, then run a training-time forward and backward pass and return a tuple of: - loss: Scalar value giving the loss - grads: Dictionary with the same keys as self.params, mapping parameter names to gradients of the loss with respect to those parameters. """ scores = None ####################################################################### # TODO: Implement the forward pass for the two-layer net, computing the # class scores for X and storing them in the scores variable. ####################################################################### W1 = self.params["W1"] b1 = self.params["b1"] W2 = self.params["W2"] b2 = self.params["b2"] N = X.shape[0] C = W2.shape[1] scores = np.zeros((N, C)) X_hidden, cache1 = affine_relu_forward(X, W1, b1) scores, cache2 = affine_forward(X_hidden, W2, b2) ####################################################################### # END OF YOUR CODE # ####################################################################### # If y is None then we are in test mode so just return scores if y is None: return scores loss, grads = 0, {} ####################################################################### # TODO: Implement the backward pass for the two-layer net. Store the # loss in the loss variable and gradients in the grads dictionary. # Compute data loss using softmax, and make sure that grads[k] # holds the gradients for self.params[k]. Don't forget to add L2 # regularization! # # NOTE: To ensure that your implementation matches ours and you pass # the automated tests, make sure that your L2 regularization includes a # factor of 0.5 to simplify the expression for the gradient. ####################################################################### loss, dscores = softmax_loss(scores, y) loss += 0.5 * self.reg * (np.sum(W1 * W1) + np.sum(W2 * W2)) dx_hidden, dw2, db2 = affine_backward(dscores, cache2) grads["W2"] = dw2 + self.reg * W2 grads["b2"] = db2 dx, dw1, db1 = affine_relu_backward(dx_hidden, cache1) grads["W1"] = dw1 + self.reg * W1 grads["b1"] = db1 ####################################################################### # END OF YOUR CODE # ####################################################################### return loss, grads
def loss(self, X, y=None): """ Compute loss and gradient for a minibatch of data. Inputs: - X: Array of input data of shape (N, d_1, ..., d_k) - y: Array of labels, of shape (N,). y[i] gives the label for X[i]. Returns: If y is None, then run a test-time forward pass of the model and return: - scores: Array of shape (N, C) giving classification scores, where scores[i, c] is the classification score for X[i] and class c. If y is not None, then run a training-time forward and backward pass and return a tuple of: - loss: Scalar value giving the loss - grads: Dictionary with the same keys as self.params, mapping parameter names to gradients of the loss with respect to those parameters. """ scores = None ############################################################################ # TODO: Implement the forward pass for the two-layer net, computing the # # class scores for X and storing them in the scores variable. # ############################################################################ W1, b1 = self.params['W1'], self.params['b1'] W2, b2 = self.params['W2'], self.params['b2'] N = X.shape[0] D = np.prod(X.shape[1:]) X_ = X.reshape(N, D) A, fc1_cache = affine_forward(X_, W1, b1) R, relu_cache = relu_forward(A) scores, fc2_cache = affine_forward(R, W2, b2) ############################################################################ # END OF YOUR CODE # ############################################################################ # If y is None then we are in test mode so just return scores if y is None: return scores loss, grads = 0, {} ############################################################################ # TODO: Implement the backward pass for the two-layer net. Store the loss # # in the loss variable and gradients in the grads dictionary. Compute data # # loss using softmax, and make sure that grads[k] holds the gradients for # # self.params[k]. Don't forget to add L2 regularization! # # # # NOTE: To ensure that your implementation matches ours and you pass the # # automated tests, make sure that your L2 regularization includes a factor # # of 0.5 to simplify the expression for the gradient. # ############################################################################ loss, dscores = softmax_loss(scores, y) dR, dW2, db2 = affine_backward(dscores, fc2_cache) dA = relu_backward(dR, relu_cache) dX, dW1, db1 = affine_backward(dA, fc1_cache) loss += 0.5 * self.reg * (np.sum(W1 * W1) + np.sum(W2 * W2)) dW2 += self.reg * W2 dW1 += self.reg * W1 grads = {'W1': dW1, 'b1': db1, 'W2': dW2, 'b2': db2} ############################################################################ # END OF YOUR CODE # ############################################################################ return loss, grads
def loss(self, X, y=None): """ Compute loss and gradient for the fully-connected net. Input / output: Same as TwoLayerNet above. """ X = X.astype(self.dtype) mode = 'test' if y is None else 'train' # Set train/test mode for batchnorm params and dropout param since they # behave differently during training and testing. """ if self.use_dropout: self.dropout_param['mode']=mode """ if self.use_batchnorm: for bn_param in self.bn_params: bn_param['mode'] = mode scores = None ############################################################################ # TODO: Implement the forward pass for the fully-connected net, computing # # the class scores for X and storing them in the scores variable. # # # # When using dropout, you'll need to pass self.dropout_param to each # # dropout forward pass. # # # # When using batch normalization, you'll need to pass self.bn_params[0] to # # the forward pass for the first batch normalization layer, pass # # self.bn_params[1] to the forward pass for the second batch normalization # # layer, etc. # ############################################################################ X_temp = X affine_Input = list() relu_input = list() batchnorm_input = list() dropout_input = list() score_tmp = None for i in range(self.num_layers - 1): tmp, affine_input_tmp = affine_forward( X_temp, self.params['W' + str(i + 1)], self.params['b' + str(i + 1)]) if self.use_batchnorm: tmp, batchnorm_cache = batchnorm_forward( tmp, self.params['gamma' + str(i + 1)], self.params['beta' + str(i + 1)], self.bn_params[i]) batchnorm_input.append(batchnorm_cache) score_tmp, relu_input_tmp = relu_forward(tmp) if self.use_dropout: score_tmp, dropout_cache = dropout_forward( score_tmp, self.dropout_param) dropout_input.append(dropout_cache) affine_Input.append(affine_input_tmp) relu_input.append(relu_input_tmp) X_temp = score_tmp scores, last_input_tmp = affine_forward( score_tmp, self.params['W' + str(self.num_layers)], self.params['b' + str(self.num_layers)]) affine_Input.append(last_input_tmp) ############################################################################ # END OF YOUR CODE # ############################################################################ if mode == 'test': return scores loss, grads = 0.0, {} ############################################################################ # TODO: Implement the backward pass for the fully-connected net. Store the # # loss in the loss variable and gradients in the grads dictionary. Compute # # data loss using softmax, and make sure that grads[k] holds the gradients # # for self.params[k]. Don't forget to add L2 regularization! # # # # When using batch normalization, you don't need to regularize the scale # # and shift parameters. # # # # NOTE: To ensure that your implementation matches ours and you pass the # # automated tests, make sure that your L2 regularization includes a factor # # of 0.5 to simplify the expression for the gradient. # ############################################################################ num_trains = X.shape[0] loss, dscores = softmax_loss(scores, y) weight_decay_sum = 0 for i in range(self.num_layers): tmp = np.sum(self.params['W' + str(i + 1)] * self.params['W' + str(i + 1)]) weight_decay_sum = weight_decay_sum + tmp loss = loss + 0.5 * self.reg * weight_decay_sum #softmax_output=np.exp(scores)/np.sum(np.exp(scores),axis=1).reshape(-1,1) #softmax_output[range(num_trains),list(y)]=softmax_output[range(num_trains),list(y)]-1 dout = dscores for i in range(self.num_layers): dx, dw, db = affine_backward(dout, affine_Input[-(i + 1)]) grads['W' + str(self.num_layers - i)] = dw + self.reg * self.params[ 'W' + str(self.num_layers - i)] grads['b' + str(self.num_layers - i)] = db if self.use_dropout and i != self.num_layers - 1: dx = dropout_backward(dx, dropout_input[-(i + 1)]) if i != self.num_layers - 1: dout = relu_backward(dx, relu_input[-(i + 1)]) if i != self.num_layers - 1 and self.use_batchnorm: dout, dgamma, dbeta = batchnorm_backward( dout, batchnorm_input[-(i + 1)]) grads['gamma' + str(self.num_layers - i - 1)] = dgamma grads['beta' + str(self.num_layers - i - 1)] = dbeta return loss, grads