def test_relu_forward_multiple_dim(dim): testing_shape = [] for y in range(0, dim): testing_shape.append(np.random.randint(3, 8)) shape = tuple(testing_shape) #y = np.random.randn(*testing_shape) x = np.random.standard_normal(shape) assert x.shape == relu_forward(x)[0].shape x[x < 0] = 0 assert rel_error(x, relu_forward(x)[0]) < 5e-7
def test_relu_forward_multiple_dim(dim): testing_shape = [] for y in range(0,dim): testing_shape.append(np.random.randint(3,8)) shape = tuple(testing_shape) #y = np.random.randn(*testing_shape) x = np.random.standard_normal(shape) assert x.shape == relu_forward(x)[0].shape x[x<0] = 0 assert rel_error(x, relu_forward(x)[0]) < 5e-7
def test_relu_forward(): # Test the relu_forward function x = np.linspace(-0.5, 0.5, num=12).reshape(3, 4) out, _ = relu_forward(x) correct_out = np.array([[ 0., 0., 0., 0., ], [ 0., 0., 0.04545455, 0.13636364, ], [ 0.22727273, 0.31818182, 0.40909091, 0.5, ]]) # Compare your output with ours. The error should be around 1e-8 assert out.shape == correct_out.shape assert rel_error(out, correct_out) < 5e-7
def train_loss(*args): X = args[0] y = args[1] res = X for l in xrange(self.num_layers): prev_res = res res = affine_forward(prev_res, args[self.w_idx(l)], args[self.b_idx(l)]) if l < (self.num_layers - 1): if self.use_batchnorm: res = batchnorm_forward(res, args[self.bn_ga_idx(l)], args[self.bn_bt_idx(l)], self.bn_params[l]) res = relu_forward(res) if self.use_dropout: res = dropout_forward(res, self.dropout_param) scores = res if mode == 'test': return scores #loss, _ = softmax_loss(scores, y) loss = svm_loss(scores, y) return loss
def test_relu_forward(): # Test the relu_forward function x = np.linspace(-0.5, 0.5, num=12).reshape(3, 4) out, _ = relu_forward(x) correct_out = np.array([[ 0., 0., 0., 0., ], [ 0., 0., 0.04545455, 0.13636364,], [ 0.22727273, 0.31818182, 0.40909091, 0.5, ]]) # Compare your output with ours. The error should be around 1e-8 assert out.shape == correct_out.shape assert rel_error(out, correct_out) < 5e-7
def affine_relu_forward(x, w, b): """ Convenience layer that perorms an affine transform followed by a ReLU Inputs: - x: Input to the affine layer - w, b: Weights for the affine layer Returns a tuple of: - out: Output from the ReLU - cache: Object to give to the backward pass """ a = affine_forward(x, w, b) out = relu_forward(a) return out
def conv_relu_forward(x, w, b, conv_param): """ A convenience layer that performs a convolution followed by a ReLU. Inputs: - x: Input to the convolutional layer - w, b, conv_param: Weights and parameters for the convolutional layer Returns a tuple of: - out: Output from the ReLU - cache: Object to give to the backward pass """ a, conv_cache = conv_forward_fast(x, w, b, conv_param) out, relu_cache = relu_forward(a) cache = (conv_cache, relu_cache) return out, cache
def affine_batchnorm_relu_forward(x, w, b, gamma, beta, bn_param): """ Convenience layer that performs Affine->BatchNorm->ReLU Inputs: - x: Input to the affine layer - w, b: Weights for the affine layer Returns a tuple of: - out: Output from the ReLU - cache: Object to give to the backward pass """ a, fc_cache = affine_forward(x, w, b) b, bn_cache = batchnorm_forward(a, gamma, beta, bn_param) out, relu_cache = relu_forward(b) cache = (fc_cache, bn_cache, relu_cache) return out, cache
def affine_bn_relu_forward(x, w, b, gamma, beta, bn_param): """ Convenience layer that perorms an affine transform, batch normalization and then a Relu activation. Inputs: - x: Input to the affine layer - w, b: Weights for the affine layer Returns a tuple of: - out: Output from the ReLU - cache: Object to give to the backward pass """ out, fc_cache = layers.affine_forward(x, w, b) out, bn_cache = layers.batchnorm_forward(out, gamma, beta, bn_param) out, relu_cache = layers.relu_forward(out) cache = fc_cache, bn_cache, relu_cache, return out, cache
def conv_relu_pool_forward(x, w, b, conv_param, pool_param): """ Convenience layer that performs a convolution, a ReLU, and a pool. Inputs: - x: Input to the convolutional layer - w, b, conv_param: Weights and parameters for the convolutional layer - pool_param: Parameters for the pooling layer Returns a tuple of: - out: Output from the pooling layer - cache: Object to give to the backward pass """ a, conv_cache = conv_forward_fast(x, w, b, conv_param) s, relu_cache = relu_forward(a) out, pool_cache = max_pool_forward_fast(s, pool_param) cache = (conv_cache, relu_cache, pool_cache) return out, cache
def combo_forward(x, w, b, gamma, beta, bn_param): """ Combo layer forward: FC -> BN -> ReLU Inputs: - x: Input to the affine layer - w, b: Weights for the affine layer Returns a tuple of: - out: Output from the ReLU - cache: Object to give to the backward pass """ bn_cache = None a, fc_cache = affine_forward(x, w, b) if bn_param is not None: a, bn_cache = batchnorm_forward(a, gamma, beta, bn_param) out, relu_cache = relu_forward(a) cache = (fc_cache, bn_cache, relu_cache) return out, cache
def loss(self, X, y=None): """ Compute loss and gradient for the fully-connected net. Input / output: Same as TwoLayerNet above. """ X = X.astype(self.dtype) mode = 'test' if y is None else 'train' # Set train/test mode for batchnorm params and dropout param since they # behave differently during training and testing. """ if self.use_dropout: self.dropout_param['mode']=mode """ if self.use_batchnorm: for bn_param in self.bn_params: bn_param['mode'] = mode scores = None ############################################################################ # TODO: Implement the forward pass for the fully-connected net, computing # # the class scores for X and storing them in the scores variable. # # # # When using dropout, you'll need to pass self.dropout_param to each # # dropout forward pass. # # # # When using batch normalization, you'll need to pass self.bn_params[0] to # # the forward pass for the first batch normalization layer, pass # # self.bn_params[1] to the forward pass for the second batch normalization # # layer, etc. # ############################################################################ X_temp = X affine_Input = list() relu_input = list() batchnorm_input = list() dropout_input = list() score_tmp = None for i in range(self.num_layers - 1): tmp, affine_input_tmp = affine_forward( X_temp, self.params['W' + str(i + 1)], self.params['b' + str(i + 1)]) if self.use_batchnorm: tmp, batchnorm_cache = batchnorm_forward( tmp, self.params['gamma' + str(i + 1)], self.params['beta' + str(i + 1)], self.bn_params[i]) batchnorm_input.append(batchnorm_cache) score_tmp, relu_input_tmp = relu_forward(tmp) if self.use_dropout: score_tmp, dropout_cache = dropout_forward( score_tmp, self.dropout_param) dropout_input.append(dropout_cache) affine_Input.append(affine_input_tmp) relu_input.append(relu_input_tmp) X_temp = score_tmp scores, last_input_tmp = affine_forward( score_tmp, self.params['W' + str(self.num_layers)], self.params['b' + str(self.num_layers)]) affine_Input.append(last_input_tmp) ############################################################################ # END OF YOUR CODE # ############################################################################ if mode == 'test': return scores loss, grads = 0.0, {} ############################################################################ # TODO: Implement the backward pass for the fully-connected net. Store the # # loss in the loss variable and gradients in the grads dictionary. Compute # # data loss using softmax, and make sure that grads[k] holds the gradients # # for self.params[k]. Don't forget to add L2 regularization! # # # # When using batch normalization, you don't need to regularize the scale # # and shift parameters. # # # # NOTE: To ensure that your implementation matches ours and you pass the # # automated tests, make sure that your L2 regularization includes a factor # # of 0.5 to simplify the expression for the gradient. # ############################################################################ num_trains = X.shape[0] loss, dscores = softmax_loss(scores, y) weight_decay_sum = 0 for i in range(self.num_layers): tmp = np.sum(self.params['W' + str(i + 1)] * self.params['W' + str(i + 1)]) weight_decay_sum = weight_decay_sum + tmp loss = loss + 0.5 * self.reg * weight_decay_sum #softmax_output=np.exp(scores)/np.sum(np.exp(scores),axis=1).reshape(-1,1) #softmax_output[range(num_trains),list(y)]=softmax_output[range(num_trains),list(y)]-1 dout = dscores for i in range(self.num_layers): dx, dw, db = affine_backward(dout, affine_Input[-(i + 1)]) grads['W' + str(self.num_layers - i)] = dw + self.reg * self.params[ 'W' + str(self.num_layers - i)] grads['b' + str(self.num_layers - i)] = db if self.use_dropout and i != self.num_layers - 1: dx = dropout_backward(dx, dropout_input[-(i + 1)]) if i != self.num_layers - 1: dout = relu_backward(dx, relu_input[-(i + 1)]) if i != self.num_layers - 1 and self.use_batchnorm: dout, dgamma, dbeta = batchnorm_backward( dout, batchnorm_input[-(i + 1)]) grads['gamma' + str(self.num_layers - i - 1)] = dgamma grads['beta' + str(self.num_layers - i - 1)] = dbeta return loss, grads
def two_layer_net(X, model, y=None, reg=0.0): """ Compute the loss and gradients for a two layer fully connected neural network. The net has an input dimension of D, a hidden layer dimension of H, and performs classification over C classes. We use a softmax loss function and L2 regularization the the weight matrices. The two layer net should use a ReLU nonlinearity after the first affine layer. The two layer net has the following architecture: input - fully connected layer - ReLU - fully connected layer - softmax The outputs of the second fully-connected layer are the scores for each class. Inputs: - X: Input data of shape (N, D). Each X[i] is a training sample. - model: Dictionary mapping parameter names to arrays of parameter values. It should contain the following: - W1: First layer weights; has shape (D, H) - b1: First layer biases; has shape (H,) - W2: Second layer weights; has shape (H, C) - b2: Second layer biases; has shape (C,) - y: Vector of training labels. y[i] is the label for X[i], and each y[i] is an integer in the range 0 <= y[i] < C. This parameter is optional; if it is not passed then we only return scores, and if it is passed then we instead return the loss and gradients. - reg: Regularization strength. Returns: If y not is passed, return a matrix scores of shape (N, C) where scores[i, c] is the score for class c on input X[i]. If y is not passed, instead return a tuple of: - loss: Loss (data loss and regularization loss) for this batch of training samples. - grads: Dictionary mapping parameter names to gradients of those parameters with respect to the loss function. This should have the same keys as model. """ # unpack variables from the model dictionary W1, b1, W2, b2 = model['W1'], model['b1'], model['W2'], model['b2'] N, D = X.shape # compute the forward pass scores = None ############################################################################# # TODO: Perform the forward pass, computing the class scores for the input. # # Store the result in the scores variable, which should be an array of # # shape (N, C). # ############################################################################# # relu = lambda x: np.maximum(x,0) # H, C = W2.shape # scores = np.zeros((N,C)) # layer1 = np.maximum(np.dot(X,W1) + b1,0) # scores = np.dot(layer1,W2) + b2 ## above is the test implementation ## NOW, using cs231n/layers.py ## NOTICE define layer0 = X # then behaviour is 'functional' layer(n+1) = f(layer(n) | parameters) from cs231n.layers import affine_forward, relu_forward, softmax_loss from cs231n.layers import affine_backward, relu_backward layer1, cache1 = affine_forward(X, W1, b1) layer2, cache2 = relu_forward(layer1) layer3, cache3 = affine_forward(layer2, W2, b2) scores = layer3 ############################################################################# # END OF YOUR CODE # ############################################################################# # If the targets are not given then jump out, we're done if y is None: return scores # compute the loss loss = None ############################################################################# # TODO: Finish the forward pass, and compute the loss. This should include # # both the data loss and L2 regularization for W1 and W2. Store the result # # in the variable loss, which should be a scalar. Use the Softmax # # classifier loss. So that your results match ours, multiply the # # regularization loss by 0.5 # ############################################################################# # rows = np.sum(np.exp(scores), axis=1) # layer4 = np.mean(-layer3[range(N), y] + np.log(rows)) # loss = layer4 + 0.5 * reg * (np.sum(W1 * W1) + np.sum(W2 * W2)) # loss, dx = softmax_loss(scores, y) loss += 0.5 * reg * np.sum(W1*W1) + 0.5 * reg * np.sum(W2 * W2) ############################################################################# # END OF YOUR CODE # ############################################################################# # compute the gradients grads = {} ############################################################################# # TODO: Compute the backward pass, computing the derivatives of the weights # # and biases. Store the results in the grads dictionary. For example, # # grads['W1'] should store the gradient on W1, and be a matrix of same size # ############################################################################# dlayer2, grads['W2'], grads['b2'] = affine_backward(dx, cache3) dlayer1 = relu_backward(dlayer2, cache2) dLayer0, grads['W1'], grads['b1'] = affine_backward(dlayer1, cache1) #gradients need to have regularization term grads['W2'] += reg * W2 grads['W1'] += reg * W1 ############################################################################# # END OF YOUR CODE # ############################################################################# return loss, grads
def loss(self, X, y=None): """ Compute loss and gradient for a minibatch of data. Inputs: - X: Array of input data of shape (N, d_1, ..., d_k) - y: Array of labels, of shape (N,). y[i] gives the label for X[i]. Returns: If y is None, then run a test-time forward pass of the model and return: - scores: Array of shape (N, C) giving classification scores, where scores[i, c] is the classification score for X[i] and class c. If y is not None, then run a training-time forward and backward pass and return a tuple of: - loss: Scalar value giving the loss - grads: Dictionary with the same keys as self.params, mapping parameter names to gradients of the loss with respect to those parameters. """ scores = None ############################################################################ # TODO: Implement the forward pass for the two-layer net, computing the # # class scores for X and storing them in the scores variable. # ############################################################################ W1, b1 = self.params['W1'], self.params['b1'] W2, b2 = self.params['W2'], self.params['b2'] N = X.shape[0] D = np.prod(X.shape[1:]) X_ = X.reshape(N, D) A, fc1_cache = affine_forward(X_, W1, b1) R, relu_cache = relu_forward(A) scores, fc2_cache = affine_forward(R, W2, b2) ############################################################################ # END OF YOUR CODE # ############################################################################ # If y is None then we are in test mode so just return scores if y is None: return scores loss, grads = 0, {} ############################################################################ # TODO: Implement the backward pass for the two-layer net. Store the loss # # in the loss variable and gradients in the grads dictionary. Compute data # # loss using softmax, and make sure that grads[k] holds the gradients for # # self.params[k]. Don't forget to add L2 regularization! # # # # NOTE: To ensure that your implementation matches ours and you pass the # # automated tests, make sure that your L2 regularization includes a factor # # of 0.5 to simplify the expression for the gradient. # ############################################################################ loss, dscores = softmax_loss(scores, y) dR, dW2, db2 = affine_backward(dscores, fc2_cache) dA = relu_backward(dR, relu_cache) dX, dW1, db1 = affine_backward(dA, fc1_cache) loss += 0.5 * self.reg * (np.sum(W1 * W1) + np.sum(W2 * W2)) dW2 += self.reg * W2 dW1 += self.reg * W1 grads = {'W1': dW1, 'b1': db1, 'W2': dW2, 'b2': db2} ############################################################################ # END OF YOUR CODE # ############################################################################ return loss, grads
def loss(self, X, y=None): """ Compute loss and gradient for the fully-connected net. Input / output: Same as TwoLayerNet above. """ X = X.astype(self.dtype) mode = 'test' if y is None else 'train' # Set train/test mode for batchnorm params and dropout param since they # behave differently during training and testing. if self.use_dropout: self.dropout_param['mode'] = mode if self.use_batchnorm: for bn_param in self.bn_params: bn_param['mode'] = mode scores = None ############################################################################ # TODO: Implement the forward pass for the fully-connected net, computing # # the class scores for X and storing them in the scores variable. # # # # When using dropout, you'll need to pass self.dropout_param to each # # dropout forward pass. # # # # When using batch normalization, you'll need to pass self.bn_params[0] to # # the forward pass for the first batch normalization layer, pass # # self.bn_params[1] to the forward pass for the second batch normalization # # layer, etc. # ############################################################################ caches = collections.defaultdict(list) out_layer = X for i in range(self.num_layers - 1): n = str(i + 1) # (zy) The learned parameters are for BN affine transformation used # in training, while the running average is used for prediction. if self.use_batchnorm: out_layer, cache = affine_bn_relu_forward( out_layer, self.params["W" + n], self.params["b" + n], self.params["gamma" + n], self.params["beta" + n], self.bn_params[i]) caches["affine_bn_relu"].append(cache) else: out_layer, cache = layers.affine_forward( out_layer, self.params["W" + n], self.params["b" + n]) caches["affine"].append(cache) out_layer, cache = layers.relu_forward(out_layer) caches["relu"].append(cache) if self.use_dropout: out_layer, cache = layers.dropout_forward( out_layer, self.dropout_param) caches["drop"].append(cache) nn = str(self.num_layers) scores, cache = layers.affine_forward(out_layer, self.params["W" + nn], self.params["b" + nn]) ############################################################################ # END OF YOUR CODE # ############################################################################ # If test mode return early if mode == 'test': return scores loss, grads = 0.0, {} ############################################################################ # TODO: Implement the backward pass for the fully-connected net. Store the # # loss in the loss variable and gradients in the grads dictionary. Compute # # data loss using softmax, and make sure that grads[k] holds the gradients # # for self.params[k]. Don't forget to add L2 regularization! # # # # When using batch normalization, you don't need to regularize the scale # # and shift parameters. # # # # NOTE: To ensure that your implementation matches ours and you pass the # # automated tests, make sure that your L2 regularization includes a factor # # of 0.5 to simplify the expression for the gradient. # ############################################################################ loss, dloss = layers.softmax_loss(scores, y) # for regularization if self.reg != 0: for k, v in self.params.items(): # only include the w parameters, excluding gamma, beta and b if k.startswith("W"): loss += 0.5 * self.reg * np.sum(v**2) # get the gradient out = layers.affine_backward(dloss, cache) dout, grads["W" + nn], grads["b" + nn] = out grads["W" + nn] += self.reg * cache[1] for i in range(self.num_layers - 2, -1, -1): n = str(i + 1) if self.use_dropout: dout = layers.dropout_backward(dout, caches["drop"][i]) if self.use_batchnorm: out = affine_bn_relu_backward(dout, caches["affine_bn_relu"][i]) dout, grads["W"+n], grads["b"+n], \ grads["gamma"+n], grads["beta"+n] = out grads["W" + n] += self.reg * self.params["W" + n] if self.reg else 0 else: dout = layers.relu_backward(dout, caches["relu"][i]) out = layers.affine_backward(dout, caches["affine"][i]) dout, grads["W" + n], grads["b" + n] = out # need to include regularization grads["W" + n] += self.reg * caches["affine"][i][1] ############################################################################ # END OF YOUR CODE # ############################################################################ return loss, grads
def loss(self, X, y=None): """ Compute loss and gradient for the fully-connected net. Input / output: Same as TwoLayerNet above. """ X = X.astype(self.dtype) mode = 'test' if y is None else 'train' # Set train/test mode for batchnorm params and dropout param since they # behave differently during training and testing. if self.use_dropout: self.dropout_param['mode'] = mode if self.normalization == 'batchnorm': for bn_param in self.bn_params: bn_param['mode'] = mode scores = None ############################################################################ # TODO: Implement the forward pass for the fully-connected net, computing # # the class scores for X and storing them in the scores variable. # # # # When using dropout, you'll need to pass self.dropout_param to each # # dropout forward pass. # # # # When using batch normalization, you'll need to pass self.bn_params[0] to # # the forward pass for the first batch normalization layer, pass # # self.bn_params[1] to the forward pass for the second batch normalization # # layer, etc. # ############################################################################ # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)***** arg, caches = X, [] for i in range(1, self.num_layers + 1): cache = {} W = self.params[f"W{i}"] b = self.params[f"b{i}"] arg, cache['fc_cache'] = affine_forward(arg, W, b) if i != self.num_layers and self.normalization: gamma = self.params[f"gamma{i}"] beta = self.params[f"beta{i}"] normalize_forward = batchnorm_forward if self.normalization is 'batchnorm' else layernorm_forward arg, cache['bn_cache'] = normalize_forward(arg, gamma, beta, self.bn_params[i-1]) arg, cache['relu_cache'] = relu_forward(arg) if self.use_dropout: arg, cache['dropout_cache'] = dropout_forward(arg, self.dropout_param) caches.append(cache) scores = arg # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)***** ############################################################################ # END OF YOUR CODE # ############################################################################ # If test mode return early if mode == 'test': return scores loss, grads = 0.0, {} ############################################################################ # TODO: Implement the backward pass for the fully-connected net. Store the # # loss in the loss variable and gradients in the grads dictionary. Compute # # data loss using softmax, and make sure that grads[k] holds the gradients # # for self.params[k]. Don't forget to add L2 regularization! # # # # When using batch/layer normalization, you don't need to regularize the scale # # and shift parameters. # # # # NOTE: To ensure that your implementation matches ours and you pass the # # automated tests, make sure that your L2 regularization includes a factor # # of 0.5 to simplify the expression for the gradient. # ############################################################################ # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)***** loss, dout = softmax_loss(scores, y) for i in range(self.num_layers, 0, -1): W = self.params[f"W{i}"] cache = caches[i-1] if self.use_dropout: dout = dropout_backward(dout, cache['dropout_cache']) da = relu_backward(dout, cache['relu_cache']) if i != self.num_layers and self.normalization: normalize_backward = batchnorm_backward if self.normalization is 'batchnorm' else layernorm_backward da, dgamma, dbeta = batchnorm_backward(da, cache['bn_cache']) grads[f"gamma{i}"] = dgamma grads[f"beta{i}"] = dbeta dout, dw, db = affine_backward(da, cache['fc_cache']) grads[f"W{i}"] = dw + self.reg * W grads[f"b{i}"] = db loss += 0.5 * self.reg * np.sum(W * W) # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)***** ############################################################################ # END OF YOUR CODE # ############################################################################ return loss, grads
db_num = eval_numerical_gradient_array(lambda b: affine_forward(x, w, b)[0], b, dout) _, cache = affine_forward(x, w, b) dx, dw, db = affine_backward(dout, cache) # The error should be around 1e-10 print('Testing affine_backward function:') print('dx error: ', rel_error(dx_num, dx)) print('dw error: ', rel_error(dw_num, dw)) print('db error: ', rel_error(db_num, db)) # Test the relu_forward function x = np.linspace(-0.5, 0.5, num=12).reshape(3, 4) out, _ = relu_forward(x) correct_out = np.array([[ 0., 0., 0., 0., ], [ 0., 0., 0.04545455, 0.13636364, ], [ 0.22727273, 0.31818182, 0.40909091, 0.5,
def loss(self, X, y=None): """ Compute loss and gradient for a minibatch of data. Inputs: - X: Array of input data of shape (N, d_1, ..., d_k) - y: Array of labels, of shape (N,). y[i] gives the label for X[i]. Returns: If y is None, then run a test-time forward pass of the model and return: - scores: Array of shape (N, C) giving classification scores, where scores[i, c] is the classification score for X[i] and class c. If y is not None, then run a training-time forward and backward pass and return a tuple of: - loss: Scalar value giving the loss - grads: Dictionary with the same keys as self.params, mapping parameter names to gradients of the loss with respect to those parameters. """ scores = None ############################################################################ # TODO: Implement the forward pass for the two-layer net, computing the # # class scores for X and storing them in the scores variable. # ############################################################################ hidden1_out, h1_cache = affine_forward(X, self.params['W1'], self.params['b1']) relu_out, relu_cache = relu_forward(hidden1_out) scores, h2_cache = affine_forward(relu_out, self.params['W2'], self.params['b2']) ############################################################################ # END OF YOUR CODE # ############################################################################ # If y is None then we are in test mode so just return scores if y is None: return scores loss, grads = 0, {} ############################################################################ # TODO: Implement the backward pass for the two-layer net. Store the loss # # in the loss variable and gradients in the grads dictionary. Compute data # # loss using softmax, and make sure that grads[k] holds the gradients for # # self.params[k]. Don't forget to add L2 regularization! # # # # NOTE: To ensure that your implementation matches ours and you pass the # # automated tests, make sure that your L2 regularization includes a factor # # of 0.5 to simplify the expression for the gradient. # ############################################################################ """ X_reshape=np.reshape(X,(X.shape[0],-1)) num_trains=X.shape[0] loss,_=softmax_loss(scores,y) loss=loss+self.reg*0.5*(np.sum(self.params['W2']*self.params['W2'])+np.sum(self.params['W1']*self.params['W1'])) softmax_output=np.exp(scores)/np.sum(np.exp(scores),axis=1).reshape(-1,1) softmax_output[range(num_trains),list(y)]=softmax_output[range(num_trains),list(y)]-1 grads['b2']=np.zeros_like(self.params['b2']) grads['W2']=np.zeros_like(self.params['W2']) grads['b1']=np.zeros_like(self.params['b1']) grads['W1']=np.zeros_like(self.params['W1']) grads['b2']=np.sum(softmax_output,axis=0) grads['W2']=np.dot(relu_out.T,softmax_output) grads_b1_tmp=np.dot(softmax_output,self.params['W2'].T) tmp=(relu_out>0)*grads_b1_tmp grads['b1']=np.sum(tmp,axis=0) grads['W1']=np.dot(X_reshape.T,grads_b1_tmp) grads['W1']=grads['W1']/num_trains+self.reg*self.params['W1'] grads['b1']=grads['b1']/num_trains grads['W2']=grads['W2']/num_trains+self.reg*self.params['W2'] grads['b2']=grads['b2']/num_trains """ num_trains = X.shape[0] loss, dscore = softmax_loss(scores, y) loss = loss + self.reg * 0.5 * ( np.sum(self.params['W2'] * self.params['W2']) + np.sum(self.params['W1'] * self.params['W1'])) grads_h2, grads_w2, grads_b2 = affine_backward(dout=dscore, cache=h2_cache) grads_relu = relu_backward(grads_h2, relu_cache) grads_h1, grads_w1, grads_b1 = affine_backward(grads_relu, h1_cache) grads['W1'] = grads_w1 + self.reg * self.params['W1'] grads['W2'] = grads_w2 + self.reg * self.params['W2'] grads['b1'] = grads_b1 grads['b2'] = grads_b2 ############################################################################ # END OF YOUR CODE # ############################################################################ return loss, grads
def loss(self, X, y=None): """ Compute loss and gradient for a minibatch of data. Inputs: - X: Array of input data of shape (N, d_1, ..., d_k) - y: Array of labels, of shape (N,). y[i] gives the label for X[i]. Returns: If y is None, then run a test-time forward pass of the model and return: - scores: Array of shape (N, C) giving classification scores, where scores[i, c] is the classification score for X[i] and class c. If y is not None, then run a training-time forward and backward pass and return a tuple of: - loss: Scalar value giving the loss - grads: Dictionary with the same keys as self.params, mapping parameter names to gradients of the loss with respect to those parameters. """ scores = None ############################################################################ # TODO: Implement the forward pass for the two-layer net, computing the # # class scores for X and storing them in the scores variable. # ############################################################################ out_affine1, cache_affine1 = layers.affine_forward( X, self.params["W1"], self.params["b1"]) out_relu1, cache_relu1 = layers.relu_forward(out_affine1) out_affine2, cache_affine2 = layers.affine_forward( out_relu1, self.params["W2"], self.params["b2"]) # no need to compute SVM/softmax loss, just give the argmax result When # we are in prediction. scores = out_affine2 ############################################################################ # END OF YOUR CODE # ############################################################################ # If y is None then we are in test mode so just return scores if y is None: return scores loss, grads = 0, {} ############################################################################ # TODO: Implement the backward pass for the two-layer net. Store the loss # # in the loss variable and gradients in the grads dictionary. Compute data # # loss using softmax, and make sure that grads[k] holds the gradients for # # self.params[k]. Don't forget to add L2 regularization! # # # # NOTE: To ensure that your implementation matches ours and you pass the # # automated tests, make sure that your L2 regularization includes a factor # # of 0.5 to simplify the expression for the gradient. # ############################################################################ # in training, compute the loss and do backprop. loss, dloss = layers.softmax_loss(scores, y) # need to add regularization here... loss += 0.5 * self.reg * (np.sum(self.params["W1"]**2) + np.sum(self.params["W2"]**2)) dout_affine2 = layers.affine_backward(dloss, cache_affine2) grads["W2"] = dout_affine2[1] + self.reg * self.params["W2"] grads["b2"] = dout_affine2[2] dout_relu1 = layers.relu_backward(dout_affine2[0], cache_relu1) dout_affine1 = layers.affine_backward(dout_relu1, cache_affine1) grads["W1"] = dout_affine1[1] + self.reg * self.params["W1"] grads["b1"] = dout_affine1[2] ############################################################################ # END OF YOUR CODE # ############################################################################ return loss, grads
def loss(self, X, y=None): """ Compute loss and gradient for a minibatch of data. Inputs: - X: Array of input data of shape (N, d_1, ..., d_k) - y: Array of labels, of shape (N,). y[i] gives the label for X[i]. Returns: If y is None, then run a test-time forward pass of the model and return: - scores: Array of shape (N, C) giving classification scores, where scores[i, c] is the classification score for X[i] and class c. If y is not None, then run a training-time forward and backward pass and return a tuple of: - loss: Scalar value giving the loss - grads: Dictionary with the same keys as self.params, mapping parameter names to gradients of the loss with respect to those parameters. """ W1 = self.params["W1"] W2 = self.params["W2"] b1 = self.params["b1"] b2 = self.params["b2"] fc_1, cache_fc_1 = affine_forward(X, W1, b1) # (N, H) relu_1, cache_relu_1 = relu_forward(fc_1) # (N, H) fc_2, cache_fc_2 = affine_forward(relu_1, W2, b2) # (N, C) import copy scores = copy.deepcopy(fc_2) ############################################################################ # TODO: Implement the forward pass for the two-layer net, computing the # # class scores for X and storing them in the scores variable. # ############################################################################ pass ############################################################################ # END OF YOUR CODE # ############################################################################ # If y is None then we are in test mode so just return scores if y is None: return scores loss, grads = 0, {} loss, d_scores = softmax_loss(scores, y) d_relu_1, d_W2, d_b2 = affine_backward(d_scores, cache_fc_2) d_fc_1 = relu_backward(d_relu_1, cache_relu_1) dx, d_W1, d_b1 = affine_backward(d_fc_1, cache_fc_1) grads["W1"] = d_W1 grads["W2"] = d_W2 grads["b1"] = d_b1 grads["b2"] = d_b2 loss += 0.5 * self.reg * \ (np.sum(np.square(self.params["W1"])) + np.sum(np.square(self.params["W2"]))) grads["W2"] += self.reg * self.params["W2"] grads["W1"] += self.reg * self.params["W1"] ############################################################################ # TODO: Implement the backward pass for the two-layer net. Store the loss # # in the loss variable and gradients in the grads dictionary. Compute data # # loss using softmax, and make sure that grads[k] holds the gradients for # # self.params[k]. Don't forget to add L2 regularization! # # # # NOTE: To ensure that your implementation matches ours and you pass the # # automated tests, make sure that your L2 regularization includes a factor # # of 0.5 to simplify the expression for the gradient. # ############################################################################ pass ############################################################################ # END OF YOUR CODE # ############################################################################ return loss, grads
def loss(self, X, y=None): """ Compute loss and gradient for the fully-connected net. Input / output: Same as TwoLayerNet above. """ X = X.astype(self.dtype) mode = "test" if y is None else "train" # Set train/test mode for batchnorm params and dropout param since they # behave differently during training and testing. if self.use_dropout: self.dropout_param["mode"] = mode if self.use_batchnorm: for bn_param in self.bn_params: bn_param["mode"] = mode """ loss 과정에서 활용할 리스트들 [i] : i번째 layer의 변수들 """ fc = [] relu = [] bn = [] dropout = [] cache_bn = [] cache_fc = [] cache_relu = [] cache_dropout = [] fc.append(0) bn.append(0) relu.append(X) dropout.append(0) cache_bn.append(0) cache_dropout.append(0) cache_fc.append(0) cache_relu.append(0) # 맨 처음 trian data X를 집어넣어준다 # 0으로 모든 리스트를 초기화해준다 # 이러한 작업을 해주는 이유 : 인덱스를 1부터 L-1까지 활용하기 위함 """ fc_i : i번째 layer의 output cache_fc_i : i번째 layer의 input """ for i in range(1, self.num_layers): # 1부터 L-1까지 # affine fc_i, cache_fc_i = affine_forward(relu[i - 1], self.params["W" + str(i)], self.params["b" + str(i)]) fc.append(fc_i) cache_fc.append(cache_fc_i) if self.use_batchnorm: # batchnorm bn_i, cache_bn_i = batchnorm_forward( fc_i, gamma=self.params["gamma" + str(i)], beta=self.params["beta" + str(i)], bn_param=self.bn_params[i - 1], ) bn.append(bn_i) cache_bn.append(cache_bn_i) # relu relu_i, cache_relu_i = relu_forward(bn_i) relu.append(relu_i) cache_relu.append(cache_relu_i) else: # relu relu_i, cache_relu_i = relu_forward(fc[i]) relu.append(relu_i) cache_relu.append(cache_relu_i) # dropout layer if self.use_dropout: dropout_i, cache_dropout_i = dropout_forward( relu_i, dropout_param=self.dropout_param) dropout.append(dropout_i) cache_dropout.append(cache_dropout_i) # 마지막 L번째 layer : affine & softmax fc_L, cache_fc_L = affine_forward( dropout[-1] if self.use_dropout else relu[-1], self.params["W" + str(self.num_layers)], self.params["b" + str(self.num_layers)]) fc.append(fc_L) cache_fc.append(cache_fc_L) # (N,C) scores = fc[self.num_layers] ############################################################################ # TODO: Implement the forward pass for the fully-connected net, computing # # the class scores for X and storing them in the scores variable. # # # # When using dropout, you'll need to pass self.dropout_param to each # # dropout forward pass. # # # # When using batch normalization, you'll need to pass self.bn_params[0] to # # the forward pass for the first batch normalization layer, pass # # self.bn_params[1] to the forward pass for the second batch normalization # # layer, etc. # ############################################################################ pass ############################################################################ # END OF YOUR CODE # ############################################################################ # If test mode return early if mode == "test": return scores loss, grads = 0.0, {} loss, d_scores = softmax_loss(scores, y) dx_ = [] dfc = [] drelu = [] dbatch = [] ddropout = [] # 맨 마지막 Layer drelu_L, dWL, dbL = affine_backward(d_scores, cache_fc[self.num_layers]) dfc.append(d_scores) dx_.append(drelu_L) grads["W" + str(self.num_layers)] = dWL grads["b" + str(self.num_layers)] = dbL for i in range(self.num_layers - 1, 0, -1): # N-1, 1 : all hidden layer # dropout backward if self.use_dropout: ddropout_i = dropout_backward(dx_[-1], cache_dropout[i]) ddropout.append(ddropout_i) # relu backward d_fc = relu_backward(ddropout[-1] if self.use_dropout else dx_[-1], cache_relu[i]) # batch normalization if self.use_batchnorm: # vriable name = d_fc이지만 사실은 d_batch dbatch.append(d_fc) # print('i = ', i) # print('length of cache_bn = ', len(cache_bn)) d_fc, dgamma, dbeta = batchnorm_backward(dbatch[-1], cache=cache_bn[i]) grads["gamma" + str(i)] = dgamma grads["beta" + str(i)] = dbeta dfc.append(d_fc) else: dfc.append(d_fc) # affine backward dx, dw, db = affine_backward(dfc[-1], cache_fc[i]) dx_.append(dx) grads["W" + str(i)] = dw grads["b" + str(i)] = db # if (i == 1): # print(i) ############################################################################ # TODO: Implement the backward pass for the fully-connected net. Store the # # loss in the loss variable and gradients in the grads dictionary. Compute # # data loss using softmax, and make sure that grads[k] holds the gradients # # for self.params[k]. Don't forget to add L2 regularization! # # # # When using batch normalization, you don't need to regularize the scale # # and shift parameters. # # # # NOTE: To ensure that your implementation matches ours and you pass the # # automated tests, make sure that your L2 regularization includes a factor # # of 0.5 to simplify the expression for the gradient. # ############################################################################ pass ############################################################################ # END OF YOUR CODE # ############################################################################ return loss, grads