def loss(self, X, y=None): """ Compute loss and gradient for a minibatch of data. Inputs: - X: Tensor of input data of shape (N, d_1, ..., d_k) - y: int64 Tensor of labels, of shape (N,). y[i] gives the label for X[i]. Returns: If y is None, then run a test-time forward pass of the model and return: - scores: Tensor of shape (N, C) giving classification scores, where scores[i, c] is the classification score for X[i] and class c. If y is not None, then run a training-time forward and backward pass and return a tuple of: - loss: Scalar value giving the loss - grads: Dictionary with the same keys as self.params, mapping parameter names to gradients of the loss with respect to those parameters. """ scores = None ########################################################################### # TODO: Implement the forward pass for the two-layer net, computing the # # class scores for X and storing them in the scores variable. # ########################################################################### # Replace "pass" statement with your code h1, cache1 = Linear_ReLU.forward(X, self.params['W1'], self.params['b1']) scores, cache2 = Linear.forward(h1, self.params['W2'], self.params['b2']) ########################################################################### # END OF YOUR CODE # ########################################################################### # If y is None then we are in test mode so just return scores if y is None: return scores loss, grads = 0, {} ########################################################################### # TODO: Implement the backward pass for the two-layer net. Store the loss # # in the loss variable and gradients in the grads dictionary. Compute data# # loss using softmax, and make sure that grads[k] holds the gradients for # # self.params[k]. Don't forget to add L2 regularization! # # # # NOTE: To ensure that your implementation matches ours and you pass the # # automated tests, make sure that your L2 regularization does not include # # a factor of 0.5. # ########################################################################### # Replace "pass" statement with your code loss, dout = softmax_loss(scores, y) loss += self.reg * torch.sum(self.params['W1'] ** 2) + self.reg * torch.sum(self.params['W2'] ** 2) dh1, grads['W2'], grads['b2'] = Linear.backward(dout, cache2) grads['W2'] += 2 * self.reg * self.params['W2'] _, grads['W1'], grads['b1'] = Linear_ReLU.backward(dh1, cache1) grads['W1'] += 2 * self.reg * self.params['W1'] ########################################################################### # END OF YOUR CODE # ########################################################################### return loss, grads
def loss(self, X, y=None): """ Compute loss and gradient for the fully-connected net. Input / output: Same as TwoLayerNet above. """ X = X.to(self.dtype) mode = 'test' if y is None else 'train' # Set train/test mode for batchnorm params and dropout param since they # behave differently during training and testing. if self.use_dropout: self.dropout_param['mode'] = mode scores = None ############################################################################ # TODO: Implement the forward pass for the fully-connected net, computing # # the class scores for X and storing them in the scores variable. # # # # When using dropout, you'll need to pass self.dropout_param to each # # dropout forward pass. # ############################################################################ # Replace "pass" statement with your code hiddens = {} caches = {} dropouts = {} N = self.num_layers - 1 if self.use_dropout: dropouts['d0'] = X for i in range(N): hiddens[f'h{i+1}'], caches[f'l{i}'] = Linear_ReLU.forward(dropouts[f'd{i}'], self.params[f'W{i}'], self.params[f'b{i}']) dropouts[f'd{i+1}'], caches[f'd{i+1}'] = Dropout.forward(hiddens[f'h{i+1}'], self.dropout_param) scores, caches[f'l{N}'] = Linear.forward(dropouts[f'd{N}'], self.params[f'W{N}'], self.params[f'b{N}']) else: hiddens['h0'] = X for i in range(N): hiddens[f'h{i+1}'], caches[f'l{i}'] = Linear_ReLU.forward(hiddens[f'h{i}'], self.params[f'W{i}'], self.params[f'b{i}']) scores, caches[f'l{N}'] = Linear.forward(hiddens[f'h{N}'], self.params[f'W{N}'], self.params[f'b{N}']) ############################################################################ # END OF YOUR CODE # ############################################################################ # If test mode return early if mode == 'test': return scores loss, grads = 0.0, {} ############################################################################ # TODO: Implement the backward pass for the fully-connected net. Store the # # loss in the loss variable and gradients in the grads dictionary. Compute # # data loss using softmax, and make sure that grads[k] holds the gradients # # for self.params[k]. Don't forget to add L2 regularization! # # NOTE: To ensure that your implementation matches ours and you pass the # # automated tests, make sure that your L2 regularization includes a factor # # of 0.5 to simplify the expression for the gradient. # ############################################################################ # Replace "pass" statement with your code # calculate loss loss, da = softmax_loss(scores, y) reg_sum = 0.0 for i in range(self.num_layers): reg_sum += torch.sum(self.params[f'W{i}'] * self.params[f'W{i}']) loss += self.reg * reg_sum # calculate gradients ds = {} if self.use_dropout: for i in reversed(range(self.num_layers)): if i == self.num_layers-1: ds[f'dh{i}'], ds[f'dW{i}'], ds[f'db{i}'] = Linear.backward(da, caches[f'l{i}']) else: ds[f'dd{i+1}'] = Dropout.backward(ds[f'dh{i+1}'], caches[f'd{i+1}']) ds[f'dh{i}'], ds[f'dW{i}'], ds[f'db{i}'] = Linear_ReLU.backward(ds[f'dd{i+1}'], caches[f'l{i}']) else: for i in reversed(range(self.num_layers)): if i == self.num_layers-1: ds[f'dh{i}'], ds[f'dW{i}'], ds[f'db{i}'] = Linear.backward(da, caches[f'l{i}']) else: ds[f'dh{i}'], ds[f'dW{i}'], ds[f'db{i}'] = Linear_ReLU.backward(ds[f'dh{i+1}'], caches[f'l{i}']) for i in range(self.num_layers): grads[f'W{i}'] = ds[f'dW{i}'] + 2 * self.reg * self.params[f'W{i}'] grads[f'b{i}'] = ds[f'db{i}'] ############################################################################ # END OF YOUR CODE # ############################################################################ return loss, grads
def loss(self, X, y=None): """ Compute loss and gradient for the fully-connected net. Input / output: Same as TwoLayerNet above. """ X = X.to(self.dtype) mode = 'test' if y is None else 'train' # Set train/test mode for batchnorm params and dropout param since they # behave differently during training and testing. if self.use_dropout: self.dropout_param['mode'] = mode scores = None ############################################################################ # TODO: Implement the forward pass for the fully-connected net, computing # # the class scores for X and storing them in the scores variable. # # # # When using dropout, you'll need to pass self.dropout_param to each # # dropout forward pass. # ############################################################################ # Replace "pass" statement with your code h = X relu_caches = {} dropout_caches = {} cache = None for i in range(1, self.num_layers): h, relu_caches[i] = Linear_ReLU.forward(h, self.params[f'W{i}'], self.params[f'b{i}']) if self.use_dropout: h, dropout_caches[i] = Dropout.forward(h, self.dropout_param) scores, cache = Linear.forward(h, self.params[f'W{self.num_layers}'], self.params[f'b{self.num_layers}']) ############################################################################ # END OF YOUR CODE # ############################################################################ # If test mode return early if mode == 'test': return scores loss, grads = 0.0, {} ############################################################################ # TODO: Implement the backward pass for the fully-connected net. Store the # # loss in the loss variable and gradients in the grads dictionary. Compute # # data loss using softmax, and make sure that grads[k] holds the gradients # # for self.params[k]. Don't forget to add L2 regularization! # # NOTE: To ensure that your implementation matches ours and you pass the # # automated tests, make sure that your L2 regularization includes a factor # # of 0.5 to simplify the expression for the gradient. # ############################################################################ # Replace "pass" statement with your code loss, dout = softmax_loss(scores, y) for i in range(1, self.num_layers + 1): loss += self.reg * torch.sum(self.params[f'W{i}'] ** 2) dh, grads[f'W{self.num_layers}'], grads[f'b{self.num_layers}'] = Linear.backward(dout, cache) grads[f'W{self.num_layers}'] += 2 * self.reg * self.params[f'W{self.num_layers}'] for i in range(self.num_layers - 1, 0, -1): if self.use_dropout: dh = Dropout.backward(dh, dropout_caches[i]) dh, grads[f'W{i}'], grads[f'b{i}'] = Linear_ReLU.backward(dh, relu_caches[i]) grads[f'W{i}'] += 2 * self.reg * self.params[f'W{i}'] ############################################################################ # END OF YOUR CODE # ############################################################################ return loss, grads
def loss(self, X, y=None): """ Compute loss and gradient for the fully-connected net. Input / output: Same as TwoLayerNet above. """ X = X.to(self.dtype) mode = 'test' if y is None else 'train' # Set train/test mode for batchnorm params and dropout param since they # behave differently during training and testing. if self.use_dropout: self.dropout_param['mode'] = mode scores = None ############################################################################ # TODO: Implement the forward pass for the fully-connected net, computing # # the class scores for X and storing them in the scores variable. # # # # When using dropout, you'll need to pass self.dropout_param to each # # dropout forward pass. # ############################################################################ # Replace "pass" statement with your code cache_dict = {} last_out = X for n in range(self.num_layers - 1): i = n + 1 last_out, cache_dict['cache_LR{}'.format(i)] = Linear_ReLU.forward( last_out, self.params['W{}'.format(i)], self.params['b{}'.format(i)]) if self.use_dropout: last_out, cache_dict['cache_Dropout{}'.format( i)] = Dropout.forward(last_out, self.dropout_param) i += 1 last_out, cache_dict['cache_L{}'.format(i)] = Linear.forward( last_out, self.params['W{}'.format(i)], self.params['b{}'.format(i)]) scores = last_out ############################################################################ # END OF YOUR CODE # ############################################################################ # If test mode return early if mode == 'test': return scores loss, grads = 0.0, {} ############################################################################ # TODO: Implement the backward pass for the fully-connected net. Store the # # loss in the loss variable and gradients in the grads dictionary. Compute # # data loss using softmax, and make sure that grads[k] holds the gradients # # for self.params[k]. Don't forget to add L2 regularization! # # NOTE: To ensure that your implementation matches ours and you pass the # # automated tests, make sure that your L2 regularization includes a factor # # of 0.5 to simplify the expression for the gradient. # ############################################################################ # Replace "pass" statement with your code loss, dout = softmax_loss(scores, y) loss += (self.params['W{}'.format(i)] * self.params['W{}'.format(i)]).sum() * self.reg last_dout, dw, db = Linear.backward(dout, cache_dict['cache_L{}'.format(i)]) grads['W{}'.format( i)] = dw + 2 * self.params['W{}'.format(i)] * self.reg grads['b{}'.format(i)] = db for n in range(self.num_layers - 1)[::-1]: i = n + 1 if self.use_dropout: last_dout = Dropout.backward( last_dout, cache_dict['cache_Dropout{}'.format(i)]) last_dout, dw, db = Linear_ReLU.backward( last_dout, cache_dict['cache_LR{}'.format(i)]) grads['W{}'.format( i)] = dw + 2 * self.params['W{}'.format(i)] * self.reg grads['b{}'.format(i)] = db loss += (self.params['W{}'.format(i)] * self.params['W{}'.format(i)]).sum() * self.reg ############################################################################ # END OF YOUR CODE # ############################################################################ return loss, grads
def loss(self, X, y=None): """ Compute loss and gradient for the fully-connected net. Input / output: Same as TwoLayerNet above. """ X = X.to(self.dtype) mode = 'test' if y is None else 'train' # Set train/test mode for batchnorm params and dropout param since they # behave differently during training and testing. if self.use_dropout: self.dropout_param['mode'] = mode scores = None ############################################################################ # TODO: Implement the forward pass for the fully-connected net, computing # # the class scores for X and storing them in the scores variable. # # # # When using dropout, you'll need to pass self.dropout_param to each # # dropout forward pass. # ############################################################################ # Replace "pass" statement with your code if self.use_dropout: a = [None] * (self.num_layers + 1) cache = [None] * (self.num_layers + 1) cache_dropout = [None] * (self.num_layers + 1) a[0] = X cache[0] = 0 for i in range(1,self.num_layers): W_str = 'W{}'.format(str(i)) b_str = 'b{}'.format(str(i)) a[i], cache[i] = Linear_ReLU.forward(a[i-1], self.params[W_str], self.params[b_str]) a[i], cache_dropout[i] = Dropout.forward(a[i], self.dropout_param) W_linear = 'W{}'.format(str(self.num_layers)) b_linear = 'b{}'.format(str(self.num_layers)) a[self.num_layers], cache[self.num_layers] = Linear.forward(a[self.num_layers-1], self.params[W_linear], self.params[b_linear]) scores = a[self.num_layers] else: a = [None] * (self.num_layers + 1) cache = [None] * (self.num_layers + 1) a[0] = X cache[0] = 0 for i in range(1,self.num_layers): W_str = 'W{}'.format(str(i)) b_str = 'b{}'.format(str(i)) a[i], cache[i] = Linear_ReLU.forward(a[i-1], self.params[W_str], self.params[b_str]) W_linear = 'W{}'.format(str(self.num_layers)) b_linear = 'b{}'.format(str(self.num_layers)) a[self.num_layers], cache[self.num_layers] = Linear.forward(a[self.num_layers-1], self.params[W_linear], self.params[b_linear]) scores = a[self.num_layers] ############################################################################ # END OF YOUR CODE # ############################################################################ # If test mode return early if mode == 'test': return scores loss, grads = 0.0, {} ############################################################################ # TODO: Implement the backward pass for the fully-connected net. Store the # # loss in the loss variable and gradients in the grads dictionary. Compute # # data loss using softmax, and make sure that grads[k] holds the gradients # # for self.params[k]. Don't forget to add L2 regularization! # # NOTE: To ensure that your implementation matches ours and you pass the # # automated tests, make sure that your L2 regularization includes a factor # # of 0.5 to simplify the expression for the gradient. # ############################################################################ # Replace "pass" statement with your code if self.use_dropout: loss, dout_softmax = softmax_loss(scores,y) dout = [None] * (self.num_layers + 1) W_linear = 'W{}'.format(str(self.num_layers)) b_linear = 'b{}'.format(str(self.num_layers)) dout[self.num_layers], grads[W_linear], grads[b_linear] = Linear.backward(dout_softmax,cache[self.num_layers]) grads[W_linear] += 2 * self.reg * self.params[W_linear] loss += self.reg * torch.sum(self.params[W_linear] * self.params[W_linear]) for i in range(self.num_layers - 1, 0, -1): W_str = 'W{}'.format(str(i)) b_str = 'b{}'.format(str(i)) dout[i+1] = Dropout.backward(dout[i+1],cache_dropout[i]) dout[i], grads[W_str], grads[b_str] = Linear_ReLU.backward(dout[i+1],cache[i]) grads[W_str] += 2 * self.reg * self.params[W_str] loss += self.reg * torch.sum(self.params[W_str] * self.params[W_str]) else: loss, dout_softmax = softmax_loss(scores,y) dout = [None] * (self.num_layers + 1) W_linear = 'W{}'.format(str(self.num_layers)) b_linear = 'b{}'.format(str(self.num_layers)) dout[self.num_layers], grads[W_linear], grads[b_linear] = Linear.backward(dout_softmax,cache[self.num_layers]) grads[W_linear] += 2 * self.reg * self.params[W_linear] loss += self.reg * torch.sum(self.params[W_linear] * self.params[W_linear]) for i in range(self.num_layers - 1, 0, -1): W_str = 'W{}'.format(str(i)) b_str = 'b{}'.format(str(i)) dout[i], grads[W_str], grads[b_str] = Linear_ReLU.backward(dout[i+1],cache[i]) grads[W_str] += 2 * self.reg * self.params[W_str] loss += self.reg * torch.sum(self.params[W_str] * self.params[W_str]) ############################################################################ # END OF YOUR CODE # ############################################################################ return loss, grads
def loss(self, X, y=None): """ Compute loss and gradient for the fully-connected net. Input / output: Same as TwoLayerNet above. """ X = X.to(self.dtype) mode = 'test' if y is None else 'train' # Set train/test mode for batchnorm params and dropout param since they # behave differently during training and testing. if self.use_dropout: self.dropout_param['mode'] = mode scores = None ############################################################################ # TODO: Implement the forward pass for the fully-connected net, computing # # the class scores for X and storing them in the scores variable. # # # # When using dropout, you'll need to pass self.dropout_param to each # # dropout forward pass. # ############################################################################ # Replace "pass" statement with your code out = X forward_cache = [] dropout_cache = [] for i in range(self.num_layers - 1): name_W = 'W' + '{}'.format(i + 1) name_b = 'b' + '{}'.format(i + 1) out, cache = Linear_ReLU.forward(x=out, w=self.params[name_W], b=self.params[name_b]) forward_cache.append(cache) if self.use_dropout: out, cache = Dropout.forward(out, self.dropout_param) dropout_cache.append(cache) name_W = 'W' + '{}'.format(self.num_layers) name_b = 'b' + '{}'.format(self.num_layers) out, cache = Linear.forward(out, self.params[name_W], self.params[name_b]) forward_cache.append(cache) scores = out ############################################################################ # END OF YOUR CODE # ############################################################################ # If test mode return early if mode == 'test': return scores loss, grads = 0.0, {} ############################################################################ # TODO: Implement the backward pass for the fully-connected net. Store the # # loss in the loss variable and gradients in the grads dictionary. Compute # # data loss using softmax, and make sure that grads[k] holds the gradients # # for self.params[k]. Don't forget to add L2 regularization! # # NOTE: To ensure that your implementation matches ours and you pass the # # automated tests, make sure that your L2 regularization includes a factor # # of 0.5 to simplify the expression for the gradient. # ############################################################################ # Replace "pass" statement with your code loss, dout = softmax_loss(scores, y) for i in range(self.num_layers): name_W = 'W' + '{}'.format(i + 1) loss += self.reg * torch.sum(self.params[name_W]**2) for i in range(self.num_layers, 0, -1): name_W = 'W' + '{}'.format(i) name_b = 'b' + '{}'.format(i) if i == self.num_layers: grads_x, grads[name_W], grads[name_b] = Linear.backward( dout, forward_cache.pop()) grads[name_W] += 2 * self.reg * self.params[name_W] else: if self.use_dropout: grads_x = Dropout.backward(grads_x, dropout_cache.pop()) grads_x, grads[name_W], grads[name_b] = Linear_ReLU.backward( grads_x, forward_cache.pop()) grads[name_W] += 2 * self.reg * self.params[name_W] ############################################################################ # END OF YOUR CODE # ############################################################################ return loss, grads