def loss(self, X, y=None):
    """
    Compute loss and gradient for a minibatch of data.

    Inputs:
    - X: Tensor of input data of shape (N, d_1, ..., d_k)
    - y: int64 Tensor of labels, of shape (N,). y[i] gives the label for X[i].

    Returns:
    If y is None, then run a test-time forward pass of the model and return:
    - scores: Tensor of shape (N, C) giving classification scores, where
      scores[i, c] is the classification score for X[i] and class c.
    If y is not None, then run a training-time forward and backward pass and
    return a tuple of:
    - loss: Scalar value giving the loss
    - grads: Dictionary with the same keys as self.params, mapping parameter
      names to gradients of the loss with respect to those parameters.
    """
    scores = None
    ###########################################################################
    # TODO: Implement the forward pass for the two-layer net, computing the   #
    # class scores for X and storing them in the scores variable.             #
    ###########################################################################
    # Replace "pass" statement with your code
    h1, cache1 = Linear_ReLU.forward(X, self.params['W1'], self.params['b1'])
    scores, cache2 = Linear.forward(h1, self.params['W2'], self.params['b2'])
    ###########################################################################
    #                            END OF YOUR CODE                             #
    ###########################################################################

    # If y is None then we are in test mode so just return scores
    if y is None:
      return scores

    loss, grads = 0, {}
    ###########################################################################
    # TODO: Implement the backward pass for the two-layer net. Store the loss #
    # in the loss variable and gradients in the grads dictionary. Compute data#
    # loss using softmax, and make sure that grads[k] holds the gradients for #
    # self.params[k]. Don't forget to add L2 regularization!                  #
    #                                                                         #
    # NOTE: To ensure that your implementation matches ours and you pass the  #
    # automated tests, make sure that your L2 regularization does not include #
    # a factor of 0.5.                                                        #
    ###########################################################################
    # Replace "pass" statement with your code
    loss, dout = softmax_loss(scores, y)
    loss += self.reg * torch.sum(self.params['W1'] ** 2) + self.reg * torch.sum(self.params['W2'] ** 2)

    dh1, grads['W2'], grads['b2'] = Linear.backward(dout, cache2)
    grads['W2'] += 2 * self.reg * self.params['W2']
    _, grads['W1'], grads['b1'] = Linear_ReLU.backward(dh1, cache1)
    grads['W1'] += 2 * self.reg * self.params['W1']
    ###########################################################################
    #                            END OF YOUR CODE                             #
    ###########################################################################

    return loss, grads
Ejemplo n.º 2
0
  def loss(self, X, y=None):
    """
    Compute loss and gradient for the fully-connected net.
    Input / output: Same as TwoLayerNet above.
    """
    X = X.to(self.dtype)
    mode = 'test' if y is None else 'train'

    # Set train/test mode for batchnorm params and dropout param since they
    # behave differently during training and testing.
    if self.use_dropout:
      self.dropout_param['mode'] = mode
    scores = None
    ############################################################################
    # TODO: Implement the forward pass for the fully-connected net, computing  #
    # the class scores for X and storing them in the scores variable.          #
    #                                                                          #
    # When using dropout, you'll need to pass self.dropout_param to each       #
    # dropout forward pass.                                                    #
    ############################################################################
    # Replace "pass" statement with your code

    hiddens = {}
    caches = {}
    dropouts = {}

    N = self.num_layers - 1

    if self.use_dropout:
      dropouts['d0'] = X
      for i in range(N):
        hiddens[f'h{i+1}'], caches[f'l{i}'] = Linear_ReLU.forward(dropouts[f'd{i}'], self.params[f'W{i}'], self.params[f'b{i}'])
        dropouts[f'd{i+1}'], caches[f'd{i+1}'] = Dropout.forward(hiddens[f'h{i+1}'], self.dropout_param)
      scores, caches[f'l{N}'] = Linear.forward(dropouts[f'd{N}'], self.params[f'W{N}'], self.params[f'b{N}'])
    else:
      hiddens['h0'] = X
      for i in range(N):
        hiddens[f'h{i+1}'], caches[f'l{i}'] = Linear_ReLU.forward(hiddens[f'h{i}'], self.params[f'W{i}'], self.params[f'b{i}'])
      scores, caches[f'l{N}'] = Linear.forward(hiddens[f'h{N}'], self.params[f'W{N}'], self.params[f'b{N}'])

    ############################################################################
    #                             END OF YOUR CODE                             #
    ############################################################################

    # If test mode return early
    if mode == 'test':
      return scores

    loss, grads = 0.0, {}
    ############################################################################
    # TODO: Implement the backward pass for the fully-connected net. Store the #
    # loss in the loss variable and gradients in the grads dictionary. Compute #
    # data loss using softmax, and make sure that grads[k] holds the gradients #
    # for self.params[k]. Don't forget to add L2 regularization!               #
    # NOTE: To ensure that your implementation matches ours and you pass the   #
    # automated tests, make sure that your L2 regularization includes a factor #
    # of 0.5 to simplify the expression for the gradient.                      #
    ############################################################################
    # Replace "pass" statement with your code


    # calculate loss
    loss, da = softmax_loss(scores, y)

    reg_sum = 0.0
    for i in range(self.num_layers):
      reg_sum += torch.sum(self.params[f'W{i}'] * self.params[f'W{i}'])
    loss += self.reg * reg_sum

    # calculate gradients
    ds = {}
    if self.use_dropout:
      for i in reversed(range(self.num_layers)):
        if i == self.num_layers-1:
          ds[f'dh{i}'], ds[f'dW{i}'], ds[f'db{i}'] = Linear.backward(da, caches[f'l{i}'])
        else:
          ds[f'dd{i+1}'] = Dropout.backward(ds[f'dh{i+1}'], caches[f'd{i+1}'])
          ds[f'dh{i}'], ds[f'dW{i}'], ds[f'db{i}'] = Linear_ReLU.backward(ds[f'dd{i+1}'], caches[f'l{i}'])
    else:
      for i in reversed(range(self.num_layers)):
        if i == self.num_layers-1:
          ds[f'dh{i}'], ds[f'dW{i}'], ds[f'db{i}'] = Linear.backward(da, caches[f'l{i}'])
        else:
          ds[f'dh{i}'], ds[f'dW{i}'], ds[f'db{i}'] = Linear_ReLU.backward(ds[f'dh{i+1}'], caches[f'l{i}'])

    for i in range(self.num_layers):
      grads[f'W{i}'] = ds[f'dW{i}'] + 2 * self.reg * self.params[f'W{i}']
      grads[f'b{i}'] = ds[f'db{i}']

    ############################################################################
    #                             END OF YOUR CODE                             #
    ############################################################################

    return loss, grads
  def loss(self, X, y=None):
    """
    Compute loss and gradient for the fully-connected net.
    Input / output: Same as TwoLayerNet above.
    """
    X = X.to(self.dtype)
    mode = 'test' if y is None else 'train'

    # Set train/test mode for batchnorm params and dropout param since they
    # behave differently during training and testing.
    if self.use_dropout:
      self.dropout_param['mode'] = mode
    scores = None
    ############################################################################
    # TODO: Implement the forward pass for the fully-connected net, computing  #
    # the class scores for X and storing them in the scores variable.          #
    #                                                                          #
    # When using dropout, you'll need to pass self.dropout_param to each       #
    # dropout forward pass.                                                    #
    ############################################################################
    # Replace "pass" statement with your code
    h = X
    relu_caches = {}
    dropout_caches = {}
    cache = None
    for i in range(1, self.num_layers):
      h, relu_caches[i] = Linear_ReLU.forward(h, self.params[f'W{i}'], self.params[f'b{i}'])
      if self.use_dropout:
        h, dropout_caches[i] = Dropout.forward(h, self.dropout_param)
    scores, cache = Linear.forward(h, self.params[f'W{self.num_layers}'], self.params[f'b{self.num_layers}'])
    ############################################################################
    #                             END OF YOUR CODE                             #
    ############################################################################

    # If test mode return early
    if mode == 'test':
      return scores

    loss, grads = 0.0, {}
    ############################################################################
    # TODO: Implement the backward pass for the fully-connected net. Store the #
    # loss in the loss variable and gradients in the grads dictionary. Compute #
    # data loss using softmax, and make sure that grads[k] holds the gradients #
    # for self.params[k]. Don't forget to add L2 regularization!               #
    # NOTE: To ensure that your implementation matches ours and you pass the   #
    # automated tests, make sure that your L2 regularization includes a factor #
    # of 0.5 to simplify the expression for the gradient.                      #
    ############################################################################
    # Replace "pass" statement with your code
    loss, dout = softmax_loss(scores, y)
    for i in range(1, self.num_layers + 1):
      loss += self.reg * torch.sum(self.params[f'W{i}'] ** 2)
    
    dh, grads[f'W{self.num_layers}'], grads[f'b{self.num_layers}'] = Linear.backward(dout, cache)
    grads[f'W{self.num_layers}'] += 2 * self.reg * self.params[f'W{self.num_layers}']
    for i in range(self.num_layers - 1, 0, -1):
      if self.use_dropout:
        dh = Dropout.backward(dh, dropout_caches[i])
      dh, grads[f'W{i}'], grads[f'b{i}'] = Linear_ReLU.backward(dh, relu_caches[i])
      grads[f'W{i}'] += 2 * self.reg * self.params[f'W{i}']
    ############################################################################
    #                             END OF YOUR CODE                             #
    ############################################################################

    return loss, grads
    def loss(self, X, y=None):
        """
    Compute loss and gradient for the fully-connected net.
    Input / output: Same as TwoLayerNet above.
    """
        X = X.to(self.dtype)
        mode = 'test' if y is None else 'train'

        # Set train/test mode for batchnorm params and dropout param since they
        # behave differently during training and testing.
        if self.use_dropout:
            self.dropout_param['mode'] = mode
        scores = None
        ############################################################################
        # TODO: Implement the forward pass for the fully-connected net, computing  #
        # the class scores for X and storing them in the scores variable.          #
        #                                                                          #
        # When using dropout, you'll need to pass self.dropout_param to each       #
        # dropout forward pass.                                                    #
        ############################################################################
        # Replace "pass" statement with your code
        cache_dict = {}
        last_out = X
        for n in range(self.num_layers - 1):
            i = n + 1
            last_out, cache_dict['cache_LR{}'.format(i)] = Linear_ReLU.forward(
                last_out, self.params['W{}'.format(i)],
                self.params['b{}'.format(i)])
            if self.use_dropout:
                last_out, cache_dict['cache_Dropout{}'.format(
                    i)] = Dropout.forward(last_out, self.dropout_param)
        i += 1
        last_out, cache_dict['cache_L{}'.format(i)] = Linear.forward(
            last_out, self.params['W{}'.format(i)],
            self.params['b{}'.format(i)])
        scores = last_out
        ############################################################################
        #                             END OF YOUR CODE                             #
        ############################################################################

        # If test mode return early
        if mode == 'test':
            return scores

        loss, grads = 0.0, {}
        ############################################################################
        # TODO: Implement the backward pass for the fully-connected net. Store the #
        # loss in the loss variable and gradients in the grads dictionary. Compute #
        # data loss using softmax, and make sure that grads[k] holds the gradients #
        # for self.params[k]. Don't forget to add L2 regularization!               #
        # NOTE: To ensure that your implementation matches ours and you pass the   #
        # automated tests, make sure that your L2 regularization includes a factor #
        # of 0.5 to simplify the expression for the gradient.                      #
        ############################################################################
        # Replace "pass" statement with your code
        loss, dout = softmax_loss(scores, y)
        loss += (self.params['W{}'.format(i)] *
                 self.params['W{}'.format(i)]).sum() * self.reg
        last_dout, dw, db = Linear.backward(dout,
                                            cache_dict['cache_L{}'.format(i)])
        grads['W{}'.format(
            i)] = dw + 2 * self.params['W{}'.format(i)] * self.reg
        grads['b{}'.format(i)] = db
        for n in range(self.num_layers - 1)[::-1]:
            i = n + 1
            if self.use_dropout:
                last_dout = Dropout.backward(
                    last_dout, cache_dict['cache_Dropout{}'.format(i)])
            last_dout, dw, db = Linear_ReLU.backward(
                last_dout, cache_dict['cache_LR{}'.format(i)])
            grads['W{}'.format(
                i)] = dw + 2 * self.params['W{}'.format(i)] * self.reg
            grads['b{}'.format(i)] = db
            loss += (self.params['W{}'.format(i)] *
                     self.params['W{}'.format(i)]).sum() * self.reg

        ############################################################################
        #                             END OF YOUR CODE                             #
        ############################################################################

        return loss, grads
Ejemplo n.º 5
0
  def loss(self, X, y=None):
    """
    Compute loss and gradient for the fully-connected net.
    Input / output: Same as TwoLayerNet above.
    """
    X = X.to(self.dtype)
    mode = 'test' if y is None else 'train'

    # Set train/test mode for batchnorm params and dropout param since they
    # behave differently during training and testing.
    if self.use_dropout:
      self.dropout_param['mode'] = mode
    scores = None
    ############################################################################
    # TODO: Implement the forward pass for the fully-connected net, computing  #
    # the class scores for X and storing them in the scores variable.          #
    #                                                                          #
    # When using dropout, you'll need to pass self.dropout_param to each       #
    # dropout forward pass.                                                    #
    ############################################################################
    # Replace "pass" statement with your code
    if self.use_dropout:
      a = [None] * (self.num_layers + 1)
      cache = [None] * (self.num_layers + 1)
      cache_dropout = [None] * (self.num_layers + 1)
      a[0] = X
      cache[0] = 0
      for i in range(1,self.num_layers):
        W_str = 'W{}'.format(str(i))
        b_str = 'b{}'.format(str(i))
        a[i], cache[i] = Linear_ReLU.forward(a[i-1], self.params[W_str], self.params[b_str])
        a[i], cache_dropout[i] = Dropout.forward(a[i], self.dropout_param)
      W_linear = 'W{}'.format(str(self.num_layers))
      b_linear = 'b{}'.format(str(self.num_layers))
      a[self.num_layers], cache[self.num_layers] = Linear.forward(a[self.num_layers-1], self.params[W_linear], self.params[b_linear])
      scores = a[self.num_layers]    
    else:
      a = [None] * (self.num_layers + 1)
      cache = [None] * (self.num_layers + 1)
      a[0] = X
      cache[0] = 0
      for i in range(1,self.num_layers):
        W_str = 'W{}'.format(str(i))
        b_str = 'b{}'.format(str(i))
        a[i], cache[i] = Linear_ReLU.forward(a[i-1], self.params[W_str], self.params[b_str])
      W_linear = 'W{}'.format(str(self.num_layers))
      b_linear = 'b{}'.format(str(self.num_layers))
      a[self.num_layers], cache[self.num_layers] = Linear.forward(a[self.num_layers-1], self.params[W_linear], self.params[b_linear])
      scores = a[self.num_layers]





    ############################################################################
    #                             END OF YOUR CODE                             #
    ############################################################################

    # If test mode return early
    if mode == 'test':
      return scores

    loss, grads = 0.0, {}
    ############################################################################
    # TODO: Implement the backward pass for the fully-connected net. Store the #
    # loss in the loss variable and gradients in the grads dictionary. Compute #
    # data loss using softmax, and make sure that grads[k] holds the gradients #
    # for self.params[k]. Don't forget to add L2 regularization!               #
    # NOTE: To ensure that your implementation matches ours and you pass the   #
    # automated tests, make sure that your L2 regularization includes a factor #
    # of 0.5 to simplify the expression for the gradient.                      #
    ############################################################################
    # Replace "pass" statement with your code
    if self.use_dropout:
      loss, dout_softmax = softmax_loss(scores,y)
      dout = [None] * (self.num_layers + 1)
      W_linear = 'W{}'.format(str(self.num_layers))
      b_linear = 'b{}'.format(str(self.num_layers))
      dout[self.num_layers], grads[W_linear], grads[b_linear] = Linear.backward(dout_softmax,cache[self.num_layers])
      grads[W_linear] += 2 * self.reg * self.params[W_linear]
      loss += self.reg * torch.sum(self.params[W_linear] * self.params[W_linear])
      for i in range(self.num_layers - 1, 0, -1):
        W_str = 'W{}'.format(str(i))
        b_str = 'b{}'.format(str(i))
        dout[i+1] = Dropout.backward(dout[i+1],cache_dropout[i]) 
        dout[i], grads[W_str], grads[b_str] = Linear_ReLU.backward(dout[i+1],cache[i])
        grads[W_str] += 2 * self.reg * self.params[W_str]
        loss += self.reg * torch.sum(self.params[W_str] * self.params[W_str])
    else:
      loss, dout_softmax = softmax_loss(scores,y)
      dout = [None] * (self.num_layers + 1)
      W_linear = 'W{}'.format(str(self.num_layers))
      b_linear = 'b{}'.format(str(self.num_layers))
      dout[self.num_layers], grads[W_linear], grads[b_linear] = Linear.backward(dout_softmax,cache[self.num_layers])
      grads[W_linear] += 2 * self.reg * self.params[W_linear]
      loss += self.reg * torch.sum(self.params[W_linear] * self.params[W_linear])
      for i in range(self.num_layers - 1, 0, -1):
        W_str = 'W{}'.format(str(i))
        b_str = 'b{}'.format(str(i))
        dout[i], grads[W_str], grads[b_str] = Linear_ReLU.backward(dout[i+1],cache[i])
        grads[W_str] += 2 * self.reg * self.params[W_str]
        loss += self.reg * torch.sum(self.params[W_str] * self.params[W_str])



    ############################################################################
    #                             END OF YOUR CODE                             #
    ############################################################################

    return loss, grads
    def loss(self, X, y=None):
        """
    Compute loss and gradient for the fully-connected net.
    Input / output: Same as TwoLayerNet above.
    """
        X = X.to(self.dtype)
        mode = 'test' if y is None else 'train'

        # Set train/test mode for batchnorm params and dropout param since they
        # behave differently during training and testing.
        if self.use_dropout:
            self.dropout_param['mode'] = mode
        scores = None
        ############################################################################
        # TODO: Implement the forward pass for the fully-connected net, computing  #
        # the class scores for X and storing them in the scores variable.          #
        #                                                                          #
        # When using dropout, you'll need to pass self.dropout_param to each       #
        # dropout forward pass.                                                    #
        ############################################################################
        # Replace "pass" statement with your code
        out = X
        forward_cache = []
        dropout_cache = []
        for i in range(self.num_layers - 1):
            name_W = 'W' + '{}'.format(i + 1)
            name_b = 'b' + '{}'.format(i + 1)
            out, cache = Linear_ReLU.forward(x=out,
                                             w=self.params[name_W],
                                             b=self.params[name_b])
            forward_cache.append(cache)

            if self.use_dropout:
                out, cache = Dropout.forward(out, self.dropout_param)
                dropout_cache.append(cache)

        name_W = 'W' + '{}'.format(self.num_layers)
        name_b = 'b' + '{}'.format(self.num_layers)
        out, cache = Linear.forward(out, self.params[name_W],
                                    self.params[name_b])
        forward_cache.append(cache)

        scores = out
        ############################################################################
        #                             END OF YOUR CODE                             #
        ############################################################################

        # If test mode return early
        if mode == 'test':
            return scores

        loss, grads = 0.0, {}
        ############################################################################
        # TODO: Implement the backward pass for the fully-connected net. Store the #
        # loss in the loss variable and gradients in the grads dictionary. Compute #
        # data loss using softmax, and make sure that grads[k] holds the gradients #
        # for self.params[k]. Don't forget to add L2 regularization!               #
        # NOTE: To ensure that your implementation matches ours and you pass the   #
        # automated tests, make sure that your L2 regularization includes a factor #
        # of 0.5 to simplify the expression for the gradient.                      #
        ############################################################################
        # Replace "pass" statement with your code
        loss, dout = softmax_loss(scores, y)
        for i in range(self.num_layers):
            name_W = 'W' + '{}'.format(i + 1)
            loss += self.reg * torch.sum(self.params[name_W]**2)

        for i in range(self.num_layers, 0, -1):
            name_W = 'W' + '{}'.format(i)
            name_b = 'b' + '{}'.format(i)
            if i == self.num_layers:
                grads_x, grads[name_W], grads[name_b] = Linear.backward(
                    dout, forward_cache.pop())
                grads[name_W] += 2 * self.reg * self.params[name_W]
            else:
                if self.use_dropout:
                    grads_x = Dropout.backward(grads_x, dropout_cache.pop())
                grads_x, grads[name_W], grads[name_b] = Linear_ReLU.backward(
                    grads_x, forward_cache.pop())
                grads[name_W] += 2 * self.reg * self.params[name_W]

        ############################################################################
        #                             END OF YOUR CODE                             #
        ############################################################################

        return loss, grads