Example #1
0
def affine_relu_backward( dout, cache ):
	"""
	Backward pass for the affine-relu convenience layer
	"""
	fc_cache, relu_cache = cache
	da = layers.relu_backward( dout, relu_cache )
	dx, dw, db = layers.affine_backward( da, fc_cache )
	return dx, dw, db
Example #2
0
def affine_relu_backward(dout, cache):
    """
    Backward pass for the affine-relu convenience layer
    """
    fc_cache, relu_cache = cache
    da = relu_backward(dout, relu_cache)
    dx, dw, db = affine_backward(da, fc_cache)
    return dx, dw, db
Example #3
0
def affine_batchnorm_relu_backward(dout, cache):
    """
    Backward pass for the Affine->BatchNorm->ReLU convenience layer
    """
    fc_cache, bn_cache, relu_cache = cache
    da = relu_backward(dout, relu_cache)
    dan, dgamma, dbeta = batchnorm_backward(da, bn_cache)
    dx, dw, db = affine_backward(dan, fc_cache)
    return dx, dw, db, dgamma, dbeta
Example #4
0
def affine_bn_relu_backward(dout, cache):
    """
    Backward pass for the affine-bn-relu convenience layer
    """
    fc_cache, bn_cache, relu_cache = cache
    dx = layers.relu_backward(dout, relu_cache)
    dx, dgamma, dbeta = layers.batchnorm_backward_alt(dx, bn_cache)
    dx, dw, db = layers.affine_backward(dx, fc_cache)
    return dx, dw, db, dgamma, dbeta
Example #5
0
def combo_backward(dout, cache):
    """
    Backward pass for the affine-relu convenience layer
    """
    dgamma, dbeta = 0, 0
    fc_cache, bn_cache, relu_cache = cache

    da = relu_backward(dout, relu_cache)
    if bn_cache is not None:
        da, dgamma, dbeta = batchnorm_backward(da, bn_cache)
    dx, dw, db = affine_backward(da, fc_cache)
    return dx, dw, db, dgamma, dbeta
Example #6
0
def rnn_step_full_backward(dcurrent_h, cache):
    (x, prev_h, Wx, Wh, bh, Ws, bs, current_h, cache_affine, dscore) = cache
    
    dcurrent_h_, dWs, dbs = affine_backward(dscore, cache_affine)
    
    dcurrent_h = dcurrent_h + dcurrent_h_
    dcurrent_state = dcurrent_h * (1 - np.square(current_h))
  
    dx = dcurrent_state.dot(Wx.T)
    dWx = x.T.dot(dcurrent_state)
    dprev_h = dcurrent_state.dot(Wh.T)
    dWh = prev_h.T.dot(dcurrent_state)
    dbh = np.sum(dcurrent_state, axis=0)
    return dx, dprev_h, dWx, dWh, dbh, dWs, dbs
Example #7
0
def lstm_step_backward(dnext_h, dnext_c, cache):
    """
    Backward pass for a single timestep of an LSTM.

    Inputs:
    - dnext_h: Gradients of next hidden state, of shape (N, H)
    - dnext_c: Gradients of next cell state, of shape (N, H)
    - cache: Values from the forward pass

    Returns a tuple of:
    - dx: Gradient of input data, of shape (N, D)
    - dprev_h: Gradient of previous hidden state, of shape (N, H)
    - dprev_c: Gradient of previous cell state, of shape (N, H)
    - dWx: Gradient of input-to-hidden weights, of shape (D, 4H)
    - dWh: Gradient of hidden-to-hidden weights, of shape (H, 4H)
    - db: Gradient of biases, of shape (4H,)
    """
    dx, dprev_h, dprev_c, dWx, dWh, db = None, None, None, None, None, None
    #############################################################################
    # TODO: Implement the backward pass for a single timestep of an LSTM.       #
    #                                                                           #
    # HINT: For sigmoid and tanh you can compute local derivatives in terms of  #
    # the output value from the nonlinearity.                                   #
    #############################################################################
    N, H = dnext_h.shape
    o, tanh_next_c, prev_c, cache_gates, i, f, o, g, D = cache
    dtanh_next_c = dnext_h * o
    dnext_c_sum = dtanh_next_c * (1 - tanh_next_c * tanh_next_c) + dnext_c
    dprev_c = dnext_c_sum * f
    di = dnext_c_sum * g
    df = dnext_c_sum * prev_c
    do = dnext_h * tanh_next_c
    dg = dnext_c_sum * i
    di_before_sigmoid = di * (i * (1 - i))
    df_before_sigmoid = df * (f * (1 - f))
    do_before_sigmoid = do * (o * (1 - o))
    dg_before_tanh = dg * (1 - g * g)
    d_gates = np.concatenate((di_before_sigmoid, df_before_sigmoid,
                              do_before_sigmoid, dg_before_tanh),
                             axis=1)
    dinputs, dW, db = affine_backward(d_gates, cache_gates)
    dWx = dW[:D, :]
    dWh = dW[D:, :]
    dx = dinputs[:, :D]
    dprev_h = dinputs[:, D:]
    ##############################################################################
    #                               END OF YOUR CODE                             #
    ##############################################################################

    return dx, dprev_h, dprev_c, dWx, dWh, db
Example #8
0
def test_affine_backward():
	x = np.random.randn(10, 2, 3)
	w = np.random.randn(6, 5)
	b = np.random.randn(5)
	dout = np.random.randn(10, 5)

	dx_num = eval_numerical_gradient_array(lambda x: affine_forward(x, w, b)[0], x, dout)
	dw_num = eval_numerical_gradient_array(lambda w: affine_forward(x, w, b)[0], w, dout)
	db_num = eval_numerical_gradient_array(lambda b: affine_forward(x, w, b)[0], b, dout)

	_, cache = affine_forward(x, w, b)
	dx, dw, db = affine_backward(dout, cache)

	assert dx.shape == dx.shape
	assert dw.shape == dw.shape
	assert db.shape == db.shape

	assert rel_error(dx_num,dx) < 5e-7
	assert rel_error(dw_num,dw) < 5e-7
	assert rel_error(db_num,db) < 5e-7
Example #9
0
def test_affine_backward():
    x = np.random.randn(10, 2, 3)
    w = np.random.randn(6, 5)
    b = np.random.randn(5)
    dout = np.random.randn(10, 5)

    dx_num = eval_numerical_gradient_array(lambda x: affine_forward(x, w, b)[0], x, dout)
    dw_num = eval_numerical_gradient_array(lambda w: affine_forward(x, w, b)[0], w, dout)
    db_num = eval_numerical_gradient_array(lambda b: affine_forward(x, w, b)[0], b, dout)

    _, cache = affine_forward(x, w, b)
    dx, dw, db = affine_backward(dout, cache)

    assert dx.shape == dx.shape
    assert dw.shape == dw.shape
    assert db.shape == db.shape

    assert rel_error(dx_num,dx) < 5e-7
    assert rel_error(dw_num,dw) < 5e-7
    assert rel_error(db_num,db) < 5e-7
Example #10
0
    def loss(self, X, y=None):
        """
        Compute loss and gradient for a minibatch of data.

        Inputs:
        - X: Array of input data of shape (N, d_1, ..., d_k)
        - y: Array of labels, of shape (N,). y[i] gives the label for X[i].

        Returns:
        If y is None, then run a test-time forward pass of the model and return:
        - scores: Array of shape (N, C) giving classification scores, where
          scores[i, c] is the classification score for X[i] and class c.

        If y is not None, then run a training-time forward and backward pass and
        return a tuple of:
        - loss: Scalar value giving the loss
        - grads: Dictionary with the same keys as self.params, mapping parameter
          names to gradients of the loss with respect to those parameters.
        """
        scores = None
        ############################################################################
        # TODO: Implement the forward pass for the two-layer net, computing the    #
        # class scores for X and storing them in the scores variable.              #
        ############################################################################
        hidden1_out, h1_cache = affine_forward(X, self.params['W1'],
                                               self.params['b1'])
        relu_out, relu_cache = relu_forward(hidden1_out)
        scores, h2_cache = affine_forward(relu_out, self.params['W2'],
                                          self.params['b2'])
        ############################################################################
        #                             END OF YOUR CODE                             #
        ############################################################################

        # If y is None then we are in test mode so just return scores
        if y is None:
            return scores

        loss, grads = 0, {}
        ############################################################################
        # TODO: Implement the backward pass for the two-layer net. Store the loss  #
        # in the loss variable and gradients in the grads dictionary. Compute data #
        # loss using softmax, and make sure that grads[k] holds the gradients for  #
        # self.params[k]. Don't forget to add L2 regularization!                   #
        #                                                                          #
        # NOTE: To ensure that your implementation matches ours and you pass the   #
        # automated tests, make sure that your L2 regularization includes a factor #
        # of 0.5 to simplify the expression for the gradient.                      #
        ############################################################################
        """
        X_reshape=np.reshape(X,(X.shape[0],-1))
        num_trains=X.shape[0]
        loss,_=softmax_loss(scores,y)
        loss=loss+self.reg*0.5*(np.sum(self.params['W2']*self.params['W2'])+np.sum(self.params['W1']*self.params['W1']))
        softmax_output=np.exp(scores)/np.sum(np.exp(scores),axis=1).reshape(-1,1)
        softmax_output[range(num_trains),list(y)]=softmax_output[range(num_trains),list(y)]-1
        grads['b2']=np.zeros_like(self.params['b2'])
        grads['W2']=np.zeros_like(self.params['W2'])
        grads['b1']=np.zeros_like(self.params['b1'])
        grads['W1']=np.zeros_like(self.params['W1'])
        grads['b2']=np.sum(softmax_output,axis=0)
        grads['W2']=np.dot(relu_out.T,softmax_output)
        grads_b1_tmp=np.dot(softmax_output,self.params['W2'].T)
        tmp=(relu_out>0)*grads_b1_tmp
        grads['b1']=np.sum(tmp,axis=0)
        grads['W1']=np.dot(X_reshape.T,grads_b1_tmp)
        grads['W1']=grads['W1']/num_trains+self.reg*self.params['W1']
        grads['b1']=grads['b1']/num_trains
        grads['W2']=grads['W2']/num_trains+self.reg*self.params['W2']
        grads['b2']=grads['b2']/num_trains
        """
        num_trains = X.shape[0]
        loss, dscore = softmax_loss(scores, y)
        loss = loss + self.reg * 0.5 * (
            np.sum(self.params['W2'] * self.params['W2']) +
            np.sum(self.params['W1'] * self.params['W1']))
        grads_h2, grads_w2, grads_b2 = affine_backward(dout=dscore,
                                                       cache=h2_cache)
        grads_relu = relu_backward(grads_h2, relu_cache)
        grads_h1, grads_w1, grads_b1 = affine_backward(grads_relu, h1_cache)
        grads['W1'] = grads_w1 + self.reg * self.params['W1']
        grads['W2'] = grads_w2 + self.reg * self.params['W2']
        grads['b1'] = grads_b1
        grads['b2'] = grads_b2
        ############################################################################
        #                             END OF YOUR CODE                             #
        ############################################################################
        return loss, grads
Example #11
0
    def loss(self, X, y=None):
        """
        Compute loss and gradient for the fully-connected net.

        Inputs:
        - X: Array of input data of shape (N, d_1, ..., d_k)
        - y: Array of labels, of shape (N,). y[i] gives the label for X[i].

        Returns:
        If y is None, then run a test-time forward pass of the model and return:
        - scores: Array of shape (N, C) giving classification scores, where
          scores[i, c] is the classification score for X[i] and class c.
        """
        X = X.astype(self.dtype)
        mode = 'test' if y is None else 'train'

        # Set train/test mode for batchnorm params and dropout param since they
        # behave differently during training and testing.
        if self.dropout_param is not None:
            self.dropout_param['mode'] = mode
        if self.use_batchnorm:
            for bn_param in self.bn_params:
                bn_param[mode] = mode

        scores = None
        #######################################################################
        # TODO: Implement the forward pass for the fully-connected net,
        # computing the class scores for X and storing them in the scores
        # variable.
        #
        # When using dropout, you'll need to pass self.dropout_param to each
        # dropout forward pass.
        #
        # When using batch normalization, you'll need to pass self.bn_params[0]
        # to the forward pass for the first batch normalization layer,
        # pass self.bn_params[1] to the forward pass for the second batch
        # normalization layer, etc.
        #######################################################################
        IN = X

        caches = {}
        if self.use_dropout:
            dropout_caches = {}

        for l in range(self.num_layers - 1):
            W = self.params["W{}".format(l + 1)]
            b = self.params["b{}".format(l + 1)]

            if self.use_batchnorm:
                gamma = self.params["gamma{}".format(l + 1)]
                beta = self.params["beta{}".format(l + 1)]
                IN, cache = affine_batchnorm_relu_forward(
                    IN, W, b, gamma, beta, self.bn_params[l])
            else:
                IN, cache = affine_relu_forward(IN, W, b)

            caches[l] = cache

            if self.use_dropout:
                IN, d_cache = dropout_forward(IN, self.dropout_param)
                dropout_caches[l] = d_cache

        # forward pass: last affine layer
        num_last = self.num_layers
        name_W_last = "W{}".format(num_last)
        name_b_last = "b{}".format(num_last)
        W_last = self.params[name_W_last]
        b_last = self.params[name_b_last]

        scores, cache_last = affine_forward(IN, W_last, b_last)

        #######################################################################
        #                             END OF YOUR CODE                        #
        #######################################################################

        # If test mode return early
        if mode == 'test':
            return scores

        loss, grads = 0.0, {}
        #######################################################################
        # TODO: Implement the backward pass for the fully-connected net.
        # Store the loss in the loss variable and gradients in the grads
        # dictionary. Compute data loss using softmax, and make sure that
        # grads[k] holds the gradients for self.params[k]. Don't forget to add
        # L2 regularization!
        #
        # When using batch normalization, you don't need to regularize the
        # scale and shift parameters.
        #
        # NOTE: To ensure that your implementation matches ours and you pass
        # the automated tests, make sure that your L2 regularization includes a
        # factor of 0.5 to simplify the expression for the gradient.
        #######################################################################

        # loss
        loss, dscores = softmax_loss(scores, y)

        # regularization loss
        for l in range(self.num_layers):
            W = self.params["W{}".format(l + 1)]
            loss += 0.5 * self.reg * np.sum(W * W)

        # backprop through last affine layer
        dx, dw, db = affine_backward(dscores, cache_last)
        grads[name_W_last] = dw + self.reg * W_last
        grads[name_b_last] = db

        # backprop through affine-batchnorm-relu layers
        for l in reversed(range(self.num_layers - 1)):
            name_W = "W{}".format(l + 1)
            name_b = "b{}".format(l + 1)

            if self.use_dropout:
                dx = dropout_backward(dx, dropout_caches[l])

            if self.use_batchnorm:
                dx, dw, db, dgamma, dbeta = affine_batchnorm_relu_backward(
                    dx, caches[l])
                grads["gamma{}".format(l + 1)] = dgamma
                grads["beta{}".format(l + 1)] = dbeta
            else:
                dx, dw, db = affine_relu_backward(dx, caches[l])
            grads[name_W] = dw + self.reg * self.params[name_W]
            grads[name_b] = db

        #######################################################################
        #                             END OF YOUR CODE                        #
        #######################################################################

        return loss, grads
Example #12
0
def two_layer_net(X, model, y=None, reg=0.0):
  """
  Compute the loss and gradients for a two layer fully connected neural network.
  The net has an input dimension of D, a hidden layer dimension of H, and
  performs classification over C classes. We use a softmax loss function and L2
  regularization the the weight matrices. The two layer net should use a ReLU
  nonlinearity after the first affine layer.

  The two layer net has the following architecture:

  input - fully connected layer - ReLU - fully connected layer - softmax

  The outputs of the second fully-connected layer are the scores for each
  class.

  Inputs:
  - X: Input data of shape (N, D). Each X[i] is a training sample.
  - model: Dictionary mapping parameter names to arrays of parameter values.
    It should contain the following:
    - W1: First layer weights; has shape (D, H)
    - b1: First layer biases; has shape (H,)
    - W2: Second layer weights; has shape (H, C)
    - b2: Second layer biases; has shape (C,)
  - y: Vector of training labels. y[i] is the label for X[i], and each y[i] is
    an integer in the range 0 <= y[i] < C. This parameter is optional; if it
    is not passed then we only return scores, and if it is passed then we
    instead return the loss and gradients.
  - reg: Regularization strength.

  Returns:
  If y not is passed, return a matrix scores of shape (N, C) where scores[i, c]
  is the score for class c on input X[i].

  If y is not passed, instead return a tuple of:
  - loss: Loss (data loss and regularization loss) for this batch of training
    samples.
  - grads: Dictionary mapping parameter names to gradients of those parameters
    with respect to the loss function. This should have the same keys as model.
  """

  # unpack variables from the model dictionary
  W1, b1, W2, b2 = model['W1'], model['b1'], model['W2'], model['b2']
  N, D = X.shape

  # compute the forward pass
  scores = None
  #############################################################################
  # TODO: Perform the forward pass, computing the class scores for the input. #
  # Store the result in the scores variable, which should be an array of      #
  # shape (N, C).                                                             #
  #############################################################################
  # relu = lambda x: np.maximum(x,0)
  # H, C = W2.shape
  # scores = np.zeros((N,C))
  # layer1 = np.maximum(np.dot(X,W1) + b1,0)
  # scores = np.dot(layer1,W2) + b2
  ## above is the test implementation
  ## NOW, using cs231n/layers.py
  ## NOTICE define layer0 = X
  # then behaviour is 'functional' layer(n+1) = f(layer(n) | parameters)
  from cs231n.layers import affine_forward, relu_forward, softmax_loss
  from cs231n.layers import affine_backward, relu_backward

  layer1, cache1 = affine_forward(X, W1, b1)
  layer2, cache2 = relu_forward(layer1)
  layer3, cache3 = affine_forward(layer2, W2, b2)

  scores = layer3
  #############################################################################
  #                              END OF YOUR CODE                             #
  #############################################################################
  
  # If the targets are not given then jump out, we're done
  if y is None:
    return scores

  # compute the loss
  loss = None
  #############################################################################
  # TODO: Finish the forward pass, and compute the loss. This should include  #
  # both the data loss and L2 regularization for W1 and W2. Store the result  #
  # in the variable loss, which should be a scalar. Use the Softmax           #
  # classifier loss. So that your results match ours, multiply the            #
  # regularization loss by 0.5                                                #
  #############################################################################
  # rows   = np.sum(np.exp(scores), axis=1)
  # layer4 = np.mean(-layer3[range(N), y] + np.log(rows))
  # loss   = layer4 + 0.5 * reg * (np.sum(W1 * W1) + np.sum(W2 * W2))
  # 
  loss, dx = softmax_loss(scores, y)
  loss += 0.5 * reg * np.sum(W1*W1) + 0.5 * reg * np.sum(W2 * W2)
  #############################################################################
  #                              END OF YOUR CODE                             #
  #############################################################################

  # compute the gradients
  grads = {}
  #############################################################################
  # TODO: Compute the backward pass, computing the derivatives of the weights #
  # and biases. Store the results in the grads dictionary. For example,       #
  # grads['W1'] should store the gradient on W1, and be a matrix of same size #
  #############################################################################
  dlayer2, grads['W2'], grads['b2'] = affine_backward(dx, cache3)
  dlayer1                           = relu_backward(dlayer2, cache2)
  dLayer0, grads['W1'], grads['b1'] = affine_backward(dlayer1, cache1)

  #gradients need to have regularization term
  grads['W2'] += reg * W2
  grads['W1'] += reg * W1
  #############################################################################
  #                              END OF YOUR CODE                             #
  #############################################################################

  return loss, grads
Example #13
0
    def loss(self, X, y=None):
        """
        Compute loss and gradient for a minibatch of data.

        Inputs:
        - X: Array of input data of shape (N, d_1, ..., d_k)
        - y: Array of labels, of shape (N,). y[i] gives the label for X[i].

        Returns:
        If y is None, then run a test-time forward pass of the model and return:
        - scores: Array of shape (N, C) giving classification scores, where
          scores[i, c] is the classification score for X[i] and class c.

        If y is not None, then run a training-time forward and backward pass and
        return a tuple of:
        - loss: Scalar value giving the loss
        - grads: Dictionary with the same keys as self.params, mapping parameter
          names to gradients of the loss with respect to those parameters.
        """
        scores = None
        ############################################################################
        # TODO: Implement the forward pass for the two-layer net, computing the    #
        # class scores for X and storing them in the scores variable.              #
        ############################################################################
        out_affine1, cache_affine1 = layers.affine_forward(
            X, self.params["W1"], self.params["b1"])
        out_relu1, cache_relu1 = layers.relu_forward(out_affine1)
        out_affine2, cache_affine2 = layers.affine_forward(
            out_relu1, self.params["W2"], self.params["b2"])
        # no need to compute SVM/softmax loss, just give the argmax result When
        # we are in prediction.
        scores = out_affine2
        ############################################################################
        #                             END OF YOUR CODE                             #
        ############################################################################

        # If y is None then we are in test mode so just return scores
        if y is None:
            return scores

        loss, grads = 0, {}
        ############################################################################
        # TODO: Implement the backward pass for the two-layer net. Store the loss  #
        # in the loss variable and gradients in the grads dictionary. Compute data #
        # loss using softmax, and make sure that grads[k] holds the gradients for  #
        # self.params[k]. Don't forget to add L2 regularization!                   #
        #                                                                          #
        # NOTE: To ensure that your implementation matches ours and you pass the   #
        # automated tests, make sure that your L2 regularization includes a factor #
        # of 0.5 to simplify the expression for the gradient.                      #
        ############################################################################
        # in training, compute the loss and do backprop.
        loss, dloss = layers.softmax_loss(scores, y)
        # need to add regularization here...
        loss += 0.5 * self.reg * (np.sum(self.params["W1"]**2) +
                                  np.sum(self.params["W2"]**2))
        dout_affine2 = layers.affine_backward(dloss, cache_affine2)
        grads["W2"] = dout_affine2[1] + self.reg * self.params["W2"]
        grads["b2"] = dout_affine2[2]
        dout_relu1 = layers.relu_backward(dout_affine2[0], cache_relu1)
        dout_affine1 = layers.affine_backward(dout_relu1, cache_affine1)
        grads["W1"] = dout_affine1[1] + self.reg * self.params["W1"]
        grads["b1"] = dout_affine1[2]
        ############################################################################
        #                             END OF YOUR CODE                             #
        ############################################################################

        return loss, grads
Example #14
0
    def loss(self, X, y=None):
        """
        Compute loss and gradient for the fully-connected net.

        Input / output: Same as TwoLayerNet above.
        """
        X = X.astype(self.dtype)
        mode = 'test' if y is None else 'train'

        # Set train/test mode for batchnorm params and dropout param since they
        # behave differently during training and testing.
        if self.use_dropout:
            self.dropout_param['mode'] = mode
        if self.use_batchnorm:
            for bn_param in self.bn_params:
                bn_param['mode'] = mode

        scores = None
        ############################################################################
        # TODO: Implement the forward pass for the fully-connected net, computing  #
        # the class scores for X and storing them in the scores variable.          #
        #                                                                          #
        # When using dropout, you'll need to pass self.dropout_param to each       #
        # dropout forward pass.                                                    #
        #                                                                          #
        # When using batch normalization, you'll need to pass self.bn_params[0] to #
        # the forward pass for the first batch normalization layer, pass           #
        # self.bn_params[1] to the forward pass for the second batch normalization #
        # layer, etc.                                                              #
        ############################################################################
        caches = collections.defaultdict(list)
        out_layer = X

        for i in range(self.num_layers - 1):
            n = str(i + 1)

            # (zy) The learned parameters are for BN affine transformation used
            # in training, while the running average is used for prediction.
            if self.use_batchnorm:
                out_layer, cache = affine_bn_relu_forward(
                    out_layer, self.params["W" + n], self.params["b" + n],
                    self.params["gamma" + n], self.params["beta" + n],
                    self.bn_params[i])
                caches["affine_bn_relu"].append(cache)
            else:
                out_layer, cache = layers.affine_forward(
                    out_layer, self.params["W" + n], self.params["b" + n])
                caches["affine"].append(cache)

                out_layer, cache = layers.relu_forward(out_layer)
                caches["relu"].append(cache)

            if self.use_dropout:
                out_layer, cache = layers.dropout_forward(
                    out_layer, self.dropout_param)
                caches["drop"].append(cache)

        nn = str(self.num_layers)
        scores, cache = layers.affine_forward(out_layer, self.params["W" + nn],
                                              self.params["b" + nn])
        ############################################################################
        #                             END OF YOUR CODE                             #
        ############################################################################

        # If test mode return early
        if mode == 'test':
            return scores

        loss, grads = 0.0, {}
        ############################################################################
        # TODO: Implement the backward pass for the fully-connected net. Store the #
        # loss in the loss variable and gradients in the grads dictionary. Compute #
        # data loss using softmax, and make sure that grads[k] holds the gradients #
        # for self.params[k]. Don't forget to add L2 regularization!               #
        #                                                                          #
        # When using batch normalization, you don't need to regularize the scale   #
        # and shift parameters.                                                    #
        #                                                                          #
        # NOTE: To ensure that your implementation matches ours and you pass the   #
        # automated tests, make sure that your L2 regularization includes a factor #
        # of 0.5 to simplify the expression for the gradient.                      #
        ############################################################################
        loss, dloss = layers.softmax_loss(scores, y)
        # for regularization
        if self.reg != 0:
            for k, v in self.params.items():
                # only include the w parameters, excluding gamma, beta and b
                if k.startswith("W"):
                    loss += 0.5 * self.reg * np.sum(v**2)

        # get the gradient
        out = layers.affine_backward(dloss, cache)
        dout, grads["W" + nn], grads["b" + nn] = out
        grads["W" + nn] += self.reg * cache[1]

        for i in range(self.num_layers - 2, -1, -1):
            n = str(i + 1)

            if self.use_dropout:
                dout = layers.dropout_backward(dout, caches["drop"][i])

            if self.use_batchnorm:
                out = affine_bn_relu_backward(dout,
                                              caches["affine_bn_relu"][i])
                dout, grads["W"+n], grads["b"+n], \
                    grads["gamma"+n], grads["beta"+n] = out
                grads["W" +
                      n] += self.reg * self.params["W" + n] if self.reg else 0

            else:
                dout = layers.relu_backward(dout, caches["relu"][i])

                out = layers.affine_backward(dout, caches["affine"][i])
                dout, grads["W" + n], grads["b" + n] = out
                # need to include regularization
                grads["W" + n] += self.reg * caches["affine"][i][1]
        ############################################################################
        #                             END OF YOUR CODE                             #
        ############################################################################

        return loss, grads
Example #15
0
    def loss(self, X, y=None):
        """
        Compute loss and gradient for the fully-connected net.

        Input / output: Same as TwoLayerNet above.
        """
        X = X.astype(self.dtype)
        mode = 'test' if y is None else 'train'

        # Set train/test mode for batchnorm params and dropout param since they
        # behave differently during training and testing.
        if self.use_dropout:
            self.dropout_param['mode'] = mode
        if self.normalization == 'batchnorm':
            for bn_param in self.bn_params:
                bn_param['mode'] = mode
        scores = None
        ############################################################################
        # TODO: Implement the forward pass for the fully-connected net, computing  #
        # the class scores for X and storing them in the scores variable.          #
        #                                                                          #
        # When using dropout, you'll need to pass self.dropout_param to each       #
        # dropout forward pass.                                                    #
        #                                                                          #
        # When using batch normalization, you'll need to pass self.bn_params[0] to #
        # the forward pass for the first batch normalization layer, pass           #
        # self.bn_params[1] to the forward pass for the second batch normalization #
        # layer, etc.                                                              #
        ############################################################################
        # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****

        arg, caches = X, []

        for i in range(1, self.num_layers + 1):
            cache = {}

            W = self.params[f"W{i}"]
            b = self.params[f"b{i}"]
            arg, cache['fc_cache'] = affine_forward(arg, W, b)

            if i != self.num_layers and self.normalization:
                gamma = self.params[f"gamma{i}"]
                beta = self.params[f"beta{i}"]

                normalize_forward = batchnorm_forward if self.normalization is 'batchnorm' else layernorm_forward
                arg, cache['bn_cache'] = normalize_forward(arg, gamma, beta, self.bn_params[i-1])

            arg, cache['relu_cache'] = relu_forward(arg)

            if self.use_dropout:
                arg, cache['dropout_cache'] = dropout_forward(arg, self.dropout_param)

            caches.append(cache)

        scores = arg

        # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
        ############################################################################
        #                             END OF YOUR CODE                             #
        ############################################################################

        # If test mode return early
        if mode == 'test':
            return scores

        loss, grads = 0.0, {}
        ############################################################################
        # TODO: Implement the backward pass for the fully-connected net. Store the #
        # loss in the loss variable and gradients in the grads dictionary. Compute #
        # data loss using softmax, and make sure that grads[k] holds the gradients #
        # for self.params[k]. Don't forget to add L2 regularization!               #
        #                                                                          #
        # When using batch/layer normalization, you don't need to regularize the scale   #
        # and shift parameters.                                                    #
        #                                                                          #
        # NOTE: To ensure that your implementation matches ours and you pass the   #
        # automated tests, make sure that your L2 regularization includes a factor #
        # of 0.5 to simplify the expression for the gradient.                      #
        ############################################################################
        # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****

        loss, dout = softmax_loss(scores, y)

        for i in range(self.num_layers, 0, -1):
            W = self.params[f"W{i}"]
            cache = caches[i-1]

            if self.use_dropout:
                dout = dropout_backward(dout, cache['dropout_cache'])

            da = relu_backward(dout, cache['relu_cache'])

            if i != self.num_layers and self.normalization:
                normalize_backward = batchnorm_backward if self.normalization is 'batchnorm' else layernorm_backward
                da, dgamma, dbeta = batchnorm_backward(da, cache['bn_cache'])
                grads[f"gamma{i}"] = dgamma
                grads[f"beta{i}"] = dbeta

            dout, dw, db = affine_backward(da, cache['fc_cache'])

            grads[f"W{i}"] = dw + self.reg * W
            grads[f"b{i}"] = db

            loss += 0.5 * self.reg * np.sum(W * W)

        # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
        ############################################################################
        #                             END OF YOUR CODE                             #
        ############################################################################

        return loss, grads
# Test the affine_backward function
np.random.seed(231)
x = np.random.randn(10, 2, 3)
w = np.random.randn(6, 5)
b = np.random.randn(5)
dout = np.random.randn(10, 5)

dx_num = eval_numerical_gradient_array(lambda x: affine_forward(x, w, b)[0], x,
                                       dout)
dw_num = eval_numerical_gradient_array(lambda w: affine_forward(x, w, b)[0], w,
                                       dout)
db_num = eval_numerical_gradient_array(lambda b: affine_forward(x, w, b)[0], b,
                                       dout)

_, cache = affine_forward(x, w, b)
dx, dw, db = affine_backward(dout, cache)

# The error should be around 1e-10
print('Testing affine_backward function:')
print('dx error: ', rel_error(dx_num, dx))
print('dw error: ', rel_error(dw_num, dw))
print('db error: ', rel_error(db_num, db))

# Test the relu_forward function
x = np.linspace(-0.5, 0.5, num=12).reshape(3, 4)

out, _ = relu_forward(x)
correct_out = np.array([[
    0.,
    0.,
    0.,
Example #17
0
    def loss(self, X, y=None):
        """
        Compute loss and gradient for the fully-connected net.

        Input / output: Same as TwoLayerNet above.
        """
        X = X.astype(self.dtype)
        mode = 'test' if y is None else 'train'

        # Set train/test mode for batchnorm params and dropout param since they
        # behave differently during training and testing.
        if self.use_dropout:
            self.dropout_param['mode'] = mode
        if self.normalization == 'batchnorm':
            for bn_param in self.bn_params:
                bn_param['mode'] = mode
        scores = None
        ############################################################################
        # TODO: Implement the forward pass for the fully-connected net, computing  #
        # the class scores for X and storing them in the scores variable.          #
        #                                                                          #
        # When using dropout, you'll need to pass self.dropout_param to each       #
        # dropout forward pass.                                                    #
        #                                                                          #
        # When using batch normalization, you'll need to pass self.bn_params[0] to #
        # the forward pass for the first batch normalization layer, pass           #
        # self.bn_params[1] to the forward pass for the second batch normalization #
        # layer, etc.                                                              #
        ############################################################################

        combo_caches = []
        fc_cache = None

        N = X.shape[0]
        D = np.prod(X.shape[1:])
        x_ = X.reshape(N, D)

        # middle combo layers
        for layer in range(1, self.num_layers):  #[1, 2, ..., L-1]
            w = self.params['W' + str(layer)]
            b = self.params['b' + str(layer)]

            # prepare for batch normalization
            gamma, beta, bn_parma = 1., 0, None
            if self.normalization == 'batchnorm':
                gamma = self.params['gamma' + str(layer)]
                beta = self.params['beta' + str(layer)]
                bn_parma = self.bn_params[layer - 1]  # zero based

            x_, cache = combo_forward(x_, w, b, gamma, beta, bn_parma)
            combo_caches.append(cache)

        # final fully connected layer
        w = self.params['W' + str(self.num_layers)]
        b = self.params['b' + str(self.num_layers)]
        scores, fc_cache = affine_forward(x_, w, b)

        ############################################################################
        #                             END OF YOUR CODE                             #
        ############################################################################

        # If test mode return early
        if mode == 'test':
            return scores

        loss, grads = 0.0, {}
        ############################################################################
        # TODO: Implement the backward pass for the fully-connected net. Store the #
        # loss in the loss variable and gradients in the grads dictionary. Compute #
        # data loss using softmax, and make sure that grads[k] holds the gradients #
        # for self.params[k]. Don't forget to add L2 regularization!               #
        #                                                                          #
        # When using batch/layer normalization, you don't need to regularize the scale   #
        # and shift parameters.                                                    #
        #                                                                          #
        # NOTE: To ensure that your implementation matches ours and you pass the   #
        # automated tests, make sure that your L2 regularization includes a factor #
        # of 0.5 to simplify the expression for the gradient.                      #
        ############################################################################

        loss, dout = softmax_loss(scores, y)

        # finall fully connected layer
        dout, dw, db = affine_backward(dout, fc_cache)
        grads['W' +
              str(self.num_layers
                  )] = dw + self.reg * self.params['W' + str(self.num_layers)]
        grads['b' + str(self.num_layers)] = db
        # adjust loss with regularization term of dWL
        loss += 0.5 * self.reg * np.sum(self.params['W' + str(self.num_layers)]
                                        **2)

        # middle combo layers
        for layer in range(self.num_layers - 1, 0, -1):  # [L-1, L-2, ... ,1]

            dout, dw, db, dgamma, dbeta = combo_backward(
                dout, combo_caches[layer - 1])
            grads['W' +
                  str(layer)] = dw + self.reg * self.params['W' + str(layer)]
            grads['b' + str(layer)] = db

            if self.normalization == 'batchnorm':
                grads['gamma' + str(layer)] = dgamma
                grads['beta' + str(layer)] = dbeta

            # adjust loss with regularization term of dWl
            loss += 0.5 * self.reg * np.sum(self.params['W' + str(layer)]**2)

        ############################################################################
        #                             END OF YOUR CODE                             #
        ############################################################################

        return loss, grads
Example #18
0
    def loss(self, X, y=None):
        """
        Compute loss and gradient for a minibatch of data.

        Inputs:
        - X: Array of input data of shape (N, d_1, ..., d_k)
        - y: Array of labels, of shape (N,). y[i] gives the label for X[i].

        Returns:
        If y is None, then run a test-time forward pass of the model and return:
        - scores: Array of shape (N, C) giving classification scores, where
          scores[i, c] is the classification score for X[i] and class c.

        If y is not None, then run a training-time forward and backward pass and
        return a tuple of:
        - loss: Scalar value giving the loss
        - grads: Dictionary with the same keys as self.params, mapping parameter
          names to gradients of the loss with respect to those parameters.
        """
        scores = None
        #######################################################################
        # TODO: Implement the forward pass for the two-layer net, computing the
        # class scores for X and storing them in the scores variable.
        #######################################################################
        W1 = self.params["W1"]
        b1 = self.params["b1"]
        W2 = self.params["W2"]
        b2 = self.params["b2"]

        N = X.shape[0]
        C = W2.shape[1]

        scores = np.zeros((N, C))

        X_hidden, cache1 = affine_relu_forward(X, W1, b1)
        scores, cache2 = affine_forward(X_hidden, W2, b2)

        #######################################################################
        #                             END OF YOUR CODE                        #
        #######################################################################

        # If y is None then we are in test mode so just return scores
        if y is None:
            return scores

        loss, grads = 0, {}
        #######################################################################
        # TODO: Implement the backward pass for the two-layer net. Store the
        # loss in the loss variable and gradients in the grads dictionary.
        # Compute data loss using softmax, and make sure that grads[k]
        # holds the gradients for self.params[k]. Don't forget to add L2
        # regularization!
        #
        # NOTE: To ensure that your implementation matches ours and you pass
        # the automated tests, make sure that your L2 regularization includes a
        # factor of 0.5 to simplify the expression for the gradient.
        #######################################################################

        loss, dscores = softmax_loss(scores, y)
        loss += 0.5 * self.reg * (np.sum(W1 * W1) + np.sum(W2 * W2))

        dx_hidden, dw2, db2 = affine_backward(dscores, cache2)
        grads["W2"] = dw2 + self.reg * W2
        grads["b2"] = db2

        dx, dw1, db1 = affine_relu_backward(dx_hidden, cache1)
        grads["W1"] = dw1 + self.reg * W1
        grads["b1"] = db1
        #######################################################################
        #                             END OF YOUR CODE                        #
        #######################################################################

        return loss, grads
Example #19
0
    def loss(self, X, y=None):
        """
        Compute loss and gradient for a minibatch of data.

        Inputs:
        - X: Array of input data of shape (N, d_1, ..., d_k)
        - y: Array of labels, of shape (N,). y[i] gives the label for X[i].

        Returns:
        If y is None, then run a test-time forward pass of the model and return:
        - scores: Array of shape (N, C) giving classification scores, where
          scores[i, c] is the classification score for X[i] and class c.

        If y is not None, then run a training-time forward and backward pass and
        return a tuple of:
        - loss: Scalar value giving the loss
        - grads: Dictionary with the same keys as self.params, mapping parameter
          names to gradients of the loss with respect to those parameters.
        """
        scores = None
        ############################################################################
        # TODO: Implement the forward pass for the two-layer net, computing the    #
        # class scores for X and storing them in the scores variable.              #
        ############################################################################

        W1, b1 = self.params['W1'], self.params['b1']
        W2, b2 = self.params['W2'], self.params['b2']
        N = X.shape[0]
        D = np.prod(X.shape[1:])

        X_ = X.reshape(N, D)
        A, fc1_cache = affine_forward(X_, W1, b1)
        R, relu_cache = relu_forward(A)
        scores, fc2_cache = affine_forward(R, W2, b2)

        ############################################################################
        #                             END OF YOUR CODE                             #
        ############################################################################

        # If y is None then we are in test mode so just return scores
        if y is None:
            return scores

        loss, grads = 0, {}
        ############################################################################
        # TODO: Implement the backward pass for the two-layer net. Store the loss  #
        # in the loss variable and gradients in the grads dictionary. Compute data #
        # loss using softmax, and make sure that grads[k] holds the gradients for  #
        # self.params[k]. Don't forget to add L2 regularization!                   #
        #                                                                          #
        # NOTE: To ensure that your implementation matches ours and you pass the   #
        # automated tests, make sure that your L2 regularization includes a factor #
        # of 0.5 to simplify the expression for the gradient.                      #
        ############################################################################

        loss, dscores = softmax_loss(scores, y)
        dR, dW2, db2 = affine_backward(dscores, fc2_cache)
        dA = relu_backward(dR, relu_cache)
        dX, dW1, db1 = affine_backward(dA, fc1_cache)

        loss += 0.5 * self.reg * (np.sum(W1 * W1) + np.sum(W2 * W2))
        dW2 += self.reg * W2
        dW1 += self.reg * W1

        grads = {'W1': dW1, 'b1': db1, 'W2': dW2, 'b2': db2}

        ############################################################################
        #                             END OF YOUR CODE                             #
        ############################################################################

        return loss, grads
Example #20
0
    def loss(self, X, y=None):
        """
        Compute loss and gradient for the fully-connected net.

        Input / output: Same as TwoLayerNet above.
        """
        X = X.astype(self.dtype)
        mode = 'test' if y is None else 'train'

        # Set train/test mode for batchnorm params and dropout param since they
        # behave differently during training and testing.
        """
        if self.use_dropout:
            self.dropout_param['mode']=mode
       """
        if self.use_batchnorm:
            for bn_param in self.bn_params:
                bn_param['mode'] = mode

        scores = None
        ############################################################################
        # TODO: Implement the forward pass for the fully-connected net, computing  #
        # the class scores for X and storing them in the scores variable.          #
        #                                                                          #
        # When using dropout, you'll need to pass self.dropout_param to each       #
        # dropout forward pass.                                                    #
        #                                                                          #
        # When using batch normalization, you'll need to pass self.bn_params[0] to #
        # the forward pass for the first batch normalization layer, pass           #
        # self.bn_params[1] to the forward pass for the second batch normalization #
        # layer, etc.                                                              #
        ############################################################################
        X_temp = X
        affine_Input = list()
        relu_input = list()
        batchnorm_input = list()
        dropout_input = list()
        score_tmp = None
        for i in range(self.num_layers - 1):
            tmp, affine_input_tmp = affine_forward(
                X_temp, self.params['W' + str(i + 1)],
                self.params['b' + str(i + 1)])
            if self.use_batchnorm:
                tmp, batchnorm_cache = batchnorm_forward(
                    tmp, self.params['gamma' + str(i + 1)],
                    self.params['beta' + str(i + 1)], self.bn_params[i])
                batchnorm_input.append(batchnorm_cache)
            score_tmp, relu_input_tmp = relu_forward(tmp)
            if self.use_dropout:
                score_tmp, dropout_cache = dropout_forward(
                    score_tmp, self.dropout_param)
                dropout_input.append(dropout_cache)
            affine_Input.append(affine_input_tmp)
            relu_input.append(relu_input_tmp)
            X_temp = score_tmp
        scores, last_input_tmp = affine_forward(
            score_tmp, self.params['W' + str(self.num_layers)],
            self.params['b' + str(self.num_layers)])
        affine_Input.append(last_input_tmp)
        ############################################################################
        #                             END OF YOUR CODE                             #
        ############################################################################
        if mode == 'test':
            return scores
        loss, grads = 0.0, {}
        ############################################################################
        # TODO: Implement the backward pass for the fully-connected net. Store the #
        # loss in the loss variable and gradients in the grads dictionary. Compute #
        # data loss using softmax, and make sure that grads[k] holds the gradients #
        # for self.params[k]. Don't forget to add L2 regularization!               #
        #                                                                          #
        # When using batch normalization, you don't need to regularize the scale   #
        # and shift parameters.                                                    #
        #                                                                          #
        # NOTE: To ensure that your implementation matches ours and you pass the   #
        # automated tests, make sure that your L2 regularization includes a factor #
        # of 0.5 to simplify the expression for the gradient.                      #
        ############################################################################
        num_trains = X.shape[0]
        loss, dscores = softmax_loss(scores, y)
        weight_decay_sum = 0
        for i in range(self.num_layers):
            tmp = np.sum(self.params['W' + str(i + 1)] *
                         self.params['W' + str(i + 1)])
            weight_decay_sum = weight_decay_sum + tmp

        loss = loss + 0.5 * self.reg * weight_decay_sum
        #softmax_output=np.exp(scores)/np.sum(np.exp(scores),axis=1).reshape(-1,1)
        #softmax_output[range(num_trains),list(y)]=softmax_output[range(num_trains),list(y)]-1
        dout = dscores
        for i in range(self.num_layers):
            dx, dw, db = affine_backward(dout, affine_Input[-(i + 1)])
            grads['W' +
                  str(self.num_layers - i)] = dw + self.reg * self.params[
                      'W' + str(self.num_layers - i)]
            grads['b' + str(self.num_layers - i)] = db
            if self.use_dropout and i != self.num_layers - 1:
                dx = dropout_backward(dx, dropout_input[-(i + 1)])
            if i != self.num_layers - 1:
                dout = relu_backward(dx, relu_input[-(i + 1)])
            if i != self.num_layers - 1 and self.use_batchnorm:
                dout, dgamma, dbeta = batchnorm_backward(
                    dout, batchnorm_input[-(i + 1)])
                grads['gamma' + str(self.num_layers - i - 1)] = dgamma
                grads['beta' + str(self.num_layers - i - 1)] = dbeta

        return loss, grads