Python softmax_loss Examples, layers.softmax_loss Python Examples

Example #1

0

Show file

File: cnn.py Project: yilunh98/StanfordCS231n

  def loss(self, X, y=None):
    """
    Evaluate loss and gradient for the three-layer convolutional network.
    """
    W1 = self.params['W1']
    W2, b2 = self.params['W2'], self.params['b2']
    W3, b3 = self.params['W3'], self.params['b3']

    # pass pool_param to the forward pass for the max-pooling layer
    pool_param = {'pool_height': 2, 'pool_width': 2, 'stride': 2}

    scores = None
    conv, cache1 = layers.conv_forward(X,W1)
    relu1, cache2 = layers.relu_forward(conv)
    maxp, cache3 = layers.max_pool_forward(relu1,pool_param)
    fc1, cache4 = layers.fc_forward(maxp,W2,b2)
    relu2, cache5 = layers.relu_forward(fc1)
    scores, cache6 = layers.fc_forward(relu2,W3,b3)

    if y is None:
      return scores

    loss, grads = 0, {}
    loss, dscores = layers.softmax_loss(scores,y)
    dx3, dW3, db3 = layers.fc_backward(dscores,cache6)
    dRelu2 = layers.relu_backward(dx3,cache5)
    dx2, dW2, db2 = layers.fc_backward(dRelu2,cache4)
    dmaxp = layers.max_pool_backward(dx2.reshape(maxp.shape),cache3)
    dRelu1 = layers.relu_backward(dmaxp,cache2)
    dx,dW1 = layers.conv_backward(dRelu1,cache1)
    
    grads = {'W1':dW1,'W2':dW2,'b2':db2,'W3':dW3,'b3':db3}

    return loss, grads

Example #2

0

Show file

    def loss(self, X, y=None, reg=1e-5):
        print 'start computing loss and grad.............'
        W1, b1 = self.params['W1'], self.params['b1']
        W2, b2 = self.params['W2'], self.params['b2']
        W3, b3 = self.params['W3'], self.params['b3']

        # pass conv_param to the forward pass for the convolutional layer
        filter_size = W1.shape[2]
        conv_param = {'stride': 1, 'pad': (filter_size - 1) / 2}

        # pass pool_param to the forward pass for the max-pooling layer
        pool_param = {'pool_height': 2, 'pool_width': 2, 'stride': 2}

        # compute the forward pass
        print 'compute the forward pass......'
        print 'compute the w1 conv_relu_pool_forward forward pass......'
        a1, cache1 = layers.conv_relu_pool_forward(X, W1, b1, conv_param,
                                                   pool_param)

        print 'compute the w2 affine_relu_forward forward pass......'
        a2, cache2 = layers.affine_relu_forward(a1, W2, b2)

        print 'compute the w3 affine_forward forward pass......'
        scores, cache3 = layers.affine_forward(a2, W3, b3)

        if y is None:
            return scores

        # compute the backward pass
        print 'compute the backward pass......'
        print 'compute the softmax_loss backward pass......'
        data_loss, dscores = layers.softmax_loss(scores, y)

        print 'compute the dw3 affine_backward backward pass......'
        da2, dW3, db3 = layers.affine_backward(dscores, cache3)

        print 'compute the dw2 affine_relu_backward backward pass......'
        da1, dW2, db2 = layers.affine_relu_backward(da2, cache2)

        print 'compute the dw1 conv_relu_pool_backward backward pass......'
        dX, dW1, db1 = layers.conv_relu_pool_backward(da1, cache1)

        # Add regularization
        dW1 += self.reg * W1
        dW2 += self.reg * W2
        dW3 += self.reg * W3
        reg_loss = 0.5 * self.reg * sum(np.sum(W * W) for W in [W1, W2, W3])
        loss = data_loss + reg_loss
        grads = {
            'W1': dW1,
            'b1': db1,
            'W2': dW2,
            'b2': db2,
            'W3': dW3,
            'b3': db3
        }
        print ' computing loss and grad end !!!!!!!!!!!!!!!!!'
        print 'loss is :', loss
        return loss, grads

Example #3

0

Show file

File: autograder.py Project: llee628/EECS545

    def test_softmax_loss(self):
        # Softmax loss
        np.random.seed(498)
        num_classes, num_inputs = 10, 50
        x = 0.001 * np.random.randn(num_inputs, num_classes)
        y = np.random.randint(num_classes, size=num_inputs)

        dx_num = eval_numerical_gradient(
            lambda x: layers.softmax_loss(x, y)[0], x, verbose=False)
        loss, dx = layers.softmax_loss(x, y)

        # Test softmax_loss function. Loss should be 2.3 and dx error might be 1e-8
        # As long as your error is small enough, your implementation should pass this test.
        print('\nTesting softmax_loss:')
        print('loss: ', loss)
        print('dx error: ', rel_error(dx_num, dx))

        np.testing.assert_allclose(loss, 2.3, atol=0.05)
        np.testing.assert_allclose(dx, dx_num, atol=1e-7)

Example #4

0

Show file

File: classifier_fc_net.py Project: dz-chen/cs231n-homework

    def loss(self, X, y=None):
        """
        Compute loss and gradient for a minibatch of data.

        Inputs:
        - X: Array of input data of shape (N, d_1, ..., d_k)
        - y: Array of labels, of shape (N,). y[i] gives the label for X[i].

        Returns:
        If y is None, then run a test-time forward pass of the model and return:
        - scores: Array of shape (N, C) giving classification scores, where
          scores[i, c] is the classification score for X[i] and class c.

        If y is not None, then run a training-time forward and backward pass and
        return a tuple of:
        - loss: Scalar value giving the loss
        - grads: Dictionary with the same keys as self.params, mapping parameter
          names to gradients of the loss with respect to those parameters.
        """
        out1, cache1 = layer_utilities.affine_relu_forward(
            X, self.params['W1'], self.params['b1'])
        out2, cache2 = layers.affine_forward(
            out1, self.params['W2'],
            self.params['b2'])  # last layer no need to use relu
        scores = out2

        if y is None:
            return scores
        # backward
        loss, grads = 0, {}
        loss, d_scores = layers.softmax_loss(scores, y)
        loss = loss + 0.5 * self.reg * (
            np.sum(self.params['W1'] * self.params['W1']) +
            np.sum(self.params['W2'] * self.params['W2']))

        dout1, dW2, db2 = layers.affine_backward(d_scores, cache2)
        dx, dW1, db1 = layer_utilities.affine_relu_backward(dout1, cache1)

        grads['W2'] = dW2 + self.reg * self.params['W2']
        grads['b2'] = db2
        grads['W1'] = dW1 + self.reg * self.params['W1']
        grads['b1'] = db1

        return loss, grads

Example #5

0

Show file

def training_step(model, X_batch, y_batch, reg):
    """
    Compute the loss and gradients for a single training iteration of a model
    given a minibatch of data. The loss should be a sum of a cross-entropy loss
    between the model predictions and the ground-truth image labels, and
    an L2 regularization term on all weight matrices in the fully-connected
    layers of the model. You should not regularize the bias vectors.

    Inputs:
    - model: A Classifier instance
    - X_batch: A numpy array of shape (N, D) giving a minibatch of images
    - y_batch: A numpy array of shape (N,) where 0 <= y_batch[i] < C is the
      ground-truth label for the image X_batch[i]
    - reg: A float giving the strength of L2 regularization to use.

    Returns a tuple of:
    - loss: A float giving the loss (data loss + regularization loss) for the
      model on this minibatch of data
    - grads: A dictionary giving gradients of the loss with respect to the
      parameters of the model. In particular grads[k] should be the gradient
      of the loss with respect to model.parameters()[k].
    """
    loss, grads = None, None
    ###########################################################################
    # TODO: Compute the loss and gradient for one training iteration.         #
    ###########################################################################
    scores, cache = model.forward(X_batch)
    data_loss, grad_scores = softmax_loss(scores, y_batch)
    grads = model.backward(grad_scores, cache)

    #regularization
    W1_loss, grad_W1 = l2_regularization(model.W1, reg)
    W2_loss, grad_W2 = l2_regularization(model.W2, reg)

    loss = data_loss + W1_loss + W2_loss
    grads['W1'] += grad_W1
    grads['W2'] += grad_W2
    #breakpoint()

    ###########################################################################
    #                             END OF YOUR CODE                            #
    ###########################################################################
    return loss, grads

Example #6

0

Show file

    def loss(self,X,y=None):
        """
                Compute loss and gradient for a minibatch of data.
                Inputs:
                - X: Array of input data of shape (N, d_1, ..., d_k)
                - y: Array of labels, of shape (N,). y[i] gives the label for X[i].
                Returns:
                If y is None, then run a test-time forward pass of the model and return:
                - scores: Array of shape (N, C) giving classification scores, where
                  scores[i, c] is the classification score for X[i] and class c.
                If y is not None, then run a training-time forward and backward pass and
                return a tuple of:
                - loss: Scalar value giving the loss
                - grads: Dictionary with the same keys as self.params, mapping parameter
                  names to gradients of the loss with respect to those parameters.
                """
        scores = None
        W1,b1 = self.params['W1'],self.params['b1']
        W2,b2 = self.params['W2'],self.params['b2']

        ar1_out,ar1_cache = affine_relu_forward(X,W1,b1)
        ar2_out,ar2_cache = affine_forward(ar1_out,W2,b2)

        scores = ar2_out

        if y is None:
            return scores
        loss,grads = 0,{}
        loss,dout = softmax_loss(scores,y)
        loss = loss+0.5*self.reg*np.sum(W1*W1)+0.5*self.reg*np.sum(W2*W2)
        dx2,dw2,db2 = affine_backward(dout,ar2_cache)
        grads['W2'] = dw2 +self.reg*W2
        grads['b2'] = db2
        dx1,dw1,db1 = affine_relu_backward(dx2,ar1_cache)
        grads['W1'] = dw1+self.reg*W1
        grads['b1'] = db1

        return loss,grads

Example #7

0

Show file

File: softmax.py Project: yilunh98/StanfordCS231n

    def loss(self, X, y=None):
        """
        Compute loss and gradient for a minibatch of data.
        Inputs:
        - X: Array of input data of shape (N, d_in)
        - y: Array of labels, of shape (N,). y[i] gives the label for X[i].
        Returns:
        If y is None, then run a test-time forward pass of the model and return:
        - scores: Array of shape (N, C) giving classification scores, where
          scores[i, c] is the classification score for X[i] and class c.
        If y is not None, then run a training-time forward and backward pass and
        return a tuple of:
        - loss: Scalar value giving the loss
        - grads: Dictionary with the same keys as self.params, mapping parameter
          names to gradients of the loss with respect to those parameters.
        """
        W1, b1 = self.params['W1'], self.params['b1']
        W3, b3 = self.params['W3'], self.params['b3']
        N, d_in = X.shape

        scores = None
        f, cache1 = layers.fc_forward(X, W1, b1)  #fc
        h, cache2 = layers.relu_forward(f)  #relu
        scores, cache3 = layers.fc_forward(h, W3, b3)  #fc

        # If y is None then we are in test mode so just return scores
        if y is None:
            return scores

        loss, grads = 0, {}
        loss, dscores = layers.softmax_loss(scores, y)
        dx2, dW3, db3 = layers.fc_backward(dscores, cache3)
        dx1 = layers.relu_backward(dx2, cache2)
        dx, dW1, db1 = layers.fc_backward(dx1, cache1)

        grads = {'W1': dW1, 'b1': db1, 'W3': dW3, 'b3': db3}

        return loss, grads

Example #8

0

Show file

File: model_2layer.py Project: pdpham-llnl/sparse-gcn

	def loss(self,X,L,Y=None):


		out_gcnrelu1,cache_gcnrelu1 = gcn_relu_fw(L,X,self.params['Theta1'])
		out_sum2, cache_sum2 = sum_out_fw(self.params['W2'],out_gcnrelu1)

		scores = out_sum2

		if Y is None:
			return scores

		loss,dout = softmax_loss(scores,Y)

		loss += self.reg * .5 * ( LA.norm(self.params['Theta1'])**2 +
								  LA.norm(self.params['W2'])**2)

		dx2,dw2 = sum_out_bw(dout,cache_sum2)
		dw2 += self.reg * self.params['W2']

		dx1,dtheta1 = gcn_relu_bw(dx2,cache_gcnrelu1)
		dtheta1 += self.reg * self.params['Theta1']

		grads = {'Theta1':dtheta1,'W2':dw2}
		return loss,grads

Example #9

0

Show file

File: gcn_test.py Project: pdpham-llnl/sparse-gcn

def test_onelayer_gcn():

    #Test loss func

    N, n, l, K = 10, 5, 3, 2

    X = np.random.rand(N, n)
    y = np.random.randint(l, size=N)

    L = [sp.rand(n, n, density=1, format='csr') for i in range(N)]

    model = OneLayer(N, K, l, weight_scale=1e-3)
    loss, grads = model.loss(X, L, y)

    Theta1 = model.params['Theta1']
    W2 = model.params['W2']
    out1 = np.array([expmulit(L[i], X[i], Theta1) for i in range(N)])
    out1 = np.maximum(out1, 0)
    out2 = np.dot(out1, np.ones(n)).reshape(-1, 1).dot(W2.reshape(1, -1))
    correct_loss, _ = softmax_loss(out2, y)

    print('check loss diff')
    assert_diff(loss, correct_loss)

    #Test gradient
    _, grads = model.loss(X, L, y)

    for name in ['Theta1', 'W2']:
        grad = grads[name]
        f = lambda _: model.loss(X, L, y)[0]
        grad_num = eval_numerical_gradient(f,
                                           model.params[name],
                                           verbose=False)

        print('Check grad', name)
        assert_diff(grad, grad_num, 1e-8)

Example #10

0

Show file

        def loss(self, X, y=None):
            """
            Compute loss and gradient for the fully-connected net.
            Input / output: Same as TwoLayerNet above.
            """
            X = X.astype(self.dtype)
            mode = 'test' if y is None else 'train'

            # Set train/test mode for batchnorm params and dropout param since they
            # behave differently during training and testing.
            if self.dropout_param is not None:
                self.dropout_param['mode'] = mode
            if self.use_batchnorm:
                for bn_param in self.bn_params:
                    bn_param['mode'] = mode

            scores = None
            ############################################################################
            # TODO: Implement the forward pass for the fully-connected net, computing  #
            # the class scores for X and storing them in the scores variable.          #
            #                                                                          #
            # When using dropout, you'll need to pass self.dropout_param to each       #
            # dropout forward pass.                                                    #
            #                                                                          #
            # When using batch normalization, you'll need to pass self.bn_params[0] to #
            # the forward pass for the first batch normalization layer, pass           #
            # self.bn_params[1] to the forward pass for the second batch normalization #
            # layer, etc.                                                              #
            ############################################################################
            layer_input = X
            ar_cache = {}
            dp_cache = {}

            for lay in xrange(self.num_layers - 1):
                if self.use_batchnorm:
                    layer_input, ar_cache[lay] = affine_bn_relu_forward(layer_input,
                                                                        self.params['W%d' % (lay + 1)],
                                                                        self.params['b%d' % (lay + 1)],
                                                                        self.params['gamma%d' % (lay + 1)],
                                                                        self.params['beta%d' % (lay + 1)],
                                                                        self.bn_params[lay])
                else:
                    layer_input, ar_cache[lay] = affine_relu_forward(layer_input, self.params['W%d' % (lay + 1)],
                                                                     self.params['b%d' % (lay + 1)])

                if self.use_dropout:
                    layer_input, dp_cache[lay] = dropout_forward(layer_input, self.dropout_param)

            ar_out, ar_cache[self.num_layers] = affine_forward(layer_input, self.params['W%d' % (self.num_layers)],
                                                               self.params['b%d' % (self.num_layers)])
            scores = ar_out
            # pass
            ############################################################################
            #                             END OF YOUR CODE                             #
            ############################################################################

            # If test mode return early
            if mode == 'test':
                return scores

            loss, grads = 0.0, {}
            ############################################################################
            # TODO: Implement the backward pass for the fully-connected net. Store the #
            # loss in the loss variable and gradients in the grads dictionary. Compute #
            # data loss using softmax, and make sure that grads[k] holds the gradients #
            # for self.params[k]. Don't forget to add L2 regularization!               #
            #                                                                          #
            # When using batch normalization, you don't need to regularize the scale   #
            # and shift parameters.                                                    #
            #                                                                          #
            # NOTE: To ensure that your implementation matches ours and you pass the   #
            # automated tests, make sure that your L2 regularization includes a factor #
            # of 0.5 to simplify the expression for the gradient.                      #
            ############################################################################
            loss, dscores = softmax_loss(scores, y)
            dhout = dscores
            loss = loss + 0.5 * self.reg * np.sum(
                self.params['W%d' % (self.num_layers)] * self.params['W%d' % (self.num_layers)])
            dx, dw, db = affine_backward(dhout, ar_cache[self.num_layers])
            grads['W%d' % (self.num_layers)] = dw + self.reg * self.params['W%d' % (self.num_layers)]
            grads['b%d' % (self.num_layers)] = db
            dhout = dx
            for idx in xrange(self.num_layers - 1):
                lay = self.num_layers - 1 - idx - 1
                loss = loss + 0.5 * self.reg * np.sum(self.params['W%d' % (lay + 1)] * self.params['W%d' % (lay + 1)])
                if self.use_dropout:
                    dhout = dropout_backward(dhout, dp_cache[lay])
                if self.use_batchnorm:
                    dx, dw, db, dgamma, dbeta = affine_bn_relu_backward(dhout, ar_cache[lay])
                else:
                    dx, dw, db = affine_relu_backward(dhout, ar_cache[lay])
                grads['W%d' % (lay + 1)] = dw + self.reg * self.params['W%d' % (lay + 1)]
                grads['b%d' % (lay + 1)] = db
                if self.use_batchnorm:
                    grads['gamma%d' % (lay + 1)] = dgamma
                    grads['beta%d' % (lay + 1)] = dbeta
                dhout = dx
            # pass
            ############################################################################
            #                             END OF YOUR CODE                             #
            ############################################################################

            return loss, grads

Example #11

0

Show file

    def loss(self,X,y=None):
        X = X.astype(self.dtype)
        mode = 'test' if y is None else 'train'

        if self.use_dropout :
            self.dropout_param['mode'] = mode
        if self.use_batchnorm:
            for bn_param in self.bn_params:
                bn_param['mode'] = mode

        scores = None


        inputi = X
        batch_size = X.shape[0]
        X = np.reshape(X,[batch_size,-1])

        fc_cache_list = []
        relu_cache_list = []
        bn_cache_list = []
        dropout_cache_list = []


        for i in range(self.num_layers-1):
            fc_act,fc_cache= affine_forward(X,self.params['W'+str(i+1)],self.params['b'+str(i+1)])
            fc_cache_list.append(fc_cache)
            if self.use_batchnorm:
                bn_act,bn_cache = batchnorm_forward(fc_act,self.params['gamma'+str(i+1)],self.params['beta'+str(i+1)],self.bn_params[i])
                bn_cache_list.append(bn_cache)
                relu_act,relu_cache = relu_forward(bn_act)
                relu_cache_list.append(relu_cache)
            else:
                relu_act,relu_cache = relu_forward(fc_act)
                relu_cache_list.append(relu_cache)
            if self.use_dropout:
                relu_act,dropout_cache = dropout_forward(relu_act,self.dropout_param)
                dropout_cache_list.append(dropout_cache)

            X = relu_act.copy()
        ########最后一层
        scores,final_cache = affine_forward(X,self.params['W'+str(self.num_layers)],self.params['b'+str(self.num_layers)])
        #
        # for layer in range(self.num_layers):
        #     Wi,bi = self.params['W%d'%(layer+1)],self.params['b%d'%(layer+1)]
        #     outi,fc_cachei = affine_forward(inputi,Wi,bi)
        #     fc_cache_list.append(fc_cachei)
        #
        #     if self.use_batchnorm and layer!=self.num_layers-1:
        #         gammai,betai = self.params['gamma%d'%(layer+1)],self.params['beta%d'%(layer+1)]
        #
        #         outi,bn_cachei = batchnorm_forward(outi,gammai,betai,self.bn_params[layer])
        #         bn_cache_list.append(bn_cachei)
        #     outi,relu_cachei = relu_forward(outi)
        #     relu_cache_list.append(relu_cachei)
        #
        #     if self.use_dropout:
        #         outi,dropout_cachei = dropout_forward(outi,self.dropout_param)
        #         dropout_cache_list.append(dropout_cachei)
        #
        #     inputi = outi
        #
        # scores = outi

        if mode == 'test':
            return scores

        loss,grads = 0.0,{}

        loss,dsoft = softmax_loss(scores,y)
        loss += 0.5*self.reg*(np.sum(np.square(self.params['W'+str(self.num_layers)])))
        #########最后一层的反向传播
        dx_last,dw_last,db_last = affine_backward(dsoft,final_cache)
        grads['W'+str(self.num_layers)] = dw_last+self.reg*self.params['W'+str(self.num_layers)]
        grads['b'+str(self.num_layers)] = db_last

        for i in range(self.num_layers-1,0,-1):

            if self.use_dropout:
                dx_last = dropout_backward(dx_last,dropout_cache_list[i-1])

            drelu = relu_backward(dx_last,relu_cache_list[i-1])
            if self.use_batchnorm:
                dbatchnorm,dgamma,dbeta = batchnorm_backward(drelu,bn_cache_list[i-1])
                dx_last,dw_last,db_last = affine_backward(dbatchnorm,fc_cache_list[i-1])
                grads['beta'+str(i)] = dbeta
                grads['gamma'+str(i)] = dgamma
            else:
                dx_last,dw_last,db_last = affine_backward(drelu,fc_cache_list[i-1])

            grads['W'+str(i)] = dw_last+self.reg*self.params['W'+str(i)]
            grads['b'+str(i)] = db_last

            loss += 0.5*self.reg*(np.sum(np.square(self.params['W'+str(i)])))

        return loss,grads

Example #12

0

Show file

    lambda w: affine_relu_forward(x, w, b)[0], w, dout)
db_num = eval_numerical_gradient_array(
    lambda b: affine_relu_forward(x, w, b)[0], b, dout)

print('Testing affine_relu_forward:')
print('dx error: ', rel_error(dx_num, dx))
print('dw error: ', rel_error(dw_num, dw))
print('db error: ', rel_error(db_num, db))
#######################################################################################

#######################################################################################
# Test the softmax _loss function
######################################################################################
from layers import softmax_loss

np.random.seed(231)
num_classes, num_inputs = 10, 50
x = 0.001 * np.random.randn(num_inputs, num_classes)
y = np.random.randint(num_classes, size=num_inputs)

dx_num = eval_numerical_gradient(lambda x: softmax_loss(x, y)[0],
                                 x,
                                 verbose=False)
loss, dx = softmax_loss(x, y)

# Test softmax_loss function. Loss should be 2.3 and dx error should be 1e-8
print('\nTesting softmax_loss:')
print('loss: ', loss)
print('dx error: ', rel_error(dx_num, dx))
#######################################################################################

Example #13

0

Show file

File: cnn_import_classes.py Project: smrut1r/deep-learning-map

# Multilayer Perceptron classifier
###########################

# Initialise parameters
cnn1 = cnn2d(input_shape=(Xh, Xw, Xd),
             filter_shape=(f1_field, f1_field),
             num_filters=h1_units)


def flatten(x, n_examples):
    return np.reshape(x, (n_examples, -1))


relu1 = relu()
fc2 = fc2d(cnn1.yw * cnn1.yh * Xd * h1_units, K)
data_loss_fn = softmax_loss(y_train)

for i in range(n_epochs):
    # Forward pass
    conv1 = cnn1.forward(X_train)
    print("conv1:", conv1.shape)
    flatten1 = flatten(conv1, num_examples)
    h1 = relu1.forward(flatten1)
    scores = fc2.forward(h1)
    data_loss = data_loss_fn.forward(scores)
    reg_loss = 0.5 * reg * (np.sum(cnn1.W * cnn1.W) + np.sum(fc2.W * fc2.W))
    loss = data_loss + reg_loss
    if i % 1 == 0:
        print("Epoch: %d, Loss: %f" % (i, loss))

    # Backprop

Example #14

0

Show file

# Number of dimensions of input
D = 2
# Number of classes
K = 3

X, y = generate_spiral_data(N, D, K, plot=False)

###########################
# Multilayer Perceptron classifier
###########################

# Initialise parameters
fc1 = fc2d(D, h1_units)
relu1 = relu()
fc2 = fc2d(h1_units, K)
data_loss_fn = softmax_loss(y)

for i in range(n_epochs):
    # Forward pass
    h1_prod = fc1.forward(X)
    h1 = relu1.forward(h1_prod)
    scores = fc2.forward(h1)
    data_loss = data_loss_fn.forward(scores)
    reg_loss = 0.5*reg*(np.sum(fc1.W * fc1.W) + np.sum(fc2.W*fc2.W))
    loss = data_loss + reg_loss
    if i % 1000 == 0:
        print("Epoch: %d, Loss: %f" % (i, loss))

    # Backprop
    # dLi/dfk = pk - 1(yi=k)
    dscores = data_loss_fn.backward()