Beispiel #1
0
    def test_fc_backward(self):
        # FC layer: backward
        np.random.seed(498)
        x = np.random.randn(10, 6)
        w = np.random.randn(6, 5)
        b = np.random.randn(5)
        dout = np.random.randn(10, 5)

        dx_num = eval_numerical_gradient_array(
            lambda x: layers.fc_forward(x, w, b)[0], x, dout)
        dw_num = eval_numerical_gradient_array(
            lambda w: layers.fc_forward(x, w, b)[0], w, dout)
        db_num = eval_numerical_gradient_array(
            lambda b: layers.fc_forward(x, w, b)[0], b, dout)

        _, cache = layers.fc_forward(x, w, b)
        dx, dw, db = layers.fc_backward(dout, cache)

        # The error should be around 1e-9
        print('\nTesting fc_backward function:')
        print('dx error: ', rel_error(dx_num, dx))
        print('dw error: ', rel_error(dw_num, dw))
        print('db error: ', rel_error(db_num, db))

        np.testing.assert_allclose(dx, dx_num, atol=1e-8)
        np.testing.assert_allclose(dw, dw_num, atol=1e-8)
        np.testing.assert_allclose(db, db_num, atol=1e-8)
Beispiel #2
0
 def forward(self, train_data):
     self.nurons["z2"] = fc_forward(train_data, self.weights["W1"], self.weights["b1"])
     self.nurons["z2_relu"] = relu_forward(self.nurons["z2"])
     self.nurons["z3"] = fc_forward(self.nurons["z2_relu"], self.weights["W2"], self.weights["b2"])
     self.nurons["z3_relu"] = relu_forward(self.nurons["z3"])
     self.nurons["y"] = fc_forward(self.nurons["z3_relu"], self.weights["W3"], self.weights["b3"])
     return self.nurons["y"]
Beispiel #3
0
  def loss(self, X, y=None):
    """
    Evaluate loss and gradient for the three-layer convolutional network.
    """
    W1 = self.params['W1']
    W2, b2 = self.params['W2'], self.params['b2']
    W3, b3 = self.params['W3'], self.params['b3']

    # pass pool_param to the forward pass for the max-pooling layer
    pool_param = {'pool_height': 2, 'pool_width': 2, 'stride': 2}

    scores = None
    conv, cache1 = layers.conv_forward(X,W1)
    relu1, cache2 = layers.relu_forward(conv)
    maxp, cache3 = layers.max_pool_forward(relu1,pool_param)
    fc1, cache4 = layers.fc_forward(maxp,W2,b2)
    relu2, cache5 = layers.relu_forward(fc1)
    scores, cache6 = layers.fc_forward(relu2,W3,b3)

    if y is None:
      return scores

    loss, grads = 0, {}
    loss, dscores = layers.softmax_loss(scores,y)
    dx3, dW3, db3 = layers.fc_backward(dscores,cache6)
    dRelu2 = layers.relu_backward(dx3,cache5)
    dx2, dW2, db2 = layers.fc_backward(dRelu2,cache4)
    dmaxp = layers.max_pool_backward(dx2.reshape(maxp.shape),cache3)
    dRelu1 = layers.relu_backward(dmaxp,cache2)
    dx,dW1 = layers.conv_backward(dRelu1,cache1)
    
    grads = {'W1':dW1,'W2':dW2,'b2':db2,'W3':dW3,'b3':db3}

    return loss, grads
Beispiel #4
0
    def train(self):
        # 随机初始化参数
        W1 = np.random.randn(2, 3)
        b1 = np.zeros([3])
        loss = 100.0
        lr = 0.01
        i = 0

        while loss > 1e-15:
            x, y_true = self.next_sample(2)  # 获取当前样本
            # 前向传播
            y = fc_forward(x, W1, b1)
            # 反向传播更新梯度
            loss, dy = mean_squared_loss(y, y_true)
            dw, db, _ = fc_backward(dy, self.W, x)

            # 在一个batch中梯度取均值
            # print(dw)

            # 更新梯度
            W1 -= lr * dw
            b1 -= lr * db

            # 更新迭代次数
            i += 1
            if i % 1000 == 0:
                print("\n迭代{}次,当前loss:{}, 当前权重:{},当前偏置{}".format(i, loss, W1, b1))

                # 打印最终结果
        print("\n迭代{}次,当前loss:{}, 当前权重:{},当前偏置{}".format(i, loss, W1, b1))

        return W1, b1
Beispiel #5
0
    def forward(self, X):
        scores, cache = None, None
        #######################################################################
        # TODO: Implement the forward pass to compute classification scores   #
        # for the input data X. Store into cache any data that will be needed #
        # during the backward pass.                                           #
        #######################################################################
        out11, cache11 = fc_forward(X, self.W1, self.b1)
        out12, cache12 = relu_forward(out11)
        scores, cache2 = fc_forward(out12, self.W2, self.b2)

        cache = (cache11, cache12, cache2)
        #######################################################################
        #                          END OF YOUR CODE                           #
        #######################################################################
        return scores, cache
Beispiel #6
0
    def forward(self, x):
        """

        :param x: [B,in_units]
        :return output: [B,out_units]
        """
        self.in_features = x
        output = fc_forward(x, self.weight, self.bias)
        return output
Beispiel #7
0
    def sample(self, features, max_length=30):
        """
        Run a test-time forward pass for the model, sampling captions for input
        feature vectors.
        At each timestep, we embed the current word, pass it and the previous hidden
        state to the RNN to get the next hidden state, use the hidden state to get
        scores for all vocab words, and choose the word with the highest score as
        the next word. The initial hidden state is computed by applying an affine
        transform to the input image features, and the initial word is the <START>
        token.
        Inputs:
        - features: Array of input image features of shape (N, D).
        - max_length: Maximum length T of generated captions.
        Returns:
        - captions: Array of shape (N, max_length) giving sampled captions,
          where each element is an integer in the range [0, V). The first element
          of captions should be the first sampled word, not the <START> token.
        """
        N = features.shape[0]
        captions = self._null * np.ones((N, max_length), dtype=np.int32)

        # Unpack parameters
        W_proj, b_proj = self.params['W_proj'], self.params['b_proj']
        W_embed = self.params['W_embed']
        Wx, Wh, b = self.params['Wx'], self.params['Wh'], self.params['b']
        W_vocab, b_vocab = self.params['W_vocab'], self.params['b_vocab']

        fc, cache1 = layers.fc_forward(features, W_proj, b_proj)
        captions[:, 0] = self._start
        x = np.array([self._start for i in range(N)])
        x1 = self._start * np.ones(N)

        for t in range(1, max_length):
            emb, cache2 = rnn_layers.word_embedding_forward(x, W_embed)
            # emb = emb.transpose(1,0,2)
            rnn, cache3 = rnn_layers.rnn_step_forward(emb, fc, Wx, Wh, b)
            # rnn = rnn.transpose(1,0,2)
            fc = rnn
            vocab, cache3 = layers.fc_forward(rnn, W_vocab, b_vocab)
            x = vocab.argmax(1)
            x1 = np.argmax(vocab, axis=1)
            captions[:, t] = x

        return captions
Beispiel #8
0
    def loss(self, X, y=None):
        """
        Compute loss and gradient for a minibatch of data.
        Inputs:
        - X: Array of input data of shape (N, d_in)
        - y: Array of labels, of shape (N,). y[i] gives the label for X[i].
        Returns:
        If y is None, then run a test-time forward pass of the model and return:
        - scores: Array of shape (N, C) giving classification scores, where
          scores[i, c] is the classification score for X[i] and class c.
        If y is not None, then run a training-time forward and backward pass and
        return a tuple of:
        - loss: Scalar value giving the loss
        - grads: Dictionary with the same keys as self.params, mapping parameter
          names to gradients of the loss with respect to those parameters.
        """
        W1, b1 = self.params['W1'], self.params['b1']
        W3, b3 = self.params['W3'], self.params['b3']
        N, d_in = X.shape

        scores = None
        f, cache1 = layers.fc_forward(X, W1, b1)  #fc
        h, cache2 = layers.relu_forward(f)  #relu
        scores, cache3 = layers.fc_forward(h, W3, b3)  #fc

        # If y is None then we are in test mode so just return scores
        if y is None:
            return scores

        loss, grads = 0, {}
        loss, dscores = layers.softmax_loss(scores, y)
        dx2, dW3, db3 = layers.fc_backward(dscores, cache3)
        dx1 = layers.relu_backward(dx2, cache2)
        dx, dW1, db1 = layers.fc_backward(dx1, cache1)

        grads = {'W1': dW1, 'b1': db1, 'W3': dW3, 'b3': db3}

        return loss, grads
Beispiel #9
0
    def test_fc_forward(self):
        # FC layer: foward
        num_inputs = 2
        dim = 120
        output_dim = 3

        input_size = num_inputs * dim
        weight_size = output_dim * dim

        x = np.linspace(-0.1, 0.5, num=input_size).reshape(num_inputs, dim)
        w = np.linspace(-0.2, 0.3, num=weight_size).reshape(dim, output_dim)
        b = np.linspace(-0.3, 0.1, num=output_dim)

        out, _ = layers.fc_forward(x, w, b)
        correct_out = np.array([[1.49834967, 1.70660132, 1.91485297],
                                [3.25553199, 3.5141327, 3.77273342]])

        # Compare your output with ours. The error might be around 1e-9.
        # As long as your error is small enough, your implementation should pass this test.
        print('Testing fc_forward function:')
        print('difference: ', rel_error(out, correct_out))

        np.testing.assert_allclose(out, correct_out, atol=1e-8)
Beispiel #10
0
    def loss(self, features, captions):
        """
        Compute training-time loss for the RNN. We input image features and
        ground-truth captions for those images, and use an RNN to compute
        loss and gradients on all parameters.
        Inputs:
        - features: Input image features, of shape (N, D)
        - captions: Ground-truth captions; an integer array of shape (N, T) where
          each element is in the range 0 <= y[i, t] < V
        Returns a tuple of:
        - loss: Scalar loss
        - grads: Dictionary of gradients parallel to self.params
        """
        # Cut captions into two pieces: captions_in has everything but the last word
        # and will be input to the RNN; captions_out has everything but the first
        # word and this is what we will expect the RNN to generate. These are offset
        # by one relative to each other because the RNN should produce word (t+1)
        # after receiving word t. The first element of captions_in will be the START
        # token, and the first element of captions_out will be the first word.
        captions_in = captions[:, :-1]
        captions_out = captions[:, 1:]

        # You'll need this
        mask = (captions_out != self._null)

        # Weight and bias for the affine transform from image features to initial
        # hidden state
        W_proj, b_proj = self.params['W_proj'], self.params['b_proj']

        # Word embedding matrix
        W_embed = self.params['W_embed']

        # Input-to-hidden, hidden-to-hidden, and biases for the RNN
        Wx, Wh, b = self.params['Wx'], self.params['Wh'], self.params['b']

        # Weight and bias for the hidden-to-vocab transformation.
        W_vocab, b_vocab = self.params['W_vocab'], self.params['b_vocab']

        loss, grads = 0.0, {}
        ############################################################################
        # TODO: Implement the forward and backward passes for the CaptioningRNN.   #
        # In the forward pass you will need to do the following:                   #
        # (1) Use an fc transformation to compute the initial hidden state         #
        #     from the image features. This should produce an array of shape (N, H)#
        # (2) Use a word embedding layer to transform the words in captions_in     #
        #     from indices to vectors, giving an array of shape (N, T, W).         #
        # (3) Use a vanilla RNN to process the sequence of input word vectors      #
        #     of shape (T, N, W), and produce hidden state vectors for all         #
        #     timesteps, producing an array of shape (T, N, H).                    #
        # (4) Use a (temporal) fc transformation to compute scores over the        #
        #     vocabulary at every timestep using the hidden states, giving an      #
        #     array of shape (N, T, V).                                            #
        # (5) Use (temporal) softmax to compute loss using captions_out, ignoring  #
        #     the points where the output word is <NULL> using the mask above.     #
        #                                                                          #
        # In the backward pass you will need to compute the gradient of the loss   #
        # with respect to all model parameters. Use the loss and grads variables   #
        # defined above to store loss and gradients; grads[k] should give the      #
        # gradients for self.params[k].                                            #
        ############################################################################

        # Forward Pass
        #step 1
        h0, cache_0 = layers.fc_forward(features, W_proj, b_proj)

        #step 2
        word_embedded, word_embedded_cache = word_embedding_forward(captions_in, W_embed)
        word_embedded = np.transpose(word_embedded, (1,0,2))

        #step 3
        h, cache_rnn = rnn_forward(word_embedded, h0, Wx, Wh, b)
        h = np.transpose(h, (1,0,2))

        #step 4
        y_hat, cache_temp = temporal_fc_forward(h, W_vocab, b_vocab)

        #step 5
        loss, dout = temporal_softmax_loss(y_hat, captions_out, mask)

        # Gradients
        #temporal backward
        dh, grads['W_vocab'], grads['b_vocab'] = temporal_fc_backward(dout, cache_temp)
        dh = np.transpose(dh, (1,0,2))

        #rnn backward
        d_word, dh0, grads['Wx'], grads['Wh'], grads['b'] = rnn_backward(dh, cache_rnn)
        d_word = np.transpose(d_word, (1,0,2))

        #word embedded backward
        grads['W_embed'] = word_embedding_backward(d_word, word_embedded_cache)

        #full connected backward
        d_feature, grads['W_proj'], grads['b_proj'] = layers.fc_backward(dh0, cache_0)

        ############################################################################
        #                             END OF YOUR CODE                             #
        ############################################################################

        return loss, grads
Beispiel #11
0
    def sample(self, features, max_length=30):
        """
        Run a test-time forward pass for the model, sampling captions for input
        feature vectors.
        At each timestep, we embed the current word, pass it and the previous hidden
        state to the RNN to get the next hidden state, use the hidden state to get
        scores for all vocab words, and choose the word with the highest score as
        the next word. The initial hidden state is computed by applying an affine
        transform to the input image features, and the initial word is the <START>
        token.
        For LSTMs you will also have to keep track of the cell state; in that case
        the initial cell state should be zero.
        Inputs:
        - features: Array of input image features of shape (N, D).
        - max_length: Maximum length T of generated captions.
        Returns:
        - captions: Array of shape (N, max_length) giving sampled captions,
          where each element is an integer in the range [0, V). The first element
          of captions should be the first sampled word, not the <START> token.
        """
        N = features.shape[0]
        captions = self._null * np.ones((N, max_length), dtype=np.int32)

        # Unpack parameters
        W_proj, b_proj = self.params['W_proj'], self.params['b_proj']
        W_embed = self.params['W_embed']
        Wx, Wh, b = self.params['Wx'], self.params['Wh'], self.params['b']
        W_vocab, b_vocab = self.params['W_vocab'], self.params['b_vocab']

        ###########################################################################
        # TODO: Implement test-time sampling for the model. You will need to      #
        # initialize the hidden state of the RNN by applying the learned affine   #
        # transform to the input image features. The first word that you feed to  #
        # the RNN should be the <START> token; its value is stored in the         #
        # variable self._start. At each timestep you will need to do to:          #
        # (1) Embed the previous word using the learned word embeddings           #
        # (2) Make an RNN step using the previous hidden state and the embedded   #
        #     current word to get the next hidden state.                          #
        # (3) Apply the learned fc transformation to the next hidden state to     #
        #     get scores for all words in the vocabulary                          #
        # (4) Select the word with the highest score as the next word, writing it #
        #     to the appropriate slot in the captions variable                    #
        #                                                                         #
        # For simplicity, you do not need to stop generating after an <END> token #
        # is sampled, but you can if you want to.                                 #
        #                                                                         #
        # HINT: You will not be able to use the rnn_forward functions; you'll     #
        # need to call rnn_step_forward or lstm_step_forward in a loop.           #
        ###########################################################################
        h0, _ = fc_forward(features, W_proj, b_proj)
        prev_h = h0
        captions[:, 0] = self._start
        prev_word = self._start * np.ones((N, ), dtype=int)
        for i in range(1, max_length):
            embd, _ = word_embedding_forward(prev_word, W_embed)
            prev_h, _ = rnn_step_forward(embd, prev_h, Wx, Wh, b)
            scores, _ = fc_forward(prev_h, W_vocab, b_vocab)
            selection = np.argmax(scores, axis=1)
            captions[:, i] = selection

        ############################################################################
        #                             END OF YOUR CODE                             #
        ############################################################################
        return captions
    def sample(self, features, max_length=30):
        """
        Run a test-time forward pass for the model, sampling captions for input
        feature vectors.
        At each timestep, we embed the current word, pass it and the previous hidden
        state to the RNN to get the next hidden state, use the hidden state to get
        scores for all vocab words, and choose the word with the highest score as
        the next word. The initial hidden state is computed by applying an affine
        transform to the input image features, and the initial word is the <START>
        token.
        
        
        embed current word---->
        pass embedded word, prev hidden state----RNN----> get next hidden state
        use this hidden state----. scores for all vocab words
        choose highest score (softmax) word as next word
        initial hidden state is by first FC_forward 
        initial word --<start>
        
        \
        For LSTMs you will also have to keep track of the cell state; in that case
        the initial cell state should be zero.
        Inputs:
        - features: Array of input image features of shape (N, D).
        - max_length: Maximum length T of generated captions.
        Returns:
        - captions: Array of shape (N, max_length) giving sampled captions,
          where each element is an integer in the range [0, V). The first element
          of captions should be the first sampled word, not the <START> token.
        """
        N = features.shape[0]
        captions = self._null * np.ones((N, max_length), dtype=np.int32)

        # Unpack parameters
        W_proj, b_proj = self.params['W_proj'], self.params['b_proj']
        W_embed = self.params['W_embed']
        Wx, Wh, b = self.params['Wx'], self.params['Wh'], self.params['b']
        W_vocab, b_vocab = self.params['W_vocab'], self.params['b_vocab']

        ###########################################################################
        # TODO: Implement test-time sampling for the model. You will need to      #
        # initialize the hidden state of the RNN by applying the learned affine   #
        # transform to the input image features. The first word that you feed to  #
        # the RNN should be the <START> token; its value is stored in the         #
        # variable self._start. At each timestep you will need to do to:          #
        # (1) Embed the previous word using the learned word embeddings           #
        # (2) Make an RNN step using the previous hidden state and the embedded   #
        #     current word to get the next hidden state.                          #
        # (3) Apply the learned fc transformation to the next hidden state to     #
        #     get scores for all words in the vocabulary                          #
        # (4) Select the word with the highest score as the next word, writing it #
        #     to the appropriate slot in the captions variable                    #
        #                                                                         #
        # For simplicity, you do not need to stop generating after an <END> token #
        # is sampled, but you can if you want to.                                 #
        #                                                                         #
        # HINT: You will not be able to use the rnn_forward functions; you'll     #
        # need to call rnn_step_forward or lstm_step_forward in a loop.           #
        ###########################################################################

        hidden_state1, fc_cache = fc_forward(
            features, W_proj,
            b_proj)  #fc_cache=(x,w,b) #hidden_state=(N,W_proj.shape[1])
        captions[:, 0] = self._start
        prev_h = hidden_state1.copy()
        #        V, W = W_embed.shape
        #        # pass each word at a time to word_embeding fn
        for t in range(1, max_length):
            word_vec, _ = word_embedding_forward(
                captions[:, t - 1], W_embed)  #word_vec=(N, T=1, D)
            next_h, cache_rnn = rnn_step_forward(word_vec, prev_h, Wx, Wh, b)
            prev_h = next_h.copy()
            scores_vocab, cache_vocab = fc_forward(next_h, W_vocab, b_vocab)
            arg = np.argmax(scores_vocab,
                            1)  #finding the 2nd index of the max score to us
            captions[:, t] = arg

#
#
#
#
#
#
#        V, W = W_embed.shape
#        prev_h, _ = affine_forward(features, W_proj, b_proj) # using image features as h0
#        if self.cell_type == 'lstm':
#            prev_c = np.zeros_like(prev_h)
#        captions[:, 0] = self._start
#        for t in range(1, max_length):
#            onehots = np.eye(V)[captions[:, t-1]]    # [N, V] # 按照caption中的描述从eyeV中取两行,其实也就是word在序列中对应的位置,和W相乘周后就的到序列了
#            word_vectors = onehots.dot(W_embed)      # [N, W]
#            if self.cell_type == 'rnn':
#                next_h, cache = rnn_step_forward(word_vectors, prev_h, Wx, Wh, b)
#                prev_h = next_h
#            vocab_out, vocab_cache = affine_forward(next_h, W_vocab, b_vocab)
#            x = vocab_out.argmax(1)
#            captions[:, t] = x
#            #print(captions) #show the iteration change of caption vector
############################################################################
#                             END OF YOUR CODE                             #
############################################################################
        return captions
Beispiel #13
0
    def loss(self, features, captions):
        """
        Compute training-time loss for the RNN. We input image features and
        ground-truth captions for those images, and use an RNN to compute
        loss and gradients on all parameters.
        Inputs:
        - features: Input image features, of shape (N, D)
        - captions: Ground-truth captions; an integer array of shape (N, T) where
          each element is in the range 0 <= y[i, t] < V
        Returns a tuple of:
        - loss: Scalar loss
        - grads: Dictionary of gradients parallel to self.params
        """
        # Cut captions into two pieces: captions_in has everything but the last word
        # and will be input to the RNN; captions_out has everything but the first
        # word and this is what we will expect the RNN to generate. These are offset
        # by one relative to each other because the RNN should produce word (t+1)
        # after receiving word t. The first element of captions_in will be the START
        # token, and the first element of captions_out will be the first word.
        captions_in = captions[:, :-1]
        captions_out = captions[:, 1:]

        # You'll need this
        mask = (captions_out != self._null)

        # Weight and bias for the affine transform from image features to initial
        # hidden state
        W_proj, b_proj = self.params['W_proj'], self.params['b_proj']

        # Word embedding matrix
        W_embed = self.params['W_embed']

        # Input-to-hidden, hidden-to-hidden, and biases for the RNN
        Wx, Wh, b = self.params['Wx'], self.params['Wh'], self.params['b']

        # Weight and bias for the hidden-to-vocab transformation.
        W_vocab, b_vocab = self.params['W_vocab'], self.params['b_vocab']

        loss, grads = 0.0, {}

        # Forward Pass
        fc, cache1 = layers.fc_forward(features, W_proj, b_proj)
        emb, cache2 = rnn_layers.word_embedding_forward(captions_in, W_embed)
        emb = emb.transpose(1, 0, 2)
        rnn, cache3 = rnn_layers.rnn_forward(emb, fc, Wx, Wh, b)
        rnn = rnn.transpose(1, 0, 2)
        tfc, cache4 = rnn_layers.temporal_fc_forward(rnn, W_vocab, b_vocab)
        loss, dout = rnn_layers.temporal_softmax_loss(tfc, captions_out, mask)

        # Gradients
        dtfc, dW_vocab, db_vocab = rnn_layers.temporal_fc_backward(
            dout, cache4)
        dtfc = dtfc.transpose(1, 0, 2)
        drnn, dfc, dWx, dWh, db = rnn_layers.rnn_backward(dtfc, cache3)
        drnn = drnn.transpose(1, 0, 2)
        dW_embed = rnn_layers.word_embedding_backward(drnn, cache2)
        dfeature, dW_proj, db_proj = layers.fc_backward(dfc, cache1)

        grads = {
            'W_embed': dW_embed,
            'W_proj': dW_proj,
            'W_vocab': dW_vocab,
            'Wh': dWh,
            'Wx': dWx,
            'b': db,
            'b_proj': db_proj,
            'b_vocab': db_vocab
        }
        return loss, grads
Beispiel #14
0
    def loss(self, features, captions):
        """
        Compute training-time loss for the RNN. We input image features and
        ground-truth captions for those images, and use an RNN to compute
        loss and gradients on all parameters.
        Inputs:
        - features: Input image features, of shape (N, D)
        - captions: Ground-truth captions; an integer array of shape (N, T) where
          each element is in the range 0 <= y[i, t] < V
        Returns a tuple of:
        - loss: Scalar loss
        - grads: Dictionary of gradients parallel to self.params
        """
        # Cut captions into two pieces: captions_in has everything but the last word
        # and will be input to the RNN; captions_out has everything but the first
        # word and this is what we will expect the RNN to generate. These are offset
        # by one relative to each other because the RNN should produce word (t+1)
        # after receiving word t. The first element of captions_in will be the START
        # token, and the first element of captions_out will be the first word.
        captions_in = captions[:, :-1]
        captions_out = captions[:, 1:]

        # You'll need this
        mask = (captions_out != self._null)

        # Weight and bias for the affine transform from image features to initial
        # hidden state
        W_proj, b_proj = self.params['W_proj'], self.params['b_proj']

        # Word embedding matrix
        W_embed = self.params['W_embed']

        # Input-to-hidden, hidden-to-hidden, and biases for the RNN
        Wx, Wh, b = self.params['Wx'], self.params['Wh'], self.params['b']

        # Weight and bias for the hidden-to-vocab transformation.
        W_vocab, b_vocab = self.params['W_vocab'], self.params['b_vocab']

        loss, grads = 0.0, {}
        batch_size, input_dim = features.shape
        _, n_time_steps = captions_in.shape
        wordvec_dim = Wx.shape[0]
        hidden_dim = Wh.shape[0]
        vocab_size = W_vocab.shape[1]
        ############################################################################
        # TODO: Implement the forward and backward passes for the CaptioningRNN.   #
        # In the forward pass you will need to do the following:                   #
        # (1) Use an fc transformation to compute the initial hidden state         #
        #     from the image features. This should produce an array of shape (N, H)#
        # (2) Use a word embedding layer to transform the words in captions_in     #
        #     from indices to vectors, giving an array of shape (N, T, W).         #
        # (3) Use a vanilla RNN to process the sequence of input word vectors      #
        #     and produce hidden state vectors for all timesteps, producing        #
        #     an array of shape (T, N, H).                                         #
        # (4) Use a (temporal) fc transformation to compute scores over the        #
        #     vocabulary at every timestep using the hidden states, giving an      #
        #     array of shape (N, T, V).                                            #
        # (5) Use (temporal) softmax to compute loss using captions_out, ignoring  #
        #     the points where the output word is <NULL> using the mask above.     #
        #                                                                          #
        # In the backward pass you will need to compute the gradient of the loss   #
        # with respect to all model parameters. Use the loss and grads variables   #
        # defined above to store loss and gradients; grads[k] should give the      #
        # gradients for self.params[k].                                            #
        ############################################################################

        # Forward Pass
        # N x T x D
        # (1) compute the initial hidden state (N, H)
        h0, cache_h0 = fc_forward(features, W_proj, b_proj)

        # (2) transform the words in captions_in to vectors (N, T, W)
        x, cache_emb = word_embedding_forward(captions_in, W_embed)
        x_trans = np.transpose(x, (1, 0, 2))
        # (3) produce hidden state vectors for all timestapes (N, T, H)
        h_trans, cache_h = rnn_forward(x_trans, h0, Wx, Wh, b)
        h = np.transpose(h_trans, (1, 0, 2))
        # (4) compute scores over the vocabulary (N, T, V)
        out, cache_out = temporal_fc_forward(h, W_vocab, b_vocab)

        # (5) compute softmax loss using captions_out
        loss, dout = temporal_softmax_loss(out, captions_out, mask)

        # Gradients####################################
        # (6) backprop for (4)
        dout = dout.reshape(-1, vocab_size)  # (N x T, V)
        dh, grads['W_vocab'], grads['b_vocab'] = temporal_fc_backward(
            dout, cache_out)
        dh = np.transpose(dh, (1, 0, 2))
        # (7) backprop for (3)
        dx, dh0, grads['Wx'], grads['Wh'], grads['b'] = rnn_backward(
            dh, cache_h)
        dx = np.transpose(dx, (1, 0, 2))
        # (8) backprop for (2)
        grads['W_embed'] = word_embedding_backward(dx, cache_emb)

        # (9) backprop for (1)
        _, grads['W_proj'], grads['b_proj'] = fc_backward(dh0, cache_h0)

        ############################################################################
        #                             END OF YOUR CODE                             #
        ############################################################################

        return loss, grads