Example #1
0
    def sample(self, features, max_length=30):
        """
        Run a test-time forward pass for the model, sampling captions for input
        feature vectors.

        At each timestep, we embed the current word, pass it and the previous hidden
        state to the RNN to get the next hidden state, use the hidden state to get
        scores for all vocab words, and choose the word with the highest score as
        the next word. The initial hidden state is computed by applying an affine
        transform to the input image features, and the initial word is the <START>
        token.

        For LSTMs you will also have to keep track of the cell state; in that case
        the initial cell state should be zero.

        Inputs:
        - features: Array of input image features of shape (N, D).
        - max_length: Maximum length T of generated captions.

        Returns:
        - captions: Array of shape (N, max_length) giving sampled captions,
          where each element is an integer in the range [0, V). The first element
          of captions should be the first sampled word, not the <START> token.
        """
        N = features.shape[0]
        captions = self._null * np.ones((N, max_length), dtype= np.int32)

        # Unpack parameters
        W_proj, b_proj = self.params['W_proj'], self.params['b_proj']
        W_embed = self.params['W_embed']
        Wx, Wh, b = self.params['Wx'], self.params['Wh'], self.params['b']
        W_vocab, b_vocab = self.params['W_vocab'], self.params['b_vocab']

        ###########################################################################
        # TODO: Implement test-time sampling for the model. You will need to      #
        # initialize the hidden state of the RNN by applying the learned affine   #
        # transform to the input image features. The first word that you feed to  #
        # the RNN should be the <START> token; its value is stored in the         #
        # variable self._start. At each timestep you will need to do to:          #
        # (1) Embed the previous word using the learned word embeddings           #
        # (2) Make an RNN step using the previous hidden state and the embedded   #
        #     current word to get the next hidden state.                          #
        # (3) Apply the learned affine transformation to the next hidden state to #
        #     get scores for all words in the vocabulary                          #
        # (4) Select the word with the highest score as the next word, writing it #
        #     to the appropriate slot in the captions variable                    #
        #                                                                         #
        # For simplicity, you do not need to stop generating after an <END> token #
        # is sampled, but you can if you want to.                                 #
        #                                                                         #
        # HINT: You will not be able to use the rnn_forward or lstm_forward       #
        # functions; you'll need to call rnn_step_forward or lstm_step_forward in #
        # a loop.                                                                 #
        ###########################################################################
        #pass
        h0 = np.dot(features, W_proj) + b_proj
        previous_h = h0
        _, wordvec_dim = W_embed.shape
        previous_inx = np.full((N, ), self._start)
        previous_inx = previous_inx.tolist()
        words_vector = np.zeros((N, wordvec_dim))
        captions[:, 0] = self._start
        if self.cell_type == 'rnn':
            for i in range(1,max_length):
                words_vector = W_embed[previous_inx]  # (1)
                next_h, _ = rnn_step_forward(x= words_vector, prev_h= previous_h, Wx= Wx, Wh= Wh, b= b) # (2)
                out_scores = np.dot(next_h, W_vocab) + b_vocab # (3)
                next_word_inx = np.argmax(out_scores, axis= 1)
                captions[:,i] = next_word_inx   # (4)
                previous_h = next_h
                previous_inx = next_word_inx

        else:
            if self.cell_type == 'lstm':
                previous_c = np.zeros_like(h0)
                for i in range(1, max_length):
                    words_vector = W_embed[previous_inx]  # (1)
                    next_h, next_c, _ = lstm_step_forward(x=words_vector, prev_h= previous_h, prev_c= previous_c, Wx= Wx, Wh= Wh, b= b) # (2)
                    out_scores = np.dot(next_h, W_vocab) + b_vocab # (3)
                    next_word_inx = np.argmax(out_scores, axis= 1)
                    captions[:, i] = next_word_inx # (4)
                    previous_h, previous_inx, previous_c = next_h, next_word_inx, next_c
            else:
                print('Unknown type')
        ############################################################################
        #                             END OF YOUR CODE                             #
        ############################################################################
        return captions
Example #2
0
# # Vanilla RNN: step forward
# Open the file `cs231n/rnn_layers.py`. This file implements the forward and backward passes for different types of layers that are commonly used in recurrent neural networks.
# 
# First implement the function `rnn_step_forward` which implements the forward pass for a single timestep of a vanilla recurrent neural network. After doing so run the following to check your implementation. You should see errors less than 1e-8.

# In[ ]:

N, D, H = 3, 10, 4

x = np.linspace(-0.4, 0.7, num=N*D).reshape(N, D)
prev_h = np.linspace(-0.2, 0.5, num=N*H).reshape(N, H)
Wx = np.linspace(-0.1, 0.9, num=D*H).reshape(D, H)
Wh = np.linspace(-0.3, 0.7, num=H*H).reshape(H, H)
b = np.linspace(-0.2, 0.4, num=H)

next_h, _ = rnn_step_forward(x, prev_h, Wx, Wh, b)
expected_next_h = np.asarray([
  [-0.58172089, -0.50182032, -0.41232771, -0.31410098],
  [ 0.66854692,  0.79562378,  0.87755553,  0.92795967],
  [ 0.97934501,  0.99144213,  0.99646691,  0.99854353]])

print('next_h error: ', rel_error(expected_next_h, next_h))


# # Vanilla RNN: step backward
# In the file `cs231n/rnn_layers.py` implement the `rnn_step_backward` function. After doing so run the following to numerically gradient check your implementation. You should see errors less than `1e-8`.

# In[ ]:

from cs231n.rnn_layers import rnn_step_forward, rnn_step_backward
np.random.seed(231)
# types of layers that are commonly used in recurrent neural networks.
# 
# First implement the function `rnn_step_forward` which implements the forward pass for a single timestep of 
# a vanilla recurrent neural network. After doing so run the following to check your implementation.

# In[ ]:

N, D, H = 3, 10, 4

x = np.linspace(-0.4, 0.7, num=N*D).reshape(N, D)
prev_h = np.linspace(-0.2, 0.5, num=N*H).reshape(N, H)
Wx = np.linspace(-0.1, 0.9, num=D*H).reshape(D, H)
Wh = np.linspace(-0.3, 0.7, num=H*H).reshape(H, H)
b = np.linspace(-0.2, 0.4, num=H)

next_h, _ = rnn_step_forward(x, prev_h, Wx, Wh, b)
expected_next_h = np.asarray([
  [-0.58172089, -0.50182032, -0.41232771, -0.31410098],
  [ 0.66854692,  0.79562378,  0.87755553,  0.92795967],
  [ 0.97934501,  0.99144213,  0.99646691,  0.99854353]])

print 'next_h error: ', rel_error(expected_next_h, next_h)


## # Vanilla RNN: step backward
## In the file `cs231n/rnn_layers.py` implement the `rnn_step_backward` function. After doing so run the following to numerically gradient check your implementation. You should see errors less than `1e-8`.
#
## In[ ]:

from cs231n.rnn_layers import rnn_step_forward, rnn_step_backward