def check_loss(N, T, V, p): x = 0.001 * np.random.randn(N, T, V) y = np.random.randint(V, size=(N, T)) mask = np.random.rand(N, T) <= p print(temporal_softmax_loss(x, y, mask)[0])
mask = np.random.rand(N, T) <= p print(temporal_softmax_loss(x, y, mask)[0]) check_loss(100, 1, 10, 1.0) # Should be about 2.3 check_loss(100, 10, 10, 1.0) # Should be about 23 check_loss(5000, 10, 10, 0.1) # Should be about 2.3 # Gradient check for temporal softmax loss N, T, V = 7, 8, 9 x = np.random.randn(N, T, V) y = np.random.randint(V, size=(N, T)) mask = (np.random.rand(N, T) > 0.5) loss, dx = temporal_softmax_loss(x, y, mask, verbose=False) dx_num = eval_numerical_gradient( lambda x: temporal_softmax_loss(x, y, mask)[0], x, verbose=False) print('dx error: ', rel_error(dx, dx_num)) # # RNN for image captioning # Now that you have implemented the necessary layers, you can combine them to build an image captioning model. Open the file `cs231n/classifiers/rnn.py` and look at the `CaptioningRNN` class. # # Implement the forward and backward pass of the model in the `loss` function. For now you only need to implement the case where `cell_type='rnn'` for vanialla RNNs; you will implement the LSTM case later. After doing so, run the following to check your forward pass using a small test case; you should see error less than `1e-10`. # In[ ]: N, D, W, H = 10, 20, 30, 40 word_to_idx = {'<NULL>': 0, 'cat': 2, 'dog': 3}
y = np.random.randint(V, size=(N, T)) mask = np.random.rand(N, T) <= p print(temporal_softmax_loss(x, y, mask)[0]) check_loss(100, 1, 10, 1.0) # Should be about 2.3 check_loss(100, 10, 10, 1.0) # Should be about 23 check_loss(5000, 10, 10, 0.1) # Should be about 2.3 # Gradient check for temporal softmax loss N, T, V = 7, 8, 9 x = np.random.randn(N, T, V) y = np.random.randint(V, size=(N, T)) mask = (np.random.rand(N, T) > 0.5) loss, dx = temporal_softmax_loss(x, y, mask, verbose=False) dx_num = eval_numerical_gradient(lambda x: temporal_softmax_loss(x, y, mask)[0], x, verbose=False) print('dx error: ', rel_error(dx, dx_num)) # # RNN for image captioning # Now that you have implemented the necessary layers, you can combine them to build an image captioning model. Open the file `cs231n/classifiers/rnn.py` and look at the `CaptioningRNN` class. # # Implement the forward and backward pass of the model in the `loss` function. For now you only need to implement the case where `cell_type='rnn'` for vanialla RNNs; you will implement the LSTM case later. After doing so, run the following to check your forward pass using a small test case; you should see error less than `1e-10`. # In[ ]: N, D, W, H = 10, 20, 30, 40 word_to_idx = {'<NULL>': 0, 'cat': 2, 'dog': 3}
def loss(self, features, captions): """ Compute training-time loss for the RNN. We input image features and ground-truth captions for those images, and use an RNN (or LSTM) to compute loss and gradients on all parameters. Inputs: - features: Input image features, of shape (N, D) - captions: Ground-truth captions; an integer array of shape (N, T) where each element is in the range 0 <= y[i, t] < V Returns a tuple of: - loss: Scalar loss - grads: Dictionary of gradients parallel to self.params """ # Cut captions into two pieces: captions_in has everything but the last word # and will be input to the RNN; captions_out has everything but the first # word and this is what we will expect the RNN to generate. These are offset # by one relative to each other because the RNN should produce word (t+1) # after receiving word t. The first element of captions_in will be the START # token, and the first element of captions_out will be the first word. captions_in = captions[:, :-1] captions_out = captions[:, 1:] # You'll need this mask = (captions_out != self._null) # Weight and bias for the affine transform from image features to initial # hidden state W_proj, b_proj = self.params['W_proj'], self.params['b_proj'] # Word embedding matrix W_embed = self.params['W_embed'] # Input-to-hidden, hidden-to-hidden, adn biases for the RNN Wx, Wh, b = self.params['Wx'], self.params['Wh'], self.params['b'] # Weight and bias for the hidden-to-vocab transformation W_vocab, b_vocab = self.params['W_vocab'], self.params['b_vocab'] loss, grads = 0.0, {} ############################################################################ # TODO: Implement the forward and backward passes for the CaptioningRNN. # # In the forward pass you will need to do the following: # # (1) Use an affine transformation to compute the initial hidden state # # from the image features. This should produce an array of shape (N, H)# # (2) Use a word embedding layer to transform the words in captions_in # # from indices to vectors, giving an array of shape (N, T, W). # # (3) Use either a vanilla RNN or LSTM (depending on self.cell_type) to # # process the sequence of input word vectors and produce hidden state # # vectors for all timesteps, producing an array of shape (N, T, H). # # (4) Use a (temporal) affine transformation to compute scores over the # # vocabulary at every timestep using the hidden states, giving an # # array of shape (N, T, V). # # (5) Use (temporal) softmax to compute loss using captions_out, ignoring # # the points where the output word is <NULL> using the mask above. # # # # In the backward pass you will need to compute the gradient of the loss # # with respect to all model parameters. Use the loss and grads variables # # defined above to store loss and gradients; grads[k] should give the # # gradients for self.params[k]. # ############################################################################ # pass N, _ = features.shape h0 = np.dot(features,W_proj) + b_proj # (1) word_vec, word_vec_cache = word_embedding_forward(captions_in, W_embed) # (2) if self.cell_type == 'rnn': h, h_cache = rnn_forward(x= word_vec, h0= h0, Wx= Wx, Wh= Wh, b= b) # (3) out, out_cache = temporal_affine_forward(x= h, w= W_vocab, b= b_vocab) # (4) mask = (captions_out!=self._null) loss, dx = temporal_softmax_loss(x= out, y= captions_out, mask= mask, verbose=False) dh, grads['W_vocab'], grads['b_vocab'] = temporal_affine_backward(dx, out_cache) dcaption_in, dh0, grads['Wx'], grads['Wh'], grads['b'] = rnn_backward(dh, h_cache) grads['W_embed'] = word_embedding_backward(dcaption_in, word_vec_cache) grads['W_proj'] = np.dot(features.T, dh0) grads['b_proj'] = np.sum(dh0, axis=0) else: if self.cell_type == 'lstm': h, h_cache = lstm_forward(x= word_vec, h0= h0, Wx= Wx, Wh= Wh, b= b) # (2) out, out_cache = temporal_affine_forward(x= h, w= W_vocab, b= b_vocab) mask = (captions_out!=self._null) loss, dx = temporal_softmax_loss(x= out, y= captions_out, mask= mask, verbose= False) dh, grads['W_vocab'], grads['b_vocab'] = temporal_affine_backward(dx, out_cache) dcaption_in, dh0, grads['Wx'], grads['Wh'], grads['b'] = lstm_backward(dh, h_cache) grads['W_embed'] = word_embedding_backward(dcaption_in, word_vec_cache) grads['W_proj'] = np.dot(features.T, dh0) grads['b_proj'] = np.sum(dh0, axis=0) else: print('Unknow type') ############################################################################ # END OF YOUR CODE # ############################################################################ return loss, grads