def predict(self, X): N, T, D = X.shape h0 = np.zeros((N, self.hidden_dim)) layer1, l1cache = rnn_layers.rnn_forward(X, h0, self.Wx, self.Wh, self.b, self.non_liniearity) final_layer = (layer1[:, T - 1, :]) layer2, _ = layers.dense_forward(final_layer, self.W1, self.b1) return np.argmax(layer2, axis=1)
def test_rnn_layer(): N, D, T, H = 2, 3, 10, 5 x = np.random.randn(N, T, D) h0 = np.random.randn(N, H) Wx = np.random.randn(D, H) Wh = np.random.randn(H, H) b = np.random.randn(H) out, cache = rnn_layers.rnn_forward(x, h0, Wx, Wh, b) dout = np.random.randn(*out.shape) dx, dh0, dWx, dWh, db = rnn_layers.rnn_backward(dout, cache) fx = lambda x: rnn_layers.rnn_forward(x, h0, Wx, Wh, b)[0] fh0 = lambda h0: rnn_layers.rnn_forward(x, h0, Wx, Wh, b)[0] fWx = lambda Wx: rnn_layers.rnn_forward(x, h0, Wx, Wh, b)[0] fWh = lambda Wh: rnn_layers.rnn_forward(x, h0, Wx, Wh, b)[0] fb = lambda b: rnn_layers.rnn_forward(x, h0, Wx, Wh, b)[0] dx_num = eval_numerical_gradient_array(fx, x, dout) dh0_num = eval_numerical_gradient_array(fh0, h0, dout) dWx_num = eval_numerical_gradient_array(fWx, Wx, dout) dWh_num = eval_numerical_gradient_array(fWh, Wh, dout) db_num = eval_numerical_gradient_array(fb, b, dout) print 'Testing rnn layers' print 'dx error: ', rel_error(dx_num, dx) print 'dh0 error: ', rel_error(dh0_num, dh0) print 'dWx error: ', rel_error(dWx_num, dWx) print 'dWh error: ', rel_error(dWh_num, dWh) print 'db error: ', rel_error(db_num, db)
dx_num = eval_numerical_gradient( lambda x: temporal_softmax_loss(x, y, mask)[0], x, verbose=False) print('dx error: ', rel_error(dx, dx_num)) from rnn_layers import rnn_forward, rnn_backward N, T, D, H = 2, 3, 4, 5 x = np.linspace(-0.1, 0.3, num=N * T * D).reshape(N, T, D) h0 = np.linspace(-0.3, 0.1, num=N * H).reshape(N, H) Wx = np.linspace(-0.2, 0.4, num=D * H).reshape(D, H) Wh = np.linspace(-0.4, 0.1, num=H * H).reshape(H, H) b = np.linspace(-0.7, 0.1, num=H) h, _ = rnn_forward(x, h0, Wx, Wh, b) expected_h = np.asarray( [[ [-0.42070749, -0.27279261, -0.11074945, 0.05740409, 0.22236251], [-0.39525808, -0.22554661, -0.0409454, 0.14649412, 0.32397316], [-0.42305111, -0.24223728, -0.04287027, 0.15997045, 0.35014525], ], [[-0.55857474, -0.39065825, -0.19198182, 0.02378408, 0.23735671], [-0.27150199, -0.07088804, 0.13562939, 0.33099728, 0.50158768], [-0.51014825, -0.30524429, -0.06755202, 0.17806392, 0.40333043]]]) print('h error: ', rel_error(expected_h, h)) np.random.seed(231) N, D, T, H = 2, 3, 10, 5
def loss(self, features, captions): """ Compute training-time loss for the RNN. We input image features and ground-truth captions for those images, and use an RNN to compute loss and gradients on all parameters. Inputs: - features: Input image features, of shape (N, D) - captions: Ground-truth captions; an integer array of shape (N, T) where each element is in the range 0 <= y[i, t] < V Returns a tuple of: - loss: Scalar loss - grads: Dictionary of gradients parallel to self.params """ # Cut captions into two pieces: captions_in has everything but the last word # and will be input to the RNN; captions_out has everything but the first # word and this is what we will expect the RNN to generate. These are offset # by one relative to each other because the RNN should produce word (t+1) # after receiving word t. The first element of captions_in will be the START # token, and the first element of captions_out will be the first word. captions_in = captions[:, :-1] captions_out = captions[:, 1:] # You'll need this mask = (captions_out != self._null) # Weight and bias for the affine transform from image features to initial # hidden state W_proj, b_proj = self.params['W_proj'], self.params['b_proj'] # Word embedding matrix W_embed = self.params['W_embed'] # Input-to-hidden, hidden-to-hidden, and biases for the RNN Wx, Wh, b = self.params['Wx'], self.params['Wh'], self.params['b'] # Weight and bias for the hidden-to-vocab transformation. W_vocab, b_vocab = self.params['W_vocab'], self.params['b_vocab'] loss, grads = 0.0, {} # Forward Pass fc, cache1 = layers.fc_forward(features, W_proj, b_proj) emb, cache2 = rnn_layers.word_embedding_forward(captions_in, W_embed) emb = emb.transpose(1, 0, 2) rnn, cache3 = rnn_layers.rnn_forward(emb, fc, Wx, Wh, b) rnn = rnn.transpose(1, 0, 2) tfc, cache4 = rnn_layers.temporal_fc_forward(rnn, W_vocab, b_vocab) loss, dout = rnn_layers.temporal_softmax_loss(tfc, captions_out, mask) # Gradients dtfc, dW_vocab, db_vocab = rnn_layers.temporal_fc_backward( dout, cache4) dtfc = dtfc.transpose(1, 0, 2) drnn, dfc, dWx, dWh, db = rnn_layers.rnn_backward(dtfc, cache3) drnn = drnn.transpose(1, 0, 2) dW_embed = rnn_layers.word_embedding_backward(drnn, cache2) dfeature, dW_proj, db_proj = layers.fc_backward(dfc, cache1) grads = { 'W_embed': dW_embed, 'W_proj': dW_proj, 'W_vocab': dW_vocab, 'Wh': dWh, 'Wx': dWx, 'b': db, 'b_proj': db_proj, 'b_vocab': db_vocab } return loss, grads
def train(self, X, y, learning_rate=1e-2, opt='sgd', n_iters=5000, batch_size=200, verbose=1): lr = learning_rate N, T, D = X.shape for i in xrange(n_iters): ids = np.random.choice(X.shape[0], batch_size) h0 = np.zeros((batch_size, self.hidden_dim)) layer1, l1cache = rnn_layers.rnn_forward(X[ids], h0, self.Wx, self.Wh, self.b, self.non_liniearity) final_layer = (layer1[:, T - 1, :]) layer2, l2cache = layers.dense_forward(final_layer, self.W1, self.b1) loss, l3cache = layers.softmax_loss_forward(layer2, y[ids]) self.loss_history.append(loss) if verbose == 1 and i % 500 == 0: print 'Iteration %d: loss %g' % (i, loss) dlayer3 = 1.0 dlayer2 = layers.softmax_loss_backward(dlayer3, l3cache) dlayer1, dW1, db1 = layers.dense_backward(dlayer2, l2cache) dh = np.zeros((batch_size, T, self.hidden_dim)) dh[:, T - 1, :] = dlayer1 _, _, dWx, dWh, db = rnn_layers.rnn_backward(dh, l1cache) self.params, self.Wx = optimizers.optimize(self.params, self.Wx, dWx, lr=lr, name='Wx', opt=opt) self.params, self.Wh = optimizers.optimize(self.params, self.Wh, dWh, lr=lr, name='Wh', opt=opt) self.params, self.b = optimizers.optimize(self.params, self.b, db, lr=lr, name='b', opt=opt) self.params, self.W1 = optimizers.optimize(self.params, self.W1, dW1, lr=lr, name='W1', opt=opt) self.params, self.b1 = optimizers.optimize(self.params, self.b1, db1, lr=lr, name='b1', opt=opt)
def build_model(self): """ Place Holder: - features: input image features of shape (N, D) - captions: ground-truth captions; an integer array of shape (N, T) where each element is in the range [0, V) Returns - logits: score of shape (N, T, V) - loss: Scalar loss """ # some hyper-parameters T = self.T N = self.N V = self.V H = self.H M = self.M D = self.D # place holder features and captions features = self.features captions = self.captions # caption in, out and mask matrix captions_in = captions[:, : T] # same as captions[:, :-1], tensorflow doesn't provide negative stop slice yet. captions_out = captions[:, 1:] mask = tf.not_equal(captions_out, self._null) # word embedding matrix W_embed = self.params['W_embed'] # parameters for (cnn_features)-to-(initial_hidden) W_proj = self.params['W_proj'] b_proj = self.params['b_proj'] # parameters for input-to-hidden, hidden-to-hidden Wx = self.params['Wx'] Wh = self.params['Wh'] b = self.params['b'] # parameters for hidden-to-vocab W_vocab = self.params['W_vocab'] b_vocab = self.params['b_vocab'] # params used in some function call rnn_param = {'n_time_step': T} param = { 'batch_size': N, 'n_time_step': T, 'dim_hidden': H, 'vocab_size': V } # generate initial hidden state using cnn features h0 = affine_forward(features, W_proj, b_proj) # (N, H) # generate input x (word vector) x = word_embedding_forward(captions_in, W_embed) # (N, T, M) # rnn forward if self.cell_type == 'rnn': h = rnn_forward(x, h0, Wx, Wh, b, rnn_param) else: h = lstm_forward(x, h0, Wx, Wh, b, rnn_param) # hidden-to-vocab logits = temporal_affine_forward(h, W_vocab, b_vocab, param) # softmax loss loss = temporal_softmax_loss(logits, captions_out, mask, param) return logits, loss