def _step(sess, data, train_op, model, keep_prob): """ Make a single gradient update for batch data. """ # Make a minibatch of training data minibatch = sample_coco_minibatch(data, batch_size=model_config.batch_size, split='train') captions, features, urls = minibatch captions_in = captions[:, :-1] captions_out = captions[:, 1:] mask = (captions_out != model_config._null) _, total_loss_value = sess.run( [train_op, model['total_loss']], feed_dict={ model['image_feature']: features, model['input_seqs']: captions_in, model['target_seqs']: captions_out, model['input_mask']: mask, model['keep_prob']: keep_prob }) return total_loss_value
def main(): # The dataset (987M) can be downloaded from # https://drive.google.com/file/d/1Wgeq3NZ4R1letnZEKLo-DTSSgcTsgkmq/view?usp=sharing # The dataset contains the feature of images in MSCOCO dataset # The data should be in the same folder as the code # Load COCO data from disk; this returns a dictionary small_data = coco_utils.load_coco_data(max_train=50) # Experiment with vanilla RNN small_rnn_model = CaptioningRNN( cell_type='rnn', word_to_idx=small_data['word_to_idx'], input_dim=small_data['train_features'].shape[1], hidden_dim=512, wordvec_dim=256, ) small_rnn_solver = CaptioningSolver(small_rnn_model, small_data, update_rule='adam', num_epochs=50, batch_size=25, optim_config={ 'learning_rate': 5e-3, }, lr_decay=0.95, verbose=True, print_every=10, ) small_rnn_solver.train() # Plot the training losses plt.plot(small_rnn_solver.loss_history) plt.xlabel('Iteration') plt.ylabel('Loss') plt.title('Training loss history') plt.show() plt.savefig('loss_rnn.png') plt.close() for split in ['train', 'val']: # some images might be deprecated. You may rerun the code several times # to successfully get the sample images from url. minibatch = coco_utils.sample_coco_minibatch( small_data, split=split, batch_size=2, seed=0) gt_captions, features, urls = minibatch gt_captions = coco_utils.decode_captions(gt_captions, small_data['idx_to_word']) sample_captions = small_rnn_model.sample(features) sample_captions = coco_utils.decode_captions(sample_captions, small_data['idx_to_word']) for i, (gt_caption, sample_caption, url) in enumerate(zip(gt_captions, sample_captions, urls)): plt.imshow(image_from_url(url)) plt.title('%s\n%s\nGT:%s' % (split, sample_caption, gt_caption)) plt.axis('off') plt.show() plt.savefig('%s_rnn_%d.png' % (split, i)) plt.close()
def train_val_step(self, data, batch_size, optimizer, train_mode=True): optimizer.zero_grad() if train_mode: minibatch = sample_coco_minibatch(data, batch_size=batch_size, split='train') else: minibatch = sample_coco_minibatch(data, batch_size=batch_size, split='val') captions, features, urls = minibatch captions = torch.LongTensor(captions).to(self.device) features = torch.from_numpy(features).to(self.device) captions_in = captions[:, :-1] captions_out = captions[:, 1:] Y_hat = self.forward(features, captions_in) loss = self.loss(Y_hat, captions_out) if train_mode: loss.backward() optimizer.step() return loss
def _step(self): # Make a minibatch of training data minibatch = sample_coco_minibatch(self.data, batch_size=self.batch_size, split='train') captions, features, urls = minibatch # Compute loss and gradient loss, grads = self.model.loss(features, captions) self.loss_history.append(loss) # Perform a parameter update for p, w in self.model.params.items(): dw = grads[p] config = self.optim_configs[p] next_w, next_config = self.update_rule(w, dw, config) self.model.params[p] = next_w self.optim_configs[p] = next_config
def _step_test(sess, data, batch_size, model, keep_prob): """ Make a single gradient update for batch data. """ # Make a minibatch of training data minibatch = sample_coco_minibatch(data, batch_size=batch_size, split='val') captions, features, urls = minibatch # print out ground truth caption captions_in = captions[:, 0].reshape(-1, 1) state = None final_preds = [] current_pred = captions_in mask = np.zeros((batch_size, model_config.padded_length)) mask[:, 0] = 1 # get initial state using image feature feed_dict = { model['image_feature']: features, model['keep_prob']: keep_prob } state = sess.run(model['initial_state'], feed_dict=feed_dict) # start to generate sentences for t in range(model_config.padded_length): feed_dict = { model['input_seqs']: current_pred, model['initial_state']: state, model['input_mask']: mask, model['keep_prob']: keep_prob } current_pred, state = sess.run([model['preds'], model['final_state']], feed_dict=feed_dict) current_pred = current_pred.reshape(-1, 1) final_preds.append(current_pred) return final_preds, urls
def _step(self): """ Make a single gradient update. This is called by train() and should not be called manually. """ # Make a minibatch of training data minibatch = coco_utils.sample_coco_minibatch( self.data, batch_size=self.batch_size, split='train') captions, features, urls = minibatch # Compute loss and gradient loss, grads = self.model.loss(features, captions) self.loss_history.append(loss) # Perform a parameter update for p, w in self.model.params.items(): dw = grads[p] config = self.optim_configs[p] next_w, next_config = self.update_rule(w, dw, config) self.model.params[p] = next_w self.optim_configs[p] = next_config
def getAnnotatedImage(self, data, split): ''' samples image and returns it with GT and generated capture''' minibatch = sample_coco_minibatch(data, batch_size=1, split=split) captions, features, urls = minibatch # sample some captions given image features gt_captions = decode_captions(captions, data['idx_to_word']) _, captions_out = self.beam_decode(features) #captions_out = self.sample(features) sample_captions = [] sample_captions.append( decode_captions(captions_out, data['idx_to_word'])) for gt_caption, sample_caption, url in zip(gt_captions, sample_captions, urls): img = image_from_url(url) img = np.asarray(img) try: img = np.swapaxes(img, 0, 2).transpose(0, 2, 1) except ValueError: img = np.random.rand(3, 256, 256) caption = ('%s \n %s \n GT:%s' % (split, sample_caption, gt_caption)) return img, caption
def evaluate_model(model, data): """ model: CaptioningRNN model Prints unigram BLEU score averaged over 1000 training and val examples. """ BLEUscores = {} for split in ['train', 'val']: minibatch = sample_coco_minibatch(data, split=split, batch_size=1000) gt_captions, features, urls = minibatch gt_captions = decode_captions(gt_captions, data['idx_to_word']) sample_captions = model.sample(features) sample_captions = decode_captions(sample_captions, data['idx_to_word']) total_score = 0.0 for gt_caption, sample_caption, url in zip(gt_captions, sample_captions, urls): total_score += BLEU_score(gt_caption, sample_caption) BLEUscores[split] = total_score / len(sample_captions) for split in BLEUscores: print('Average BLEU score for %s: %f' % (split, BLEUscores[split]))
lr_decay=0.995, verbose=True, print_every=10, ) small_lstm_solver.train() # Plot the training losses plt.plot(small_lstm_solver.loss_history) plt.xlabel('Iteration') plt.ylabel('Loss') plt.title('Training loss history') plt.show() #LSTM test-time sampling for split in ['train', 'val']: minibatch = sample_coco_minibatch(small_data, split=split, batch_size=2) gt_captions, features, urls = minibatch gt_captions = decode_captions(gt_captions, data['idx_to_word']) sample_captions = small_lstm_model.sample(features) sample_captions = decode_captions(sample_captions, data['idx_to_word']) for gt_caption, sample_caption, url in zip(gt_captions, sample_captions, urls): plt.imshow(image_from_url(url)) plt.title('%s\n%s\nGT:%s' % (split, sample_caption, gt_caption)) plt.axis('off') plt.show() #train a good model sdata = load_coco_data(max_train=10000)
return np.max(np.abs(x - y) / (np.maximum(1e-8, np.abs(x) + np.abs(y)))) data = load_coco_data(pca_features=True) # Print out all the keys and values from the data dictionary for k, v in data.items(): if type(v) == np.ndarray: print(k, type(v), v.shape, v.dtype) else: print(k, type(v), len(v)) # Sample a minibatch and show the images and captions batch_size = 3 captions, features, urls = sample_coco_minibatch(data, batch_size=batch_size) for i, (caption, url) in enumerate(zip(captions, urls)): plt.imshow(image_from_url(url)) plt.axis('off') caption_str = decode_captions(caption, data['idx_to_word']) plt.title(caption_str) plt.show() """ This file defines layer types that are commonly used for recurrent neural networks. """ def rnn_step_forward(x, prev_h, Wx, Wh, b): """ Run the forward pass for a single timestep of a vanilla RNN that uses a tanh
def main(): # The dataset can be downloaded in https://drive.google.com/drive/folders/1zCq7kS9OXc2mgaOzDimAwiBblECWeBtO?usp=sharing # The dataset contains the feature of images in MSCOCO dataset # Load COCO data from disk; this returns a dictionary small_data = load_coco_data(max_train=50) # Experiment with vanilla RNN small_rnn_model = CaptioningRNN( cell_type='rnn', word_to_idx=small_data['word_to_idx'], input_dim=small_data['train_features'].shape[1], hidden_dim=512, wordvec_dim=256, ) small_rnn_solver = CaptioningSolver( small_rnn_model, small_data, update_rule='adam', num_epochs=50, batch_size=25, optim_config={ 'learning_rate': 5e-3, }, lr_decay=0.95, verbose=True, print_every=10, ) small_rnn_solver.train() # Plot the training losses plt.plot(small_rnn_solver.loss_history) plt.xlabel('Iteration') plt.ylabel('Loss') plt.title('Training loss history') plt.show() for split in ['train', 'val']: minibatch = sample_coco_minibatch(small_data, split=split, batch_size=2) gt_captions, features, urls = minibatch gt_captions = decode_captions(gt_captions, small_data['idx_to_word']) sample_captions = small_rnn_model.sample(features) sample_captions = decode_captions(sample_captions, small_data['idx_to_word']) for gt_caption, sample_caption, url in zip(gt_captions, sample_captions, urls): plt.imshow(image_from_url(url)) plt.title('%s\n%s\nGT:%s' % (split, sample_caption, gt_caption)) plt.axis('off') plt.show() ################################################################################################## # Experiment with LSTM small_lstm_model = CaptioningRNN( cell_type='lstm', word_to_idx=small_data['word_to_idx'], input_dim=small_data['train_features'].shape[1], hidden_dim=512, wordvec_dim=256, dtype=np.float32, ) small_lstm_solver = CaptioningSolver( small_lstm_model, small_data, update_rule='adam', num_epochs=50, batch_size=25, optim_config={ 'learning_rate': 5e-3, }, lr_decay=0.995, verbose=True, print_every=10, ) small_lstm_solver.train() # Plot the training losses plt.plot(small_lstm_solver.loss_history) plt.xlabel('Iteration') plt.ylabel('Loss') plt.title('Training loss history') plt.show() for split in ['train', 'val']: minibatch = sample_coco_minibatch(small_data, split=split, batch_size=2) gt_captions, features, urls = minibatch gt_captions = decode_captions(gt_captions, small_data['idx_to_word']) sample_captions = small_lstm_model.sample(features) sample_captions = decode_captions(sample_captions, small_data['idx_to_word']) for gt_caption, sample_caption, url in zip(gt_captions, sample_captions, urls): plt.imshow(image_from_url(url)) plt.title('%s\n%s\nGT:%s' % (split, sample_caption, gt_caption)) plt.axis('off') plt.show()
plt.plot(small_rnn_solver.loss_history) plt.xlabel('Iteration') plt.ylabel('Loss') plt.title('Training loss history') plt.show() for k, v in data.iteritems(): if type(v) == np.ndarray: print k, type(v), v.shape, v.dtype else: print k, type(v), len(v) # Look at the data batch_size = 3 captions, features, urls = sample_coco_minibatch(data, batch_size=batch_size) for i, (caption, url) in enumerate(zip(captions, urls)): plt.imshow(image_from_url(url)) plt.axis('off') caption_str = decode_captions(caption, data['idx_to_word']) plt.title(caption_str) plt.show() #Testing time sampling for split in ['train', 'val']: minibatch = sample_coco_minibatch(small_data, split=split, batch_size=2) gt_captions, features, urls = minibatch gt_captions = decode_captions(gt_captions, data['idx_to_word']) sample_captions = small_rnn_model.sample(features)
def evaluate_model(model, med_data, idx_to_word, batch_size=1000, beam_size=None): """ model: CaptioningRNN model Prints unigram BLEU score averaged over 1000 training and val examples. """ BLEUscores = {} if beam_size is None: # no beam search for split in ['train', 'val']: minibatch = sample_coco_minibatch(med_data, split=split, batch_size=batch_size) gt_captions, features, urls = minibatch gt_captions = decode_captions(gt_captions, med_data['idx_to_word']) sample_captions = model.sample(features) sample_captions = decode_captions(sample_captions, med_data['idx_to_word']) total_score = 0.0 for gt_caption, sample_caption, url in zip(gt_captions, sample_captions, urls): total_score += BLEU_score(gt_caption, sample_caption) BLEUscores[split] = total_score / len(sample_captions) for split in BLEUscores: print('Average BLEU score for %s: %f' % (split, BLEUscores[split])) else: # with beam search for split in ['train', 'val']: sample_captions = [] # empty list for the sample captures gt_captions = [] # empty list for GT urls = [] for batch in range(batch_size): minibatch = sample_coco_minibatch( med_data, split=split, batch_size=1) # each time only one sample gt_caption, features, url = minibatch gt_caption = decode_captions(gt_caption, med_data['idx_to_word']) _, sample_caption = model.beam_decode(features, beam_size=beam_size) sample_caption = decode_captions(sample_caption, med_data['idx_to_word']) sample_captions.append(str(sample_caption)) gt_captions.append(str(gt_caption)) urls.append(url) total_score = 0.0 for gt_caption, sample_caption, url in zip(gt_captions, sample_captions, urls): total_score += BLEU_score(gt_caption, sample_caption) BLEUscores[split] = total_score / len( sample_captions) # divide by the lenght of words for split in BLEUscores: print('Average BLEU score for %s: %f' % (split, BLEUscores[split])) return BLEUscores['val']
def train(self): """ Train model and print out some useful information(loss, generated captions) for debugging. """ n_examples = self.data['train_captions'].shape[0] n_iters_per_epoch = n_examples // self.batch_size # get data features = self.data['train_features'] captions = self.data['train_captions'] # build train model graph loss, generated_captions = self.model.build_model() optimizer = self.optimizer(self.learning_rate).minimize(loss) # build test model graph alphas, sampled_captions = self.model.build_sampler( ) # (N, max_len, L), (N, max_len) print "num epochs: %d" % self.n_epochs print "iterations per epoch: %d" % n_iters_per_epoch print "data size: %d" % n_examples print "batch size: %d" % self.batch_size sess = tf.InteractiveSession() tf.initialize_all_variables().run() saver = tf.train.Saver(max_to_keep=10) for e in range(self.n_epochs): # print initial loss if e == 0: captions_batch, features_batch, _ = sample_coco_minibatch( self.data, self.batch_size, split='train') feed_dict = { self.model.features: features_batch, self.model.captions: captions_batch } gen_caps, l = sess.run([generated_captions, loss], feed_dict) self.loss_history.append(l) print "" print "*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*" print "Initial Train Loss: %.5f" % l decoded = decode_captions(gen_caps, self.model.idx_to_word) for j in range(3): print "Generated Caption: %s" % decoded[j] print "*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*" print "" # actual training step for i in range(n_iters_per_epoch): captions_batch, features_batch, _ = sample_coco_minibatch( self.data, self.batch_size, split='train') feed_dict = { self.model.features: features_batch, self.model.captions: captions_batch } sess.run(optimizer, feed_dict) # save loss history l = sess.run(loss, feed_dict) self.loss_history.append(l) # print info if (e + 1) % self.print_every == 0: gen_caps = sess.run(generated_captions, feed_dict) print "" print "*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*" print "Train Loss at Epoch %d: %.5f" % (e + 1, l) decoded = decode_captions(gen_caps, self.model.idx_to_word) for j in range(3): print "Generated Caption: %s" % decoded[j] print "*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*" print "" # save model if (e + 1) % self.save_every == 0: saver.save(sess, os.path.join(self.model_path, 'model'), global_step=e + 1) print "model-%s saved." % (e + 1) # actual test step: sample captions and visualize attention _, features_batch, image_files = sample_coco_minibatch(self.data, self.batch_size, split='train') feed_dict = {self.model.features: features_batch} alps, sam_cap = sess.run([alphas, sampled_captions], feed_dict) # (N, max_len, L), (N, max_len) # decode captions decoded = decode_captions(sam_cap, self.model.idx_to_word) # visualize 10 images and captions for n in range(10): print "Sampled Caption: %s" % decoded[n] # plot original image img_path = os.path.join(self.image_path, image_files[n]) img = ndimage.imread(img_path) plt.subplot(4, 5, 1) plt.imshow(img) plt.axis('off') # plot image with attention weights words = decoded[n].split(" ") for t in range(len(words)): if t > 18: break plt.subplot(4, 5, t + 2) plt.text(0, 1, words[t], color='black', backgroundcolor='white', fontsize=12) plt.imshow(img) alp_curr = alps[n, t, :].reshape(14, 14) alp_img = skimage.transform.pyramid_expand(alp_curr, upscale=16, sigma=20) plt.imshow(alp_img, alpha=0.8) plt.axis('off') plt.show()