def generate_image_index_to_reference_captions(base_dir="datasets/self_process"): data = load_coco_data(base_dir=base_dir, pca_features=False, is_caption_separated=True) gts_train = {} for cap_idx, img_idx in enumerate(data['train_image_idxs']): img_idx = str(img_idx) if img_idx not in gts_train: gts_train[img_idx] = [] gts_train[img_idx].append({'caption': decode_captions(data['train_captions'][cap_idx][1:], data['idx_to_word'])}) with open('train_img_idx_to_captions.json', 'w') as f: f.write(json.dumps(gts_train)) gts_val = {} for cap_idx, img_idx in enumerate(data['val_image_idxs']): img_idx = str(img_idx) if img_idx not in gts_val: gts_val[img_idx] = [] gts_val[img_idx].append({'caption': decode_captions(data['val_captions'][cap_idx][1:], data['idx_to_word'])}) with open('val_img_idx_to_captions.json', 'w') as f: f.write(json.dumps(gts_val))
def generate_image_index_to_reference_captions(): data = load_coco_data() gts_train = {} for cap_idx, img_idx in enumerate(data['train_image_idxs']): img_idx = str(img_idx) if img_idx not in gts_train: gts_train[img_idx] = [] gts_train[img_idx].append({ 'caption': decode_captions(data['train_captions'][cap_idx][1:], data['idx_to_word']) }) with open('train_img_idx_to_captions.json', 'wb') as f: f.write(json.dumps(gts_train).encode('ascii')) gts_val = {} for cap_idx, img_idx in enumerate(data['val_image_idxs']): img_idx = str(img_idx) if img_idx not in gts_val: gts_val[img_idx] = [] gts_val[img_idx].append({ 'caption': decode_captions(data['val_captions'][cap_idx][1:], data['idx_to_word']) }) with open('val_img_idx_to_captions.json', 'wb') as f: f.write(json.dumps(gts_val).encode('ascii'))
def main(): # The dataset (987M) can be downloaded from # https://drive.google.com/file/d/1Wgeq3NZ4R1letnZEKLo-DTSSgcTsgkmq/view?usp=sharing # The dataset contains the feature of images in MSCOCO dataset # The data should be in the same folder as the code # Load COCO data from disk; this returns a dictionary small_data = coco_utils.load_coco_data(max_train=50) # Experiment with vanilla RNN small_rnn_model = CaptioningRNN( cell_type='rnn', word_to_idx=small_data['word_to_idx'], input_dim=small_data['train_features'].shape[1], hidden_dim=512, wordvec_dim=256, ) small_rnn_solver = CaptioningSolver(small_rnn_model, small_data, update_rule='adam', num_epochs=50, batch_size=25, optim_config={ 'learning_rate': 5e-3, }, lr_decay=0.95, verbose=True, print_every=10, ) small_rnn_solver.train() # Plot the training losses plt.plot(small_rnn_solver.loss_history) plt.xlabel('Iteration') plt.ylabel('Loss') plt.title('Training loss history') plt.show() plt.savefig('loss_rnn.png') plt.close() for split in ['train', 'val']: # some images might be deprecated. You may rerun the code several times # to successfully get the sample images from url. minibatch = coco_utils.sample_coco_minibatch( small_data, split=split, batch_size=2, seed=0) gt_captions, features, urls = minibatch gt_captions = coco_utils.decode_captions(gt_captions, small_data['idx_to_word']) sample_captions = small_rnn_model.sample(features) sample_captions = coco_utils.decode_captions(sample_captions, small_data['idx_to_word']) for i, (gt_caption, sample_caption, url) in enumerate(zip(gt_captions, sample_captions, urls)): plt.imshow(image_from_url(url)) plt.title('%s\n%s\nGT:%s' % (split, sample_caption, gt_caption)) plt.axis('off') plt.show() plt.savefig('%s_rnn_%d.png' % (split, i)) plt.close()
def main(_): # load dictionary data = {} with open(FLAGS.dict_file, 'r') as f: dict_data = json.load(f) for k, v in dict_data.items(): data[k] = v data['idx_to_word'] = {int(k):v for k, v in data['idx_to_word'].items()} # extract all features features, all_image_names = extract_features(FLAGS.test_dir) # Build the TensorFlow graph and train it g = tf.Graph() with g.as_default(): num_of_images = len(os.listdir(FLAGS.test_dir)) print("Inferencing on {} images".format(num_of_images)) # Build the model. model = build_model(model_config, mode, inference_batch = 1) # Initialize beam search Caption Generator generator = CaptionGenerator(model, data['word_to_idx'], max_caption_length = model_config.padded_length-1) # run training init = tf.global_variables_initializer() with tf.Session() as sess: sess.run(init) model['saver'].restore(sess, FLAGS.saved_sess) print("Model restored! Last step run: ", sess.run(model['global_step'])) # predictions final_preds = run_inference(sess, features, generator, 1.0) captions_pred = [unpack.reshape(-1, 1) for unpack in final_preds] #captions_pred = np.concatenate(captions_pred, 1) captions_deco= [] for cap in captions_pred: dec = decode_captions(cap.reshape(-1, 1), data['idx_to_word']) dec = ' '.join(dec) captions_deco.append(dec) # saved the images with captions written on them if not os.path.exists(FLAGS.results_dir): os.makedirs(FLAGS.results_dir) for j in range(len(captions_deco)): this_image_name = all_image_names['file_name'].values[j] img_name = os.path.join(FLAGS.results_dir, this_image_name) img = imread(os.path.join(FLAGS.test_dir, this_image_name)) write_text_on_image(img, img_name, captions_deco[j]) print("\ndone.")
def getAnnotatedImage(self, data, split): ''' samples image and returns it with GT and generated capture''' minibatch = sample_coco_minibatch(data, batch_size=1, split=split) captions, features, urls = minibatch # sample some captions given image features gt_captions = decode_captions(captions, data['idx_to_word']) _, captions_out = self.beam_decode(features) #captions_out = self.sample(features) sample_captions = [] sample_captions.append( decode_captions(captions_out, data['idx_to_word'])) for gt_caption, sample_caption, url in zip(gt_captions, sample_captions, urls): img = image_from_url(url) img = np.asarray(img) try: img = np.swapaxes(img, 0, 2).transpose(0, 2, 1) except ValueError: img = np.random.rand(3, 256, 256) caption = ('%s \n %s \n GT:%s' % (split, sample_caption, gt_caption)) return img, caption
def evaluate_model(model, data): """ model: CaptioningRNN model Prints unigram BLEU score averaged over 1000 training and val examples. """ BLEUscores = {} for split in ['train', 'val']: minibatch = sample_coco_minibatch(data, split=split, batch_size=1000) gt_captions, features, urls = minibatch gt_captions = decode_captions(gt_captions, data['idx_to_word']) sample_captions = model.sample(features) sample_captions = decode_captions(sample_captions, data['idx_to_word']) total_score = 0.0 for gt_caption, sample_caption, url in zip(gt_captions, sample_captions, urls): total_score += BLEU_score(gt_caption, sample_caption) BLEUscores[split] = total_score / len(sample_captions) for split in BLEUscores: print('Average BLEU score for %s: %f' % (split, BLEUscores[split]))
) small_lstm_solver.train() # Plot the training losses plt.plot(small_lstm_solver.loss_history) plt.xlabel('Iteration') plt.ylabel('Loss') plt.title('Training loss history') plt.show() #LSTM test-time sampling for split in ['train', 'val']: minibatch = sample_coco_minibatch(small_data, split=split, batch_size=2) gt_captions, features, urls = minibatch gt_captions = decode_captions(gt_captions, data['idx_to_word']) sample_captions = small_lstm_model.sample(features) sample_captions = decode_captions(sample_captions, data['idx_to_word']) for gt_caption, sample_caption, url in zip(gt_captions, sample_captions, urls): plt.imshow(image_from_url(url)) plt.title('%s\n%s\nGT:%s' % (split, sample_caption, gt_caption)) plt.axis('off') plt.show() #train a good model sdata = load_coco_data(max_train=10000) lstm_model = CaptioningRNN(
# Print out all the keys and values from the data dictionary for k, v in data.items(): if type(v) == np.ndarray: print(k, type(v), v.shape, v.dtype) else: print(k, type(v), len(v)) # Sample a minibatch and show the images and captions batch_size = 3 captions, features, urls = sample_coco_minibatch(data, batch_size=batch_size) for i, (caption, url) in enumerate(zip(captions, urls)): plt.imshow(image_from_url(url)) plt.axis('off') caption_str = decode_captions(caption, data['idx_to_word']) plt.title(caption_str) plt.show() """ This file defines layer types that are commonly used for recurrent neural networks. """ def rnn_step_forward(x, prev_h, Wx, Wh, b): """ Run the forward pass for a single timestep of a vanilla RNN that uses a tanh activation function. The input data has dimension D, the hidden state has dimension H, and we use a minibatch size of N.
def main(): # The dataset can be downloaded in https://drive.google.com/drive/folders/1zCq7kS9OXc2mgaOzDimAwiBblECWeBtO?usp=sharing # The dataset contains the feature of images in MSCOCO dataset # Load COCO data from disk; this returns a dictionary small_data = load_coco_data(max_train=50) # Experiment with vanilla RNN small_rnn_model = CaptioningRNN( cell_type='rnn', word_to_idx=small_data['word_to_idx'], input_dim=small_data['train_features'].shape[1], hidden_dim=512, wordvec_dim=256, ) small_rnn_solver = CaptioningSolver( small_rnn_model, small_data, update_rule='adam', num_epochs=50, batch_size=25, optim_config={ 'learning_rate': 5e-3, }, lr_decay=0.95, verbose=True, print_every=10, ) small_rnn_solver.train() # Plot the training losses plt.plot(small_rnn_solver.loss_history) plt.xlabel('Iteration') plt.ylabel('Loss') plt.title('Training loss history') plt.show() for split in ['train', 'val']: minibatch = sample_coco_minibatch(small_data, split=split, batch_size=2) gt_captions, features, urls = minibatch gt_captions = decode_captions(gt_captions, small_data['idx_to_word']) sample_captions = small_rnn_model.sample(features) sample_captions = decode_captions(sample_captions, small_data['idx_to_word']) for gt_caption, sample_caption, url in zip(gt_captions, sample_captions, urls): plt.imshow(image_from_url(url)) plt.title('%s\n%s\nGT:%s' % (split, sample_caption, gt_caption)) plt.axis('off') plt.show() ################################################################################################## # Experiment with LSTM small_lstm_model = CaptioningRNN( cell_type='lstm', word_to_idx=small_data['word_to_idx'], input_dim=small_data['train_features'].shape[1], hidden_dim=512, wordvec_dim=256, dtype=np.float32, ) small_lstm_solver = CaptioningSolver( small_lstm_model, small_data, update_rule='adam', num_epochs=50, batch_size=25, optim_config={ 'learning_rate': 5e-3, }, lr_decay=0.995, verbose=True, print_every=10, ) small_lstm_solver.train() # Plot the training losses plt.plot(small_lstm_solver.loss_history) plt.xlabel('Iteration') plt.ylabel('Loss') plt.title('Training loss history') plt.show() for split in ['train', 'val']: minibatch = sample_coco_minibatch(small_data, split=split, batch_size=2) gt_captions, features, urls = minibatch gt_captions = decode_captions(gt_captions, small_data['idx_to_word']) sample_captions = small_lstm_model.sample(features) sample_captions = decode_captions(sample_captions, small_data['idx_to_word']) for gt_caption, sample_caption, url in zip(gt_captions, sample_captions, urls): plt.imshow(image_from_url(url)) plt.title('%s\n%s\nGT:%s' % (split, sample_caption, gt_caption)) plt.axis('off') plt.show()
def main(_): # load data data = load_coco_data(FLAGS.data_dir) # force padded_length equal to padded_length - 1 # model_config.padded_length = len(data['train_captions'][0]) - 1 # Build the TensorFlow graph and train it g = tf.Graph() with g.as_default(): # Build the model. If FLAGS.glove_vocab is null, we do not initialize the model with word vectors; if not, we initialize with glove vectors if FLAGS.glove_vocab is '': model = build_model(model_config, mode=mode) else: glove_vocab = np.load(FLAGS.glove_vocab) model = build_model(model_config, mode=mode, glove_vocab=glove_vocab) # Set up the learning rate. learning_rate_decay_fn = None learning_rate = tf.constant(training_config.initial_learning_rate) if training_config.learning_rate_decay_factor > 0: num_batches_per_epoch = (training_config.num_examples_per_epoch / model_config.batch_size) decay_steps = int(num_batches_per_epoch * training_config.num_epochs_per_decay) def _learning_rate_decay_fn(learning_rate, global_step): return tf.train.exponential_decay( learning_rate, global_step, decay_steps=decay_steps, decay_rate=training_config.learning_rate_decay_factor, staircase=True) learning_rate_decay_fn = _learning_rate_decay_fn # Set up the training ops. train_op = tf.contrib.layers.optimize_loss( loss=model['total_loss'], global_step=model['global_step'], learning_rate=learning_rate, optimizer=training_config.optimizer, clip_gradients=training_config.clip_gradients, learning_rate_decay_fn=learning_rate_decay_fn) # initialize all variables init = tf.global_variables_initializer() with tf.Session() as sess: sess.run(init) num_epochs = training_config.total_num_epochs num_train = data['train_captions'].shape[0] iterations_per_epoch = max(num_train / model_config.batch_size, 1) num_iterations = int(num_epochs * iterations_per_epoch) # Set up some variables for book-keeping epoch = 0 best_val_acc = 0 best_params = {} loss_history = [] train_acc_history = [] val_acc_history = [] print("\n\nTotal training iter: ", num_iterations, "\n\n") time_now = datetime.now() for t in range(num_iterations): total_loss_value = _step(sess, data, train_op, model, model_config.lstm_dropout_keep_prob ) # run each training step loss_history.append(total_loss_value) # Print out training loss if FLAGS.print_every > 0 and t % FLAGS.print_every == 0: print( '(Iteration %d / %d) loss: %f, and time eclipsed: %.2f minutes' % (t + 1, num_iterations, float(loss_history[-1]), (datetime.now() - time_now).seconds / 60.0)) # Print out some image sample results if FLAGS.sample_every > 0 and (t + 1) % FLAGS.sample_every == 0: temp_dir = os.path.join(FLAGS.sample_dir, 'temp_dir_{}//'.format(t + 1)) if not os.path.exists(temp_dir): os.makedirs(temp_dir) captions_pred, urls = _run_validation( sess, data, model_config.batch_size, model, 1.0) # the output is size (32, 16) captions_pred = [ unpack.reshape(-1, 1) for unpack in captions_pred ] captions_pred = np.concatenate(captions_pred, 1) captions_deco = decode_captions(captions_pred, data['idx_to_word']) for j in range(len(captions_deco)): img_name = os.path.join(temp_dir, 'image_{}.jpg'.format(j)) img = image_from_url(urls[j]) write_text_on_image(img, img_name, captions_deco[j]) # save the model continuously to avoid interruption if FLAGS.saveModel_every > 0 and ( t + 1) % FLAGS.saveModel_every == 0: if not os.path.exists(FLAGS.savedSession_dir): os.makedirs(FLAGS.savedSession_dir) checkpoint_name = savedModelName[: -5] + '_checkpoint{}.ckpt'.format( t + 1) save_path = model['saver'].save( sess, os.path.join(FLAGS.savedSession_dir, checkpoint_name)) if not os.path.exists(FLAGS.savedSession_dir): os.makedirs(FLAGS.savedSession_dir) save_path = model['saver'].save( sess, os.path.join(FLAGS.savedSession_dir, savedModelName)) print("done. Model saved at: ", os.path.join(FLAGS.savedSession_dir, savedModelName))
with g.as_default(): # Build the model. model = build_model(model_config, mode, inference_batch=BATCH_SIZE_INFERENCE) # run training init = tf.global_variables_initializer() with tf.Session() as sess: sess.run(init) model['saver'].restore(sess, directory + "savedSession/model0.ckpt") print("Model restured! Last step run: ", sess.run(model['global_step'])) for i in range(TOTAL_INFERENCE_STEP): captions_pred, urls = _step_test( sess, data, BATCH_SIZE_INFERENCE, model, 1.0) # the output is size (32, 16) captions_pred = [unpack.reshape(-1, 1) for unpack in captions_pred] captions_pred = np.concatenate(captions_pred, 1) captions_deco = decode_captions(captions_pred, data['idx_to_word']) for j in range(len(captions_deco)): img_name = directory + 'image_' + str(j) + '.jpg' img = image_from_url(urls[j]) write_text_on_image(img, img_name, captions_deco[j])
def create_annotations(features, image_names, data, num_processes, saved_sess, beam_size=3, voting_scheme="range", num_winners=1, normalise_votes=False): # Build the model. model = build_model(model_config, mode, inference_batch=1) # Initialize beam search Caption Generator generator = CaptionGenerator( model, data['word_to_idx'], max_caption_length=model_config.padded_length - 1, beam_size=beam_size) # run training init = tf.global_variables_initializer() gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=1 / (2 * num_processes)) with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess: #with tf.Session() as sess: sess.run(init) model['saver'].restore(sess, saved_sess) # predictions beam_preds = run_inference(sess, features, generator) annotations = [] for j, beam_captions in enumerate(beam_preds): beam_dec = [] total_prob = 0 for caption in beam_captions: sentence = ' '.join( decode_captions(caption.sentence, data['idx_to_word'])) prob = np.exp(caption.score) beam_dec.append({'caption': sentence, 'prob': prob}) total_prob += prob print(total_prob) voted_captions = rrv_captions_from_beam( beam_captions, num_winners=num_winners, normalise_votes=normalise_votes) voted_dec = [] for voted_caption in voted_captions: vote_dec = decode_captions(voted_caption, data['idx_to_word']) vote_dec = ' '.join(vote_dec) voted_dec.append(vote_dec) image_name = image_names[j] annotation = { 'image_id': extract_image_id(image_name), 'captions': { 'beam': beam_dec, 'voted': voted_dec } } annotations.append(annotation) print("Created annotations for {} images".format(len(features))) return annotations
def train_model(model, config, data): #g = tf.Graph() #with g.as_default(): ################define optimizer######## num_batches = config.total_instances / config.batch_size decay_steps = int(num_batches * config.num_epochs_per_decay) learning_rate = tf.constant(config.initial_learning_rate) learning_rate_decay_fn = None def _decay_fn(learning_rate, global_step): return tf.train.exponential_decay(learning_rate, global_step, decay_steps=decay_steps, decay_rate=0.5, staircase=True) learning_rate_decay_fn = _decay_fn train_op = tf.contrib.layers.optimize_loss( loss=model.total_loss, global_step=model.global_step, learning_rate=learning_rate, optimizer='SGD', clip_gradients=config.clip_gradients, learning_rate_decay_fn=learning_rate_decay_fn) ################## saver = tf.train.Saver() init = tf.global_variables_initializer() # for BLAS Memmory DUMP failure config_ = tf.ConfigProto() config_.gpu_options.allow_growth = True with tf.Session(config=config_) as sess: sess.run(init) # if checkpoint exist, restore ckpt = tf.train.get_checkpoint_state( os.path.dirname('checkpoints/checkpoint')) if ckpt and ckpt.model_checkpoint_path: saver.restore(sess, ckpt.model_checkpoint_path) print("cucessfully restored the checkpoint") rand_int = np.random.randint(1, 100) caption_in, caption_out, mask, image_features, urls = minibatch( data, rand_int, config.batch_size, config.total_instances) if not os.path.exists('test_caption'): os.makedirs('test_caption') captions_pred = _run_validation( sess, caption_in, image_features, config.batch_size, model, config.input_len) # the output is size (32, 16) captions_pred = [unpack.reshape(-1, 1) for unpack in captions_pred] captions_pred = np.concatenate(captions_pred, 1) captions_deco = decode_captions(captions_pred, data['idx_to_word']) for j in range(len(captions_deco)): img_name = os.path.join('test_caption', 'image_{}.jpg'.format(j)) img = image_from_url(urls[j]) write_text_on_image(img, img_name, captions_deco[j]) print("saved predicted images into ./test_caption folder") # 100 epoch # total_runs = int((config.total_instances/config.batch_size)*config.num_epochs) # initial_step = model.global_step.eval() ### initialize summary writer # tf.summary.scalar("learing_rate", learning_rate) # a = tf.summary.merge_all() # writer = tf.summary.FileWriter('./graphs/singlelayer_lstm', sess.graph) # time_now = datetime.now() # for t in range(total_runs): # caption_in, caption_out, mask, image_features, urls = minibatch(data,t,config.batch_size, config.total_instances) # # feed data # feed_dict = {model.image_feature: image_features, model.caption_in: caption_in, # model.caption_out: caption_out, model.caption_mask: mask} # merge_op, _, total_loss, b = sess.run([model.summary_op, train_op, model.total_loss, a], # feed_dict = feed_dict) # writer.add_summary(merge_op, global_step=t) # writer.add_summary(b, global_step=t) # # print loss infor # if(t+1) % 20 == 0: # print('(Iteration %d / %d) loss: %f, and time eclipsed: %.2f minutes' % ( # t + 1, total_runs, float(total_loss), (datetime.now() - time_now).seconds/60.0)) # #print image # if(t+1)%100 == 0: # if not os.path.exists('test_caption'): # os.makedirs('test_caption') # captions_pred = _run_validation(sess, caption_in, image_features, 1, model, config.input_len) # the output is size (32, 16) # captions_pred = [unpack.reshape(-1, 1) for unpack in captions_pred] # captions_pred = np.concatenate(captions_pred, 1) # captions_deco = decode_captions(captions_pred, data['idx_to_word']) # for j in range(len(captions_deco)): # img_name = os.path.join('test_caption', 'image_{}.jpg'.format(j)) # img = image_from_url(urls[j]) # write_text_on_image(img, img_name, captions_deco[j]) # #save model # if(t+1)%50 == 0 or t == (total_runs-1): # if not os.path.exists('checkpoints/singlelayer_lstm'): # os.makedirs('checkpoints/singlelayer_lstm') # saver.save(sess, 'checkpoints/singlelayer_lstm', t) # visualize embed matrix #code to visualize the embeddings. uncomment the below to visualize embeddings final_embed_matrix = sess.run(model.embed_map) # it has to variable. constants don't work here. you can't reuse model.embed_matrix embedding_var = tf.Variable(final_embed_matrix[:1000], name='embedding') sess.run(embedding_var.initializer) config = projector.ProjectorConfig() summary_writer = tf.summary.FileWriter('processed') # add embedding to the config file embedding = config.embeddings.add() embedding.tensor_name = embedding_var.name # link this tensor to its metadata file, in this case the first 500 words of vocab # metadata_path = './processed/matadata.tsv' # if not os.path.exists(metadata_path): # f = open(metadata_path, "w") # f.close() embedding.metadata_path = os.path.join('processed', 'metadata.tsv') # saves a configuration file that TensorBoard will read during startup. projector.visualize_embeddings(summary_writer, config) saver_embed = tf.train.Saver([embedding_var]) saver_embed.save(sess, 'processed/model3.ckpt', 1)
def evaluate_model(model, med_data, idx_to_word, batch_size=1000, beam_size=None): """ model: CaptioningRNN model Prints unigram BLEU score averaged over 1000 training and val examples. """ BLEUscores = {} if beam_size is None: # no beam search for split in ['train', 'val']: minibatch = sample_coco_minibatch(med_data, split=split, batch_size=batch_size) gt_captions, features, urls = minibatch gt_captions = decode_captions(gt_captions, med_data['idx_to_word']) sample_captions = model.sample(features) sample_captions = decode_captions(sample_captions, med_data['idx_to_word']) total_score = 0.0 for gt_caption, sample_caption, url in zip(gt_captions, sample_captions, urls): total_score += BLEU_score(gt_caption, sample_caption) BLEUscores[split] = total_score / len(sample_captions) for split in BLEUscores: print('Average BLEU score for %s: %f' % (split, BLEUscores[split])) else: # with beam search for split in ['train', 'val']: sample_captions = [] # empty list for the sample captures gt_captions = [] # empty list for GT urls = [] for batch in range(batch_size): minibatch = sample_coco_minibatch( med_data, split=split, batch_size=1) # each time only one sample gt_caption, features, url = minibatch gt_caption = decode_captions(gt_caption, med_data['idx_to_word']) _, sample_caption = model.beam_decode(features, beam_size=beam_size) sample_caption = decode_captions(sample_caption, med_data['idx_to_word']) sample_captions.append(str(sample_caption)) gt_captions.append(str(gt_caption)) urls.append(url) total_score = 0.0 for gt_caption, sample_caption, url in zip(gt_captions, sample_captions, urls): total_score += BLEU_score(gt_caption, sample_caption) BLEUscores[split] = total_score / len( sample_captions) # divide by the lenght of words for split in BLEUscores: print('Average BLEU score for %s: %f' % (split, BLEUscores[split])) return BLEUscores['val']
def train(self): """ Train model and print out some useful information(loss, generated captions) for debugging. """ n_examples = self.data['train_captions'].shape[0] n_iters_per_epoch = n_examples // self.batch_size # get data features = self.data['train_features'] captions = self.data['train_captions'] # build train model graph loss, generated_captions = self.model.build_model() optimizer = self.optimizer(self.learning_rate).minimize(loss) # build test model graph alphas, sampled_captions = self.model.build_sampler( ) # (N, max_len, L), (N, max_len) print "num epochs: %d" % self.n_epochs print "iterations per epoch: %d" % n_iters_per_epoch print "data size: %d" % n_examples print "batch size: %d" % self.batch_size sess = tf.InteractiveSession() tf.initialize_all_variables().run() saver = tf.train.Saver(max_to_keep=10) for e in range(self.n_epochs): # print initial loss if e == 0: captions_batch, features_batch, _ = sample_coco_minibatch( self.data, self.batch_size, split='train') feed_dict = { self.model.features: features_batch, self.model.captions: captions_batch } gen_caps, l = sess.run([generated_captions, loss], feed_dict) self.loss_history.append(l) print "" print "*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*" print "Initial Train Loss: %.5f" % l decoded = decode_captions(gen_caps, self.model.idx_to_word) for j in range(3): print "Generated Caption: %s" % decoded[j] print "*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*" print "" # actual training step for i in range(n_iters_per_epoch): captions_batch, features_batch, _ = sample_coco_minibatch( self.data, self.batch_size, split='train') feed_dict = { self.model.features: features_batch, self.model.captions: captions_batch } sess.run(optimizer, feed_dict) # save loss history l = sess.run(loss, feed_dict) self.loss_history.append(l) # print info if (e + 1) % self.print_every == 0: gen_caps = sess.run(generated_captions, feed_dict) print "" print "*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*" print "Train Loss at Epoch %d: %.5f" % (e + 1, l) decoded = decode_captions(gen_caps, self.model.idx_to_word) for j in range(3): print "Generated Caption: %s" % decoded[j] print "*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*" print "" # save model if (e + 1) % self.save_every == 0: saver.save(sess, os.path.join(self.model_path, 'model'), global_step=e + 1) print "model-%s saved." % (e + 1) # actual test step: sample captions and visualize attention _, features_batch, image_files = sample_coco_minibatch(self.data, self.batch_size, split='train') feed_dict = {self.model.features: features_batch} alps, sam_cap = sess.run([alphas, sampled_captions], feed_dict) # (N, max_len, L), (N, max_len) # decode captions decoded = decode_captions(sam_cap, self.model.idx_to_word) # visualize 10 images and captions for n in range(10): print "Sampled Caption: %s" % decoded[n] # plot original image img_path = os.path.join(self.image_path, image_files[n]) img = ndimage.imread(img_path) plt.subplot(4, 5, 1) plt.imshow(img) plt.axis('off') # plot image with attention weights words = decoded[n].split(" ") for t in range(len(words)): if t > 18: break plt.subplot(4, 5, t + 2) plt.text(0, 1, words[t], color='black', backgroundcolor='white', fontsize=12) plt.imshow(img) alp_curr = alps[n, t, :].reshape(14, 14) alp_img = skimage.transform.pyramid_expand(alp_curr, upscale=16, sigma=20) plt.imshow(alp_img, alpha=0.8) plt.axis('off') plt.show()