def generate_image_index_to_reference_captions(base_dir="datasets/self_process"):
  data = load_coco_data(base_dir=base_dir, pca_features=False, is_caption_separated=True)

  gts_train = {}
  for cap_idx, img_idx in enumerate(data['train_image_idxs']):
    img_idx = str(img_idx)
    if img_idx not in gts_train:
      gts_train[img_idx] = []

    gts_train[img_idx].append({'caption': decode_captions(data['train_captions'][cap_idx][1:], data['idx_to_word'])})

  with open('train_img_idx_to_captions.json', 'w') as f:
    f.write(json.dumps(gts_train))


  gts_val = {}
  for cap_idx, img_idx in enumerate(data['val_image_idxs']):
    img_idx = str(img_idx)
    if img_idx not in gts_val:
      gts_val[img_idx] = []

    gts_val[img_idx].append({'caption': decode_captions(data['val_captions'][cap_idx][1:], data['idx_to_word'])})

  with open('val_img_idx_to_captions.json', 'w') as f:
    f.write(json.dumps(gts_val))
Exemple #2
0
def generate_image_index_to_reference_captions():
    data = load_coco_data()

    gts_train = {}
    for cap_idx, img_idx in enumerate(data['train_image_idxs']):
        img_idx = str(img_idx)
        if img_idx not in gts_train:
            gts_train[img_idx] = []

        gts_train[img_idx].append({
            'caption':
            decode_captions(data['train_captions'][cap_idx][1:],
                            data['idx_to_word'])
        })

    with open('train_img_idx_to_captions.json', 'wb') as f:
        f.write(json.dumps(gts_train).encode('ascii'))

    gts_val = {}
    for cap_idx, img_idx in enumerate(data['val_image_idxs']):
        img_idx = str(img_idx)
        if img_idx not in gts_val:
            gts_val[img_idx] = []

        gts_val[img_idx].append({
            'caption':
            decode_captions(data['val_captions'][cap_idx][1:],
                            data['idx_to_word'])
        })

    with open('val_img_idx_to_captions.json', 'wb') as f:
        f.write(json.dumps(gts_val).encode('ascii'))
Exemple #3
0
def main():
    # The dataset (987M) can be downloaded from
    # https://drive.google.com/file/d/1Wgeq3NZ4R1letnZEKLo-DTSSgcTsgkmq/view?usp=sharing
    # The dataset contains the feature of images in MSCOCO dataset
    # The data should be in the same folder as the code
    # Load COCO data from disk; this returns a dictionary
    small_data = coco_utils.load_coco_data(max_train=50)

    # Experiment with vanilla RNN
    small_rnn_model = CaptioningRNN(
          cell_type='rnn',
          word_to_idx=small_data['word_to_idx'],
          input_dim=small_data['train_features'].shape[1],
          hidden_dim=512,
          wordvec_dim=256,
    )

    small_rnn_solver = CaptioningSolver(small_rnn_model, small_data,
           update_rule='adam',
           num_epochs=50,
           batch_size=25,
           optim_config={
             'learning_rate': 5e-3,
           },
           lr_decay=0.95,
           verbose=True, print_every=10,
         )

    small_rnn_solver.train()

    # Plot the training losses
    plt.plot(small_rnn_solver.loss_history)
    plt.xlabel('Iteration')
    plt.ylabel('Loss')
    plt.title('Training loss history')
    plt.show()
    plt.savefig('loss_rnn.png')
    plt.close()

    for split in ['train', 'val']:
        # some images might be deprecated. You may rerun the code several times
        # to successfully get the sample images from url.
        minibatch = coco_utils.sample_coco_minibatch(
            small_data, split=split, batch_size=2, seed=0)
        gt_captions, features, urls = minibatch
        gt_captions = coco_utils.decode_captions(gt_captions,
                                                 small_data['idx_to_word'])

        sample_captions = small_rnn_model.sample(features)
        sample_captions = coco_utils.decode_captions(sample_captions,
                                                     small_data['idx_to_word'])

        for i, (gt_caption, sample_caption, url) in enumerate(zip(gt_captions, sample_captions, urls)):
            plt.imshow(image_from_url(url))
            plt.title('%s\n%s\nGT:%s' % (split, sample_caption, gt_caption))
            plt.axis('off')
            plt.show()
            plt.savefig('%s_rnn_%d.png' % (split, i))
            plt.close()
def main(_):
    
    # load dictionary 
    data = {}
    with open(FLAGS.dict_file, 'r') as f:
        dict_data = json.load(f)
        for k, v in dict_data.items():
            data[k] = v
    data['idx_to_word'] = {int(k):v for k, v in data['idx_to_word'].items()}

    # extract all features 
    features, all_image_names = extract_features(FLAGS.test_dir)
    
    # Build the TensorFlow graph and train it
    g = tf.Graph()
    with g.as_default():
        num_of_images = len(os.listdir(FLAGS.test_dir))
        print("Inferencing on {} images".format(num_of_images))
        
        # Build the model.
        model = build_model(model_config, mode, inference_batch = 1)
        
        # Initialize beam search Caption Generator 
        generator = CaptionGenerator(model, data['word_to_idx'], max_caption_length = model_config.padded_length-1)
        
        # run training 
        init = tf.global_variables_initializer()
        with tf.Session() as sess:
        
            sess.run(init)
        
            model['saver'].restore(sess, FLAGS.saved_sess)
              
            print("Model restored! Last step run: ", sess.run(model['global_step']))
            
            # predictions 
            final_preds = run_inference(sess, features, generator, 1.0)
            captions_pred = [unpack.reshape(-1, 1) for unpack in final_preds]
            #captions_pred = np.concatenate(captions_pred, 1)
            captions_deco= []
            for cap in captions_pred:
                dec = decode_captions(cap.reshape(-1, 1), data['idx_to_word'])
                dec = ' '.join(dec)
                captions_deco.append(dec)
            
            # saved the images with captions written on them
            if not os.path.exists(FLAGS.results_dir):
                os.makedirs(FLAGS.results_dir)
            for j in range(len(captions_deco)):
                this_image_name = all_image_names['file_name'].values[j]
                img_name = os.path.join(FLAGS.results_dir, this_image_name)
                img = imread(os.path.join(FLAGS.test_dir, this_image_name))
                write_text_on_image(img, img_name, captions_deco[j])
    print("\ndone.")
    def getAnnotatedImage(self, data, split):
        ''' samples image and returns it with GT and generated capture'''
        minibatch = sample_coco_minibatch(data, batch_size=1, split=split)
        captions, features, urls = minibatch
        # sample some captions given image features
        gt_captions = decode_captions(captions, data['idx_to_word'])
        _, captions_out = self.beam_decode(features)
        #captions_out = self.sample(features)
        sample_captions = []
        sample_captions.append(
            decode_captions(captions_out, data['idx_to_word']))
        for gt_caption, sample_caption, url in zip(gt_captions,
                                                   sample_captions, urls):
            img = image_from_url(url)
            img = np.asarray(img)
            try:
                img = np.swapaxes(img, 0, 2).transpose(0, 2, 1)
            except ValueError:
                img = np.random.rand(3, 256, 256)
            caption = ('%s \n %s \n GT:%s' %
                       (split, sample_caption, gt_caption))

        return img, caption
def evaluate_model(model, data):
    """
    model: CaptioningRNN model
    Prints unigram BLEU score averaged over 1000 training and val examples.
    """
    BLEUscores = {}
    for split in ['train', 'val']:
        minibatch = sample_coco_minibatch(data, split=split, batch_size=1000)
        gt_captions, features, urls = minibatch
        gt_captions = decode_captions(gt_captions, data['idx_to_word'])

        sample_captions = model.sample(features)
        sample_captions = decode_captions(sample_captions, data['idx_to_word'])

        total_score = 0.0
        for gt_caption, sample_caption, url in zip(gt_captions,
                                                   sample_captions, urls):
            total_score += BLEU_score(gt_caption, sample_caption)

        BLEUscores[split] = total_score / len(sample_captions)

    for split in BLEUscores:
        print('Average BLEU score for %s: %f' % (split, BLEUscores[split]))
Exemple #7
0
         )

small_lstm_solver.train()

# Plot the training losses
plt.plot(small_lstm_solver.loss_history)
plt.xlabel('Iteration')
plt.ylabel('Loss')
plt.title('Training loss history')
plt.show()

#LSTM test-time sampling
for split in ['train', 'val']:
  minibatch = sample_coco_minibatch(small_data, split=split, batch_size=2)
  gt_captions, features, urls = minibatch
  gt_captions = decode_captions(gt_captions, data['idx_to_word'])

  sample_captions = small_lstm_model.sample(features)
  sample_captions = decode_captions(sample_captions, data['idx_to_word'])

  for gt_caption, sample_caption, url in zip(gt_captions, sample_captions, urls):
    plt.imshow(image_from_url(url))
    plt.title('%s\n%s\nGT:%s' % (split, sample_caption, gt_caption))
    plt.axis('off')
    plt.show()

#train a good model

sdata = load_coco_data(max_train=10000)

lstm_model = CaptioningRNN(
Exemple #8
0
# Print out all the keys and values from the data dictionary
for k, v in data.items():
    if type(v) == np.ndarray:
        print(k, type(v), v.shape, v.dtype)
    else:
        print(k, type(v), len(v))

# Sample a minibatch and show the images and captions
batch_size = 3

captions, features, urls = sample_coco_minibatch(data, batch_size=batch_size)
for i, (caption, url) in enumerate(zip(captions, urls)):
    plt.imshow(image_from_url(url))
    plt.axis('off')
    caption_str = decode_captions(caption, data['idx_to_word'])
    plt.title(caption_str)
    plt.show()
"""
This file defines layer types that are commonly used for recurrent neural
networks.
"""


def rnn_step_forward(x, prev_h, Wx, Wh, b):
    """
    Run the forward pass for a single timestep of a vanilla RNN that uses a tanh
    activation function.

    The input data has dimension D, the hidden state has dimension H, and we use
    a minibatch size of N.
Exemple #9
0
def main():
    # The dataset can be downloaded in https://drive.google.com/drive/folders/1zCq7kS9OXc2mgaOzDimAwiBblECWeBtO?usp=sharing
    # The dataset contains the feature of images in MSCOCO dataset
    # Load COCO data from disk; this returns a dictionary
    small_data = load_coco_data(max_train=50)

    # Experiment with vanilla RNN
    small_rnn_model = CaptioningRNN(
        cell_type='rnn',
        word_to_idx=small_data['word_to_idx'],
        input_dim=small_data['train_features'].shape[1],
        hidden_dim=512,
        wordvec_dim=256,
    )

    small_rnn_solver = CaptioningSolver(
        small_rnn_model,
        small_data,
        update_rule='adam',
        num_epochs=50,
        batch_size=25,
        optim_config={
            'learning_rate': 5e-3,
        },
        lr_decay=0.95,
        verbose=True,
        print_every=10,
    )

    small_rnn_solver.train()

    # Plot the training losses
    plt.plot(small_rnn_solver.loss_history)
    plt.xlabel('Iteration')
    plt.ylabel('Loss')
    plt.title('Training loss history')
    plt.show()

    for split in ['train', 'val']:
        minibatch = sample_coco_minibatch(small_data,
                                          split=split,
                                          batch_size=2)
        gt_captions, features, urls = minibatch
        gt_captions = decode_captions(gt_captions, small_data['idx_to_word'])

        sample_captions = small_rnn_model.sample(features)
        sample_captions = decode_captions(sample_captions,
                                          small_data['idx_to_word'])

        for gt_caption, sample_caption, url in zip(gt_captions,
                                                   sample_captions, urls):
            plt.imshow(image_from_url(url))
            plt.title('%s\n%s\nGT:%s' % (split, sample_caption, gt_caption))
            plt.axis('off')
            plt.show()

    ##################################################################################################

    # Experiment with LSTM
    small_lstm_model = CaptioningRNN(
        cell_type='lstm',
        word_to_idx=small_data['word_to_idx'],
        input_dim=small_data['train_features'].shape[1],
        hidden_dim=512,
        wordvec_dim=256,
        dtype=np.float32,
    )
    small_lstm_solver = CaptioningSolver(
        small_lstm_model,
        small_data,
        update_rule='adam',
        num_epochs=50,
        batch_size=25,
        optim_config={
            'learning_rate': 5e-3,
        },
        lr_decay=0.995,
        verbose=True,
        print_every=10,
    )

    small_lstm_solver.train()
    # Plot the training losses
    plt.plot(small_lstm_solver.loss_history)
    plt.xlabel('Iteration')
    plt.ylabel('Loss')
    plt.title('Training loss history')
    plt.show()

    for split in ['train', 'val']:
        minibatch = sample_coco_minibatch(small_data,
                                          split=split,
                                          batch_size=2)
        gt_captions, features, urls = minibatch
        gt_captions = decode_captions(gt_captions, small_data['idx_to_word'])

        sample_captions = small_lstm_model.sample(features)
        sample_captions = decode_captions(sample_captions,
                                          small_data['idx_to_word'])

        for gt_caption, sample_caption, url in zip(gt_captions,
                                                   sample_captions, urls):
            plt.imshow(image_from_url(url))
            plt.title('%s\n%s\nGT:%s' % (split, sample_caption, gt_caption))
            plt.axis('off')
            plt.show()
Exemple #10
0
def main(_):
    # load data
    data = load_coco_data(FLAGS.data_dir)

    # force padded_length equal to padded_length - 1
    # model_config.padded_length = len(data['train_captions'][0]) - 1

    # Build the TensorFlow graph and train it
    g = tf.Graph()
    with g.as_default():

        # Build the model. If FLAGS.glove_vocab is null, we do not initialize the model with word vectors; if not, we initialize with glove vectors
        if FLAGS.glove_vocab is '':
            model = build_model(model_config, mode=mode)
        else:
            glove_vocab = np.load(FLAGS.glove_vocab)
            model = build_model(model_config,
                                mode=mode,
                                glove_vocab=glove_vocab)

        # Set up the learning rate.
        learning_rate_decay_fn = None
        learning_rate = tf.constant(training_config.initial_learning_rate)
        if training_config.learning_rate_decay_factor > 0:
            num_batches_per_epoch = (training_config.num_examples_per_epoch /
                                     model_config.batch_size)
            decay_steps = int(num_batches_per_epoch *
                              training_config.num_epochs_per_decay)

            def _learning_rate_decay_fn(learning_rate, global_step):
                return tf.train.exponential_decay(
                    learning_rate,
                    global_step,
                    decay_steps=decay_steps,
                    decay_rate=training_config.learning_rate_decay_factor,
                    staircase=True)

            learning_rate_decay_fn = _learning_rate_decay_fn

        # Set up the training ops.
        train_op = tf.contrib.layers.optimize_loss(
            loss=model['total_loss'],
            global_step=model['global_step'],
            learning_rate=learning_rate,
            optimizer=training_config.optimizer,
            clip_gradients=training_config.clip_gradients,
            learning_rate_decay_fn=learning_rate_decay_fn)

        # initialize all variables
        init = tf.global_variables_initializer()

        with tf.Session() as sess:
            sess.run(init)

            num_epochs = training_config.total_num_epochs

            num_train = data['train_captions'].shape[0]
            iterations_per_epoch = max(num_train / model_config.batch_size, 1)
            num_iterations = int(num_epochs * iterations_per_epoch)

            # Set up some variables for book-keeping
            epoch = 0
            best_val_acc = 0
            best_params = {}
            loss_history = []
            train_acc_history = []
            val_acc_history = []

            print("\n\nTotal training iter: ", num_iterations, "\n\n")
            time_now = datetime.now()
            for t in range(num_iterations):

                total_loss_value = _step(sess, data, train_op, model,
                                         model_config.lstm_dropout_keep_prob
                                         )  # run each training step

                loss_history.append(total_loss_value)

                # Print out training loss
                if FLAGS.print_every > 0 and t % FLAGS.print_every == 0:
                    print(
                        '(Iteration %d / %d) loss: %f, and time eclipsed: %.2f minutes'
                        % (t + 1, num_iterations, float(loss_history[-1]),
                           (datetime.now() - time_now).seconds / 60.0))

                # Print out some image sample results
                if FLAGS.sample_every > 0 and (t +
                                               1) % FLAGS.sample_every == 0:
                    temp_dir = os.path.join(FLAGS.sample_dir,
                                            'temp_dir_{}//'.format(t + 1))
                    if not os.path.exists(temp_dir):
                        os.makedirs(temp_dir)
                    captions_pred, urls = _run_validation(
                        sess, data, model_config.batch_size, model,
                        1.0)  # the output is size (32, 16)
                    captions_pred = [
                        unpack.reshape(-1, 1) for unpack in captions_pred
                    ]
                    captions_pred = np.concatenate(captions_pred, 1)

                    captions_deco = decode_captions(captions_pred,
                                                    data['idx_to_word'])

                    for j in range(len(captions_deco)):
                        img_name = os.path.join(temp_dir,
                                                'image_{}.jpg'.format(j))
                        img = image_from_url(urls[j])
                        write_text_on_image(img, img_name, captions_deco[j])

                # save the model continuously to avoid interruption
                if FLAGS.saveModel_every > 0 and (
                        t + 1) % FLAGS.saveModel_every == 0:
                    if not os.path.exists(FLAGS.savedSession_dir):
                        os.makedirs(FLAGS.savedSession_dir)
                    checkpoint_name = savedModelName[:
                                                     -5] + '_checkpoint{}.ckpt'.format(
                                                         t + 1)
                    save_path = model['saver'].save(
                        sess,
                        os.path.join(FLAGS.savedSession_dir, checkpoint_name))

            if not os.path.exists(FLAGS.savedSession_dir):
                os.makedirs(FLAGS.savedSession_dir)
            save_path = model['saver'].save(
                sess, os.path.join(FLAGS.savedSession_dir, savedModelName))
            print("done. Model saved at: ",
                  os.path.join(FLAGS.savedSession_dir, savedModelName))
Exemple #11
0
with g.as_default():
    # Build the model.
    model = build_model(model_config,
                        mode,
                        inference_batch=BATCH_SIZE_INFERENCE)

    # run training
    init = tf.global_variables_initializer()
    with tf.Session() as sess:

        sess.run(init)

        model['saver'].restore(sess, directory + "savedSession/model0.ckpt")

        print("Model restured! Last step run: ",
              sess.run(model['global_step']))

        for i in range(TOTAL_INFERENCE_STEP):
            captions_pred, urls = _step_test(
                sess, data, BATCH_SIZE_INFERENCE, model,
                1.0)  # the output is size (32, 16)
            captions_pred = [unpack.reshape(-1, 1) for unpack in captions_pred]
            captions_pred = np.concatenate(captions_pred, 1)

            captions_deco = decode_captions(captions_pred, data['idx_to_word'])

            for j in range(len(captions_deco)):
                img_name = directory + 'image_' + str(j) + '.jpg'
                img = image_from_url(urls[j])
                write_text_on_image(img, img_name, captions_deco[j])
Exemple #12
0
def create_annotations(features,
                       image_names,
                       data,
                       num_processes,
                       saved_sess,
                       beam_size=3,
                       voting_scheme="range",
                       num_winners=1,
                       normalise_votes=False):
    # Build the model.
    model = build_model(model_config, mode, inference_batch=1)

    # Initialize beam search Caption Generator
    generator = CaptionGenerator(
        model,
        data['word_to_idx'],
        max_caption_length=model_config.padded_length - 1,
        beam_size=beam_size)
    # run training
    init = tf.global_variables_initializer()

    gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=1 /
                                (2 * num_processes))

    with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess:
        #with tf.Session() as sess:
        sess.run(init)
        model['saver'].restore(sess, saved_sess)

        # predictions
        beam_preds = run_inference(sess, features, generator)

        annotations = []

        for j, beam_captions in enumerate(beam_preds):
            beam_dec = []
            total_prob = 0
            for caption in beam_captions:
                sentence = ' '.join(
                    decode_captions(caption.sentence, data['idx_to_word']))
                prob = np.exp(caption.score)
                beam_dec.append({'caption': sentence, 'prob': prob})
                total_prob += prob
            print(total_prob)

            voted_captions = rrv_captions_from_beam(
                beam_captions,
                num_winners=num_winners,
                normalise_votes=normalise_votes)

            voted_dec = []
            for voted_caption in voted_captions:
                vote_dec = decode_captions(voted_caption, data['idx_to_word'])
                vote_dec = ' '.join(vote_dec)
                voted_dec.append(vote_dec)

            image_name = image_names[j]

            annotation = {
                'image_id': extract_image_id(image_name),
                'captions': {
                    'beam': beam_dec,
                    'voted': voted_dec
                }
            }
            annotations.append(annotation)

        print("Created annotations for {} images".format(len(features)))
        return annotations
def train_model(model, config, data):

    #g = tf.Graph()
    #with g.as_default():
    ################define optimizer########
    num_batches = config.total_instances / config.batch_size
    decay_steps = int(num_batches * config.num_epochs_per_decay)
    learning_rate = tf.constant(config.initial_learning_rate)

    learning_rate_decay_fn = None

    def _decay_fn(learning_rate, global_step):
        return tf.train.exponential_decay(learning_rate,
                                          global_step,
                                          decay_steps=decay_steps,
                                          decay_rate=0.5,
                                          staircase=True)

    learning_rate_decay_fn = _decay_fn
    train_op = tf.contrib.layers.optimize_loss(
        loss=model.total_loss,
        global_step=model.global_step,
        learning_rate=learning_rate,
        optimizer='SGD',
        clip_gradients=config.clip_gradients,
        learning_rate_decay_fn=learning_rate_decay_fn)

    ##################
    saver = tf.train.Saver()
    init = tf.global_variables_initializer()

    # for BLAS Memmory DUMP failure
    config_ = tf.ConfigProto()
    config_.gpu_options.allow_growth = True

    with tf.Session(config=config_) as sess:
        sess.run(init)
        # if checkpoint exist, restore
        ckpt = tf.train.get_checkpoint_state(
            os.path.dirname('checkpoints/checkpoint'))
        if ckpt and ckpt.model_checkpoint_path:
            saver.restore(sess, ckpt.model_checkpoint_path)
        print("cucessfully restored the checkpoint")

        rand_int = np.random.randint(1, 100)
        caption_in, caption_out, mask, image_features, urls = minibatch(
            data, rand_int, config.batch_size, config.total_instances)

        if not os.path.exists('test_caption'):
            os.makedirs('test_caption')
        captions_pred = _run_validation(
            sess, caption_in, image_features, config.batch_size, model,
            config.input_len)  # the output is size (32, 16)
        captions_pred = [unpack.reshape(-1, 1) for unpack in captions_pred]
        captions_pred = np.concatenate(captions_pred, 1)

        captions_deco = decode_captions(captions_pred, data['idx_to_word'])

        for j in range(len(captions_deco)):
            img_name = os.path.join('test_caption', 'image_{}.jpg'.format(j))
            img = image_from_url(urls[j])
            write_text_on_image(img, img_name, captions_deco[j])
        print("saved predicted images into ./test_caption folder")
        # 100 epoch
        #         total_runs = int((config.total_instances/config.batch_size)*config.num_epochs)
        #         initial_step = model.global_step.eval()

        ### initialize summary writer
        #         tf.summary.scalar("learing_rate", learning_rate)
        #         a = tf.summary.merge_all()
        #         writer = tf.summary.FileWriter('./graphs/singlelayer_lstm', sess.graph)

        #         time_now = datetime.now()
        #         for t in range(total_runs):

        #             caption_in, caption_out, mask, image_features, urls = minibatch(data,t,config.batch_size, config.total_instances)

        #             # feed data
        #             feed_dict = {model.image_feature: image_features, model.caption_in: caption_in,
        #                         model.caption_out: caption_out, model.caption_mask: mask}
        #             merge_op, _, total_loss, b = sess.run([model.summary_op, train_op, model.total_loss, a],
        #                                            feed_dict = feed_dict)

        #             writer.add_summary(merge_op, global_step=t)
        #             writer.add_summary(b, global_step=t)

        #             # print loss infor
        #             if(t+1) % 20 == 0:
        #                 print('(Iteration %d / %d) loss: %f, and time eclipsed: %.2f minutes' % (
        #                     t + 1, total_runs, float(total_loss), (datetime.now() - time_now).seconds/60.0))

        #             #print image
        #             if(t+1)%100 == 0:
        #                 if not os.path.exists('test_caption'):
        #                     os.makedirs('test_caption')
        #                 captions_pred = _run_validation(sess, caption_in, image_features, 1, model, config.input_len) # the output is size (32, 16)
        #                 captions_pred = [unpack.reshape(-1, 1) for unpack in captions_pred]
        #                 captions_pred = np.concatenate(captions_pred, 1)

        #                 captions_deco = decode_captions(captions_pred, data['idx_to_word'])

        #                 for j in range(len(captions_deco)):
        #                     img_name = os.path.join('test_caption', 'image_{}.jpg'.format(j))
        #                     img = image_from_url(urls[j])
        #                     write_text_on_image(img, img_name, captions_deco[j])

        #             #save model
        #             if(t+1)%50 == 0 or t == (total_runs-1):
        #                 if not os.path.exists('checkpoints/singlelayer_lstm'):
        #                     os.makedirs('checkpoints/singlelayer_lstm')
        #                 saver.save(sess, 'checkpoints/singlelayer_lstm', t)

        # visualize embed matrix
        #code to visualize the embeddings. uncomment the below to visualize embeddings
        final_embed_matrix = sess.run(model.embed_map)

        # it has to variable. constants don't work here. you can't reuse model.embed_matrix
        embedding_var = tf.Variable(final_embed_matrix[:1000],
                                    name='embedding')
        sess.run(embedding_var.initializer)

        config = projector.ProjectorConfig()
        summary_writer = tf.summary.FileWriter('processed')

        # add embedding to the config file
        embedding = config.embeddings.add()
        embedding.tensor_name = embedding_var.name

        # link this tensor to its metadata file, in this case the first 500 words of vocab
        #         metadata_path = './processed/matadata.tsv'
        #         if not os.path.exists(metadata_path):
        #             f = open(metadata_path, "w")
        #             f.close()
        embedding.metadata_path = os.path.join('processed', 'metadata.tsv')

        # saves a configuration file that TensorBoard will read during startup.
        projector.visualize_embeddings(summary_writer, config)
        saver_embed = tf.train.Saver([embedding_var])
        saver_embed.save(sess, 'processed/model3.ckpt', 1)
def evaluate_model(model,
                   med_data,
                   idx_to_word,
                   batch_size=1000,
                   beam_size=None):
    """
    model: CaptioningRNN model
    Prints unigram BLEU score averaged over 1000 training and val examples.

    """
    BLEUscores = {}
    if beam_size is None:  # no beam search
        for split in ['train', 'val']:
            minibatch = sample_coco_minibatch(med_data,
                                              split=split,
                                              batch_size=batch_size)
            gt_captions, features, urls = minibatch
            gt_captions = decode_captions(gt_captions, med_data['idx_to_word'])

            sample_captions = model.sample(features)

            sample_captions = decode_captions(sample_captions,
                                              med_data['idx_to_word'])

            total_score = 0.0
            for gt_caption, sample_caption, url in zip(gt_captions,
                                                       sample_captions, urls):
                total_score += BLEU_score(gt_caption, sample_caption)

            BLEUscores[split] = total_score / len(sample_captions)

        for split in BLEUscores:
            print('Average BLEU score for %s: %f' % (split, BLEUscores[split]))
    else:  # with beam search
        for split in ['train', 'val']:
            sample_captions = []  # empty list for the sample captures
            gt_captions = []  # empty list for GT
            urls = []
            for batch in range(batch_size):
                minibatch = sample_coco_minibatch(
                    med_data, split=split,
                    batch_size=1)  # each time only one sample
                gt_caption, features, url = minibatch
                gt_caption = decode_captions(gt_caption,
                                             med_data['idx_to_word'])

                _, sample_caption = model.beam_decode(features,
                                                      beam_size=beam_size)

                sample_caption = decode_captions(sample_caption,
                                                 med_data['idx_to_word'])

                sample_captions.append(str(sample_caption))
                gt_captions.append(str(gt_caption))
                urls.append(url)

            total_score = 0.0
            for gt_caption, sample_caption, url in zip(gt_captions,
                                                       sample_captions, urls):
                total_score += BLEU_score(gt_caption, sample_caption)

            BLEUscores[split] = total_score / len(
                sample_captions)  # divide by the lenght of words
        for split in BLEUscores:
            print('Average BLEU score for %s: %f' % (split, BLEUscores[split]))

    return BLEUscores['val']
    def train(self):
        """
        Train model and print out some useful information(loss, generated captions) for debugging.  
        """
        n_examples = self.data['train_captions'].shape[0]
        n_iters_per_epoch = n_examples // self.batch_size

        # get data
        features = self.data['train_features']
        captions = self.data['train_captions']

        # build train model graph
        loss, generated_captions = self.model.build_model()
        optimizer = self.optimizer(self.learning_rate).minimize(loss)

        # build test model graph
        alphas, sampled_captions = self.model.build_sampler(
        )  # (N, max_len, L), (N, max_len)

        print "num epochs: %d" % self.n_epochs
        print "iterations per epoch: %d" % n_iters_per_epoch
        print "data size: %d" % n_examples
        print "batch size: %d" % self.batch_size

        sess = tf.InteractiveSession()
        tf.initialize_all_variables().run()
        saver = tf.train.Saver(max_to_keep=10)

        for e in range(self.n_epochs):
            # print initial loss
            if e == 0:
                captions_batch, features_batch, _ = sample_coco_minibatch(
                    self.data, self.batch_size, split='train')
                feed_dict = {
                    self.model.features: features_batch,
                    self.model.captions: captions_batch
                }
                gen_caps, l = sess.run([generated_captions, loss], feed_dict)
                self.loss_history.append(l)
                print ""
                print "*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*"
                print "Initial Train Loss: %.5f" % l
                decoded = decode_captions(gen_caps, self.model.idx_to_word)
                for j in range(3):
                    print "Generated Caption: %s" % decoded[j]
                print "*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*"
                print ""

            # actual training step
            for i in range(n_iters_per_epoch):
                captions_batch, features_batch, _ = sample_coco_minibatch(
                    self.data, self.batch_size, split='train')
                feed_dict = {
                    self.model.features: features_batch,
                    self.model.captions: captions_batch
                }
                sess.run(optimizer, feed_dict)

            # save loss history
            l = sess.run(loss, feed_dict)
            self.loss_history.append(l)

            # print info
            if (e + 1) % self.print_every == 0:
                gen_caps = sess.run(generated_captions, feed_dict)
                print ""
                print "*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*"
                print "Train Loss at Epoch %d: %.5f" % (e + 1, l)
                decoded = decode_captions(gen_caps, self.model.idx_to_word)
                for j in range(3):
                    print "Generated Caption: %s" % decoded[j]
                print "*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*"
                print ""

            # save model
            if (e + 1) % self.save_every == 0:
                saver.save(sess,
                           os.path.join(self.model_path, 'model'),
                           global_step=e + 1)
                print "model-%s saved." % (e + 1)

        # actual test step: sample captions and visualize attention
        _, features_batch, image_files = sample_coco_minibatch(self.data,
                                                               self.batch_size,
                                                               split='train')
        feed_dict = {self.model.features: features_batch}
        alps, sam_cap = sess.run([alphas, sampled_captions],
                                 feed_dict)  # (N, max_len, L), (N, max_len)

        # decode captions
        decoded = decode_captions(sam_cap, self.model.idx_to_word)

        # visualize 10 images and captions
        for n in range(10):
            print "Sampled Caption: %s" % decoded[n]

            # plot original image
            img_path = os.path.join(self.image_path, image_files[n])
            img = ndimage.imread(img_path)
            plt.subplot(4, 5, 1)
            plt.imshow(img)
            plt.axis('off')

            # plot image with attention weights
            words = decoded[n].split(" ")
            for t in range(len(words)):
                if t > 18:
                    break
                plt.subplot(4, 5, t + 2)
                plt.text(0,
                         1,
                         words[t],
                         color='black',
                         backgroundcolor='white',
                         fontsize=12)
                plt.imshow(img)
                alp_curr = alps[n, t, :].reshape(14, 14)
                alp_img = skimage.transform.pyramid_expand(alp_curr,
                                                           upscale=16,
                                                           sigma=20)
                plt.imshow(alp_img, alpha=0.8)
                plt.axis('off')
            plt.show()