Beispiel #1
0
def main():
    # The dataset (987M) can be downloaded from
    # https://drive.google.com/file/d/1Wgeq3NZ4R1letnZEKLo-DTSSgcTsgkmq/view?usp=sharing
    # The dataset contains the feature of images in MSCOCO dataset
    # The data should be in the same folder as the code
    # Load COCO data from disk; this returns a dictionary
    small_data = coco_utils.load_coco_data(max_train=50)

    # Experiment with vanilla RNN
    small_rnn_model = CaptioningRNN(
          cell_type='rnn',
          word_to_idx=small_data['word_to_idx'],
          input_dim=small_data['train_features'].shape[1],
          hidden_dim=512,
          wordvec_dim=256,
    )

    small_rnn_solver = CaptioningSolver(small_rnn_model, small_data,
           update_rule='adam',
           num_epochs=50,
           batch_size=25,
           optim_config={
             'learning_rate': 5e-3,
           },
           lr_decay=0.95,
           verbose=True, print_every=10,
         )

    small_rnn_solver.train()

    # Plot the training losses
    plt.plot(small_rnn_solver.loss_history)
    plt.xlabel('Iteration')
    plt.ylabel('Loss')
    plt.title('Training loss history')
    plt.show()
    plt.savefig('loss_rnn.png')
    plt.close()

    for split in ['train', 'val']:
        # some images might be deprecated. You may rerun the code several times
        # to successfully get the sample images from url.
        minibatch = coco_utils.sample_coco_minibatch(
            small_data, split=split, batch_size=2, seed=0)
        gt_captions, features, urls = minibatch
        gt_captions = coco_utils.decode_captions(gt_captions,
                                                 small_data['idx_to_word'])

        sample_captions = small_rnn_model.sample(features)
        sample_captions = coco_utils.decode_captions(sample_captions,
                                                     small_data['idx_to_word'])

        for i, (gt_caption, sample_caption, url) in enumerate(zip(gt_captions, sample_captions, urls)):
            plt.imshow(image_from_url(url))
            plt.title('%s\n%s\nGT:%s' % (split, sample_caption, gt_caption))
            plt.axis('off')
            plt.show()
            plt.savefig('%s_rnn_%d.png' % (split, i))
            plt.close()
Beispiel #2
0
 def show_image_by_image_idxs(self, coco_data, img_idxs):
     """
         data indices to find image
     """
     urls = coco_data.get_urls_by_image_index(img_idxs)
     for url in urls:
         plt.imshow(image_from_url(url))
         plt.axis('off')
         plt.show()
    def getAnnotatedImage(self, data, split):
        ''' samples image and returns it with GT and generated capture'''
        minibatch = sample_coco_minibatch(data, batch_size=1, split=split)
        captions, features, urls = minibatch
        # sample some captions given image features
        gt_captions = decode_captions(captions, data['idx_to_word'])
        _, captions_out = self.beam_decode(features)
        #captions_out = self.sample(features)
        sample_captions = []
        sample_captions.append(
            decode_captions(captions_out, data['idx_to_word']))
        for gt_caption, sample_caption, url in zip(gt_captions,
                                                   sample_captions, urls):
            img = image_from_url(url)
            img = np.asarray(img)
            try:
                img = np.swapaxes(img, 0, 2).transpose(0, 2, 1)
            except ValueError:
                img = np.random.rand(3, 256, 256)
            caption = ('%s \n %s \n GT:%s' %
                       (split, sample_caption, gt_caption))

        return img, caption
Beispiel #4
0
plt.xlabel('Iteration')
plt.ylabel('Loss')
plt.title('Training loss history')
plt.show()

#LSTM test-time sampling
for split in ['train', 'val']:
  minibatch = sample_coco_minibatch(small_data, split=split, batch_size=2)
  gt_captions, features, urls = minibatch
  gt_captions = decode_captions(gt_captions, data['idx_to_word'])

  sample_captions = small_lstm_model.sample(features)
  sample_captions = decode_captions(sample_captions, data['idx_to_word'])

  for gt_caption, sample_caption, url in zip(gt_captions, sample_captions, urls):
    plt.imshow(image_from_url(url))
    plt.title('%s\n%s\nGT:%s' % (split, sample_caption, gt_caption))
    plt.axis('off')
    plt.show()

#train a good model

sdata = load_coco_data(max_train=10000)

lstm_model = CaptioningRNN(
          cell_type='lstm',
          word_to_idx=sdata['word_to_idx'],
          input_dim=sdata['train_features'].shape[1],
          hidden_dim=512,
          wordvec_dim=256,
          dtype=np.float32,
Beispiel #5
0
def main():
    # The dataset can be downloaded in https://drive.google.com/drive/folders/1zCq7kS9OXc2mgaOzDimAwiBblECWeBtO?usp=sharing
    # The dataset contains the feature of images in MSCOCO dataset
    # Load COCO data from disk; this returns a dictionary
    small_data = load_coco_data(max_train=50)

    # Experiment with vanilla RNN
    small_rnn_model = CaptioningRNN(
        cell_type='rnn',
        word_to_idx=small_data['word_to_idx'],
        input_dim=small_data['train_features'].shape[1],
        hidden_dim=512,
        wordvec_dim=256,
    )

    small_rnn_solver = CaptioningSolver(
        small_rnn_model,
        small_data,
        update_rule='adam',
        num_epochs=50,
        batch_size=25,
        optim_config={
            'learning_rate': 5e-3,
        },
        lr_decay=0.95,
        verbose=True,
        print_every=10,
    )

    small_rnn_solver.train()

    # Plot the training losses
    plt.plot(small_rnn_solver.loss_history)
    plt.xlabel('Iteration')
    plt.ylabel('Loss')
    plt.title('Training loss history')
    plt.show()

    for split in ['train', 'val']:
        minibatch = sample_coco_minibatch(small_data,
                                          split=split,
                                          batch_size=2)
        gt_captions, features, urls = minibatch
        gt_captions = decode_captions(gt_captions, small_data['idx_to_word'])

        sample_captions = small_rnn_model.sample(features)
        sample_captions = decode_captions(sample_captions,
                                          small_data['idx_to_word'])

        for gt_caption, sample_caption, url in zip(gt_captions,
                                                   sample_captions, urls):
            plt.imshow(image_from_url(url))
            plt.title('%s\n%s\nGT:%s' % (split, sample_caption, gt_caption))
            plt.axis('off')
            plt.show()

    ##################################################################################################

    # Experiment with LSTM
    small_lstm_model = CaptioningRNN(
        cell_type='lstm',
        word_to_idx=small_data['word_to_idx'],
        input_dim=small_data['train_features'].shape[1],
        hidden_dim=512,
        wordvec_dim=256,
        dtype=np.float32,
    )
    small_lstm_solver = CaptioningSolver(
        small_lstm_model,
        small_data,
        update_rule='adam',
        num_epochs=50,
        batch_size=25,
        optim_config={
            'learning_rate': 5e-3,
        },
        lr_decay=0.995,
        verbose=True,
        print_every=10,
    )

    small_lstm_solver.train()
    # Plot the training losses
    plt.plot(small_lstm_solver.loss_history)
    plt.xlabel('Iteration')
    plt.ylabel('Loss')
    plt.title('Training loss history')
    plt.show()

    for split in ['train', 'val']:
        minibatch = sample_coco_minibatch(small_data,
                                          split=split,
                                          batch_size=2)
        gt_captions, features, urls = minibatch
        gt_captions = decode_captions(gt_captions, small_data['idx_to_word'])

        sample_captions = small_lstm_model.sample(features)
        sample_captions = decode_captions(sample_captions,
                                          small_data['idx_to_word'])

        for gt_caption, sample_caption, url in zip(gt_captions,
                                                   sample_captions, urls):
            plt.imshow(image_from_url(url))
            plt.title('%s\n%s\nGT:%s' % (split, sample_caption, gt_caption))
            plt.axis('off')
            plt.show()
Beispiel #6
0
def main(_):
    # load data
    data = load_coco_data(FLAGS.data_dir)

    # force padded_length equal to padded_length - 1
    # model_config.padded_length = len(data['train_captions'][0]) - 1

    # Build the TensorFlow graph and train it
    g = tf.Graph()
    with g.as_default():

        # Build the model. If FLAGS.glove_vocab is null, we do not initialize the model with word vectors; if not, we initialize with glove vectors
        if FLAGS.glove_vocab is '':
            model = build_model(model_config, mode=mode)
        else:
            glove_vocab = np.load(FLAGS.glove_vocab)
            model = build_model(model_config,
                                mode=mode,
                                glove_vocab=glove_vocab)

        # Set up the learning rate.
        learning_rate_decay_fn = None
        learning_rate = tf.constant(training_config.initial_learning_rate)
        if training_config.learning_rate_decay_factor > 0:
            num_batches_per_epoch = (training_config.num_examples_per_epoch /
                                     model_config.batch_size)
            decay_steps = int(num_batches_per_epoch *
                              training_config.num_epochs_per_decay)

            def _learning_rate_decay_fn(learning_rate, global_step):
                return tf.train.exponential_decay(
                    learning_rate,
                    global_step,
                    decay_steps=decay_steps,
                    decay_rate=training_config.learning_rate_decay_factor,
                    staircase=True)

            learning_rate_decay_fn = _learning_rate_decay_fn

        # Set up the training ops.
        train_op = tf.contrib.layers.optimize_loss(
            loss=model['total_loss'],
            global_step=model['global_step'],
            learning_rate=learning_rate,
            optimizer=training_config.optimizer,
            clip_gradients=training_config.clip_gradients,
            learning_rate_decay_fn=learning_rate_decay_fn)

        # initialize all variables
        init = tf.global_variables_initializer()

        with tf.Session() as sess:
            sess.run(init)

            num_epochs = training_config.total_num_epochs

            num_train = data['train_captions'].shape[0]
            iterations_per_epoch = max(num_train / model_config.batch_size, 1)
            num_iterations = int(num_epochs * iterations_per_epoch)

            # Set up some variables for book-keeping
            epoch = 0
            best_val_acc = 0
            best_params = {}
            loss_history = []
            train_acc_history = []
            val_acc_history = []

            print("\n\nTotal training iter: ", num_iterations, "\n\n")
            time_now = datetime.now()
            for t in range(num_iterations):

                total_loss_value = _step(sess, data, train_op, model,
                                         model_config.lstm_dropout_keep_prob
                                         )  # run each training step

                loss_history.append(total_loss_value)

                # Print out training loss
                if FLAGS.print_every > 0 and t % FLAGS.print_every == 0:
                    print(
                        '(Iteration %d / %d) loss: %f, and time eclipsed: %.2f minutes'
                        % (t + 1, num_iterations, float(loss_history[-1]),
                           (datetime.now() - time_now).seconds / 60.0))

                # Print out some image sample results
                if FLAGS.sample_every > 0 and (t +
                                               1) % FLAGS.sample_every == 0:
                    temp_dir = os.path.join(FLAGS.sample_dir,
                                            'temp_dir_{}//'.format(t + 1))
                    if not os.path.exists(temp_dir):
                        os.makedirs(temp_dir)
                    captions_pred, urls = _run_validation(
                        sess, data, model_config.batch_size, model,
                        1.0)  # the output is size (32, 16)
                    captions_pred = [
                        unpack.reshape(-1, 1) for unpack in captions_pred
                    ]
                    captions_pred = np.concatenate(captions_pred, 1)

                    captions_deco = decode_captions(captions_pred,
                                                    data['idx_to_word'])

                    for j in range(len(captions_deco)):
                        img_name = os.path.join(temp_dir,
                                                'image_{}.jpg'.format(j))
                        img = image_from_url(urls[j])
                        write_text_on_image(img, img_name, captions_deco[j])

                # save the model continuously to avoid interruption
                if FLAGS.saveModel_every > 0 and (
                        t + 1) % FLAGS.saveModel_every == 0:
                    if not os.path.exists(FLAGS.savedSession_dir):
                        os.makedirs(FLAGS.savedSession_dir)
                    checkpoint_name = savedModelName[:
                                                     -5] + '_checkpoint{}.ckpt'.format(
                                                         t + 1)
                    save_path = model['saver'].save(
                        sess,
                        os.path.join(FLAGS.savedSession_dir, checkpoint_name))

            if not os.path.exists(FLAGS.savedSession_dir):
                os.makedirs(FLAGS.savedSession_dir)
            save_path = model['saver'].save(
                sess, os.path.join(FLAGS.savedSession_dir, savedModelName))
            print("done. Model saved at: ",
                  os.path.join(FLAGS.savedSession_dir, savedModelName))
Beispiel #7
0
with g.as_default():
    # Build the model.
    model = build_model(model_config,
                        mode,
                        inference_batch=BATCH_SIZE_INFERENCE)

    # run training
    init = tf.global_variables_initializer()
    with tf.Session() as sess:

        sess.run(init)

        model['saver'].restore(sess, directory + "savedSession/model0.ckpt")

        print("Model restured! Last step run: ",
              sess.run(model['global_step']))

        for i in range(TOTAL_INFERENCE_STEP):
            captions_pred, urls = _step_test(
                sess, data, BATCH_SIZE_INFERENCE, model,
                1.0)  # the output is size (32, 16)
            captions_pred = [unpack.reshape(-1, 1) for unpack in captions_pred]
            captions_pred = np.concatenate(captions_pred, 1)

            captions_deco = decode_captions(captions_pred, data['idx_to_word'])

            for j in range(len(captions_deco)):
                img_name = directory + 'image_' + str(j) + '.jpg'
                img = image_from_url(urls[j])
                write_text_on_image(img, img_name, captions_deco[j])
def train_model(model, config, data):

    #g = tf.Graph()
    #with g.as_default():
    ################define optimizer########
    num_batches = config.total_instances / config.batch_size
    decay_steps = int(num_batches * config.num_epochs_per_decay)
    learning_rate = tf.constant(config.initial_learning_rate)

    learning_rate_decay_fn = None

    def _decay_fn(learning_rate, global_step):
        return tf.train.exponential_decay(learning_rate,
                                          global_step,
                                          decay_steps=decay_steps,
                                          decay_rate=0.5,
                                          staircase=True)

    learning_rate_decay_fn = _decay_fn
    train_op = tf.contrib.layers.optimize_loss(
        loss=model.total_loss,
        global_step=model.global_step,
        learning_rate=learning_rate,
        optimizer='SGD',
        clip_gradients=config.clip_gradients,
        learning_rate_decay_fn=learning_rate_decay_fn)

    ##################
    saver = tf.train.Saver()
    init = tf.global_variables_initializer()

    # for BLAS Memmory DUMP failure
    config_ = tf.ConfigProto()
    config_.gpu_options.allow_growth = True

    with tf.Session(config=config_) as sess:
        sess.run(init)
        # if checkpoint exist, restore
        ckpt = tf.train.get_checkpoint_state(
            os.path.dirname('checkpoints/checkpoint'))
        if ckpt and ckpt.model_checkpoint_path:
            saver.restore(sess, ckpt.model_checkpoint_path)
        print("cucessfully restored the checkpoint")

        rand_int = np.random.randint(1, 100)
        caption_in, caption_out, mask, image_features, urls = minibatch(
            data, rand_int, config.batch_size, config.total_instances)

        if not os.path.exists('test_caption'):
            os.makedirs('test_caption')
        captions_pred = _run_validation(
            sess, caption_in, image_features, config.batch_size, model,
            config.input_len)  # the output is size (32, 16)
        captions_pred = [unpack.reshape(-1, 1) for unpack in captions_pred]
        captions_pred = np.concatenate(captions_pred, 1)

        captions_deco = decode_captions(captions_pred, data['idx_to_word'])

        for j in range(len(captions_deco)):
            img_name = os.path.join('test_caption', 'image_{}.jpg'.format(j))
            img = image_from_url(urls[j])
            write_text_on_image(img, img_name, captions_deco[j])
        print("saved predicted images into ./test_caption folder")
        # 100 epoch
        #         total_runs = int((config.total_instances/config.batch_size)*config.num_epochs)
        #         initial_step = model.global_step.eval()

        ### initialize summary writer
        #         tf.summary.scalar("learing_rate", learning_rate)
        #         a = tf.summary.merge_all()
        #         writer = tf.summary.FileWriter('./graphs/singlelayer_lstm', sess.graph)

        #         time_now = datetime.now()
        #         for t in range(total_runs):

        #             caption_in, caption_out, mask, image_features, urls = minibatch(data,t,config.batch_size, config.total_instances)

        #             # feed data
        #             feed_dict = {model.image_feature: image_features, model.caption_in: caption_in,
        #                         model.caption_out: caption_out, model.caption_mask: mask}
        #             merge_op, _, total_loss, b = sess.run([model.summary_op, train_op, model.total_loss, a],
        #                                            feed_dict = feed_dict)

        #             writer.add_summary(merge_op, global_step=t)
        #             writer.add_summary(b, global_step=t)

        #             # print loss infor
        #             if(t+1) % 20 == 0:
        #                 print('(Iteration %d / %d) loss: %f, and time eclipsed: %.2f minutes' % (
        #                     t + 1, total_runs, float(total_loss), (datetime.now() - time_now).seconds/60.0))

        #             #print image
        #             if(t+1)%100 == 0:
        #                 if not os.path.exists('test_caption'):
        #                     os.makedirs('test_caption')
        #                 captions_pred = _run_validation(sess, caption_in, image_features, 1, model, config.input_len) # the output is size (32, 16)
        #                 captions_pred = [unpack.reshape(-1, 1) for unpack in captions_pred]
        #                 captions_pred = np.concatenate(captions_pred, 1)

        #                 captions_deco = decode_captions(captions_pred, data['idx_to_word'])

        #                 for j in range(len(captions_deco)):
        #                     img_name = os.path.join('test_caption', 'image_{}.jpg'.format(j))
        #                     img = image_from_url(urls[j])
        #                     write_text_on_image(img, img_name, captions_deco[j])

        #             #save model
        #             if(t+1)%50 == 0 or t == (total_runs-1):
        #                 if not os.path.exists('checkpoints/singlelayer_lstm'):
        #                     os.makedirs('checkpoints/singlelayer_lstm')
        #                 saver.save(sess, 'checkpoints/singlelayer_lstm', t)

        # visualize embed matrix
        #code to visualize the embeddings. uncomment the below to visualize embeddings
        final_embed_matrix = sess.run(model.embed_map)

        # it has to variable. constants don't work here. you can't reuse model.embed_matrix
        embedding_var = tf.Variable(final_embed_matrix[:1000],
                                    name='embedding')
        sess.run(embedding_var.initializer)

        config = projector.ProjectorConfig()
        summary_writer = tf.summary.FileWriter('processed')

        # add embedding to the config file
        embedding = config.embeddings.add()
        embedding.tensor_name = embedding_var.name

        # link this tensor to its metadata file, in this case the first 500 words of vocab
        #         metadata_path = './processed/matadata.tsv'
        #         if not os.path.exists(metadata_path):
        #             f = open(metadata_path, "w")
        #             f.close()
        embedding.metadata_path = os.path.join('processed', 'metadata.tsv')

        # saves a configuration file that TensorBoard will read during startup.
        projector.visualize_embeddings(summary_writer, config)
        saver_embed = tf.train.Saver([embedding_var])
        saver_embed.save(sess, 'processed/model3.ckpt', 1)