Esempio n. 1
0
def _step(sess, data, train_op, model, keep_prob):
    """
    Make a single gradient update for batch data. 
    """
    # Make a minibatch of training data
    minibatch = sample_coco_minibatch(data,
                                      batch_size=model_config.batch_size,
                                      split='train')
    captions, features, urls = minibatch

    captions_in = captions[:, :-1]
    captions_out = captions[:, 1:]

    mask = (captions_out != model_config._null)

    _, total_loss_value = sess.run(
        [train_op, model['total_loss']],
        feed_dict={
            model['image_feature']: features,
            model['input_seqs']: captions_in,
            model['target_seqs']: captions_out,
            model['input_mask']: mask,
            model['keep_prob']: keep_prob
        })

    return total_loss_value
Esempio n. 2
0
def main():
    # The dataset (987M) can be downloaded from
    # https://drive.google.com/file/d/1Wgeq3NZ4R1letnZEKLo-DTSSgcTsgkmq/view?usp=sharing
    # The dataset contains the feature of images in MSCOCO dataset
    # The data should be in the same folder as the code
    # Load COCO data from disk; this returns a dictionary
    small_data = coco_utils.load_coco_data(max_train=50)

    # Experiment with vanilla RNN
    small_rnn_model = CaptioningRNN(
          cell_type='rnn',
          word_to_idx=small_data['word_to_idx'],
          input_dim=small_data['train_features'].shape[1],
          hidden_dim=512,
          wordvec_dim=256,
    )

    small_rnn_solver = CaptioningSolver(small_rnn_model, small_data,
           update_rule='adam',
           num_epochs=50,
           batch_size=25,
           optim_config={
             'learning_rate': 5e-3,
           },
           lr_decay=0.95,
           verbose=True, print_every=10,
         )

    small_rnn_solver.train()

    # Plot the training losses
    plt.plot(small_rnn_solver.loss_history)
    plt.xlabel('Iteration')
    plt.ylabel('Loss')
    plt.title('Training loss history')
    plt.show()
    plt.savefig('loss_rnn.png')
    plt.close()

    for split in ['train', 'val']:
        # some images might be deprecated. You may rerun the code several times
        # to successfully get the sample images from url.
        minibatch = coco_utils.sample_coco_minibatch(
            small_data, split=split, batch_size=2, seed=0)
        gt_captions, features, urls = minibatch
        gt_captions = coco_utils.decode_captions(gt_captions,
                                                 small_data['idx_to_word'])

        sample_captions = small_rnn_model.sample(features)
        sample_captions = coco_utils.decode_captions(sample_captions,
                                                     small_data['idx_to_word'])

        for i, (gt_caption, sample_caption, url) in enumerate(zip(gt_captions, sample_captions, urls)):
            plt.imshow(image_from_url(url))
            plt.title('%s\n%s\nGT:%s' % (split, sample_caption, gt_caption))
            plt.axis('off')
            plt.show()
            plt.savefig('%s_rnn_%d.png' % (split, i))
            plt.close()
 def train_val_step(self, data, batch_size, optimizer, train_mode=True):
     optimizer.zero_grad()
     if train_mode:
         minibatch = sample_coco_minibatch(data,
                                           batch_size=batch_size,
                                           split='train')
     else:
         minibatch = sample_coco_minibatch(data,
                                           batch_size=batch_size,
                                           split='val')
     captions, features, urls = minibatch
     captions = torch.LongTensor(captions).to(self.device)
     features = torch.from_numpy(features).to(self.device)
     captions_in = captions[:, :-1]
     captions_out = captions[:, 1:]
     Y_hat = self.forward(features, captions_in)
     loss = self.loss(Y_hat, captions_out)
     if train_mode:
         loss.backward()
         optimizer.step()
     return loss
    def _step(self):
        # Make a minibatch of training data
        minibatch = sample_coco_minibatch(self.data,
                                          batch_size=self.batch_size,
                                          split='train')
        captions, features, urls = minibatch

        # Compute loss and gradient
        loss, grads = self.model.loss(features, captions)
        self.loss_history.append(loss)

        # Perform a parameter update
        for p, w in self.model.params.items():
            dw = grads[p]
            config = self.optim_configs[p]
            next_w, next_config = self.update_rule(w, dw, config)
            self.model.params[p] = next_w
            self.optim_configs[p] = next_config
Esempio n. 5
0
def _step_test(sess, data, batch_size, model, keep_prob):
    """
    Make a single gradient update for batch data. 
    """
    # Make a minibatch of training data
    minibatch = sample_coco_minibatch(data, batch_size=batch_size, split='val')
    captions, features, urls = minibatch

    # print out ground truth caption
    captions_in = captions[:, 0].reshape(-1, 1)

    state = None
    final_preds = []
    current_pred = captions_in
    mask = np.zeros((batch_size, model_config.padded_length))
    mask[:, 0] = 1

    # get initial state using image feature
    feed_dict = {
        model['image_feature']: features,
        model['keep_prob']: keep_prob
    }
    state = sess.run(model['initial_state'], feed_dict=feed_dict)

    # start to generate sentences
    for t in range(model_config.padded_length):
        feed_dict = {
            model['input_seqs']: current_pred,
            model['initial_state']: state,
            model['input_mask']: mask,
            model['keep_prob']: keep_prob
        }

        current_pred, state = sess.run([model['preds'], model['final_state']],
                                       feed_dict=feed_dict)

        current_pred = current_pred.reshape(-1, 1)

        final_preds.append(current_pred)

    return final_preds, urls
Esempio n. 6
0
    def _step(self):
        """
        Make a single gradient update. This is called by train() and should not
        be called manually.
        """
        # Make a minibatch of training data
        minibatch = coco_utils.sample_coco_minibatch(
            self.data, batch_size=self.batch_size, split='train')
        captions, features, urls = minibatch

        # Compute loss and gradient
        loss, grads = self.model.loss(features, captions)
        self.loss_history.append(loss)

        # Perform a parameter update
        for p, w in self.model.params.items():
            dw = grads[p]
            config = self.optim_configs[p]
            next_w, next_config = self.update_rule(w, dw, config)
            self.model.params[p] = next_w
            self.optim_configs[p] = next_config
    def getAnnotatedImage(self, data, split):
        ''' samples image and returns it with GT and generated capture'''
        minibatch = sample_coco_minibatch(data, batch_size=1, split=split)
        captions, features, urls = minibatch
        # sample some captions given image features
        gt_captions = decode_captions(captions, data['idx_to_word'])
        _, captions_out = self.beam_decode(features)
        #captions_out = self.sample(features)
        sample_captions = []
        sample_captions.append(
            decode_captions(captions_out, data['idx_to_word']))
        for gt_caption, sample_caption, url in zip(gt_captions,
                                                   sample_captions, urls):
            img = image_from_url(url)
            img = np.asarray(img)
            try:
                img = np.swapaxes(img, 0, 2).transpose(0, 2, 1)
            except ValueError:
                img = np.random.rand(3, 256, 256)
            caption = ('%s \n %s \n GT:%s' %
                       (split, sample_caption, gt_caption))

        return img, caption
def evaluate_model(model, data):
    """
    model: CaptioningRNN model
    Prints unigram BLEU score averaged over 1000 training and val examples.
    """
    BLEUscores = {}
    for split in ['train', 'val']:
        minibatch = sample_coco_minibatch(data, split=split, batch_size=1000)
        gt_captions, features, urls = minibatch
        gt_captions = decode_captions(gt_captions, data['idx_to_word'])

        sample_captions = model.sample(features)
        sample_captions = decode_captions(sample_captions, data['idx_to_word'])

        total_score = 0.0
        for gt_caption, sample_caption, url in zip(gt_captions,
                                                   sample_captions, urls):
            total_score += BLEU_score(gt_caption, sample_caption)

        BLEUscores[split] = total_score / len(sample_captions)

    for split in BLEUscores:
        print('Average BLEU score for %s: %f' % (split, BLEUscores[split]))
Esempio n. 9
0
           lr_decay=0.995,
           verbose=True, print_every=10,
         )

small_lstm_solver.train()

# Plot the training losses
plt.plot(small_lstm_solver.loss_history)
plt.xlabel('Iteration')
plt.ylabel('Loss')
plt.title('Training loss history')
plt.show()

#LSTM test-time sampling
for split in ['train', 'val']:
  minibatch = sample_coco_minibatch(small_data, split=split, batch_size=2)
  gt_captions, features, urls = minibatch
  gt_captions = decode_captions(gt_captions, data['idx_to_word'])

  sample_captions = small_lstm_model.sample(features)
  sample_captions = decode_captions(sample_captions, data['idx_to_word'])

  for gt_caption, sample_caption, url in zip(gt_captions, sample_captions, urls):
    plt.imshow(image_from_url(url))
    plt.title('%s\n%s\nGT:%s' % (split, sample_caption, gt_caption))
    plt.axis('off')
    plt.show()

#train a good model

sdata = load_coco_data(max_train=10000)
Esempio n. 10
0
    return np.max(np.abs(x - y) / (np.maximum(1e-8, np.abs(x) + np.abs(y))))


data = load_coco_data(pca_features=True)

# Print out all the keys and values from the data dictionary
for k, v in data.items():
    if type(v) == np.ndarray:
        print(k, type(v), v.shape, v.dtype)
    else:
        print(k, type(v), len(v))

# Sample a minibatch and show the images and captions
batch_size = 3

captions, features, urls = sample_coco_minibatch(data, batch_size=batch_size)
for i, (caption, url) in enumerate(zip(captions, urls)):
    plt.imshow(image_from_url(url))
    plt.axis('off')
    caption_str = decode_captions(caption, data['idx_to_word'])
    plt.title(caption_str)
    plt.show()
"""
This file defines layer types that are commonly used for recurrent neural
networks.
"""


def rnn_step_forward(x, prev_h, Wx, Wh, b):
    """
    Run the forward pass for a single timestep of a vanilla RNN that uses a tanh
Esempio n. 11
0
def main():
    # The dataset can be downloaded in https://drive.google.com/drive/folders/1zCq7kS9OXc2mgaOzDimAwiBblECWeBtO?usp=sharing
    # The dataset contains the feature of images in MSCOCO dataset
    # Load COCO data from disk; this returns a dictionary
    small_data = load_coco_data(max_train=50)

    # Experiment with vanilla RNN
    small_rnn_model = CaptioningRNN(
        cell_type='rnn',
        word_to_idx=small_data['word_to_idx'],
        input_dim=small_data['train_features'].shape[1],
        hidden_dim=512,
        wordvec_dim=256,
    )

    small_rnn_solver = CaptioningSolver(
        small_rnn_model,
        small_data,
        update_rule='adam',
        num_epochs=50,
        batch_size=25,
        optim_config={
            'learning_rate': 5e-3,
        },
        lr_decay=0.95,
        verbose=True,
        print_every=10,
    )

    small_rnn_solver.train()

    # Plot the training losses
    plt.plot(small_rnn_solver.loss_history)
    plt.xlabel('Iteration')
    plt.ylabel('Loss')
    plt.title('Training loss history')
    plt.show()

    for split in ['train', 'val']:
        minibatch = sample_coco_minibatch(small_data,
                                          split=split,
                                          batch_size=2)
        gt_captions, features, urls = minibatch
        gt_captions = decode_captions(gt_captions, small_data['idx_to_word'])

        sample_captions = small_rnn_model.sample(features)
        sample_captions = decode_captions(sample_captions,
                                          small_data['idx_to_word'])

        for gt_caption, sample_caption, url in zip(gt_captions,
                                                   sample_captions, urls):
            plt.imshow(image_from_url(url))
            plt.title('%s\n%s\nGT:%s' % (split, sample_caption, gt_caption))
            plt.axis('off')
            plt.show()

    ##################################################################################################

    # Experiment with LSTM
    small_lstm_model = CaptioningRNN(
        cell_type='lstm',
        word_to_idx=small_data['word_to_idx'],
        input_dim=small_data['train_features'].shape[1],
        hidden_dim=512,
        wordvec_dim=256,
        dtype=np.float32,
    )
    small_lstm_solver = CaptioningSolver(
        small_lstm_model,
        small_data,
        update_rule='adam',
        num_epochs=50,
        batch_size=25,
        optim_config={
            'learning_rate': 5e-3,
        },
        lr_decay=0.995,
        verbose=True,
        print_every=10,
    )

    small_lstm_solver.train()
    # Plot the training losses
    plt.plot(small_lstm_solver.loss_history)
    plt.xlabel('Iteration')
    plt.ylabel('Loss')
    plt.title('Training loss history')
    plt.show()

    for split in ['train', 'val']:
        minibatch = sample_coco_minibatch(small_data,
                                          split=split,
                                          batch_size=2)
        gt_captions, features, urls = minibatch
        gt_captions = decode_captions(gt_captions, small_data['idx_to_word'])

        sample_captions = small_lstm_model.sample(features)
        sample_captions = decode_captions(sample_captions,
                                          small_data['idx_to_word'])

        for gt_caption, sample_caption, url in zip(gt_captions,
                                                   sample_captions, urls):
            plt.imshow(image_from_url(url))
            plt.title('%s\n%s\nGT:%s' % (split, sample_caption, gt_caption))
            plt.axis('off')
            plt.show()
Esempio n. 12
0
plt.plot(small_rnn_solver.loss_history)
plt.xlabel('Iteration')
plt.ylabel('Loss')
plt.title('Training loss history')
plt.show()

for k, v in data.iteritems():
  if type(v) == np.ndarray:
    print k, type(v), v.shape, v.dtype
  else:
    print k, type(v), len(v)
    
# Look at the data
batch_size = 3

captions, features, urls = sample_coco_minibatch(data, batch_size=batch_size)
for i, (caption, url) in enumerate(zip(captions, urls)):
  plt.imshow(image_from_url(url))
  plt.axis('off')
  caption_str = decode_captions(caption, data['idx_to_word'])
  plt.title(caption_str)
  plt.show()

#Testing time sampling

for split in ['train', 'val']:
  minibatch = sample_coco_minibatch(small_data, split=split, batch_size=2)
  gt_captions, features, urls = minibatch
  gt_captions = decode_captions(gt_captions, data['idx_to_word'])

  sample_captions = small_rnn_model.sample(features)
def evaluate_model(model,
                   med_data,
                   idx_to_word,
                   batch_size=1000,
                   beam_size=None):
    """
    model: CaptioningRNN model
    Prints unigram BLEU score averaged over 1000 training and val examples.

    """
    BLEUscores = {}
    if beam_size is None:  # no beam search
        for split in ['train', 'val']:
            minibatch = sample_coco_minibatch(med_data,
                                              split=split,
                                              batch_size=batch_size)
            gt_captions, features, urls = minibatch
            gt_captions = decode_captions(gt_captions, med_data['idx_to_word'])

            sample_captions = model.sample(features)

            sample_captions = decode_captions(sample_captions,
                                              med_data['idx_to_word'])

            total_score = 0.0
            for gt_caption, sample_caption, url in zip(gt_captions,
                                                       sample_captions, urls):
                total_score += BLEU_score(gt_caption, sample_caption)

            BLEUscores[split] = total_score / len(sample_captions)

        for split in BLEUscores:
            print('Average BLEU score for %s: %f' % (split, BLEUscores[split]))
    else:  # with beam search
        for split in ['train', 'val']:
            sample_captions = []  # empty list for the sample captures
            gt_captions = []  # empty list for GT
            urls = []
            for batch in range(batch_size):
                minibatch = sample_coco_minibatch(
                    med_data, split=split,
                    batch_size=1)  # each time only one sample
                gt_caption, features, url = minibatch
                gt_caption = decode_captions(gt_caption,
                                             med_data['idx_to_word'])

                _, sample_caption = model.beam_decode(features,
                                                      beam_size=beam_size)

                sample_caption = decode_captions(sample_caption,
                                                 med_data['idx_to_word'])

                sample_captions.append(str(sample_caption))
                gt_captions.append(str(gt_caption))
                urls.append(url)

            total_score = 0.0
            for gt_caption, sample_caption, url in zip(gt_captions,
                                                       sample_captions, urls):
                total_score += BLEU_score(gt_caption, sample_caption)

            BLEUscores[split] = total_score / len(
                sample_captions)  # divide by the lenght of words
        for split in BLEUscores:
            print('Average BLEU score for %s: %f' % (split, BLEUscores[split]))

    return BLEUscores['val']
Esempio n. 14
0
    def train(self):
        """
        Train model and print out some useful information(loss, generated captions) for debugging.  
        """
        n_examples = self.data['train_captions'].shape[0]
        n_iters_per_epoch = n_examples // self.batch_size

        # get data
        features = self.data['train_features']
        captions = self.data['train_captions']

        # build train model graph
        loss, generated_captions = self.model.build_model()
        optimizer = self.optimizer(self.learning_rate).minimize(loss)

        # build test model graph
        alphas, sampled_captions = self.model.build_sampler(
        )  # (N, max_len, L), (N, max_len)

        print "num epochs: %d" % self.n_epochs
        print "iterations per epoch: %d" % n_iters_per_epoch
        print "data size: %d" % n_examples
        print "batch size: %d" % self.batch_size

        sess = tf.InteractiveSession()
        tf.initialize_all_variables().run()
        saver = tf.train.Saver(max_to_keep=10)

        for e in range(self.n_epochs):
            # print initial loss
            if e == 0:
                captions_batch, features_batch, _ = sample_coco_minibatch(
                    self.data, self.batch_size, split='train')
                feed_dict = {
                    self.model.features: features_batch,
                    self.model.captions: captions_batch
                }
                gen_caps, l = sess.run([generated_captions, loss], feed_dict)
                self.loss_history.append(l)
                print ""
                print "*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*"
                print "Initial Train Loss: %.5f" % l
                decoded = decode_captions(gen_caps, self.model.idx_to_word)
                for j in range(3):
                    print "Generated Caption: %s" % decoded[j]
                print "*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*"
                print ""

            # actual training step
            for i in range(n_iters_per_epoch):
                captions_batch, features_batch, _ = sample_coco_minibatch(
                    self.data, self.batch_size, split='train')
                feed_dict = {
                    self.model.features: features_batch,
                    self.model.captions: captions_batch
                }
                sess.run(optimizer, feed_dict)

            # save loss history
            l = sess.run(loss, feed_dict)
            self.loss_history.append(l)

            # print info
            if (e + 1) % self.print_every == 0:
                gen_caps = sess.run(generated_captions, feed_dict)
                print ""
                print "*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*"
                print "Train Loss at Epoch %d: %.5f" % (e + 1, l)
                decoded = decode_captions(gen_caps, self.model.idx_to_word)
                for j in range(3):
                    print "Generated Caption: %s" % decoded[j]
                print "*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*"
                print ""

            # save model
            if (e + 1) % self.save_every == 0:
                saver.save(sess,
                           os.path.join(self.model_path, 'model'),
                           global_step=e + 1)
                print "model-%s saved." % (e + 1)

        # actual test step: sample captions and visualize attention
        _, features_batch, image_files = sample_coco_minibatch(self.data,
                                                               self.batch_size,
                                                               split='train')
        feed_dict = {self.model.features: features_batch}
        alps, sam_cap = sess.run([alphas, sampled_captions],
                                 feed_dict)  # (N, max_len, L), (N, max_len)

        # decode captions
        decoded = decode_captions(sam_cap, self.model.idx_to_word)

        # visualize 10 images and captions
        for n in range(10):
            print "Sampled Caption: %s" % decoded[n]

            # plot original image
            img_path = os.path.join(self.image_path, image_files[n])
            img = ndimage.imread(img_path)
            plt.subplot(4, 5, 1)
            plt.imshow(img)
            plt.axis('off')

            # plot image with attention weights
            words = decoded[n].split(" ")
            for t in range(len(words)):
                if t > 18:
                    break
                plt.subplot(4, 5, t + 2)
                plt.text(0,
                         1,
                         words[t],
                         color='black',
                         backgroundcolor='white',
                         fontsize=12)
                plt.imshow(img)
                alp_curr = alps[n, t, :].reshape(14, 14)
                alp_img = skimage.transform.pyramid_expand(alp_curr,
                                                           upscale=16,
                                                           sigma=20)
                plt.imshow(alp_img, alpha=0.8)
                plt.axis('off')
            plt.show()