def test(self,
             save_sampled_captions=True,
             evaluate_score=True,
             generate_demo_sample=False):
        self.atten_model.eval()
        self.atten_model.load_state_dict(torch.load(self.args.save))
        self.atten_model.cuda()

        if save_sampled_captions:
            features = self.test_data['features']
            n_examples = features.shape[0]
            all_sam_cap = np.ndarray((n_examples, 20))
            test_times = int(np.ceil(float(n_examples) / self.args.batch_size))
            for t in range(test_times):
                features_batch = Variable(
                    torch.from_numpy(
                        features[t * self.args.batch_size:(t + 1) *
                                 self.args.batch_size])).cuda()
                _, _, sampled_captions = self.atten_model.build_sample(
                    features_batch)
                all_sam_cap[t * self.args.batch_size:(t + 1) *
                            self.args.batch_size] = np.array(
                                sampled_captions.data)
            decoded = decode_captions(all_sam_cap, self.idx_to_word)
            save_pickle(decoded, self.args.test_samples)
            print 'test all sccessful'

        if evaluate_score:
            ref = load_pickle('./data/test/test.references.pkl')
            try:
                evaluate(ref, decoded)
            except KeyboardInterrupt:
                decoded = load_pickle(self.args.test_samples)
                evaluate(ref, decoded)

        if generate_demo_sample:
            features = self.args.demo_feat
            features_batch = Variable(torch.from_numpy(features)).cuda()
            _, _, sampled_captions = self.atten_model.build_sample(
                features_batch)
            decoded = decode_captions(sampled_captions, self.idx_to_word)
            print decoded
Beispiel #2
0
 def evaluate_on_split(self,
                       sess,
                       generated_captions,
                       summary_writer,
                       epoch,
                       tags,
                       split='train'):
     caps = self.data.captions[split]
     ids = self.data.video_ids[split]
     unique_ids = list(set(ids))
     num_iter = int(ceil(len(unique_ids) / float(self.batch_size)))
     while len(unique_ids) < num_iter * self.batch_size:
         unique_ids += unique_ids
     unique_ids = unique_ids[:num_iter * self.batch_size]
     all_gen_cap = np.ndarray((len(unique_ids), self.max_words),
                              dtype=np.int)
     for i in range(num_iter):
         features_batch = [
             self.data.feature(vid)
             for vid in unique_ids[i * self.batch_size:(i + 1) *
                                   self.batch_size]
         ]
         # if len(features_batch) < self.batch_size:
         #     l = len(features_batch)
         #     features_batch += [self.data.feature(vid) for vid in unique_ids[:self.batch_size - l]]
         features_batch = np.asarray(features_batch)
         feed_dict = {self.features: features_batch}
         gen_cap = sess.run(generated_captions, feed_dict=feed_dict)
         all_gen_cap[i * self.batch_size:(i + 1) *
                     self.batch_size] = gen_cap
     all_decoded = decode_captions(all_gen_cap, self.data.vocab.idx2word)
     # create cand dict
     cand = {}
     for vid, sentence in zip(unique_ids, all_decoded):
         cand[vid] = [sentence]
     # create ref dict
     ref = {}
     for vid in unique_ids:
         ref[vid] = decode_captions(caps[ids == vid][:, 1:],
                                    self.data.vocab.idx2word)
     with open('result/cand_%s_%d.txt' % (split, epoch), 'w') as file:
         file.write(str(cand))
     with open('result/ref_%s_%d.txt' % (split, epoch), 'w') as file:
         file.write(str(ref))
     # evaluate
     scores = evaluate(ref=ref, cand=cand, get_scores=True)
     for tag in tags:
         summary = tf.Summary()
         summary.value.add(tag=split + tag, simple_value=scores[tag])
         summary_writer.add_summary(summary, epoch)
     return scores
Beispiel #3
0
    def run_eval(self, candidate):
        with open(candidate, "r") as f:
            cand = json.load(f)  # correct format

        # keys without image features have to be filtered out - this is the easiest way without checking image vectors
        for key in self.refdict4eval.keys():
            if not key in cand.keys():
                del self.refdict4eval[key]

        assert len(self.refdict4eval.keys()) == len(cand.keys())

        with open(self.reference_dict_path, 'w') as f:
            json.dump(self.refdict4eval, f)

        return bleu.evaluate(self.reference_dict_path, candidate, True)
Beispiel #4
0
def validation(vocab, val_loader, encoder, decoder, beam_width):
    encoder.eval()
    decoder.eval()
    output_captions = dict()  # Map(ID -> List(sentences))
    ref_captions = dict()  # Map(ID -> List(sentences))

    # Iterate over validation data set
    with torch.no_grad():
        for i, (image, captions) in enumerate(val_loader):
            image = image.to(device)
            # captions = cations.to(device)??
            feature = encoder(image)

            output_caption = decoder.sample(feature, beam_width=beam_width)

            # exclude <pad> <start> <end>
            output_without_nonstring = []
            for idx in output_caption:
                if idx == 2 or idx == 19:
                    break
                elif idx <= 3:
                    continue
                else:
                    output_without_nonstring.append(vocab.vec2word(idx))

            output_captions[i] = [" ".join(output_without_nonstring)]
            ref_captions[i] = [
                ref_caption[0].lower() for ref_caption in captions
            ]
            if i % log_step == 0:
                print('Validation Step [{}/{}]'.format(i, len(val_loader)))
                print(output_captions[i])
                print(ref_captions[i])

    bleu_score = evaluate(ref_captions, output_captions)
    print(bleu_score)
Beispiel #5
0
    def train(self):
        data_save_path = self.data_save_path
        n_examples = self.data['captions'].shape[0]
        n_iters_per_epoch = int(np.floor(float(n_examples) / self.batch_size))
        features = self.data['features']
        captions = self.data['captions'][:,:21]

        image_idxs = self.data['image_idxs']
        val_features = self.val_data['features']
        n_iters_val = int(np.ceil(float(val_features.shape[0]) / self.batch_size))

        with tf.variable_scope(tf.get_variable_scope()):
            loss = self.model.build_model()
            tf.get_variable_scope().reuse_variables()
            _, _, generated_captions = self.model.build_sampler(max_len=16)

        with tf.variable_scope(tf.get_variable_scope()):
            optimizer = self.optimizer(learning_rate=self.learning_rate)
            grads = tf.gradients(loss, tf.trainable_variables())
            grads_and_vars = list(zip(grads, tf.trainable_variables()))
            train_op = optimizer.apply_gradients(grads_and_vars=grads_and_vars)


        tf.summary.scalar('batch_loss', loss)
        for var in tf.trainable_variables():
            tf.summary.histogram(var.op.name, var)
        for grad, var in grads_and_vars:
            tf.summary.histogram(var.op.name + '/gradient', grad)

        summary_op = tf.summary.merge_all()

        print "The number of epoch: %d" % self.n_epochs
        print "Data size: %d" % n_examples
        print "Batch size: %d" % self.batch_size
        print "Iterations per epoch: %d" % n_iters_per_epoch

        config = tf.ConfigProto(allow_soft_placement=True)
        # config.gpu_options.per_process_gpu_memory_fraction=0.9
        config.gpu_options.allow_growth = True
        with tf.Session(config=config) as sess:
            tf.global_variables_initializer().run()
            summary_writer = tf.summary.FileWriter(self.log_path, graph=tf.get_default_graph())
            saver = tf.train.Saver(max_to_keep=40)

            if self.pretrained_model is not None:
                print "Start training with pretrained Model.."
                saver.restore(sess, self.pretrained_model)

            prev_loss = -1
            curr_loss = 0
            start_t = time.time()

            for e in range(self.n_epochs):
                rand_idxs = np.random.permutation(n_examples)
                captions = captions[rand_idxs]
                image_idxs = image_idxs[rand_idxs]

                for i in range(n_iters_per_epoch):

                    captions_batch = captions[i * self.batch_size:(i + 1) * self.batch_size]
                    image_idxs_batch = image_idxs[i * self.batch_size:(i + 1) * self.batch_size]
                    features_batch = features[image_idxs_batch]

                    feed_dict = {self.model.features: features_batch, self.model.captions: captions_batch,
                                 }
                    _, l = sess.run([train_op, loss], feed_dict)
                    curr_loss += l

                    if (i + 1) % self.print_every == 0:

                        ground_truths = captions[image_idxs == image_idxs_batch[0], 4:]
                        decoded = decode_captions(ground_truths, self.model.idx_to_word)
                        for j, gt in enumerate(decoded):
                            print "Ground truth %d: %s" % (j + 1, gt)
                        gen_caps = sess.run(generated_captions, feed_dict)
                        decoded = decode_captions(gen_caps, self.model.idx_to_word)
                        print "Generated caption: %s\n" % decoded[0]

                print "Previous epoch loss: ", prev_loss
                print "Current epoch loss: ", curr_loss
                print "Elapsed time: ", time.time() - start_t
                prev_loss = curr_loss
                curr_loss = 0

                if self.print_bleu:
                    all_gen_cap = np.ndarray((val_features.shape[0], 20))

                    val_features[:, :, 2048:2052] = [1, 0, 0, 1]

                    for i in range(n_iters_val):
                        features_batch = val_features[i * self.batch_size:(i + 1) * self.batch_size]
                        feed_dict = {self.model.features: features_batch}
                        gen_cap = sess.run(generated_captions, feed_dict=feed_dict)
                        all_gen_cap[i * self.batch_size:(i + 1) * self.batch_size] = gen_cap

                    all_decoded = decode_captions(all_gen_cap, self.model.idx_to_word)
                    save_pickle(all_decoded, os.path.join(data_save_path, "val/val.candidate.captions.pkl"))
                    scores = evaluate(data_path=data_save_path, split='val', get_scores=True)

                    write_bleu(scores=scores, path=self.model_path, epoch=e, senti=pos)

                    val_features[:, :, 2048:2052] = [0, 0, 1, 2]
                    for i in range(n_iters_val):
                        features_batch = val_features[i * self.batch_size:(i + 1) * self.batch_size]
                        feed_dict = {self.model.features: features_batch}
                        gen_cap = sess.run(generated_captions, feed_dict=feed_dict)
                        all_gen_cap[i * self.batch_size:(i + 1) * self.batch_size] = gen_cap

                    all_decoded = decode_captions(all_gen_cap, self.model.idx_to_word)
                    save_pickle(all_decoded, os.path.join(data_save_path, "val/val.candidate.captions.pkl"))
                    scores = evaluate(data_path=data_save_path, split='val', get_scores=True)

                    write_bleu(scores=scores, path=self.model_path, epoch=e, senti=neg)

                if (e + 1) % self.save_every == 0:
                    saver.save(sess, os.path.join(self.model_path, 'model'), global_step=e + 1)
                    print "model-%s saved." % (e + 1)
Beispiel #6
0
    def train(self, data, support_data, val_data, epochs, pretrain_epochs, batch_size = 100, print_bleu = True):
        n_example = data['captions'].shape[0]
        if n_example%batch_size == 0:
            n_batch = int(n_example/batch_size)
        else:
            n_batch = int(n_example/batch_size)+1

        features = data['features']
        captions = data['captions']
        image_idx = data['image_idxs']

        val_features = val_data['features']
        n_val_example = val_features.shape[0]
        if n_val_example%batch_size==0:
            n_val_batch = int(n_val_example/batch_size)
        else:
            n_val_batch = int(n_val_example/batch_size)+1

        pretrain_opt = self.optimizer.minimize(self.model.pretrain_loss)
        train_opt = self.optimizer.minimize(self.model.loss)
        loss = self.model.loss
        pretrain_loss = self.model.pretrain_loss
        alphs, sample_caption = self.model.build_sampler()
        init = tf.global_variables_initializer()

        print('='*80)
        print('The number of epoch: %d'%epochs)
        print('Iteration per epoch: %d'%n_batch)
        print('The batch size: %d'%batch_size)
        print('The number of training example: %d'%n_example)
        print('The number of validation example: %d'%n_val_example)
        print('='*80)

        # config = tf.ConfigProto(allow_soft_placement = True)
        # config.gpu_options.allow_growth = True
        with tf.Session() as sess:
            print('model is initialized.')
            sess.run(init)
            saver = tf.train.Saver(max_to_keep=40)

            print('Start to pretrain...')
            start_time = time.time()
            for pre_ep in xrange(pretrain_epochs):
                pre_ep_start = time.time()
                rand_idxs = np.random.permutation(n_example)
                #captions = captions[rand_idxs]
                #support_data = support_data[rand_idxs]
                #image_idx = image_idx[rand_idxs]
                pretrain_cost = 0.
                for itr in tqdm(xrange(n_batch),desc='Pretrain Epoch:%d'%(pre_ep+1)):
                    start = itr*batch_size
                    if (itr+1)*batch_size>n_example:
                        end = n_example
                    else:
                        end = (itr+1)*batch_size
		    rand_idxs_batch = rand_idxs[start:end]
                    caption_batch = captions[rand_idxs_batch]
                    support_data_batch = support_data[rand_idxs_batch]
                    image_idx_batch = image_idx[rand_idxs_batch]
                    features_batch = features[image_idx_batch]
                                        
                    feed_dict={
                        self.model.img_feature:features_batch,
                        self.model.support_context:support_data_batch,
                        self.model.captions:caption_batch
                    }
                    _,pre_loss_batch = sess.run([pretrain_opt,pretrain_loss],feed_dict=feed_dict)
                    pretrain_cost += pre_loss_batch/n_batch
                pre_ep_end = time.time()
                pre_ep_sec = pre_ep_end-pre_ep_start
                pre_ep_min = int(pre_ep_sec/60)
                pre_ep_sec = pre_ep_sec%60

                print('='*80)
                print('Pretrain Epoch: %d'%(pre_ep+1))
                print('Pretrain loss: %.4f'%pretrain_cost)
                print('Cost time %d:%d'%(pre_ep_min,pre_ep_sec))
                print('='*80)


            print('Start to train...')
            for ep in xrange(epochs):
                train_cost=0.
                ep_start = time.time()
                rand_idxs = np.random.permutation(n_example)
                #captions = captions[rand_idxs]
                #support_data = support_data[rand_idxs]
                #image_idx = image_idx[rand_idxs]

                for itr in tqdm(xrange(n_batch),desc='Epoch:%d'%(ep+1)):
                    start = itr*batch_size
                    if (itr+1)*batch_size>n_example:
                        end = n_example
                    else:
                        end = (itr+1)*batch_size
	            rand_idxs_batch = rand_idxs[start:end]
                    caption_batch = captions[rand_idxs_batch]
                    image_idx_batch = image_idx[rand_idxs_batch]
                    features_batch = features[image_idx_batch]
                    support_data_batch = support_data[rand_idxs_batch]

                    feed_dict = {
                        self.model.img_feature:features_batch,
                        self.model.support_context:support_data_batch,
                        self.model.captions:caption_batch
                    }
                    _,loss_batch = sess.run([train_opt,loss],feed_dict=feed_dict)
                    train_cost += loss_batch/n_batch
                ep_end = time.time()
                ep_sec = ep_end-ep_start
                ep_min = int(ep_sec/60)
                ep_sec = ep_sec%60

                saver.save(sess,'./model_ckpt/model',global_step=ep+1)

                print('='*80)
                print('Epoch: %d'%(ep+1))
                print('Training loss: %.4f'%train_cost)
                print('Cost time %d:%d'%(ep_min,ep_sec))
                print('model-%d is saved'%(ep+1))
                print('='*80)

                if print_bleu:
                    all_gen_cap = np.ndarray((val_features.shape[0], 16))
                    for i in xrange(n_val_batch):
                        start = i*batch_size
                        if (i+1)*batch_size>n_val_example:
                            end = n_val_example
                        else:
                            end = (i+1)*batch_size
                        features_batch = val_features[start:end]
                        feed_dict = {self.model.img_feature: features_batch}
                        gen_cap = sess.run(sample_caption, feed_dict=feed_dict)  
                        all_gen_cap[start:end] = gen_cap
                    
                    all_decoded = decode_captions(all_gen_cap, self.model.idx_to_word)
                    save_pickle(all_decoded, "./data/val/val.candidate.captions.pkl")
                    scores = evaluate(data_path='./data', split='val', get_scores=True)
                    write_bleu(scores=scores, path='./model_ckpt', epoch=ep)
            
            end_time = time.time()
            total_sec = end_time-start_time
            total_hr = int(total_sec/3600)
            total_min = int((total_sec%3600)/60)
            total_sec = total_sec%60
            print('\n')
            print('Total cost time %d:%d:%d'%(total_hr,total_min,total_sec))
Beispiel #7
0
    def train(self):
        # train/val dataset
        n_examples = self.data['captions'].shape[0]
        n_iters_per_epoch = int(np.ceil(float(n_examples) / self.batch_size))
        features = self.data['features']
        captions = self.data['captions']
        image_idxs = self.data['image_idxs']
        val_features = self.val_data['features']
        n_iters_val = int(
            np.ceil(float(val_features.shape[0]) / self.batch_size))

        # build graphs for training model and sampling captions
        loss = self.model.build_model()
        #        tf.get_variable_scope().reuse_variables()
        #        _, _, generated_captions = self.model.build_sampler(max_len=20)

        # train op
        #with tf.name_scope('optimizer'):
        with tf.variable_scope(tf.get_variable_scope()) as scope:
            with tf.name_scope('optimizer'):
                optimizer = self.optimizer(learning_rate=self.learning_rate)
                grads = tf.gradients(loss, tf.trainable_variables())
                grads_and_vars = list(zip(grads, tf.trainable_variables()))
                train_op = optimizer.apply_gradients(
                    grads_and_vars=grads_and_vars)

        tf.get_variable_scope().reuse_variables()
        _, _, generated_captions = self.model.build_sampler(
            max_len=20)  # summary op
        # tf.scalar_summary('batch_loss', loss)
        tf.summary.scalar('batch_loss', loss)
        for var in tf.trainable_variables():
            #tf.histogram_summary(var.op.name, var)
            tf.summary.histogram(var.op.name, var)
        for grad, var in grads_and_vars:
            #tf.histogram_summary(var.op.name+'/gradient', grad)
            tf.summary.histogram(var.op.name + '/gradient', grad)

        #summary_op = tf.merge_all_summaries()
        summary_op = tf.summary.merge_all()

        print "The number of epoch: %d" % self.n_epochs
        print "Data size: %d" % n_examples
        print "Batch size: %d" % self.batch_size
        print "Iterations per epoch: %d" % n_iters_per_epoch

        config = tf.ConfigProto(allow_soft_placement=True)
        #config.gpu_options.per_process_gpu_memory_fraction=0.9
        config.gpu_options.allow_growth = True
        with tf.Session(config=config) as sess:
            tf.global_variables_initializer().run()
            #summary_writer = tf.train.SummaryWriter(self.log_path, graph=tf.get_default_graph())
            summary_writer = tf.summary.FileWriter(
                self.log_path, graph=tf.get_default_graph())
            saver = tf.train.Saver(max_to_keep=40)

            if self.pretrained_model is not None:
                print "Start training with pretrained Model.."
                saver.restore(sess, self.pretrained_model)

            prev_loss = -1
            curr_loss = 0
            start_t = time.time()

            for e in range(self.n_epochs):
                rand_idxs = np.random.permutation(n_examples)
                captions = captions[rand_idxs]
                image_idxs = image_idxs[rand_idxs]

                for i in range(n_iters_per_epoch):
                    captions_batch = captions[i * self.batch_size:(i + 1) *
                                              self.batch_size]
                    image_idxs_batch = image_idxs[i * self.batch_size:(i + 1) *
                                                  self.batch_size]
                    features_batch = features[image_idxs_batch]
                    feed_dict = {
                        self.model.features: features_batch,
                        self.model.captions: captions_batch
                    }
                    _, l = sess.run([train_op, loss], feed_dict)
                    curr_loss += l

                    # write summary for tensorboard visualization
                    if i % 10 == 0:
                        summary = sess.run(summary_op, feed_dict)
                        summary_writer.add_summary(summary,
                                                   e * n_iters_per_epoch + i)

                    if (i + 1) % self.print_every == 0:
                        print "\nTrain loss at epoch %d & iteration %d (mini-batch): %.5f" % (
                            e + 1, i + 1, l)
                        ground_truths = captions[image_idxs ==
                                                 image_idxs_batch[0]]
                        decoded = decode_captions(ground_truths,
                                                  self.model.idx_to_word)
                        for j, gt in enumerate(decoded):
                            print "Ground truth %d: %s" % (j + 1, gt)
                        gen_caps = sess.run(generated_captions, feed_dict)
                        decoded = decode_captions(gen_caps,
                                                  self.model.idx_to_word)
                        print "Generated caption: %s\n" % decoded[0]

                print "Previous epoch loss: ", prev_loss
                print "Current epoch loss: ", curr_loss
                print "Elapsed time: ", time.time() - start_t
                prev_loss = curr_loss
                curr_loss = 0

                # print out BLEU scores and file write
                if self.print_bleu:
                    all_gen_cap = np.ndarray((val_features.shape[0], 20))
                    for i in range(n_iters_val):
                        features_batch = val_features[i *
                                                      self.batch_size:(i + 1) *
                                                      self.batch_size]
                        feed_dict = {self.model.features: features_batch}
                        gen_cap = sess.run(generated_captions,
                                           feed_dict=feed_dict)
                        all_gen_cap[i * self.batch_size:(i + 1) *
                                    self.batch_size] = gen_cap

                    all_decoded = decode_captions(all_gen_cap,
                                                  self.model.idx_to_word)
                    save_pickle(all_decoded,
                                "./data/val/val.candidate.captions.pkl")
                    scores = evaluate(data_path='./data',
                                      split='val',
                                      get_scores=True)
                    write_bleu(scores=scores, path=self.model_path, epoch=e)

                # save model's parameters
                if (e + 1) % self.save_every == 0:
                    saver.save(sess,
                               os.path.join(self.model_path, 'model'),
                               global_step=e + 1)
                    print "model-%s saved." % (e + 1)
    def train(self):
        # train/val dataset
        # Changed this because I keep less features than captions, see prepro
        # n_examples = self.data['captions'].shape[0]
        n_examples = self.data['features'].shape[0]
        n_iters_per_epoch = int(np.ceil(float(n_examples)/self.batch_size))
        features = self.data['features']
        captions = self.data['captions']
        image_idxs = self.data['image_idxs']
        val_features = self.val_data['features']
        n_iters_val = int(np.ceil(float(val_features.shape[0])/self.batch_size))

        # build graphs for training model and sampling captions
        # This scope fixed things!!
        with tf.variable_scope(tf.get_variable_scope()):
            loss = self.model.build_model()
            tf.get_variable_scope().reuse_variables()
            _, _, generated_captions = self.model.build_sampler(max_len=20)

        # train op
        with tf.variable_scope(tf.get_variable_scope(), reuse=False):
            optimizer = self.optimizer(learning_rate=self.learning_rate)
            grads = tf.gradients(loss, tf.trainable_variables())
            grads_and_vars = list(zip(grads, tf.trainable_variables()))
            train_op = optimizer.apply_gradients(grads_and_vars=grads_and_vars)

        # summary op
        # tf.scalar_summary('batch_loss', loss)
        tf.summary.scalar('batch_loss', loss)
        for var in tf.trainable_variables():
            #tf.histogram_summary(var.op.name, var)
            tf.summary.histogram(var.op.name, var)
        for grad, var in grads_and_vars:
            #tf.histogram_summary(var.op.name+'/gradient', grad)
            tf.summary.histogram(var.op.name+'/gradient', grad)

        #summary_op = tf.merge_all_summaries()
        summary_op = tf.summary.merge_all()

        print "The number of epoch: %d" %self.n_epochs
        print "Data size: %d" %n_examples
        print "Batch size: %d" %self.batch_size
        print "Iterations per epoch: %d" %n_iters_per_epoch

        config = tf.ConfigProto(allow_soft_placement = True)
        #config.gpu_options.per_process_gpu_memory_fraction=0.9
        config.gpu_options.allow_growth = True
        with tf.Session(config=config) as sess:
            tf.global_variables_initializer().run()
            #summary_writer = tf.train.SummaryWriter(self.log_path, graph=tf.get_default_graph())
            summary_writer = tf.summary.FileWriter(self.log_path, graph=tf.get_default_graph())
            saver = tf.train.Saver(max_to_keep=40)

            if self.pretrained_model is not None:
                print "Start training with pretrained Model.."
                saver.restore(sess, self.pretrained_model)

            prev_loss = -1
            curr_loss = 0
            start_t = time.time()

            for e in range(self.n_epochs):
                rand_idxs = np.random.permutation(n_examples)
                captions = captions[rand_idxs]
                image_idxs = image_idxs[rand_idxs]

                for i in range(n_iters_per_epoch):
                    captions_batch = captions[i*self.batch_size:(i+1)*self.batch_size]
                    image_idxs_batch = image_idxs[i*self.batch_size:(i+1)*self.batch_size]
                    features_batch = features[image_idxs_batch]
                    feed_dict = {self.model.features: features_batch, self.model.captions: captions_batch}
                    _, l = sess.run([train_op, loss], feed_dict)
                    curr_loss += l

                    # write summary for tensorboard visualization
                    if i % 10 == 0:
                        summary = sess.run(summary_op, feed_dict)
                        summary_writer.add_summary(summary, e*n_iters_per_epoch + i)

                    if (i+1) % self.print_every == 0:
                        print "\nTrain loss at epoch %d & iteration %d (mini-batch): %.5f" %(e+1, i+1, l)
                        ground_truths = captions[image_idxs == image_idxs_batch[0]]
                        decoded = decode_captions(ground_truths, self.model.idx_to_word)
                        for j, gt in enumerate(decoded):
                            print "Ground truth %d: %s" %(j+1, gt)
                        gen_caps = sess.run(generated_captions, feed_dict)
                        decoded = decode_captions(gen_caps, self.model.idx_to_word)
                        print "Generated caption: %s\n" %decoded[0]

                print "Previous epoch loss: ", prev_loss
                print "Current epoch loss: ", curr_loss
                print "Elapsed time: ", time.time() - start_t
                prev_loss = curr_loss
                curr_loss = 0

                # print out BLEU scores and file write
                if self.print_bleu:
                    all_gen_cap = np.ndarray((val_features.shape[0], 20))
                    for i in range(n_iters_val):
                        features_batch = val_features[i*self.batch_size:(i+1)*self.batch_size]
                        feed_dict = {self.model.features: features_batch}
                        gen_cap = sess.run(generated_captions, feed_dict=feed_dict)
                        all_gen_cap[i*self.batch_size:(i+1)*self.batch_size] = gen_cap

                    all_decoded = decode_captions(all_gen_cap, self.model.idx_to_word)
                    save_pickle(all_decoded, "./data/val/val.candidate.captions.pkl")
                    scores = evaluate(data_path='./data', split='val', get_scores=True)
                    write_bleu(scores=scores, path=self.model_path, epoch=e)

                # save model's parameters
                if (e+1) % self.save_every == 0:
                    saver.save(sess, os.path.join(self.model_path, 'model'), global_step=e+1)
                    print "model-%s saved." %(e+1)
Beispiel #9
0
                print "Elapsed time: ", time.time() - start_t
                prev_loss = curr_loss
                curr_loss = 0

                # print out BLEU scores and file write
                if self.print_bleu:
                    all_gen_cap = np.ndarray((val_features.shape[0], 20))
                    for i in range(n_iters_val):
                        features_batch = val_features[i*self.batch_size:(i+1)*self.batch_size]
                        feed_dict = {self.model.features: features_batch}
                        gen_cap = sess.run(generated_captions, feed_dict=feed_dict)
                        all_gen_cap[i*self.batch_size:(i+1)*self.batch_size] = gen_cap

                    all_decoded = decode_captions(all_gen_cap, self.model.idx_to_word)
                    save_pickle(all_decoded, "./data/val/val.candidate.captions.pkl")
                    scores = evaluate(data_path='./data', split='val', get_scores=True)
                    write_bleu(scores=scores, path=self.model_path, epoch=e)

                # save model's parameters
                if (e+1) % self.save_every == 0:
                    saver.save(sess, os.path.join(self.model_path, 'model'), global_step=e+1)
                    print "model-%s saved." %(e+1)


    def test(self, data, split='train', attention_visualization=True, save_sampled_captions=True):
        '''
        Args:
            - data: dictionary with the following keys:
            - features: Feature vectors of shape (5000, 196, 512)
            - file_names: Image file names of shape (5000, )
            - captions: Captions of shape (24210, 17)
    def test(self, data, split='train', attention_visualization=False, save_sampled_captions=False):
        '''
        Args:
            - data: dictionary with the following keys:
            - features: Feature vectors of shape (5000, 196, 512)
            - file_names: Image file names of shape (5000, )
            - captions: Captions of shape (24210, 17)
            - image_idxs: Indices for mapping caption to image of shape (24210, )
            - features_to_captions: Mapping feature to captions (5000, 4~5)
            - split: 'train', 'val' or 'test'
            - attention_visualization: If True, visualize attention weights with images for each sampled word. (ipthon notebook)
            - save_sampled_captions: If True, save sampled captions to pkl file for computing BLEU scores.
        '''

        features = data['features']
        n_examples = self.data['captions'].shape[0]
        n_iters_per_epoch = int(np.ceil(float(n_examples) / self.batch_size))
        # build a graph to sample captions
        alphas, betas, sampled_captions = self.model.build_sampler(max_len=20)  # (N, max_len, L), (N, max_len)

        config = tf.ConfigProto(allow_soft_placement=True)
        config.gpu_options.allow_growth = True
        with tf.Session(config=config) as sess:
            saver = tf.train.Saver()
            saver.restore(sess, self.test_model)
            features_batch, image_files = sample_coco_minibatch_inference(data, self.batch_size)
            feed_dict = {self.model.features: features_batch}
            alps, bts, sam_cap = sess.run([alphas, betas, sampled_captions], feed_dict)  # (N, max_len, L), (N, max_len)
            decoded = decode_captions(sam_cap, self.model.idx_to_word)

            if self.print_bleu:
                all_gen_cap = np.ndarray((features.shape[0], 20))
                for i in range(n_iters_per_epoch):
                    features_batch = features[i * self.batch_size:(i + 1) * self.batch_size]
                    feed_dict = {self.model.features: features_batch}
                    gen_cap = sess.run(sampled_captions, feed_dict=feed_dict)
                    all_gen_cap[i * self.batch_size:(i + 1) * self.batch_size] = gen_cap

                all_decoded = decode_captions(all_gen_cap, self.model.idx_to_word)
                save_pickle(all_decoded, "./data/val/val.candidate.captions.pkl")
                scores = evaluate(data_path='./data', split='val', get_scores=True)


            if attention_visualization:
                for n in range(10):
                    print "Sampled Caption: %s" % decoded[n]

                    # Plot original image
                    img = ndimage.imread(image_files[n])
                    plt.clf()
                    plt.subplot(4, 5, 1)
                    plt.imshow(img)
                    plt.axis('off')

                    # Plot images with attention weights
                    words = decoded[n].split(" ")
                    for t in range(len(words)):
                        if t > 18:
                            break
                        plt.subplot(4, 5, t + 2)
                        plt.text(0, 1, '%s(%.2f)' % (words[t], bts[n, t]), color='black', backgroundcolor='white',
                                 fontsize=8)
                        plt.imshow(img)
                        alp_curr = alps[n, t, :].reshape(14, 14)
                        alp_img = skimage.transform.pyramid_expand(alp_curr, upscale=16, sigma=20)
                        plt.imshow(alp_img, alpha=0.85)
                        plt.axis('off')
                    plt.savefig(str(n) + 'test.pdf')

            if save_sampled_captions:
                all_sam_cap = np.ndarray((features.shape[0], 20))
                num_iter = int(np.ceil(float(features.shape[0]) / self.batch_size))
                for i in range(num_iter):
                    features_batch = features[i * self.batch_size:(i + 1) * self.batch_size]
                    feed_dict = {self.model.features: features_batch}
                    all_sam_cap[i * self.batch_size:(i + 1) * self.batch_size] = sess.run(sampled_captions, feed_dict)
                all_decoded = decode_captions(all_sam_cap, self.model.idx_to_word)
                save_pickle(all_decoded, "./data/%s/%s.candidate.captions.pkl" % (split, split))
    def train(self):

        ######################################################
        # move to each epoch to solve huge data load problem #
        ######################################################

        # train/val dataset
        n_examples = self.data['captions'].shape[0]
        n_iters_per_epoch = int(np.ceil(float(n_examples)/self.batch_size))
        # features = self.data['features']
        captions = self.data['captions']
        image_idxs = self.data['image_idxs']
        caption_idxs = {}
        for i in range(len(image_idxs)):
            if image_idxs[i] not in caption_idxs:
                caption_idxs[image_idxs[i]] = [i]
            else:
                caption_idxs[image_idxs[i]].append(i)

        # val_features = self.val_data['features']
        val_features = load_val_data()
        n_iters_val = int(np.ceil(float(val_features.shape[0])/self.batch_size))


        # build graphs for training model and sampling captions
        loss = self.model.build_model()
        with tf.variable_scope(tf.get_variable_scope(), reuse=True):
            _, _, generated_captions = self.model.build_sampler(max_len=20)

        # train op
        with tf.name_scope('optimizer'):
            optimizer = self.optimizer(learning_rate=self.learning_rate)
            grads = tf.gradients(loss, tf.trainable_variables())
            grads_and_vars = list(zip(grads, tf.trainable_variables()))
            train_op = optimizer.apply_gradients(grads_and_vars=grads_and_vars)
           
        # summary op   
        tf.summary.scalar('batch_loss', loss)
        for var in tf.trainable_variables():
            tf.summary.histogram(var.op.name, var)
        # for grad, var in grads_and_vars:
        #     tf.summary.histogram(var.op.name+'/gradient', grad)
        
        summary_op = tf.summary.merge_all() 

        config = tf.ConfigProto(allow_soft_placement = True)
        #config.gpu_options.per_process_gpu_memory_fraction=0.9
        config.gpu_options.allow_growth = True
        with tf.Session(config=config) as sess:
            tf.global_variables_initializer().run()
            summary_writer = tf.summary.FileWriter(self.log_path, graph=tf.get_default_graph())
            saver = tf.train.Saver(max_to_keep=40)

            if self.pretrained_model is not None:
                print "Start training with pretrained Model.."
                saver.restore(sess, self.pretrained_model)

            prev_loss = -1
            curr_loss = 0
            start_t = time.time()

            print "The number of epoch: %d" %self.n_epochs
            print "Batch size: %d" %self.batch_size

            for e in range(self.n_epochs):
                # load data 9 times to solve huge data load problem #
                cur_iteration = 0
                for data_cnt in range(9):
                    print "----------------------------------------------------"
                    print "Loading data (part %d / 9) " %(int(data_cnt)+1)

                    features = hickle.load(os.path.join('./data/train', 'train.features%d.hkl' % data_cnt))
                    
                    total_num = features.shape[0]
                    print "Load success (data size: %d) " %total_num
                    print "Iterations: %d" %n_iters_per_epoch
                    print "----------------------------------------------------"

                    index_st = data_cnt * 10000
                    index_ed = index_st + total_num
                    part_features = features
                    part_captions = []
                    part_image_idxs = []

                    for idx in range(total_num):
                        for caption_idx in caption_idxs[index_st + idx]:
                            part_captions.append(captions[caption_idx])
                            part_image_idxs.append(idx)
                    part_captions = np.asarray(part_captions)
                    part_image_idxs = np.asarray(part_image_idxs)
                    part_iters = int(np.ceil(float(part_captions.shape[0])/self.batch_size))
                    

                    rand_idxs = np.random.permutation(part_captions.shape[0])
                    part_captions = part_captions[rand_idxs]
                    part_image_idxs = part_image_idxs[rand_idxs]

                    for i in range(part_iters):
                        captions_batch = part_captions[i*self.batch_size:(i+1)*self.batch_size]
                        part_image_idxs_batch = part_image_idxs[i*self.batch_size:(i+1)*self.batch_size]
                        features_batch = part_features[part_image_idxs_batch]
                        feed_dict = {self.model.features: features_batch, self.model.captions: captions_batch}
                        _, l = sess.run([train_op, loss], feed_dict)
                        curr_loss += l

                        # write summary for tensorboard visualization
                        if cur_iteration % 10 == 0:
                            summary = sess.run(summary_op, feed_dict)
                            summary_writer.add_summary(summary, e*n_iters_per_epoch + cur_iteration)

                        if (cur_iteration+1) % self.print_every == 0:
                            print "\nTrain loss at epoch %d & iteration %d (mini-batch): %.5f" %(e+1, cur_iteration+1, l)
                            ground_truths = part_captions[part_image_idxs == part_image_idxs_batch[0]]
                            decoded = decode_captions(ground_truths, self.model.idx_to_word)
                            for j, gt in enumerate(decoded):
                                print "Ground truth %d: %s" %(j+1, gt)                    
                            gen_caps = sess.run(generated_captions, feed_dict)
                            decoded = decode_captions(gen_caps, self.model.idx_to_word)
                            print "Generated caption: %s\n" %decoded[0]
                        cur_iteration = cur_iteration + 1

                    print "Current( epoch %d / part %d )  loss: %f" %(e+1, data_cnt+1, curr_loss)

                print "----------------------------------------------------"
                print "Previous epoch loss: ", prev_loss
                print "Current epoch loss: ", curr_loss
                print "Elapsed time: ", time.time() - start_t
                print "----------------------------------------------------"
                prev_loss = curr_loss
                curr_loss = 0
                
                # print out BLEU scores and file write
                if self.print_bleu:
                    all_gen_cap = np.ndarray((val_features.shape[0], 20))
                    for i in range(n_iters_val):
                        features_batch = val_features[i*self.batch_size:(i+1)*self.batch_size]
                        feed_dict = {self.model.features: features_batch}
                        gen_cap = sess.run(generated_captions, feed_dict=feed_dict)  
                        all_gen_cap[i*self.batch_size:(i+1)*self.batch_size] = gen_cap
                    
                    all_decoded = decode_captions(all_gen_cap, self.model.idx_to_word)
                    save_pickle(all_decoded, "./data/val/val.candidate.captions.pkl")
                    scores = evaluate(data_path='./data', split='val', get_scores=True)
                    write_bleu(scores=scores, path=self.model_path, epoch=e)

                # save model's parameters
                if (e+1) % self.save_every == 0:
                    saver.save(sess, os.path.join(self.model_path, 'model'), global_step=e+1)
                    print "model-%s saved." %(e+1)
Beispiel #12
0
    def test(self, split='train', save_sampled_captions=True):
        '''
        Args:
            - data: dictionary with the following keys:
            - features: Feature vectors of shape (5000, 196, 512)
            - file_names: Image file names of shape (5000, )
            - captions: Captions of shape (24210, 17)
            - image_idxs: Indices for mapping caption to image of shape (24210, )
            - features_to_captions: Mapping feature to captions (5000, 4~5)
            - split: 'train', 'val' or 'test'
            - attention_visualization: If True, visualize attention weights with images for each sampled word. (ipthon notebook)
            - save_sampled_captions: If True, save sampled captions to pkl file for computing BLEU scores.
        '''

        caps = self.data.captions[split]
        ids = self.data.video_ids[split]
        unique_ids = list(set(ids))
        n_examples = len(unique_ids)
        n_iters_per_epoch = int(np.ceil(float(n_examples) / self.batch_size))
        # build a graph to sample captions
        alphas, betas, sampled_captions = self.model.build_sampler(
            max_len=self.max_words)  # (N, max_len, L), (N, max_len)

        config = tf.ConfigProto(allow_soft_placement=True)
        config.gpu_options.allow_growth = True
        all_decoded = []
        with tf.Session(config=config) as sess:
            saver = tf.train.Saver()
            saver.restore(sess, self.test_model)
            for i in range(n_iters_per_epoch):
                ids_batch = unique_ids[i * self.batch_size:(i + 1) *
                                       self.batch_size]
                features_batch = [self.data.feature(vid) for vid in ids_batch]
                features_batch = np.asarray(features_batch)
                feed_dict = {self.model.features: features_batch}
                alps, bts, sam_cap = sess.run(
                    [alphas, betas, sampled_captions],
                    feed_dict)  # (N, max_len, L), (N, max_len)
                decoded = decode_captions(sam_cap, self.data.vocab.idx2word)
                all_decoded.extend(decoded)

        # generate ref and cand
        ref = {}
        cand = {}
        for vid, dec in zip(unique_ids, all_decoded):
            gts = decode_captions(caps[ids == vid][:, 1:],
                                  self.data.vocab.idx2word)
            ref[vid] = gts
            cand[vid] = [dec]
        # print ground truths and generated sentences
        for vid in unique_ids:
            print '---' * 10
            for i, gt in enumerate(ref[vid]):
                print i + 1, ':', gt
            print 'generated :', cand[vid][0]
        scores = evaluate(ref, cand, get_scores=True)
        tags = [
            'Bleu_1', 'Bleu_2', 'Bleu_3', 'Bleu_4', 'METEOR', 'CIDEr',
            'ROUGE_L'
        ]
        for tag in tags:
            print tag, ':', scores[tag]
        print split, len(unique_ids), len(all_decoded)
Beispiel #13
0
    def test(self,
             data,
             split='train',
             attention_visualization=False,
             save_sampled_captions=False,
             senti=[0]):

        max_len_captions = 20

        features = data['features'].reshape(-1, 49, 2048)
        captions = data['captions']

        if senti == [1]:
            data_save_path = "../data/positive"
        else:
            data_save_path = "../data/negative"

        n_examples = self.data['captions'].shape[0]
        n_iters_per_epoch = int(np.floor(float(n_examples) / self.batch_size))

        alphas, betas, sampled_captions = self.model.build_sampler(
            max_len=max_len_captions)

        config = tf.ConfigProto(allow_soft_placement=True)
        config.gpu_options.allow_growth = True

        with tf.Session(config=config) as sess:
            saver = tf.train.Saver()
            saver.restore(sess, self.test_model)

            if self.print_bleu:
                all_gen_cap = np.ndarray((features.shape[0], max_len_captions))

                for i in range(n_iters_per_epoch):
                    features_batch = features[i * self.batch_size:(i + 1) *
                                              self.batch_size]
                    captions_batch = captions[i * self.batch_size:(i + 1) *
                                              self.batch_size]
                    feed_dict = {
                        self.model.features: features_batch,
                        self.model.whole_samples:
                        captions_batch[:, 4:self.model.T],
                        self.model.nsample: 0,
                        self.model.mode_sampling: 1,
                        self.model.captions: captions_batch
                    }

                    gen_cap = sess.run(sampled_captions, feed_dict=feed_dict)
                    all_gen_cap[i * self.batch_size:(i + 1) *
                                self.batch_size] = gen_cap

                all_decoded = decode_captions(all_gen_cap,
                                              self.model.idx_to_word)

                save_pickle(
                    all_decoded,
                    os.path.join(data_save_path +
                                 'test/test.candidate.captions.pkl'))
                scores = evaluate(data_path=data_save_path,
                                  split=split,
                                  get_scores=True)

            if save_sampled_captions:
                all_sam_cap = np.ndarray((features.shape[0], max_len_captions))
                num_iter = int(
                    np.floor(float(features.shape[0]) / self.batch_size))
                for i in range(num_iter):
                    features_batch = features[i * self.batch_size:(i + 1) *
                                              self.batch_size]
                    feed_dict = {self.model.features: features_batch}
                    all_sam_cap[i * self.batch_size:(i + 1) *
                                self.batch_size] = sess.run(
                                    sampled_captions, feed_dict)

                all_decoded = decode_captions(all_sam_cap,
                                              self.model.idx_to_word)
                save_pickle(
                    all_decoded,
                    "./data/%s/%s.candidate.captions.pkl" % (split, split))
Beispiel #14
0
    def train(self):

        data_save_path = self.data_path

        sentiment_i = np.where(self.data['captions'][:, 3] != 0)[0]
        captions = self.data['captions'][sentiment_i, :21]
        n_examples = captions.shape[0]
        n_iters_per_epoch = int(np.floor(float(n_examples) / self.batch_size))
        image_idxs = self.data['image_idxs'][sentiment_i]

        features = self.data['features'].reshape(-1, 49, 2048)

        val_features = self.val_data['features'].reshape(-1, 49, 2048)

        n_iters_val = int(
            np.ceil(float(val_features.shape[0]) / self.batch_size))

        with tf.variable_scope(tf.get_variable_scope()):
            loss = self.model.build_model()
            tf.get_variable_scope().reuse_variables()
            _, _, generated_captions = self.model.build_sampler(
                max_len=self.model.T - 4)

        with tf.variable_scope(tf.get_variable_scope()):
            optimizer = self.optimizer(learning_rate=self.learning_rate)
            params = [
                param for param in tf.trainable_variables()
                if not ('discriminator' in param.name)
            ]
            grads = tf.gradients(loss, params)
            grads_and_vars = list(zip(grads,
                                      params))  #tf.trainable_variables()))
            train_op = optimizer.apply_gradients(grads_and_vars=grads_and_vars)

        tf.summary.scalar('batch_loss', loss)
        for var in tf.trainable_variables():
            tf.summary.histogram(var.op.name, var)
        for grad, var in grads_and_vars:
            print var.op.name, 'ooooo'
            tf.summary.histogram(var.op.name + '/gradient', grad)

        summary_op = tf.summary.merge_all()

        print "The number of epoch: %d" % self.n_epochs
        print "Data size: %d" % n_examples
        print "Batch size: %d" % self.batch_size
        print "Iterations per epoch: %d" % n_iters_per_epoch

        config = tf.ConfigProto(allow_soft_placement=True)
        config.gpu_options.allow_growth = True

        dis_embedding_dim = 256
        dis_filter_sizes = [
            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, self.model.T - 4
        ]
        dis_num_filters = [
            100, 200, 200, 200, 200, 100, 100, 100, 100, 100, 160, 160
        ]
        dis_l2_reg_lambda = 0.2

        discriminator = Discriminator(sequence_length=self.model.T - 4,
                                      num_classes=2,
                                      vocab_size=self.model.V,
                                      embedding_size=dis_embedding_dim,
                                      filter_sizes=dis_filter_sizes,
                                      num_filters=dis_num_filters,
                                      l2_reg_lambda=dis_l2_reg_lambda)

        rollout = ROLLOUT(self.model, 0.8)

        dis_data_loader = Dis_dataloader(self.dis_batch_size)

        rewards = np.zeros((self.batch_size, self.model.T - 4),
                           dtype=np.float32)

        dis_results_file = open(
            os.path.join(self.model_path, 'dis_results_file_4.txt'), 'w')

        with tf.Session(config=config) as sess:

            tf.global_variables_initializer().run()
            summary_writer = tf.summary.FileWriter(
                self.log_path, graph=tf.get_default_graph())
            saver = tf.train.Saver(max_to_keep=40)

            if self.pretrained_model is not None:
                print "Start training with pretrained Model.."
                saver.restore(sess, self.pretrained_model)

            prev_loss = -1
            curr_loss = 0
            start_t = time.time()

            print 'Start pre-training...'

            for e in range(0):  #self.n_epochs):

                rand_idxs = np.random.permutation(n_examples)
                captions = captions[rand_idxs]
                image_idxs = image_idxs[rand_idxs]

                for i in range(n_iters_per_epoch):

                    captions_batch = captions[i * self.batch_size:(i + 1) *
                                              self.batch_size]
                    image_idxs_batch = image_idxs[i * self.batch_size:(i + 1) *
                                                  self.batch_size]
                    features_batch = features[image_idxs_batch]

                    feed_dict = {
                        self.model.whole_samples:
                        captions_batch[:, 4:self.model.T],
                        self.model.rewards: rewards,
                        self.model.features: features_batch,
                        self.model.captions: captions_batch,
                        self.model.mode_learning: 1
                    }

                    _, l = sess.run([train_op, loss], feed_dict)

                    curr_loss += l

                    if (i + 1) % self.print_every == 0:

                        ground_truths = captions[image_idxs ==
                                                 image_idxs_batch[0], 4:]
                        decoded = decode_captions(ground_truths,
                                                  self.model.idx_to_word)
                        for j, gt in enumerate(decoded):
                            print "Ground truth %d: %s" % (j + 1, gt)
                        feed_dict = {
                            self.model.features:
                            features_batch,
                            self.model.whole_samples:
                            captions_batch[:, 4:self.model.T],
                            self.model.nsample:
                            0,
                            self.model.mode_sampling:
                            1,
                            self.model.captions:
                            captions_batch
                        }

                        gen_caps = sess.run(generated_captions, feed_dict)
                        decoded = decode_captions(gen_caps,
                                                  self.model.idx_to_word)
                        print "Generated caption: %s\n" % decoded[0]

                print "Previous epoch loss: ", prev_loss
                print "Current epoch loss: ", curr_loss
                print "Elapsed time: ", time.time() - start_t
                prev_loss = curr_loss
                curr_loss = 0

                captions_batch = captions[0 * self.batch_size:(0 + 1) *
                                          self.batch_size]
                if self.print_bleu:

                    all_gen_cap = np.ndarray(
                        (val_features.shape[0], self.model.T - 4))
                    pos = [1]
                    neg = [-1]

                    val_features[:, :, 2048:2052] = [0, 1, 0, 1]

                    for i in range(n_iters_val):
                        features_batch = val_features[i *
                                                      self.batch_size:(i + 1) *
                                                      self.batch_size]
                        feed_dict = {
                            self.model.features:
                            features_batch,
                            self.model.whole_samples:
                            captions_batch[:, 4:self.model.T],
                            self.model.nsample:
                            0,
                            self.model.mode_sampling:
                            1,
                            self.model.captions:
                            captions_batch
                        }
                        gen_cap = sess.run(generated_captions,
                                           feed_dict=feed_dict)
                        all_gen_cap[i * self.batch_size:(i + 1) *
                                    self.batch_size] = gen_cap

                    all_decoded = decode_captions(all_gen_cap,
                                                  self.model.idx_to_word)
                    save_pickle(
                        all_decoded,
                        os.path.join(data_save_path,
                                     "val/val.candidate.captions.pkl"))
                    scores = evaluate(data_path=data_save_path,
                                      split='val',
                                      get_scores=True)

                    print "scores_pos==================", scores
                    write_bleu(scores=scores,
                               path=self.model_path,
                               epoch=e,
                               senti=pos)

                    val_features[:, :, 2048:2052] = [0, 0, 1, 2]

                    for i in range(n_iters_val):
                        features_batch = val_features[i *
                                                      self.batch_size:(i + 1) *
                                                      self.batch_size]
                        feed_dict = {
                            self.model.features:
                            features_batch,
                            self.model.whole_samples:
                            captions_batch[:, 4:self.model.T],
                            self.model.nsample:
                            0,
                            self.model.mode_sampling:
                            1,
                            self.model.captions:
                            captions_batch
                        }
                        gen_cap = sess.run(generated_captions,
                                           feed_dict=feed_dict)
                        all_gen_cap[i * self.batch_size:(i + 1) *
                                    self.batch_size] = gen_cap

                    all_decoded = decode_captions(all_gen_cap,
                                                  self.model.idx_to_word)
                    save_pickle(
                        all_decoded,
                        os.path.join(data_save_path,
                                     "val/val.candidate.captions.pkl"))
                    scores = evaluate(data_path=data_save_path,
                                      split='val',
                                      get_scores=True)
                    print "scores_neg==================", scores
                    write_bleu(scores=scores,
                               path=self.model_path,
                               epoch=e,
                               senti=neg)

                if (e + 1) % self.save_every == 0:
                    saver.save(sess,
                               os.path.join(self.model_path, 'model'),
                               global_step=e + 1)
                    print "model-%s saved." % (e + 1)

            print 'Start pre-training discriminator...'
            for e in range(0):  #self.n_epochs):

                rand_idxs = np.random.permutation(n_examples)
                captions = captions[rand_idxs]
                image_idxs = image_idxs[rand_idxs]
                dis_loss = 0
                for i in range(n_iters_per_epoch):

                    captions_batch = captions[i * self.batch_size:(i + 1) *
                                              self.batch_size]
                    image_idxs_batch = image_idxs[i * self.batch_size:(i + 1) *
                                                  self.batch_size]
                    features_batch = features[image_idxs_batch]

                    feed_dict = {
                        self.model.features: features_batch,
                        self.model.whole_samples:
                        captions_batch[:, 4:self.model.T],
                        self.model.nsample: 0,
                        self.model.mode_sampling: 1,
                        self.model.captions: captions_batch
                    }

                    for d_step in range(3):
                        negative_file = sess.run(generated_captions,
                                                 feed_dict=feed_dict)
                        positive_file = captions_batch[:, 4:self.model.T]
                        dis_data_loader.load_train_data(
                            positive_file, negative_file)
                        for it in xrange(dis_data_loader.num_batch):
                            x_batch, y_batch = dis_data_loader.next_batch()
                            feed = {
                                discriminator.input_x:
                                x_batch,
                                discriminator.input_y:
                                y_batch,
                                discriminator.dropout_keep_prob:
                                self.dis_dropout_keep_prob
                            }
                            dis_l = sess.run(discriminator.loss, feed)
                            dis_loss = dis_loss + dis_l
                            _ = sess.run(discriminator.train_op, feed)
                            _ = sess.run(discriminator.params_clip, feed)

                dis_results_file.write('The loss in epoch %i is %f \n' %
                                       (e + 1, dis_loss))
                dis_results_file.flush()

                saver.save(sess,
                           os.path.join(self.model_path, 'model_and_dis'),
                           global_step=e + 1)

            print '#########################################################################'
            print 'Start Adversarial Training...'
            for e in range(self.n_epochs):

                rand_idxs = np.random.permutation(n_examples)
                captions = captions[rand_idxs]
                image_idxs = image_idxs[rand_idxs]

                for i in range(n_iters_per_epoch):

                    captions_batch = captions[i * self.batch_size:(i + 1) *
                                              self.batch_size]
                    image_idxs_batch = image_idxs[i * self.batch_size:(i + 1) *
                                                  self.batch_size]
                    features_batch = features[image_idxs_batch]

                    feed_dict = {
                        self.model.features: features_batch,
                        self.model.whole_samples:
                        captions_batch[:, 4:self.model.T],
                        self.model.nsample: 0,
                        self.model.mode_sampling: 1,
                        self.model.captions: captions_batch
                    }
                    samples_whole = sess.run(generated_captions,
                                             feed_dict=feed_dict)

                    rewards = rollout.get_reward(sess, samples_whole,
                                                 generated_captions,
                                                 self.rollout_num,
                                                 discriminator, features_batch,
                                                 captions_batch)

                    feed_dict = {
                        self.model.whole_samples: samples_whole,
                        self.model.rewards: rewards,
                        self.model.features: features_batch,
                        self.model.captions: captions_batch,
                        self.model.mode_learning: 2
                    }
                    _, l_reward = sess.run([train_op, loss],
                                           feed_dict=feed_dict)
                    curr_loss += l_reward

                    feed_dict = {
                        self.model.features: features_batch,
                        self.model.whole_samples:
                        captions_batch[:, 4:self.model.T],
                        self.model.nsample: 0,
                        self.model.mode_sampling: 1,
                        self.model.captions: captions_batch
                    }

                    for d_step in range(3):
                        negative_file = sess.run(generated_captions,
                                                 feed_dict=feed_dict)
                        positive_file = captions_batch[:, 4:self.model.T]
                        dis_data_loader.load_train_data(
                            positive_file, negative_file)
                        for it in xrange(dis_data_loader.num_batch):
                            x_batch, y_batch = dis_data_loader.next_batch()
                            feed = {
                                discriminator.input_x:
                                x_batch,
                                discriminator.input_y:
                                y_batch,
                                discriminator.dropout_keep_prob:
                                self.dis_dropout_keep_prob
                            }
                            _ = sess.run(discriminator.train_op, feed)
                            _ = sess.run(discriminator.params_clip, feed)

                    if (i + 1) % self.print_every == 0:

                        ground_truths = captions[image_idxs ==
                                                 image_idxs_batch[0], 4:]
                        decoded = decode_captions(ground_truths,
                                                  self.model.idx_to_word)
                        for j, gt in enumerate(decoded):
                            print "Ground truth %d: %s" % (j + 1, gt)
                        feed_dict = {
                            self.model.features:
                            features_batch,
                            self.model.whole_samples:
                            captions_batch[:, 4:self.model.T],
                            self.model.nsample:
                            0,
                            self.model.mode_sampling:
                            1,
                            self.model.captions:
                            captions_batch
                        }
                        gen_caps = sess.run(generated_captions, feed_dict)
                        decoded = decode_captions(gen_caps,
                                                  self.model.idx_to_word)
                        print "Generated caption: %s\n" % decoded[0]

                print "Previous epoch loss: ", prev_loss
                print "Current epoch loss: ", curr_loss
                print "Elapsed time: ", time.time() - start_t
                prev_loss = curr_loss
                curr_loss = 0

                captions_batch = captions[0 * self.batch_size:(0 + 1) *
                                          self.batch_size]
                if self.print_bleu:
                    all_gen_cap = np.ndarray(
                        (val_features.shape[0], self.model.T - 4))

                    pos = [1]
                    neg = [-1]

                    val_features[:, :, 2048:2052] = [0, 1, 0, 1]

                    for i in range(n_iters_val):
                        features_batch = val_features[i *
                                                      self.batch_size:(i + 1) *
                                                      self.batch_size]
                        feed_dict = {
                            self.model.features:
                            features_batch,
                            self.model.whole_samples:
                            captions_batch[:, 4:self.model.T],
                            self.model.nsample:
                            0,
                            self.model.mode_sampling:
                            1,
                            self.model.captions:
                            captions_batch
                        }
                        gen_cap = sess.run(generated_captions,
                                           feed_dict=feed_dict)
                        all_gen_cap[i * self.batch_size:(i + 1) *
                                    self.batch_size] = gen_cap

                    all_decoded = decode_captions(all_gen_cap,
                                                  self.model.idx_to_word)
                    save_pickle(
                        all_decoded,
                        os.path.join(data_save_path,
                                     "val/val.candidate.captions.pkl"))
                    scores = evaluate(data_path=data_save_path,
                                      split='val',
                                      get_scores=True)

                    print "scores_pos==================", scores

                    write_bleu(scores=scores,
                               path=self.model_path,
                               epoch=e,
                               senti=pos)

                    val_features[:, :, 2048:2052] = [0, 0, 1, 2]

                    for i in range(n_iters_val):

                        features_batch = val_features[i *
                                                      self.batch_size:(i + 1) *
                                                      self.batch_size]
                        feed_dict = {
                            self.model.features:
                            features_batch,
                            self.model.whole_samples:
                            captions_batch[:, 4:self.model.T],
                            self.model.nsample:
                            0,
                            self.model.mode_sampling:
                            1,
                            self.model.captions:
                            captions_batch
                        }

                        gen_cap = sess.run(generated_captions,
                                           feed_dict=feed_dict)
                        all_gen_cap[i * self.batch_size:(i + 1) *
                                    self.batch_size] = gen_cap

                    all_decoded = decode_captions(all_gen_cap,
                                                  self.model.idx_to_word)
                    save_pickle(
                        all_decoded,
                        os.path.join(data_save_path,
                                     "val/val.candidate.captions.pkl"))
                    scores = evaluate(data_path=data_save_path,
                                      split='val',
                                      get_scores=True)

                    print "scores_neg==================", scores

                    write_bleu(scores=scores,
                               path=self.model_path,
                               epoch=e,
                               senti=neg)

                if (e + 1) % self.save_every == 0:
                    saver.save(sess,
                               os.path.join(self.model_path, 'model_adv'),
                               global_step=e + 1)
                    print "model-%s saved." % (e + 1)
    def train(self):
        # train/val dataset
        n_examples = self.data['captions'].shape[0]
        n_iters_per_epoch = int(np.ceil(float(n_examples) / self.batch_size))
        features = self.data['features']
        captions = self.data['captions']
        image_idxs = self.data['image_idxs']
        val_features = self.val_data['features']
        val_captions = self.val_data['captions']
        n_iters_val = int(
            np.ceil(float(val_features.shape[0]) / self.batch_size))

        # build graphs for training model and sampling captions
        loss = self.model.build_model()
        tf.get_variable_scope().reuse_variables()
        alphas, betas, sampled_captions, loss = self.model.build_multinomial_sampler(
        )

        _, _, greedy_caption = self.model.build_sampler(max_len=20)

        rewards = tf.placeholder(tf.float32, [None])
        base_line = tf.placeholder(tf.float32, [None])

        grad_mask = tf.placeholder(tf.int32, [None, 16])
        t1 = tf.expand_dims(grad_mask, 1)
        t1_mul = tf.to_float(tf.transpose(t1, [0, 2, 1]))

        # train op
        with tf.name_scope('optimizer'):

            optimizer = self.optimizer(learning_rate=self.learning_rate)
            norm = tf.reduce_sum(t1_mul)
            mask_loss = loss * t1_mul
            sum_loss = tf.reduce_sum(
                tf.transpose(
                    tf.mul(tf.transpose(mask_loss, [2, 1, 0]),
                           (rewards - base_line)), [2, 1, 0])) / norm

            # sum_loss = tf.reduce_sum(
            #     tf.transpose(tf.mul(tf.transpose(mask_loss, [2, 1, 0]), rewards - base_line), [2, 1, 0]), 1)
            grads_rl = tf.gradients(sum_loss,
                                    tf.trainable_variables(),
                                    aggregation_method=tf.AggregationMethod.
                                    EXPERIMENTAL_ACCUMULATE_N)
            grads_and_vars = list(zip(grads_rl, tf.trainable_variables()))

            # grads = tf.gradients(loss, tf.trainable_variables())
            # grads_and_vars = list(zip(grads, tf.trainable_variables()))
            train_op = optimizer.apply_gradients(grads_and_vars=grads_and_vars)

        # summary op

        print "The number of epoch: %d" % self.n_epochs
        print "Data size: %d" % n_examples
        print "Batch size: %d" % self.batch_size
        print "Iterations per epoch: %d" % n_iters_per_epoch

        config = tf.ConfigProto(allow_soft_placement=True)
        # config.gpu_options.per_process_gpu_memory_fraction=0.9
        config.gpu_options.allow_growth = True
        config.gpu_options.allocator_type = 'BFC'
        with tf.Session(config=config) as sess:

            saver = tf.train.Saver()
            saver.restore(sess, self.test_model)

            start_t = time.time()

            for e in range(self.n_epochs):
                rand_idxs = np.random.permutation(n_examples)
                captions = np.array(captions[rand_idxs])
                image_idxs = np.array(image_idxs[rand_idxs])
                b_for_eval = []

                for i in range(n_iters_per_epoch):
                    captions_batch = np.array(
                        captions[i * self.batch_size:(i + 1) *
                                 self.batch_size])
                    image_idxs_batch = np.array(
                        image_idxs[i * self.batch_size:(i + 1) *
                                   self.batch_size])
                    features_batch = np.array(features[image_idxs_batch])
                    # ground_truths = []
                    # for j in range(len(image_idxs_batch)):
                    #     print j
                    #     print image_idxs_batch[j]
                    #     print captions[image_idxs == image_idxs_batch[j]]
                    #
                    #     ground_truths.append(captions[image_idxs_batch[j]])

                    # ground_truths = [captions[image_idxs == image_idxs_batch[j]] for j in range(64)]
                    ground_truths = [
                        captions[image_idxs == image_idxs_batch[j]]
                        for j in range(len(image_idxs_batch))
                    ]
                    ref_decoded = [
                        decode_captions(ground_truths[j],
                                        self.model.idx_to_word)
                        for j in range(len(ground_truths))
                    ]

                    feed_dict = {
                        self.model.features: features_batch,
                        self.model.captions: captions_batch
                    }

                    samples, greedy_words = sess.run(
                        [sampled_captions, greedy_caption], feed_dict)
                    masks, all_decoded = decode_captions_for_blue(
                        samples, self.model.idx_to_word)
                    _, greedy_decoded = decode_captions_for_blue(
                        greedy_words, self.model.idx_to_word)
                    # write summary for tensorboard visualization

                    r = [
                        evaluate_captions([k], [v])
                        for k, v in zip(ref_decoded, all_decoded)
                    ]
                    b = [
                        evaluate_captions([k], [v])
                        for k, v in zip(ref_decoded, greedy_decoded)
                    ]

                    b_for_eval.extend(b)
                    feed_dict = {
                        grad_mask: masks,
                        rewards: r,
                        base_line: b,
                        self.model.features: features_batch,
                        self.model.captions: captions_batch
                    }  # write summary for tensorboard visualization
                    _ = sess.run([train_op], feed_dict)
                print str(np.mean(np.array(b_for_eval)))
                # print out BLEU scores and file write
                print "Elapsed time: ", time.time() - start_t
                if self.print_bleu:
                    print "b" + str(np.mean(np.array(b)))
                    print "r" + str(np.mean(np.array(r)))
                    all_gen_cap = np.ndarray((val_features.shape[0], 128))
                    for k in range(n_iters_val):
                        features_batch = val_features[k *
                                                      self.batch_size:(k + 1) *
                                                      self.batch_size]
                        captions_words_batch = np.array(
                            val_captions[k * self.batch_size:(k + 1) *
                                         self.batch_size])

                        feed_dict = {
                            self.model.features: features_batch,
                            self.model.captions: captions_words_batch
                        }
                        gen_cap = sess.run(greedy_caption, feed_dict=feed_dict)
                        all_gen_cap[k * self.batch_size:(k + 1) *
                                    self.batch_size] = gen_cap
                    masks, all_decoded = decode_captions_for_blue(
                        all_gen_cap, self.model.idx_to_word)
                    for s in range(5):
                        print all_decoded[-s - 1]
                    save_pickle(all_decoded,
                                "./data/val/val.candidate.captions.pkl")
                    scores = evaluate(data_path='./data',
                                      split='val',
                                      get_scores=True)
                    write_bleu(scores=scores, path=self.model_path, epoch=e)

                # save model's parameters
                if (e + 1) % self.save_every == 0:
                    saver.save(sess,
                               os.path.join(self.model_path, 'model'),
                               global_step=e + 1)
                    print "model-%s saved." % (e + 1)