def evaluate_on_split(self, sess, generated_captions, summary_writer, epoch, tags, split='train'): caps = self.data.captions[split] ids = self.data.video_ids[split] unique_ids = list(set(ids)) num_iter = int(ceil(len(unique_ids) / float(self.batch_size))) while len(unique_ids) < num_iter * self.batch_size: unique_ids += unique_ids unique_ids = unique_ids[:num_iter * self.batch_size] all_gen_cap = np.ndarray((len(unique_ids), self.max_words), dtype=np.int) for i in range(num_iter): features_batch = [ self.data.feature(vid) for vid in unique_ids[i * self.batch_size:(i + 1) * self.batch_size] ] # if len(features_batch) < self.batch_size: # l = len(features_batch) # features_batch += [self.data.feature(vid) for vid in unique_ids[:self.batch_size - l]] features_batch = np.asarray(features_batch) feed_dict = {self.features: features_batch} gen_cap = sess.run(generated_captions, feed_dict=feed_dict) all_gen_cap[i * self.batch_size:(i + 1) * self.batch_size] = gen_cap all_decoded = decode_captions(all_gen_cap, self.data.vocab.idx2word) # create cand dict cand = {} for vid, sentence in zip(unique_ids, all_decoded): cand[vid] = [sentence] # create ref dict ref = {} for vid in unique_ids: ref[vid] = decode_captions(caps[ids == vid][:, 1:], self.data.vocab.idx2word) with open('result/cand_%s_%d.txt' % (split, epoch), 'w') as file: file.write(str(cand)) with open('result/ref_%s_%d.txt' % (split, epoch), 'w') as file: file.write(str(ref)) # evaluate scores = evaluate(ref=ref, cand=cand, get_scores=True) for tag in tags: summary = tf.Summary() summary.value.add(tag=split + tag, simple_value=scores[tag]) summary_writer.add_summary(summary, epoch) return scores
def val(self, epoch): self.atten_model.eval() # prepare val data: random choice a batch, compute the loss and generate sentence features_batch, image_files, cur_captions = sample_minibatch( self.val_data, self.args.batch_size) features = Variable(torch.from_numpy(features_batch)).cuda() caption_in = cur_captions[:, :16] caption_out = cur_captions[:, 1:] captions_batch_in = Variable( torch.from_numpy(caption_in).type(torch.LongTensor)).cuda() captions_batch_out = torch.from_numpy(caption_out) mask = np.not_equal(captions_batch_out, 0) mask = Variable(mask.type(torch.cuda.FloatTensor)) captions_batch_out = Variable(captions_batch_out.type( torch.LongTensor)).cuda() loss = self.atten_model(captions_batch_in, captions_batch_out, features, mask) alpha_all, betas, sample_caption = self.atten_model.build_sample( features) decoded = decode_captions(np.squeeze(np.array(sample_caption.data)), self.idx_to_word) alpha_all = np.squeeze(np.array(alpha_all.data)) betas = np.squeeze(np.array(betas.data)) cur_decoded = decode_captions(np.stack(cur_captions), self.idx_to_word) if epoch % (int(self.args.epochs * 0.1)) == 0: file_decoded = { image_files[i]: (decoded[i], cur_decoded[i], alpha_all[i], betas[i]) for i in range(self.args.batch_size) } val_samples_path = os.path.join( self.args.val_samples, 'val-' + str(epoch) + '-samples.pkl') save_pickle(file_decoded, val_samples_path) val_loss = torch.sum(loss) / self.args.batch_size # Save the model if the validation loss is the best we've seen so far. if not self.best_val_loss or val_loss.data[0] < self.best_val_loss: torch.save(self.atten_model.state_dict(), self.args.save) self.best_val_loss = val_loss.data[0] save_pickle(self.best_val_loss, self.args.loss_log) print 'save train model' elif epoch != 0 and epoch % 100 == 0: self.args.lr /= 2.0 return self.args.lr, self.best_val_loss, decoded
def test(self, save_sampled_captions=True, evaluate_score=True, generate_demo_sample=False): self.atten_model.eval() self.atten_model.load_state_dict(torch.load(self.args.save)) self.atten_model.cuda() if save_sampled_captions: features = self.test_data['features'] n_examples = features.shape[0] all_sam_cap = np.ndarray((n_examples, 20)) test_times = int(np.ceil(float(n_examples) / self.args.batch_size)) for t in range(test_times): features_batch = Variable( torch.from_numpy( features[t * self.args.batch_size:(t + 1) * self.args.batch_size])).cuda() _, _, sampled_captions = self.atten_model.build_sample( features_batch) all_sam_cap[t * self.args.batch_size:(t + 1) * self.args.batch_size] = np.array( sampled_captions.data) decoded = decode_captions(all_sam_cap, self.idx_to_word) save_pickle(decoded, self.args.test_samples) print 'test all sccessful' if evaluate_score: ref = load_pickle('./data/test/test.references.pkl') try: evaluate(ref, decoded) except KeyboardInterrupt: decoded = load_pickle(self.args.test_samples) evaluate(ref, decoded) if generate_demo_sample: features = self.args.demo_feat features_batch = Variable(torch.from_numpy(features)).cuda() _, _, sampled_captions = self.atten_model.build_sample( features_batch) decoded = decode_captions(sampled_captions, self.idx_to_word) print decoded
def main(argv): assert FLAGS.train_dir is not None, "train_dir is required" assert FLAGS.resnet_ckpt is not None, "resnet_ckpt is required" # data print('loading data...') (train_stems_list, train_stem_attrs_list, train_images, train_image2stem, train_stem2image) = utils.load_coco_data(config.data_root, 'train') (val_stems_list, val_stem_attrs_list, val_images, val_image2stem, val_stem2image) = utils.load_coco_data(config.data_root, 'val') # handling directories train_dir = os.path.join(config.model_root, FLAGS.train_dir) if not tf.gfile.IsDirectory(train_dir): tf.logging.info("Creating training directory: %s", train_dir) tf.gfile.MakeDirs(train_dir) log_dir = os.path.join(train_dir, 'log') if not tf.gfile.IsDirectory(log_dir): tf.logging.info("Creating log directory for training: %s", log_dir) tf.gfile.MakeDirs(log_dir) checkpoint = None if FLAGS.checkpoint is not None: checkpoint = os.path.join(config.model_root, FLAGS.checkpoint) assert os.path.exists(checkpoint), "checkpoint must exists if given." # model print('building model.') model = HierarchicalModel(config, mode=ModeKeys.TRAIN) loss = model.build() generator = CaptionGenerator(model, model.level1_word2ix, None, beam_size_1level=3, beam_size_2level=None, encourage_1level=0.0, encourage_2level=None, level2=False) # train_op with tf.name_scope('optimizer'): optimizer = tf.train.AdamOptimizer(learning_rate=config.learning_rate) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): optim_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='level1') if config.train_resnet: optim_vars += tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='resnet') # deriv level1_grads = tf.gradients(loss, optim_vars) grads_and_vars = [(i, j) for i, j in zip(level1_grads, optim_vars) if i is not None] grads_and_vars = [(tf.clip_by_value(grad, -0.1, 0.1), var) for grad, var in grads_and_vars] # todo: here check the batch-norm moving average/var # if config.train_resnet: # optim_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='resnet') # resnet_grads = tf.gradients(model.resnet.features, optim_vars) # resnet_pairs = [(i, j) for i, j in zip(resnet_grads, optim_vars) if i is not None] # grads_and_vars.extend(resnet_pairs) batchnorm_updates = tf.get_collection('resnet_update_ops') batchnorm_updates_op = tf.group(*batchnorm_updates) apply_gradient_op = optimizer.apply_gradients( grads_and_vars=grads_and_vars) train_op = tf.group(apply_gradient_op, batchnorm_updates_op) # summary op print('************************') tf.summary.scalar('batch_loss', loss) for var in tf.trainable_variables(): tf.summary.histogram(var.op.name, var) # for grad, var in grads_and_vars: # tf.summary.histogram(var.op.name + '/gradient', grad) summary_op = tf.summary.merge_all() # stats: n_examples = len(train_stems_list) n_examples_val = len(val_stems_list) n_iters_per_epoch = int(np.ceil(float(n_examples) / config.batch_size)) n_iters_val = int(np.ceil(float(n_examples_val) / config.batch_size)) print("The number of epoch: %d" % config.n_epochs) print("Data size: %d" % n_examples) print("Batch size: %d" % config.batch_size) print("Iterations per epoch: %d" % n_iters_per_epoch) # tf session config_ = tf.ConfigProto(allow_soft_placement=True) config_.gpu_options.per_process_gpu_memory_fraction = 0.6 config_.gpu_options.allow_growth = True with tf.Session(config=config_) as sess: tf.global_variables_initializer().run() summary_writer = tf.summary.FileWriter(log_dir, graph=tf.get_default_graph()) saver = tf.train.Saver(max_to_keep=40) # pretrained if checkpoint is not None: print("Start training with checkpoint..") saver.restore(sess, checkpoint) # dynamic stats prev_loss_epo = np.inf curr_loss_epo = 0 best_loss_val = np.inf curr_loss_val = 0 i_global = 0 start_t = time.time() for epo in range(config.n_epochs): # stochastic batching rand_idxs = list(np.random.permutation(n_examples)) for it in range(n_iters_per_epoch): # next batch rand_idx = sorted(rand_idxs[it * config.batch_size:(it + 1) * config.batch_size]) stems_batch, mask_batch = utils.list2batch( [train_stems_list[i] for i in rand_idx]) img_idx = train_stem2image[rand_idx] img_batch = utils.crop_image(train_images[img_idx], True) # print(decode_captions(captions_batch, model.level1_model.idx_to_word)) feed_dict = { model.level1_model.captions: stems_batch, model.level1_model.mask: mask_batch, model.level1_model.resnet.images: img_batch, model.level1_model.resnet.is_training: config.train_resnet, model.level1_model.keep_prob: 0.5 } _, l = sess.run([train_op, loss], feed_dict) curr_loss_epo += l # print 'batch norm beta:', sess.run(test1)[:10] # print 'batch norm gamma:', sess.run(test2)[:10] # global iteration counts i_global += 1 # write summary for tensorboard visualization if it % config.log_freq == 0: summary = sess.run(summary_op, feed_dict) summary_writer.add_summary(summary, epo * n_iters_per_epoch + it) # periodical display if it % config.print_freq == 0: print( "\nTrain loss at epoch %d & iteration %d (mini-batch): %.5f" % (epo + 1, it + 1, l)) ground_truths = stems_batch[0] decoded = utils.decode_captions( ground_truths, model.level1_model.idx_to_word) for j, gt in enumerate(decoded): print("Ground truth %d: %s" % (j + 1, gt)) print(ground_truths) predicted = generator.beam_search(sess, img_batch[0:1, :, :, :]) print("Generated caption: %s\n" % predicted) print('***************') # auto save if i_global % config.save_freq == 0: saver.save(sess, os.path.join(train_dir, 'model_level1_auto_save'), global_step=i_global) print("model-auto-%s saved." % (i_global)) # validate if i_global % config.valid_freq == 0: cur_loss_val = 0 if config.print_bleu: # TODO: some preparation for saving search result. #all_gen_cap = np.ndarray((n_examples_val, 16)) pass for it_val in range(n_iters_val): idx_val = np.arange(it_val * config.batch_size, (it_val + 1) * config.batch_size) stems_batch_val, mask_batch_val = utils.list2batch( [val_stems_list[i] for i in idx_val]) img_idx_val = val_stem2image[idx_val] img_batch_val = utils.crop_image( val_images[img_idx_val], False) feed_dict_val = { model.level1_model.captions: stems_batch_val, model.level1_model.mask: mask_batch_val, model.level1_model.resnet.images: img_batch_val, model.level1_model.resnet.is_training: False, model.level1_model.keep_prob: 1.0 } curr_loss_val += sess.run(loss, feed_dict_val) if config.print_bleu: # TODO: beam search and evaluate bleu. pass curr_loss_val /= n_iters_val if curr_loss_val < best_loss_val: best_loss_val = cur_loss_val # better model saver.save(sess, os.path.join(train_dir, 'model_level1_val'), global_step=i_global) print('model-val-%s saved.' % (i_global)) else: # TODO: early stop checking. pass # end for(i) curr_loss_epo /= n_iters_per_epoch # epoch summary: print("Previous epoch loss: ", prev_loss_epo) print("Current epoch loss: ", curr_loss_epo) print("Elapsed time: ", time.time() - start_t) prev_loss_epo = curr_loss_epo curr_loss_epo = 0 # save model's parameters saver.save(sess, os.path.join(train_dir, 'model_level1_epo'), global_step=epo + 1) print("model-epo-%s saved." % (epo + 1))
def train(self): # train/val dataset train_caps, train_lengths, train_ids = self.data.captions['train'], self.data.lengths['train'], \ self.data.video_ids['train'] n_examples = len(train_caps) n_iters_per_epoch = int(np.ceil(float(n_examples) / self.batch_size)) tags = [ 'Bleu_1', 'Bleu_2', 'Bleu_3', 'Bleu_4', 'METEOR', 'CIDEr', 'ROUGE_L' ] # build graphs for training model and sampling captions with tf.Graph().as_default(): with tf.device('/cpu:0'): with tf.variable_scope(tf.get_variable_scope()) as vscope: tower_loss = [] tower_grad = [] tower_generated_cap = [] # create multi gpu train_op, loss_op and generated_captions_op # create placeholder self.features = tf.placeholder(tf.float32, [None, self.L, self.D]) self.captions = tf.placeholder(tf.int32, [None, self.max_words + 2]) # create train_op, loss_op and generated_captions_op for i in range(self.num_gpus): # on each gpu with tf.device('/gpu:%d' % i): with tf.name_scope('tower_%d' % i) as scope: # create batch input for each gpu _feat_batch = self.features[self.batch_size / self.num_gpus * i:self.batch_size / self.num_gpus * (i + 1), :, :] _cap_batch = self.captions[self.batch_size / self.num_gpus * i:self.batch_size / self.num_gpus * (i + 1), :] # compute loss one_loss = self.model.build_model( _feat_batch, _cap_batch) tower_loss.append(one_loss) # reuse variables tf.get_variable_scope().reuse_variables() alphas, betas, generated_cap = self.model.build_sampler( _feat_batch, max_len=self.max_words) tf.get_variable_scope().reuse_variables() tower_generated_cap.append(generated_cap) # compute grad var_list = tf.trainable_variables() grad = tf.gradients(one_loss, var_list) tower_grad.append(grad) # multi gpu loss operation: average loss loss_op = self.average_loss(tower_loss) # caption operation generated_caption_op = tf.concat(tower_generated_cap, 0) # average grad average_grad = self.average_gradients(tower_grad) # initialize optimizer global_step = tf.Variable(0, trainable=False) increase_global_step_op = tf.assign(global_step, global_step + 1) boundaries = [10] values = [self.learning_rate, 0.1 * self.learning_rate] piecewise_learning_rate = tf.train.piecewise_constant( global_step, boundaries, values) learning_rate = piecewise_learning_rate optimizer = self.optimizer(learning_rate=learning_rate, beta1=0.1, beta2=0.001) # train operation: apply gradients train_op = optimizer.apply_gradients( zip(average_grad, tf.trainable_variables())) # summary op tf.summary.scalar('learning_rate', learning_rate) tf.summary.scalar('batch_loss', loss_op) for var in tf.trainable_variables(): tf.summary.histogram(var.op.name, var) for grad, var in zip(average_grad, tf.trainable_variables()): tf.summary.histogram(var.op.name + '/gradient', grad) summary_op = tf.summary.merge_all() # create session sess = tf.Session(config=tf.ConfigProto( allow_soft_placement=True)) summary_writer = tf.summary.FileWriter(self.log_path, sess.graph) saver = tf.train.Saver(tf.global_variables()) # initialized variables sess.run([ tf.global_variables_initializer(), tf.local_variables_initializer() ]) for epoch in range(self.n_epochs): # shuffle train data rand_idxs = np.random.permutation(n_examples) train_caps = train_caps[rand_idxs] train_ids = train_ids[rand_idxs] train_lengths = train_lengths[rand_idxs] for it in range(n_iters_per_epoch): captions_batch = train_caps[it * self.batch_size:(it + 1) * self.batch_size] image_idxs_batch = train_ids[it * self.batch_size:(it + 1) * self.batch_size] if len(captions_batch) < self.batch_size: l = len(captions_batch) captions_batch = np.concatenate( (captions_batch, train_caps[:self.batch_size - l]), axis=0) image_idxs_batch = np.concatenate( (image_idxs_batch, train_ids[:self.batch_size - l]), axis=0) features_batch = [ self.data.feature(vid) for vid in image_idxs_batch ] feed_dict = { self.features: features_batch, self.captions: captions_batch } _, loss, summary_str = sess.run( (train_op, loss_op, summary_op), feed_dict=feed_dict) # print epoch, it, loss summary_writer.add_summary( summary_str, epoch * n_iters_per_epoch + it) if (it + 1) % self.print_every == 0: print "\nTrain loss at epoch %d & iteration %d (mini-batch): %.5f" % ( epoch + 1, it + 1, loss) ground_truths = train_caps[train_ids == image_idxs_batch[0]] decoded = decode_captions(ground_truths[:, 1:], self.data.vocab.idx2word) for j, gt in enumerate(decoded): print "Ground truth %d: %s" % ( j + 1, gt.encode('utf-8')) gen_caps = sess.run(generated_caption_op, feed_dict) decoded = decode_captions(gen_caps, self.data.vocab.idx2word) print "Generated caption: %s\n" % decoded[0] self.evaluate_on_split( sess=sess, generated_captions=generated_caption_op, summary_writer=summary_writer, epoch=epoch, tags=tags, split='train') scores = self.evaluate_on_split( sess=sess, generated_captions=generated_caption_op, summary_writer=summary_writer, epoch=epoch, tags=tags, split='val') write_bleu(scores=scores, path=self.model_path, epoch=epoch) self.evaluate_on_split( sess=sess, generated_captions=generated_caption_op, summary_writer=summary_writer, epoch=epoch, tags=tags, split='test') # save model saver.save(sess, os.path.join(self.model_path, 'model'), global_step=epoch + 1) print "model-%s saved." % (epoch + 1) # increase global step, which is used to decay learning rate sess.run(increase_global_step_op)
def test(self, split='train', save_sampled_captions=True): ''' Args: - data: dictionary with the following keys: - features: Feature vectors of shape (5000, 196, 512) - file_names: Image file names of shape (5000, ) - captions: Captions of shape (24210, 17) - image_idxs: Indices for mapping caption to image of shape (24210, ) - features_to_captions: Mapping feature to captions (5000, 4~5) - split: 'train', 'val' or 'test' - attention_visualization: If True, visualize attention weights with images for each sampled word. (ipthon notebook) - save_sampled_captions: If True, save sampled captions to pkl file for computing BLEU scores. ''' caps = self.data.captions[split] ids = self.data.video_ids[split] unique_ids = list(set(ids)) n_examples = len(unique_ids) n_iters_per_epoch = int(np.ceil(float(n_examples) / self.batch_size)) # build a graph to sample captions alphas, betas, sampled_captions = self.model.build_sampler( max_len=self.max_words) # (N, max_len, L), (N, max_len) config = tf.ConfigProto(allow_soft_placement=True) config.gpu_options.allow_growth = True all_decoded = [] with tf.Session(config=config) as sess: saver = tf.train.Saver() saver.restore(sess, self.test_model) for i in range(n_iters_per_epoch): ids_batch = unique_ids[i * self.batch_size:(i + 1) * self.batch_size] features_batch = [self.data.feature(vid) for vid in ids_batch] features_batch = np.asarray(features_batch) feed_dict = {self.model.features: features_batch} alps, bts, sam_cap = sess.run( [alphas, betas, sampled_captions], feed_dict) # (N, max_len, L), (N, max_len) decoded = decode_captions(sam_cap, self.data.vocab.idx2word) all_decoded.extend(decoded) # generate ref and cand ref = {} cand = {} for vid, dec in zip(unique_ids, all_decoded): gts = decode_captions(caps[ids == vid][:, 1:], self.data.vocab.idx2word) ref[vid] = gts cand[vid] = [dec] # print ground truths and generated sentences for vid in unique_ids: print '---' * 10 for i, gt in enumerate(ref[vid]): print i + 1, ':', gt print 'generated :', cand[vid][0] scores = evaluate(ref, cand, get_scores=True) tags = [ 'Bleu_1', 'Bleu_2', 'Bleu_3', 'Bleu_4', 'METEOR', 'CIDEr', 'ROUGE_L' ] for tag in tags: print tag, ':', scores[tag] print split, len(unique_ids), len(all_decoded)
def beam_search(self, sess, img): """ Params: :sess: tf session :img: image of shape (1, width, height, channels) Returns: top-ranked decoded literal sentence when level-2 is enabled(decode) otherwise, top-ranked digital sentence is returned.(no decode) """ resnet = self.model.resnet level1 = self.model.level1_model # feed image into resnet and get image features image_features = sess.run(resnet.features, feed_dict={resnet.images: img}) # level1 (skeleton) # initialize for beam search. (init_c, init_h, features_encode, features_proj) = sess.run( [level1.init_c, level1.init_h, level1.features_encode, level1.features_proj], feed_dict = {level1.image_features: image_features}) initial_beam = Caption( sentence=[self.vocab_1level['START']], c=init_c, h=init_h, logprob=0.0, score=0.0, embeds=[], contexts=[], hiddens=[]) partial_captions = TopN(self.beam_size_1level) partial_captions.push(initial_beam) complete_captions = TopN(self.beam_size_1level) # Run beam search. for t in range(self.max_caption_length_1level): partial_captions_list = partial_captions.extract() partial_captions.reset() input_feed = np.array([c.sentence[-1] for c in partial_captions_list]) h_feed = np.reshape(np.array([c.h for c in partial_captions_list]), (-1, level1.dim_hid)) c_feed = np.reshape(np.array([c.c for c in partial_captions_list]), (-1, level1.dim_hid)) (c, h, log_softmax, alpha, context) = sess.run([level1.c, level1.h, level1.log_softmax, level1.alpha, level1.context4next], feed_dict={level1.c_feed: c_feed, level1.h_feed: h_feed, level1.in_word: input_feed, level1.image_features: image_features}) for i, partial_caption in enumerate(partial_captions_list): word_probabilities = log_softmax[i] word_probabilities[2:] += self.encourage_1level # For this partial caption, get the beam_size most probable next words. words_and_probs = list(enumerate(word_probabilities)) words_and_probs.pop(level1._start) # exclude START words_and_probs.sort(key=lambda x: -x[1]) words_and_probs = words_and_probs[:self.beam_size_1level] # Each next word gives a new partial caption. for w, logp in words_and_probs: if self.level2: embed = sess.run(level1.embed4next, feed_dict={level1.word_feed: np.array([w])}) else: embed = None sentence = partial_caption.sentence + [w] logprob = partial_caption.logprob + logp score = logprob if w == level1.word_to_idx['EOS']: if self.length_normalization_factor > 0: score /= len(sentence) ** self.length_normalization_factor beam = Caption(sentence, c[i], h[i], logprob, score, partial_caption.embeds, partial_caption.contexts, partial_caption.hiddens) complete_captions.push(beam) else: beam = Caption(sentence, c[i], h[i], logprob, score, partial_caption.embeds + [embed], partial_caption.contexts + [context[i]], partial_caption.hiddens + [h[i]]) partial_captions.push(beam) if partial_captions.size() == 0: # We have run out of partial candidates; happens when beam_size = 1. break if not complete_captions.size(): complete_captions = partial_captions level1_top_captions = complete_captions.extract(sort=True) full_sentence = [] # level2 can be excluded for analysis if self.level2: level2 = self.model.level2_model # level2 (attributes) for caption in level1_top_captions: # for each caption(only one sentence) sentence_level1 = caption.sentence embeds, contexts, hiddens = caption.embeds, caption.contexts, caption.hiddens # only take the best skeleton generated from level1, # and splitted as word sequence (be careful!!!) sent_level1 = utils.decode_captions(np.squeeze(np.asarray(sentence_level1)), level1.idx_to_word)[0] words_level1 = sent_level1.split(' ') attrs_level2 = [] # iterate over the whole sentence word by word for t_level1 in range(len(embeds)): # initialize for beam search. embed = np.reshape(embeds[t_level1], (1, -1)) context = np.reshape(contexts[t_level1], (1, -1)) hidden = np.reshape(hiddens[t_level1], (1, -1)) (init_c, init_h) = sess.run([level2.init_c, level2.init_h], feed_dict={level2.embedding: embed, level2.context: context, level2.hidden: hidden}) initial_beam = Caption( sentence=[self.vocab_2level['START']], c=init_c, h=init_h, logprob=0.0, score=0.0, info=False) partial_captions = TopN(self.beam_size_2level) partial_captions.push(initial_beam) complete_captions = TopN(self.beam_size_2level) # Run beam search. for t in range(self.max_caption_length_2level): partial_captions_list = partial_captions.extract() partial_captions.reset() input_feed = np.array([c.sentence[-1] for c in partial_captions_list]) h_feed = np.reshape(np.array([c.h for c in partial_captions_list]), (-1, level2.dim_hid)) c_feed = np.reshape(np.array([c.c for c in partial_captions_list]), (-1, level2.dim_hid)) (c, h, log_softmax) = sess.run([level2.c, level2.h, level2.log_softmax], feed_dict={level2.c_feed: c_feed, level2.h_feed: h_feed, level2.in_word: input_feed}) for i, partial_caption in enumerate(partial_captions_list): word_probabilities = log_softmax[i] word_probabilities[2:] += self.encourage_2level words_and_probs = list(enumerate(word_probabilities)) words_and_probs.pop(level2._start) # exclude START words_and_probs.sort(key=lambda x: -x[1]) words_and_probs = words_and_probs[0:self.beam_size_2level] for w, logp in words_and_probs: sentence = partial_caption.sentence + [w] logprob = partial_caption.logprob + logp score = logprob if w == level2.word_to_idx['EOS']: if self.length_normalization_factor > 0: score /= len(sentence) ** self.length_normalization_factor beam = Caption(sentence, c[i], h[i], logprob, score, info=False) complete_captions.push(beam) else: beam = Caption(sentence, c[i], h[i], logprob, score, info=False) partial_captions.push(beam) if partial_captions.size() == 0: break if not complete_captions.size(): complete_captions = partial_captions # exclude START, only top-ranked attr is used. # attr ~ list([str <x1>]) attr = utils.decode_captions( np.squeeze(np.asarray( complete_captions.extract(sort=True)[0].sentence ))[1:], level2.idx_to_word) # append str to list attrs_level2.extend(attr) full_sentence.append(' '.join([i + ' ' + j if i != '' else j for (j, i) in zip(words_level1, attrs_level2)])) else: # exclude START full_sentence = [i.sentence[1:] for i in level1_top_captions] full_sentence = utils.decode_captions(np.asarray(full_sentence), level1.idx_to_word) # only return top-ranked stem with attr. return full_sentence[0]