def eval_split(model_cnn, model, filepaths, ix_to_word, eval_kwargs={}):

    verbose_eval = eval_kwargs.get('verbose_eval', True)
    beam_size = eval_kwargs.get('beam_size', 1)
    caption_model = eval_kwargs.get('caption_model', '')
    batch_size = eval_kwargs.get('batch_size', 1)

    predictions = []

    data = get_batch(filepaths, batch_size)

    images = torch.from_numpy(data['images']).cuda()
    images = utils.prepro_norm(images, False)
    images = Variable(images, requires_grad=False)

    if models.is_only_fc_feat(caption_model):
        fc_feats = model_cnn(images)
    else:
        fc_feats, att_feats = model_cnn(images)

    if models.is_only_fc_feat(caption_model):
        seq, _ = model.sample(fc_feats, {'beam_size': beam_size})
    else:
        seq, _ = model.sample(fc_feats, att_feats, {'beam_size': beam_size})

    # sents
    sents = utils.decode_sequence(ix_to_word, seq)

    for k, sent in enumerate(sents):
        print(sent)
        sent = ''.join(sent.split())
        predictions.append(sent)

    return predictions
def eval_split(model_cnn, model, loader, eval_kwargs={}):

    verbose_eval = eval_kwargs.get('verbose_eval', True)
    beam_size = eval_kwargs.get('beam_size', 1)
    caption_model = eval_kwargs.get('caption_model', '')
    batch_size = eval_kwargs.get('batch_size', 1)

    split = ''
    loader.reset_iterator(split)
    n = 0
    predictions = []
    vocab = loader.get_vocab()

    while True:
        data = loader.get_batch(split, batch_size)
        n = n + batch_size

        images = torch.from_numpy(data['images']).cuda()
        images = utils.prepro_norm(images, False)
        images = Variable(images, requires_grad=False)

        if models.is_only_fc_feat(caption_model):
            fc_feats = model_cnn(images)
        else:
            fc_feats, att_feats = model_cnn(images)

        if models.is_only_fc_feat(caption_model):
            seq, _ = model.sample(fc_feats, {'beam_size': beam_size})
        else:
            seq, _ = model.sample(fc_feats, att_feats,
                                  {'beam_size': beam_size})

        # sents
        sents = utils.decode_sequence(vocab, seq)

        for k, sent in enumerate(sents):
            image_id = data['infos'][k]['id']
            image_id = int(image_id.split('_')[2])
            entry = {'image_id': image_id, 'caption': sent}
            predictions.append(entry)
            if verbose_eval:
                print('image %s: %s' % (entry['image_id'], entry['caption']))

        ix0 = data['bounds']['it_pos_now']
        ix1 = data['bounds']['it_max']

        for i in range(n - ix1):
            predictions.pop()
        if verbose_eval:
            print('evaluating validation preformance... %d/%d' %
                  (ix0 - 1, ix1))

        if data['bounds']['wrapped']:
            break

    return predictions
Exemple #3
0
def train_cnn(model_cnn, images, bus, fc_expander, att_expander, bu_expander,
              use_reinforce):

    fc_feats = None
    att_feats = None
    bu_feats = None

    # train cnn
    if models.is_only_fc_feat(opt.caption_model):
        fc_feats = model_cnn(images)
        if opt.seq_per_img > 1 and not use_reinforce:
            fc_feats = fc_expander(fc_feats)
    elif models.is_only_att_feat(opt.caption_model):
        att_feats = model_cnn(images)
        if opt.seq_per_img > 1 and not use_reinforce:
            att_feats = att_expander(att_feats)
    elif models.has_sub_region_bu(opt.caption_model):
        fc_feats, att_feats, bu_feats = model_cnn(images)
        if opt.seq_per_img > 1 and not use_reinforce:
            fc_feats = fc_expander(fc_feats)
            att_feats = att_expander(att_feats)
            bu_feats = bu_expander(bu_feats)
    else:
        fc_feats, att_feats = model_cnn(images)
        if opt.seq_per_img > 1 and not use_reinforce:
            fc_feats = fc_expander(fc_feats)
            att_feats = att_expander(att_feats)

    if models.has_bu(opt.caption_model):
        bus_feats = bus
        if opt.seq_per_img > 1 and not use_reinforce:
            bu_feats = bu_expander(bus_feats)

    return fc_feats, att_feats, bu_feats
Exemple #4
0
def compute_output(caption_model, beam_size, model, fc_feats, att_feats,
                   bu_feats):
    if models.is_only_fc_feat(caption_model):
        output = model.sample(fc_feats, {'beam_size': beam_size})
    elif models.is_only_att_feat(caption_model):
        output = model.sample(att_feats, {'beam_size': beam_size})
    elif models.has_bu(caption_model) or models.has_sub_region_bu(
            caption_model) or models.is_prob_weight_mul_out(caption_model):
        output = model.sample(fc_feats, att_feats, bu_feats,
                              {'beam_size': beam_size})
    else:
        output = model.sample(fc_feats, att_feats, {'beam_size': beam_size})
    return output
def train_normal(params, opt):

    model = params['model']
    fc_feats = params['fc_feats']
    att_feats = params['att_feats']
    labels = params['labels']
    targets = params['targets']
    masks = params['masks']
    vocab = params['vocab']
    crit = params['crit']

    # forward
    start = time.time()
    if models.is_transformer(opt.caption_model):
        output = model(att_feats, targets, masks)
    elif models.is_ctransformer(opt.caption_model):
        output = model(fc_feats, att_feats, targets, masks)
    elif models.is_only_fc_feat(opt.caption_model):
        output = model(fc_feats, labels)
    elif models.is_only_att_feat(opt.caption_model):
        output = model(att_feats, labels)
    elif models.has_bu(opt.caption_model):
        bu_feats = params['bu_feats']
        output = model(fc_feats, att_feats, bu_feats, labels)
    else:
        output = model(fc_feats, att_feats, labels)

    if opt.verbose:
        print('model {:.3f}'.format(time.time() - start))

    # compute the loss
    start = time.time()

    if models.is_prob_weight(opt.caption_model):
        output = output[0]

    loss = crit(output, labels, masks)
    if opt.verbose:
        print('crit {:.3f}'.format(time.time() - start))

    # backward
    start = time.time()
    loss.backward()
    if opt.verbose:
        print('loss {:.3f}'.format(time.time() - start))

    # show information
    train_loss = loss.data[0]
    reward_mean = 0

    return train_loss, reward_mean
def train_mix(params, iteration, opt):

    model = params['model']
    fc_feats = params['fc_feats']
    att_feats = params['att_feats']
    labels = params['labels']
    masks = params['masks']
    vocab = params['vocab']
    gts = params['gts']
    crit_pg = params['crit_pg']
    crit = params['crit']

    output = None

    if iteration % 2 == 1:
        use_reinforce = True
        train_loss, reward_mean = crit_pg.forward_backward(
            output, labels, masks, vocab)
    else:
        use_reinforce = False

        # forward
        start = time.time()
        if models.is_only_fc_feat(opt.caption_model):
            output = model(fc_feats, labels)
        else:
            output = model(fc_feats, att_feats, labels)

        if opt.verbose:
            print('model {:.3f}'.format(time.time() - start))

        # compute the loss
        start = time.time()
        loss = crit(output, labels, masks)
        if opt.verbose:
            print('crit {:.3f}'.format(time.time() - start))

        # backward
        start = time.time()
        loss.backward()
        if opt.verbose:
            print('loss {:.3f}'.format(time.time() - start))

        # show information
        train_loss = loss.data[0]
        reward_mean = 0
    return train_loss, reward_mean
Exemple #7
0
def compute_cnn_feats(caption_model, model_cnn, images):

    fc_feats = None
    att_feats = None
    bu_feats = None

    if models.is_only_fc_feat(caption_model):
        fc_feats = model_cnn(images)
    elif models.is_only_att_feat(caption_model):
        att_feats = model_cnn(images)
    elif caption_model == "SCST":
        fc_feats, att_feats = model_cnn(images)
    elif models.is_prob_weight(caption_model):
        if models.has_sub_region_bu(caption_model):
            fc_feats, att_feats, bu_feats = model_cnn(images)
        else:
            fc_feats, att_feats = model_cnn(images)
    elif models.is_prob_weight_mul_out(caption_model):
        fc_feats, att_feats = model_cnn(images)
    else:
        fc_feats, att_feats = model_cnn(images)

    return fc_feats, att_feats, bu_feats
def train_actor_critic(params, opt, type, retain_graph=False):

    model = params['model']
    fc_feats = params['fc_feats']
    att_feats = params['att_feats']
    labels = params['labels']
    masks = params['masks']
    vocab = params['vocab']
    gts = params['gts']

    if type == 0:
        crit_c = params['crit_c']
    elif type == 1:
        crit_ac = params['crit_ac']

    if models.has_bu(opt.caption_model) or models.has_sub_region_bu(
            opt.caption_model):
        bu_feats = params['bu_feats']

    # forward
    start = time.time()
    if models.is_only_fc_feat(opt.caption_model):
        sample_seq, sample_seqLogprobs, sample_value = model.sample(
            fc_feats, {'sample_max': 0})
    elif models.has_bu(opt.caption_model) or models.has_sub_region_bu(
            opt.caption_model):
        sample_seq, sample_seqLogprobs, sample_value = model.sample(
            fc_feats, att_feats, bu_feats, {'sample_max': 0})
    else:
        # sample_seq, sample_seqLogprobs = model.sample_forward(fc_feats, att_feats, labels, {'sample_max': 0})
        # greedy_seq, greedy_seqLogprobs = model.sample_forward(fc_feats, att_feats, labels, {'sample_max': 1})
        sample_output = model.sample(fc_feats, att_feats, {'sample_max': 0})

        sample_seq = sample_output[0]
        sample_seqLogprobs = sample_output[1]
        sample_value = sample_output[2]

    if opt.verbose:
        print('model {:.3f}'.format(time.time() - start))

    # compute the loss
    start = time.time()
    # 0. critic
    # 1. critic, actor
    if type == 0:
        # seq, seqLogprobs, seq1, target, vocab
        loss, reward_mean, sample_mean = crit_c(sample_seq, sample_value, gts)
    elif type == 1:
        # seq, seqLogprobs, seq1, target, vocab
        loss, reward_mean, sample_mean = crit_ac(sample_seq,
                                                 sample_seqLogprobs,
                                                 sample_value, gts)
    # loss, reward_mean = crit_rl(sample_seq, sample_seqLogprobs, gts)
    if opt.verbose:
        print('crit {:.3f}'.format(time.time() - start))

    # backward
    start = time.time()
    loss.backward(retain_graph=retain_graph)
    if opt.verbose:
        print('loss {:.3f}'.format(time.time() - start))

    # show information
    train_loss = loss.data[0]

    return train_loss, reward_mean, sample_mean
def train_reinforce(params, opt):

    model = params['model']
    fc_feats = params['fc_feats']
    att_feats = params['att_feats']
    labels = params['labels']
    masks = params['masks']
    vocab = params['vocab']
    crit_pg = params['crit_pg']
    crit_rl = params['crit_rl']
    targets = params['targets']
    gts = params['gts']

    if models.has_bu(opt.caption_model) or models.has_sub_region_bu(
            opt.caption_model):
        bu_feats = params['bu_feats']

    # compute policy gradient
    if opt.reinforce_type == 0:
        raise Exception('reinforce_type error, 0 is deprecated')
        # forward
        start = time.time()
        if models.is_only_fc_feat(opt.caption_model):
            output = model(fc_feats, labels)
        else:
            output = model(fc_feats, att_feats, labels)

        if opt.verbose:
            print('model {:.3f}'.format(time.time() - start))

        train_loss, reward_mean = crit_pg.forward_backward(
            output, labels, masks, vocab)
    # self-critical
    elif opt.reinforce_type == 1:
        # forward
        start = time.time()
        if models.is_only_fc_feat(opt.caption_model):
            sample_seq, sample_seqLogprobs = model.sample(
                fc_feats, {'sample_max': 0})
            greedy_seq, greedy_seqLogprobs = model.sample(
                fc_feats, {'sample_max': 1})
        elif models.is_only_att_feat(opt.caption_model):
            sample_seq, sample_seqLogprobs = model.sample(
                att_feats, {'sample_max': 0})
            greedy_seq, greedy_seqLogprobs = model.sample(
                att_feats, {'sample_max': 1})
        elif models.has_bu(opt.caption_model) or models.has_sub_region_bu(
                opt.caption_model):
            sample_seq, sample_seqLogprobs = model.sample(
                fc_feats, att_feats, bu_feats, {'sample_max': 0})
            greedy_seq, greedy_seqLogprobs = model.sample(
                fc_feats, att_feats, bu_feats, {'sample_max': 1})
        else:
            # sample_seq, sample_seqLogprobs = model.sample_forward(fc_feats, att_feats, labels, {'sample_max': 0})
            # greedy_seq, greedy_seqLogprobs = model.sample_forward(fc_feats, att_feats, labels, {'sample_max': 1})
            sample_output = model.sample(fc_feats, att_feats,
                                         {'sample_max': 0})
            greedy_output = model.sample(fc_feats, att_feats,
                                         {'sample_max': 1})

            sample_seq = sample_output[0]
            sample_seqLogprobs = sample_output[1]

            greedy_seq = greedy_output[0]
            greedy_seqLogprobs = greedy_output[1]

        if opt.verbose:
            print('model {:.3f}'.format(time.time() - start))

        # compute the loss
        start = time.time()
        # seq, seqLogprobs, seq1, target, vocab
        loss, reward_mean, sample_mean, greedy_mean = crit_rl(
            sample_seq, sample_seqLogprobs, greedy_seq, gts, masks)
        # loss, reward_mean = crit_rl(sample_seq, sample_seqLogprobs, gts)
        if opt.verbose:
            print('crit {:.3f}'.format(time.time() - start))

        # backward
        start = time.time()
        loss.backward()
        if opt.verbose:
            print('loss {:.3f}'.format(time.time() - start))

        # show information
        train_loss = loss.data[0]

    return train_loss, reward_mean, sample_mean, greedy_mean
Exemple #10
0
def eval_split_only(model_cnn, model, crit, loader, eval_kwargs={}):

    verbose_eval = eval_kwargs.get('verbose_eval', True)
    val_images_use = eval_kwargs.get('val_images_use', -1)
    split = eval_kwargs.get('split', 'val')
    lang_eval = eval_kwargs.get('language_eval', 1)
    dataset = eval_kwargs.get('dataset', 'coco')
    beam_size = eval_kwargs.get('beam_size', 1)
    coco_caption_path = eval_kwargs.get('coco_caption_path', 'coco-caption')
    caption_model = eval_kwargs.get('caption_model', '')
    batch_size = eval_kwargs.get('batch_size', 2)
    seq_per_img = eval_kwargs.get('seq_per_img', 5)

    # Make sure in the evaluation mode
    model_cnn.eval()
    model.eval()

    loader.reset_iterator(split)

    n = 0
    loss_sum = 0
    loss_evals = 0
    predictions = []
    vocab = loader.get_vocab()
    vocab_size = loader.get_vocab_size()
    while True:
        data = loader.get_batch(split, batch_size)
        n = n + batch_size

        images = data['images']

        if models.is_only_fc_feat(caption_model):
            fc_feats = model_cnn(images)
        elif models.is_only_att_feat(caption_model):
            att_feats = model_cnn(images)
        elif caption_model == "SCST":
            fc_feats, att_feats = model_cnn(images)
        else:
            fc_feats, att_feats = model_cnn(images)

        if models.is_only_fc_feat(caption_model):
            seq, _ = model.sample(fc_feats, {'beam_size': beam_size})
        elif models.is_only_att_feat(caption_model):
            seq, _ = model.sample(att_feats, {'beam_size': beam_size})
        else:
            seq, _ = model.sample(fc_feats, att_feats,
                                  {'beam_size': beam_size})

        #
        sents = utils.decode_sequence(vocab, seq)

        for k, sent in enumerate(sents):
            entry = {'image_id': data['infos'][k]['id'], 'caption': sent}
            predictions.append(entry)
            if verbose_eval:
                print('image %s: %s' % (entry['image_id'], entry['caption']))

        ix0 = data['bounds']['it_pos_now']
        ix1 = data['bounds']['it_max']

        if val_images_use != -1:
            ix1 = min(ix1, val_images_use)
        for i in range(n - ix1):
            predictions.pop()
        if verbose_eval:
            print('evaluating validation preformance... %d/%d' %
                  (ix0 - 1, ix1))

        if data['bounds']['wrapped']:
            break
        if n >= val_images_use:
            break

    if lang_eval == 1:
        lang_stats, str_stats = language_eval(dataset, predictions,
                                              coco_caption_path)

    # Switch back to training mode
    model_cnn.train()
    model.train()

    return 0, predictions, lang_stats, str_stats
Exemple #11
0
def compute_loss(crit, model, caption_model, seq_per_img, fc_expander,
                 att_expander, bu_expander, fc_feats, att_feats, bu_feats,
                 labels, masks, tokens):

    if models.is_only_fc_feat(caption_model):
        if seq_per_img > 1:
            fc_feats_ext = fc_expander(fc_feats)
        else:
            fc_feats_ext = fc_feats
        batch_outputs = model(fc_feats_ext, labels)
    elif models.is_only_att_feat(caption_model):
        if seq_per_img > 1:
            att_feats_ext = att_expander(att_feats)
        else:
            att_feats_ext = att_feats
        batch_outputs = model(att_feats_ext, labels)
    elif caption_model == "SCST":
        if seq_per_img > 1:
            fc_feats_ext = fc_expander(fc_feats)
            att_feats_ext = att_expander(att_feats)
        else:
            fc_feats_ext = fc_feats
            att_feats_ext = att_feats
        batch_outputs, _ = model(fc_feats_ext, att_feats_ext, labels, "train")
    elif models.is_prob_weight(caption_model):
        if models.has_sub_region_bu(caption_model):
            if seq_per_img > 1:
                fc_feats_ext = fc_expander(fc_feats)
                att_feats_ext = att_expander(att_feats)
                bu_feats_ext = bu_expander(bu_feats)
            else:
                fc_feats_ext = fc_feats
                att_feats_ext = att_feats
                bu_feats_ext = bu_feats

            batch_outputs, prob_w = model(fc_feats_ext, att_feats_ext,
                                          bu_feats_ext, labels)
        else:
            if seq_per_img > 1:
                fc_feats_ext = fc_expander(fc_feats)
                att_feats_ext = att_expander(att_feats)
            else:
                fc_feats_ext = fc_feats
                att_feats_ext = att_feats

            if models.has_bu(caption_model):
                if seq_per_img > 1:
                    bu_feats_ext = bu_expander(bu_feats)
                else:
                    bu_feats_ext = bu_feats
                batch_outputs, prob_w = model(fc_feats_ext, att_feats_ext,
                                              bu_feats_ext, labels)
            else:
                batch_outputs, prob_w = model(fc_feats_ext, att_feats_ext,
                                              labels)
    elif models.is_prob_weight_mul_out(caption_model):
        if seq_per_img > 1:
            fc_feats_ext = fc_expander(fc_feats)
            att_feats_ext = att_expander(att_feats)
        else:
            fc_feats_ext = fc_feats
            att_feats_ext = att_feats

        if models.has_bu(caption_model):
            if seq_per_img > 1:
                bu_feats_ext = bu_expander(bu_feats)
            else:
                bu_feats_ext = bu_feats
            batch_outputs, prob_w = model(fc_feats_ext, att_feats_ext,
                                          bu_feats_ext, labels)
        else:
            batch_outputs, prob_w = model(fc_feats_ext, att_feats_ext, labels)
    else:
        if seq_per_img > 1:
            fc_feats_ext = fc_expander(fc_feats)
            att_feats_ext = att_expander(att_feats)
        else:
            fc_feats_ext = fc_feats
            att_feats_ext = att_feats

        if models.has_bu(caption_model):
            if seq_per_img > 1:
                bu_feats_ext = bu_expander(bu_feats)
            else:
                bu_feats_ext = bu_feats
            batch_outputs = model(fc_feats_ext, att_feats_ext, bu_feats_ext,
                                  labels)
        else:
            batch_outputs = model(fc_feats_ext, att_feats_ext, labels)

    if models.is_prob_weight(caption_model) or models.is_prob_weight_mul_out(
            caption_model):
        loss = crit(batch_outputs, labels, masks, prob_w, tokens)
    else:
        loss = crit(batch_outputs, labels, masks)
    loss.backward()

    return loss.data[0]
def eval_split(model_cnn, model, loader, eval_kwargs={}):

    verbose_eval = eval_kwargs.get('verbose_eval', True)
    beam_size = eval_kwargs.get('beam_size', 1)
    caption_model = eval_kwargs.get('caption_model', '')
    batch_size = eval_kwargs.get('batch_size', 1)

    split = ''
    loader.reset_iterator(split)
    n = 0
    predictions = []
    vocab = loader.get_vocab()

    while True:

        start = time.time()

        data = loader.get_batch(split, batch_size)
        n = n + batch_size

        images = torch.from_numpy(data['images']).cuda()
        images = utils.prepro_norm(images, False)
        images = Variable(images, requires_grad=False)

        if models.is_only_fc_feat(caption_model):
            fc_feats = model_cnn(images)
        else:
            fc_feats, att_feats = model_cnn(images)

        if models.is_only_fc_feat(caption_model):
            seq, _ = model.sample(fc_feats, {'beam_size': beam_size})
        else:
            seq, _ = model.sample(fc_feats, att_feats, {'beam_size': beam_size})

        # sents
        sents = utils.decode_sequence_aic(vocab, seq)
        for k, sent in enumerate(sents):
            image_id = data['infos'][k]['id']
            # print(image_id, sent)
            # image_id = int(image_id.split('_')[2])
            entry = {'image_id': image_id, 'caption': sent}
            predictions.append(entry)
            if verbose_eval:
                print('image %s: %s' % (entry['image_id'], entry['caption']))

        ix0 = data['bounds']['it_pos_now']
        ix1 = data['bounds']['it_max']

        for i in range(n - ix1):
            predictions.pop()
        if verbose_eval:
            span_time = time.time()-start
            left_time = (ix1-ix0)*span_time/batch_size
            if left_time > 3600:
                left_h = left_time//3600
                left_m = (left_time - left_h*3600)//60
                left_s = left_time - left_h*3600 - left_m*60
                s_left_time = '%dh:%dm:%.3fs' % (left_h, left_m, left_s)
            elif left_time > 60:
                left_m = left_time // 60
                left_s = left_time - left_m * 60
                s_left_time = '%dm:%.3fs' % (left_m, left_s)
            else:
                s_left_time = '%.3fs' % (left_time)

            print('evaluating validation preformance... %d/%d %.3fs left:%s' % (ix0, ix1, span_time, s_left_time))

        if data['bounds']['wrapped']:
            break

    return predictions
Exemple #13
0
def eval_split(model_cnn, model, loader, eval_kwargs={}):

    verbose_eval = eval_kwargs.get('verbose_eval', True)
    beam_size = eval_kwargs.get('beam_size', 1)
    caption_model = eval_kwargs.get('caption_model', '')
    batch_size = eval_kwargs.get('batch_size', 1)
    output_dir = eval_kwargs.get('output_dir', '')

    split = ''
    loader.reset_iterator(split)
    n = 0
    predictions = []
    vocab = loader.get_vocab()

    dir_fc = os.path.join(output_dir, 'fc')
    dir_att = os.path.join(output_dir, 'att')

    print(dir_fc)
    print(dir_att)

    if not os.path.isdir(dir_fc):
        os.mkdir(dir_fc)
    if not os.path.isdir(dir_att):
        os.mkdir(dir_att)

    while True:
        data = loader.get_batch(split, batch_size)
        n = n + batch_size

        images = torch.from_numpy(data['images']).cuda()
        images = utils.prepro_norm(images, False)
        images = Variable(images, requires_grad=False)

        if models.is_only_fc_feat(caption_model):
            fc_feats = model_cnn(images)
        else:
            fc_feats, att_feats = model_cnn(images)

        if models.is_only_fc_feat(caption_model):
            seq, _ = model.sample(fc_feats, {'beam_size': beam_size})
        else:
            seq, _ = model.sample(fc_feats, att_feats,
                                  {'beam_size': beam_size})

        # sents
        sents = utils.decode_sequence(vocab, seq)

        for k, sent in enumerate(sents):

            # att_batch = y[k].data.cpu().float().numpy()
            # np.savez_compressed(os.path.join(dir_att, data['infos'][k]['id']), x = att_batch)

            # fc_batch = fc_feats[k].data.cpu().float().numpy()
            # att_batch = att_feats[k].data.cpu().float().numpy()
            #
            # np.savez_compressed(os.path.join(dir_fc, data['infos'][k]['id']), x = fc_batch)
            # np.savez_compressed(os.path.join(dir_att, data['infos'][k]['id']), x = att_batch)

            entry = {'image_id': data['infos'][k]['id'], 'caption': sent}
            predictions.append(entry)
            if verbose_eval:
                print('image %s: %s' % (entry['image_id'], entry['caption']))

        ix0 = data['bounds']['it_pos_now']
        ix1 = data['bounds']['it_max']

        for i in range(n - ix1):
            predictions.pop()
        if verbose_eval:
            print('evaluating validation preformance... %d/%d' %
                  (ix0 - 1, ix1))

        if data['bounds']['wrapped']:
            break

    return predictions