Beispiel #1
0
 def build_i2t(self, infos, loader):
     self.i2t_model = I2T_Model_init(self.opt, models.setup(self.opt))
     self.dp_i2t_model = torch.nn.DataParallel(
         self.i2t_model) if len(self.opt.gpus) > 1 else self.i2t_model
     self.dp_i2t_model.cuda()
     self.dp_i2t_model.training = True if self.i2t_train_flag else False
     self.i2t_crit = criterion.LanguageModelCriterion(self.opt)
     self.i2t_rl_crit = criterion.RewardCriterion()
Beispiel #2
0
def i2t_eval():
    # Setup the model
    i2t_model = models.setup(opt)
    i2t_model.load_state_dict(torch.load(opt.model))
    i2t_model.cuda()
    i2t_model.eval()
    crit = criterion.LanguageModelCriterion(infos['opt'])

    # Create the Data Loader instance
    if len(opt.image_folder) == 0:
        loader = DataLoader(opt)
    else:
        loader = DataLoaderRaw({
            'folder_path': opt.image_folder,
            'coco_json': opt.coco_json,
            'batch_size': opt.batch_size,
            'cnn_model': opt.cnn_model
        })
    # When eval using provided pretrained model, the vocab may be different from what you have in your cocotalk.json
    # So make sure to use the vocab in infos file.
    loader.ix_to_word = infos['vocab']

    # Set sample options
    loss, split_predictions, lang_stats, nmt_valid_ppl, nmt_valid_acc = eval_utils.eval_split(
        opt, loader, i2t_model, None, vars(opt))

    print('loss: ', loss)
    if lang_stats:
        print(lang_stats)

    if opt.dump_json == 1:
        # dump the json
        dump_json_path = 'tmp/' + opt.model.split(
            '/')[1] + '_zh_' + opt.dataset + '.json'
        json.dump(
            split_predictions,
            open(
                'tmp/' + opt.model.split('/')[1] + '_zh_' + opt.dataset +
                '.json', 'w'))

    return dump_json_path
def eval_split_coco_unpaired(opt,
                             loader,
                             coco_loader,
                             i2t_model,
                             nmt_model,
                             eval_kwargs={}):
    verbose = eval_kwargs.get('verbose', True)
    verbose_beam = eval_kwargs.get('verbose_beam', 1)
    verbose_loss = eval_kwargs.get('verbose_loss', 1)
    num_images = eval_kwargs.get('num_images',
                                 eval_kwargs.get('val_images_use', -1))
    split = eval_kwargs.get('split', 'val')
    lang_eval = eval_kwargs.get('language_eval', 0)
    dataset = eval_kwargs.get('dataset', 'coco')
    beam_size = eval_kwargs.get('beam_size', 1)

    # Make sure in the evaluation mode
    print('Start evaluate the model ...')
    if opt.i2t_eval_flag:
        i2t_crit = criterion.LanguageModelCriterion(opt)
        i2t_model.eval()

    if opt.nmt_eval_flag:
        stats = onmt.Loss.Statistics()
        nmt_crit = criterion.NMT_loss(opt,
                                      nmt_model.generator,
                                      criterion.NMTCriterion(
                                          loader.nmt_dicts['tgt'].size(), opt),
                                      eval=True)
        nmt_model.eval()

    loader.reset_iterator(split)

    im_idx = 0
    n = 0
    loss = 0
    loss_sum = 0
    loss_evals = 1e-8
    predictions = []
    coco_predictions = []

    dump_text = False
    if dump_text: prediction_txt = open('tmp/coco_test_5k_image_path.txt', 'w')

    coco_loader.reset_iterator(split)
    if opt.i2t_eval_flag:
        while True:
            data = loader.get_batch(split)
            coco_data = coco_loader.get_batch(split)
            n = n + coco_loader.batch_size

            if data.get('labels', None) is not None and verbose_loss:
                # forward the model to get loss
                tmp = [
                    data['fc_feats'], data['att_feats'], data['labels'],
                    data['masks'], data['att_masks']
                ]
                tmp = [
                    Variable(torch.from_numpy(_), volatile=True).cuda()
                    for _ in tmp
                ]
                fc_feats, att_feats, labels, masks, att_masks = tmp
                outputs = i2t_model(fc_feats, att_feats, labels, att_masks)
                loss = i2t_crit(outputs, labels[:, 1:], masks[:, 1:]).data[0]
                loss_sum = loss_sum + loss
                loss_evals = loss_evals + 1

            # forward the model to also get generated samples for each image
            # Only leave one feature for each image, in case duplicate sample
            coco_fc_feats = Variable(torch.from_numpy(
                coco_data['fc_feats'][np.arange(coco_loader.batch_size) *
                                      coco_loader.seq_per_img]),
                                     volatile=True).cuda()
            coco_att_feats = None
            coco_att_mask = None

            tmp = [
                data['fc_feats'][np.arange(loader.batch_size) *
                                 loader.seq_per_img],
                data['att_feats'][np.arange(loader.batch_size) *
                                  loader.seq_per_img],
                data['att_masks'][np.arange(loader.batch_size) *
                                  loader.seq_per_img]
            ]
            tmp = [
                Variable(torch.from_numpy(_), volatile=True).cuda()
                for _ in tmp
            ]
            fc_feats, att_feats, att_masks = tmp

            # forward the model to also get generated samples for each image
            coco_seq = i2t_model(coco_fc_feats,
                                 coco_att_feats,
                                 coco_att_mask,
                                 opt=eval_kwargs,
                                 mode='sample')[0]
            seq = i2t_model(fc_feats,
                            att_feats,
                            att_masks,
                            opt=eval_kwargs,
                            mode='sample')[0]

            # Print beam search
            if beam_size > 1 and verbose_beam:
                for i in range(coco_loader.batch_size):
                    print('\n'.join([
                        utils.decode_sequence(loader.get_vocab(),
                                              _['seq'].unsqueeze(0))[0]
                        for _ in i2t_model.done_beams[i]
                    ]))
                    print('--' * 10)
            coco_sents = utils.decode_sequence(coco_loader.get_vocab(),
                                               coco_seq)
            sents = utils.decode_sequence(loader.get_vocab(), seq)

            srcBatch = tgtBatch = []
            for coco_sent in coco_sents:
                srcTokens = coco_sent.split()
                srcBatch += [srcTokens]

            # Translate zh-caption (coco) to en
            predBatch = nmt_model.translate(srcBatch)
            # process
            for b in range(len(predBatch)):
                srcSent = ' '.join(srcBatch[b])
                if nmt_model.tgt_dict.lower:
                    srcSent = srcSent.lower()
                #print('%s; PRED: %s' % (srcSent, " ".join(predBatch[b][0])))
                pred_sent = " ".join(predBatch[b][0])
                pred_sent = pred_sent.replace("'s", "is")
                pred_sent = pred_sent.replace("there is", "")
                pred_sent = pred_sent.replace("there 's", "")
                tmpTokens = pred_sent.split()
                predBatch += [tmpTokens]

            for k, coco_sent in enumerate(coco_sents):
                if verbose:
                    im_idx = im_idx + 1
                    print('{}/image: {} | ZH: {} | EN: {}'.format(
                        im_idx, coco_data['infos'][k]['id'],
                        coco_sent.encode('utf8', 'replace').replace(" ", ""),
                        " ".join(predBatch[k][0])))
                    if dump_text:
                        prediction_txt.write(
                            '%s\n' % (coco_data['infos'][k]['file_path']))
                entry = {
                    'image_id': data['infos'][k]['id'],
                    'caption': sents[k]
                }
                coco_entry = {
                    'image_id': coco_data['infos'][k]['id'],
                    'caption': " ".join(predBatch[k][0])
                }
                predictions.append(entry)
                coco_predictions.append(coco_entry)
            # if we wrapped around the split or used up val imgs budget then bail
            ix0 = data['bounds']['it_pos_now']
            ix1 = data['bounds']['it_max']
            if num_images != -1:
                ix1 = min(ix1, num_images)
            for i in range(n - ix1):
                predictions.pop()

            coco_ix0 = coco_data['bounds']['it_pos_now']
            coco_ix1 = coco_data['bounds']['it_max']
            if num_images != -1:
                coco_ix1 = min(coco_ix1, num_images)
            for i in range(n - coco_ix1):
                coco_predictions.pop()

            if coco_data['bounds']['wrapped']:
                break
            if num_images >= 0 and n >= num_images:
                break

        if dump_text: prediction_txt.close()
        tag = 'captions_image_info_karpathy_5k_test_results'
        lang_stats = language_eval('zh', predictions, tag, 'val')
        coco_lang_stats = language_eval('en', coco_predictions, tag, 'val')

    # Switch back to training mode
    if opt.nmt_eval_flag:
        loader.reset_iterator(split)
        for i in tqdm(range(int(loader.nmt_validData.numBatches))):
            batch = loader.get_batch(split)
            outputs, attn, dec_hidden, _ = nmt_model(batch['nmt'].src,
                                                     batch['nmt'].tgt,
                                                     batch['nmt'].lengths)
            batch_loss, batch_stats = nmt_crit(loader, batch['nmt'], outputs,
                                               attn)
            #stats.update(batch_stats)

    if opt.i2t_train_flag: i2t_model.train()
    if opt.nmt_train_flag: nmt_model.train()

    if opt.i2t_eval_flag and opt.nmt_eval_flag:
        return loss_sum / loss_evals, predictions, coco_predictions, lang_stats, coco_lang_stats, nmt_crit.total_stats.ppl(
        ), nmt_crit.total_stats.accuracy()
    elif opt.i2t_eval_flag:
        return loss_sum / loss_evals, predictions, coco_predictions, lang_stats, coco_lang_stats, 0.0, 0.0
Beispiel #4
0
    if k not in ignore:
        if k in vars(opt):
            assert vars(opt)[k] == vars(
                infos['opt'])[k], k + ' option not consistent'
        else:
            vars(opt).update({k: vars(infos['opt'])[k]
                              })  # copy over options from model

vocab = infos['vocab']  # ix -> word mapping

# Setup the model
i2t_model = models.setup(opt)
i2t_model.load_state_dict(torch.load(opt.model))
i2t_model.cuda()
i2t_model.eval()
crit = criterion.LanguageModelCriterion(infos['opt'])

# Create the Data Loader instance
if len(opt.image_folder) == 0:
    loader = DataLoader(opt)
else:
    loader = DataLoaderRaw({
        'folder_path': opt.image_folder,
        'coco_json': opt.coco_json,
        'batch_size': opt.batch_size,
        'cnn_model': opt.cnn_model
    })
# When eval using provided pretrained model, the vocab may be different from what you have in your cocotalk.json
# So make sure to use the vocab in infos file.
loader.ix_to_word = infos['vocab']
def eval_split(opt, loader, i2t_model, nmt_model, eval_kwargs={}):
    verbose = eval_kwargs.get('verbose', True)
    verbose_beam = eval_kwargs.get('verbose_beam', 1)
    verbose_loss = eval_kwargs.get('verbose_loss', 1)
    num_images = eval_kwargs.get('num_images',
                                 eval_kwargs.get('val_images_use', -1))
    split = eval_kwargs.get('split', 'val')
    lang_eval = eval_kwargs.get('language_eval', 0)
    dataset = eval_kwargs.get('dataset', 'coco')
    beam_size = eval_kwargs.get('beam_size', 1)

    if opt.coco_eval_flag:
        return eval_split_coco_unpaired(opt, loader, i2t_model, nmt_model,
                                        eval_kwargs)

    # Make sure in the evaluation mode
    print('Start evaluate the model ...')
    if opt.i2t_eval_flag:
        i2t_crit = criterion.LanguageModelCriterion(opt)
        i2t_model.eval()

    if opt.nmt_eval_flag:
        nmt_crit = criterion.NMT_loss(opt,
                                      nmt_model.generator,
                                      criterion.NMTCriterion(
                                          loader.nmt_dicts['tgt'].size(), opt),
                                      eval=True)
        nmt_model.eval()

    loader.reset_iterator(split)
    beam_accum = {
        "predicted_ids": [],
        "beam_parent_ids": [],
        "scores": [],
        "log_probs": []
    }

    n = 0
    loss = 0
    loss_sum = 0
    loss_evals = 1e-8
    predictions = []
    if opt.i2t_eval_flag:
        while True:
            data = loader.get_batch(split)
            n = n + loader.batch_size

            if data.get('labels', None) is not None and verbose_loss:
                # forward the model to get loss
                tmp = [
                    data['fc_feats'], data['attri_feats'], data['att_feats'],
                    data['labels'], data['masks'], data['att_masks']
                ]
                tmp = [
                    _ if _ is None else
                    (Variable(torch.from_numpy(_), volatile=True).cuda()
                     if utils.under_0_4() else torch.from_numpy(_).cuda())
                    for _ in tmp
                ]
                fc_feats, attri_feats, att_feats, labels, masks, att_masks = tmp
                outputs = i2t_model(fc_feats, attri_feats, att_feats, labels,
                                    att_masks)
                loss = i2t_crit(outputs, labels[:, 1:], masks[:, 1:]).data[0]
                loss_sum = loss_sum + loss
                loss_evals = loss_evals + 1

            # forward the model to also get generated samples for each image
            # Only leave one feature for each image, in case duplicate sample
            tmp = [
                data['fc_feats'][np.arange(loader.batch_size) *
                                 loader.seq_per_img],
                data['attri_feats'][np.arange(loader.batch_size) *
                                    loader.seq_per_img],
                data['att_feats'][np.arange(loader.batch_size) *
                                  loader.seq_per_img],
                data['att_masks'][np.arange(loader.batch_size) *
                                  loader.seq_per_img]
                if data['att_masks'] is not None else None
            ]
            tmp = [
                _ if _ is None else
                (Variable(torch.from_numpy(_), volatile=True).cuda()
                 if utils.under_0_4() else torch.from_numpy(_).cuda())
                for _ in tmp
            ]
            fc_feats, attri_feats, att_feats, att_masks = tmp
            # forward the model to also get generated samples for each image
            seq = i2t_model(fc_feats,
                            attri_feats,
                            att_feats,
                            att_masks,
                            opt=eval_kwargs,
                            mode='sample')[0].data
            #print(seq)
            # Print beam search
            if beam_size > 1 and verbose_beam:
                for i in range(loader.batch_size):
                    print('\n'.join([
                        utils.decode_sequence(loader.get_vocab(),
                                              _['seq'].unsqueeze(0))[0]
                        for _ in i2t_model.done_beams[i]
                    ]))
                    print('--' * 10)
            sents = utils.decode_sequence(loader.get_vocab(), seq)
            tgtBatch = []

            for k, sent in enumerate(sents):
                if verbose:
                    print('image %s: ' % (data['infos'][k]['id']),
                          sent.encode('utf8', 'replace'))
                entry = {'image_id': data['infos'][k]['id'], 'caption': sent}
                if eval_kwargs.get('dump_path', 0) == 1:
                    entry['file_name'] = data['infos'][k]['file_path']
                predictions.append(entry)
                if eval_kwargs.get('dump_images', 0) == 1:
                    # dump the raw image to vis/ folder
                    cmd = 'cp "' + os.path.join(
                        eval_kwargs['image_root'], data['infos'][k]
                        ['file_path']) + '" vis/imgs/img' + str(
                            len(predictions)) + '.jpg'  # bit gross
                    print(cmd)
                    os.system(cmd)

            # if we wrapped around the split or used up val imgs budget then bail
            ix0 = data['bounds']['it_pos_now']
            ix1 = data['bounds']['it_max']
            if num_images != -1:
                ix1 = min(ix1, num_images)
            for i in range(n - ix1):
                predictions.pop()

            if verbose:
                print('evaluating validation preformance... %d/%d (%f)' %
                      (ix0 - 1, ix1, loss))

            if data['bounds']['wrapped']:
                break
            if num_images >= 0 and n >= num_images:
                break

        lang_stats = None
        if lang_eval == 1:
            if 'coco' in opt.input_json:
                lang_stats = language_eval('coco', predictions, opt.id, split)
            elif 'chinese' in opt.input_json:
                lang_stats = language_eval('zh', predictions, opt.id, split)
            elif '30k' in opt.input_json:
                lang_stats = language_eval('30k', predictions, opt.id, split)
            else:
                raise Exception('Current eval type is not recognizable.')
    # Switch back to training mode
    if opt.nmt_eval_flag:
        for i in tqdm(range(int(loader.nmt_validData.numBatches))):
            batch = loader.get_batch('val')
            outputs, attn, dec_hidden, _ = nmt_model(batch['nmt'].src,
                                                     batch['nmt'].tgt,
                                                     batch['nmt'].lengths)
            batch_loss = nmt_crit(loader, batch['nmt'], outputs, attn)

    if opt.nmt_train_flag: nmt_model.train()
    if opt.i2t_train_flag: i2t_model.train()

    if opt.i2t_eval_flag and opt.nmt_eval_flag:
        return loss_sum / loss_evals, predictions, lang_stats, nmt_crit.total_stats.ppl(
        ), nmt_crit.total_stats.accuracy()
    elif opt.nmt_eval_flag:
        return 0.0, None, None, nmt_crit.total_stats.ppl(
        ), nmt_crit.total_stats.accuracy()
    elif opt.i2t_eval_flag:
        return loss_sum / loss_evals, predictions, lang_stats, 0.0, 0.0