コード例 #1
0
ファイル: main.py プロジェクト: xshhhm/NeuralBabyTalk
def eval(opt):
    model.eval()
    #########################################################################################
    # eval begins here
    #########################################################################################
    data_iter_val = iter(dataloader_val)
    loss_temp = 0
    start = time.time()

    num_show = 0
    predictions = []
    count = 0
    for step in range(len(dataloader_val)):
        data = data_iter_val.next()
        img, iseq, gts_seq, num, proposals, bboxs, box_mask, img_id = data

        proposals = proposals[:, :max(int(max(num[:, 1])), 1), :]

        input_imgs.data.resize_(img.size()).copy_(img)
        input_seqs.data.resize_(iseq.size()).copy_(iseq)
        gt_seqs.data.resize_(gts_seq.size()).copy_(gts_seq)
        input_num.data.resize_(num.size()).copy_(num)
        input_ppls.data.resize_(proposals.size()).copy_(proposals)
        gt_bboxs.data.resize_(bboxs.size()).copy_(bboxs)
        mask_bboxs.data.resize_(box_mask.size()).copy_(box_mask)
        input_imgs.data.resize_(img.size()).copy_(img)

        eval_opt = {
            'sample_max': 1,
            'beam_size': opt.beam_size,
            'inference_mode': True,
            'tag_size': opt.cbs_tag_size
        }
        seq, bn_seq, fg_seq =  model(input_imgs, input_seqs, gt_seqs, \
                                input_num, input_ppls, gt_bboxs, mask_bboxs, 'sample', eval_opt)

        sents = utils.decode_sequence(dataset.itow, dataset.itod, dataset.ltow, dataset.itoc, dataset.wtod, \
                                    seq.data, bn_seq.data, fg_seq.data, opt.vocab_size, opt)
        for k, sent in enumerate(sents):
            entry = {'image_id': img_id[k], 'caption': sent}
            predictions.append(entry)
            if num_show < 20:
                print('image %s: %s' % (entry['image_id'], entry['caption']))
                num_show += 1

        if count % 100 == 0:
            print(count)
        count += 1

    print('Total image to be evaluated %d' % (len(predictions)))
    lang_stats = None
    if opt.language_eval == 1:
        if opt.decode_noc:
            lang_stats = utils.noc_eval(predictions, str(1), opt.val_split,
                                        opt)
        else:
            lang_stats = utils.language_eval(opt.dataset, predictions, str(1),
                                             opt.val_split, opt)

    print('Saving the predictions')
    if opt.inference_only:
        import json
        pdb.set_trace()

    # Write validation result into summary
    if tf is not None:
        for k, v in lang_stats.items():
            add_summary_value(tf_summary_writer, k, v, iteration)
        tf_summary_writer.flush()
    val_result_history[iteration] = {
        'lang_stats': lang_stats,
        'predictions': predictions
    }

    return lang_stats
コード例 #2
0
def eval(opt):
    model.eval()
    #########################################################################################
    # eval begins here
    #########################################################################################
    data_iter_val = iter(dataloader_val)
    #loss_temp = 0
    #start = time.time()

    num_show = 0
    predictions = []
    count = 0
    for step in range(len(dataloader_val)):
        data = data_iter_val.next()
        img, iseq, gts_seq, num, proposals, bboxs, box_mask, img_id = data

        proposals = proposals[:,:max(int(max(num[:,1])),1),:]

        input_imgs.data.resize_(img.size()).copy_(img)
        input_seqs.data.resize_(iseq.size()).copy_(iseq)
        gt_seqs.data.resize_(gts_seq.size()).copy_(gts_seq)
        input_num.data.resize_(num.size()).copy_(num)
        input_ppls.data.resize_(proposals.size()).copy_(proposals)
        gt_bboxs.data.resize_(bboxs.size()).copy_(bboxs)
        mask_bboxs.data.resize_(box_mask.size()).copy_(box_mask)
        input_imgs.data.resize_(img.size()).copy_(img)

        eval_opt = {'sample_max':1, 'beam_size': opt.beam_size, 'inference_mode' : True, 'tag_size' : opt.cbs_tag_size}
        
        seq, bn_seq, fg_seq =  model(input_imgs, input_seqs, gt_seqs, input_num, input_ppls, gt_bboxs, mask_bboxs, 'sample', eval_opt)

        sents = utils.decode_sequence(dataset.itow, dataset.itod, dataset.ltow, dataset.itoc, dataset.wtod, seq.data, bn_seq.data, fg_seq.data, opt.vocab_size, opt)

        for k, sent in enumerate(sents):
            entry = {'image_id': img_id[k].item(), 'caption': sent}
            predictions.append(entry)
            
            if num_show < opt.batch_size:
                print('image %s: %s' % (entry['image_id'], entry['caption']) )
                num_show += 1

        if count % 100 == 0:
            print("Magi_ZZ_ML_Kernel:>> Evaluation function just ran for %d times...",count)
        count += 1

    print('Magi_ZZ_ML_Kernel:>> Total images and captions to be evaluated is: %d' %(len(predictions)))
    lang_stats = None
    if opt.language_eval == 1:
        #if opt.decode_noc:
            #lang_stats = utils.noc_eval(predictions, str(1), opt.val_split, opt)
        #else:
        lang_stats = utils.language_eval(opt.dataset, predictions, str(1), opt.val_split, opt)


    print('Magi_ZZ_ML_Kernel:>> Saving the predictions...')
    if opt.inference_only:
        lang_stats = utils.language_eval(opt.dataset, predictions, str(1), opt.val_split, opt)
        print("Magi_ZZ_ML_Kernel:>> Welcome To Inference mode, saving scores into {} ", opt.checkpoint_path)
        with open(os.path.join(opt.checkpoint_path, 'lang_stats.json'), 'w') as f:
            json.dump(lang_stats, f)
        print("Magi_ZZ_ML_Kernel:>> Done!")
        print("Magi_ZZ_ML_Kernel:>> now saving images and captions into {} ", opt.checkpoint_path)
        with open(os.path.join(opt.checkpoint_path, 'preds.json'), 'w') as f:
            json.dump(predictions, f)
        print("Magi_ZZ_ML_Kernel:>> Done!")
        print("Magi_ZZ_ML_Kernel:>> now saving images and captions into {} ", opt.checkpoint_path)
        with open(os.path.join(opt.checkpoint_path, 'sents.json'), 'w') as f:
            json.dump(sents, f)
        print("Magi_ZZ_ML_Kernel:>> Done!")

    # Write validation result into summary
        #for k,v in lang_stats.items():
        #    add_summary_value(tf_summary_writer, k, v, iteration)
        #tf_summary_writer.flush()
    
    val_result_history[iteration] = {'lang_stats': lang_stats, 'predictions': predictions}

    return lang_stats
コード例 #3
0
def eval_fusion_models(opt,
                       dataset_val,
                       imp_pro,
                       spa_pro,
                       sem_pro,
                       imp_model=None,
                       spa_model=None,
                       sem_model=None):
    dataloader_val = torch.utils.data.DataLoader(dataset_val,
                                                 batch_size=opt.batch_size,
                                                 shuffle=False,
                                                 num_workers=opt.num_workers)
    input_imgs = torch.FloatTensor(1)
    input_seqs = torch.LongTensor(1)
    input_ppls = torch.FloatTensor(1)
    gt_bboxs = torch.FloatTensor(1)
    mask_bboxs = torch.ByteTensor(1)
    gt_seqs = torch.LongTensor(1)
    input_num = torch.LongTensor(1)

    if opt.cuda:
        input_imgs = input_imgs.cuda()
        input_seqs = input_seqs.cuda()
        gt_seqs = gt_seqs.cuda()
        input_num = input_num.cuda()
        input_ppls = input_ppls.cuda()
        gt_bboxs = gt_bboxs.cuda()
        mask_bboxs = mask_bboxs.cuda()

    input_imgs = Variable(input_imgs)
    input_seqs = Variable(input_seqs)
    gt_seqs = Variable(gt_seqs)
    input_num = Variable(input_num)
    input_ppls = Variable(input_ppls)
    gt_bboxs = Variable(gt_bboxs)
    mask_bboxs = Variable(mask_bboxs)

    data_iter_val = iter(dataloader_val)
    loss_temp = 0
    start = time.time()

    num_show = 0
    predictions = []
    progress_bar = tqdm(dataloader_val,
                        desc='|Validation process',
                        leave=False)
    # for step in range(len(dataloader_val)):
    for step, data in enumerate(progress_bar):
        # data = data_iter_val.next()
        img, iseq, gts_seq, num, proposals, bboxs, box_mask, img_id, spa_adj_matrix, sem_adj_matrix = data
        # print(img_id)
        proposals = proposals[:, :max(int(max(num[:, 1])), 1), :]
        # print(proposals)
        # FF: Fix the bug with .data not run in the Pytorch
        input_imgs.resize_(img.size()).copy_(img)
        input_seqs.resize_(iseq.size()).copy_(iseq)
        gt_seqs.resize_(gts_seq.size()).copy_(gts_seq)
        input_num.resize_(num.size()).copy_(num)
        input_ppls.resize_(proposals.size()).copy_(proposals)
        gt_bboxs.resize_(bboxs.size()).copy_(bboxs)
        # FF: modify 0/1 to true/false
        mask_bboxs.resize_(box_mask.size()).copy_(box_mask.bool())
        # mask_bboxs.data.resize_(box_mask.size()).copy_(box_mask)
        input_imgs.resize_(img.size()).copy_(img)

        if len(spa_adj_matrix[0]) != 0:
            spa_adj_matrix = spa_adj_matrix[:, :max(int(max(num[:, 1])), 1), :
                                            max(int(max(num[:, 1])), 1)]
        if len(sem_adj_matrix[0]) != 0:
            sem_adj_matrix = sem_adj_matrix[:, :max(int(max(num[:, 1])), 1), :
                                            max(int(max(num[:, 1])), 1)]

        # relationship modify
        eval_opt_rel = {
            'imp_model': opt.imp_model,
            'spa_model': opt.spa_model,
            'sem_model': opt.sem_model,
            "graph_att": opt.graph_attention
        }
        pos_emb_var, spa_adj_matrix, sem_adj_matrix = prepare_graph_variables(
            opt.relation_type, proposals[:, :, :4], sem_adj_matrix,
            spa_adj_matrix, opt.nongt_dim, opt.imp_pos_emb_dim,
            opt.spa_label_num, opt.sem_label_num, eval_opt_rel)

        eval_opt = {
            'sample_max': 1,
            'beam_size': opt.beam_size,
            'inference_mode': True,
            'tag_size': opt.cbs_tag_size
        }
        seq, bn_seq, fg_seq, seqLogprobs, bnLogprobs, fgLogprobs, attention_weights = fusion_beam_sample(
            opt, imp_pro, spa_pro, sem_pro, input_ppls, input_imgs, input_num,
            pos_emb_var, spa_adj_matrix, sem_adj_matrix, eval_opt, imp_model,
            spa_model, sem_model)
        sents = utils.decode_sequence(dataset_val.itow, dataset_val.itod,
                                      dataset_val.ltow, dataset_val.itoc,
                                      dataset_val.wtod, seq.data, bn_seq.data,
                                      fg_seq.data, opt.vocab_size, opt)
        for k, sent in enumerate(sents):
            entry = {'image_id': img_id[k].item(), 'caption': sent}
            predictions.append(entry)
            if num_show < 20:
                print('image %s: %s' % (entry['image_id'], entry['caption']))
                num_show += 1

        if opt.graph_attention:
            for k in range(len(img_id)):
                save_attention(img_id[k], attention_weights[k],
                               opt.att_weight_save)

    print('Total image to be evaluated %d' % (len(predictions)))
    lang_stats = None
    if opt.language_eval == 1:
        if opt.decode_noc:
            lang_stats = utils.noc_eval(predictions, str(1), opt.val_split,
                                        opt)
        else:
            lang_stats = utils.language_eval(opt.dataset, predictions, str(1),
                                             opt.val_split, opt)

    print('Saving the predictions')

    # Write validation result into summary
    # if tf is not None:
    #     for k, v in lang_stats.items():
    #         add_summary_value(tf_summary_writer, k, v, iteration)
    #     tf_summary_writer.flush()

    # TODO: change the train process
    # val_result_history[iteration] = {'lang_stats': lang_stats, 'predictions': predictions}
    # if wandb is not None:
    #     wandb.log({k: v for k, v in lang_stats.items()})
    return lang_stats, predictions
コード例 #4
0
def eval(epoch, opt, vis=None, vis_window=None):
    model.eval()

    data_iter_val = iter(dataloader_val)
    start = time.time()

    num_show = 0
    predictions = []
    count = 0

    if opt.eval_obj_grounding:
        grd_output = defaultdict(list)

        lemma_det_dict = {
            opt.wtol[key]: idx
            for key, idx in opt.wtod.items() if key in opt.wtol
        }

        print('{} classes have the associated lemma word!'.format(
            len(lemma_det_dict)))

    if opt.eval_obj_grounding or opt.language_eval:
        print('eval')
        for step in range(len(dataloader_val)):
            data = data_iter_val.next()
            if opt.vis_attn:
                print('vis')
                img, iseq, gts_seq, num, proposals, bboxs, box_mask, img_id, img_show, region_feat = data
            else:
                img, iseq, gts_seq, num, proposals, bboxs, box_mask, img_id, region_feat = data

            proposals = proposals[:, :max(int(max(num[:, 1])), 1), :]
            region_feat = region_feat[:, :max(int(max(num[:, 1])), 1), :]

            input_imgs.resize_(img.size()).data.copy_(img)
            input_num.resize_(num.size()).data.copy_(num)
            input_ppls.resize_(proposals.size()).data.copy_(proposals)
            ppls_feat.resize_(region_feat.size()).data.copy_(region_feat)

            eval_opt = {
                'sample_max': 1,
                'beam_size': opt.beam_size,
                'inference_mode': True,
                'tag_size': opt.cbs_tag_size
            }
            dummy = input_ppls.new(input_imgs.size(0)).fill_(0)
            seq, att2_weights, sim_mat = model(input_imgs, dummy, dummy, input_num, \
                                               input_ppls, dummy, dummy, ppls_feat, 'sample', eval_opt)

            att2_weights_clone = att2_weights.clone()

            # save localization results on generated sentences
            if opt.eval_obj_grounding:
                assert opt.beam_size == 1, 'only support beam_size is 1'

                att2_ind = torch.max(att2_weights, dim=2)[1]

                # resize proposals back
                input_ppls[:, :, torch.
                           LongTensor([0, 2])] *= input_num[:, 3].float().view(
                               -1, 1, 1) / opt.image_crop_size
                input_ppls[:, :, torch.
                           LongTensor([1, 3])] *= input_num[:, 4].float().view(
                               -1, 1, 1) / opt.image_crop_size

                for i in range(seq.size(0)):
                    tmp_result = {'clss': [], 'idx_in_sent': [], 'bbox': []}
                    num_sent = 0  # does not really matter which reference to use
                    for j in range(seq.size(1)):
                        if seq[i, j].item() != 0:
                            lemma = opt.wtol[opt.itow[str(seq[i, j].item())]]
                            if lemma in lemma_det_dict:
                                tmp_result['bbox'].append(
                                    input_ppls[i, att2_ind[i, j], :4].tolist())
                                tmp_result['clss'].append(
                                    opt.itod[lemma_det_dict[lemma]])
                                tmp_result['idx_in_sent'].append(
                                    j
                                )  # redundant, for the sake of output format
                        else:
                            break

                    grd_output[img_id[i].item()].append(tmp_result)

            sents = utils.decode_sequence(dataset.itow, dataset.itod, dataset.ltow, dataset.itoc, \
                                          dataset.wtod, seq.data, opt.vocab_size, opt)

            for k, sent in enumerate(sents):
                entry = {'image_id': img_id[k].item(), 'caption': sent}
                predictions.append(entry)
                if num_show < 20:
                    print('image %s: %s' %
                          (entry['image_id'], entry['caption']))
                    num_show += 1

                # visualize the caption and region
                if opt.vis_attn:
                    if torch.sum(proposals[k]) != 0:
                        vis_infer(img_show[k], entry['image_id'],
                                  entry['caption'], att2_weights[k].cpu().data,
                                  proposals[k].data, sim_mat[k].cpu().data)
                        # print('GT sent: {} \nattn prec (obj): {:.3f} ({}), recall (obj): {:.3f} ({})' \
                        # .format('UNK', np.mean(ba_per_sent_prec[img_id[k].item()]), len(ba_per_sent_prec[img_id[k].item()]),
                        # np.mean(ba_per_sent_recall[img_id[k].item()]), len(ba_per_sent_recall[img_id[k].item()])))
                        print('*' * 80)

            if count % 2 == 0:
                print(count)
            count += 1

    lang_stats = None
    if opt.language_eval:
        print('Total image to be evaluated %d' % (len(predictions)))
        lang_stats = utils.language_eval(opt.dataset, predictions, opt.id,
                                         opt.val_split, opt)

        print('\nResults Summary (lang eval):')
        print('Printing language evaluation metrics...')
        for m, s in lang_stats.items():
            print('{}: {:.3f}'.format(m, s * 100))
        print('\n')

    if opt.eval_obj_grounding:
        # write attention results to file
        attn_file = 'results/attn-gen-sent-results-' + opt.val_split + '-' + opt.id + '.json'
        with open(attn_file, 'w') as f:
            json.dump(
                {
                    'results': grd_output,
                    'eval_mode': 'gen',
                    'external_data': {
                        'used':
                        True,
                        'details':
                        'Object detector pre-trained on Visual Genome on object detection task.'
                    }
                }, f)

        # offline eval
        evaluator = FlickrGrdEval(reference_file=opt.grd_reference,
                                  submission_file=attn_file,
                                  split_file=opt.split_file,
                                  val_split=[opt.val_split],
                                  iou_thresh=0.5)

        print('\nResults Summary (generated sent):')
        print('Printing attention accuracy on generated sentences...')
        prec_all, recall_all, f1_all = evaluator.grd_eval(mode='all')
        prec_loc, recall_loc, f1_loc = evaluator.grd_eval(mode='loc')
        print('\n')

    if opt.eval_obj_grounding_gt:
        box_accu_att, box_accu_grd, cls_accu = eval_grounding(opt)
        print('\nResults Summary (GT sent):')
        print(
            'The averaged attention / grounding box accuracy across all classes is: {:.4f} / {:.4f}'
            .format(box_accu_att, box_accu_grd))
        print(
            'The averaged classification accuracy across all classes is: {:.4f}\n'
            .format(cls_accu))
    else:
        box_accu_att, box_accu_grd, cls_accu = 0, 0, 0

    if opt.enable_visdom:
        assert (opt.language_eval)
        if vis_window['score'] is None:
            vis_window['score'] = vis.line(
                X=np.tile(np.arange(epoch, epoch + 1), (7, 1)).T,
                Y=np.column_stack(
                    (np.asarray(box_accu_att), np.asarray(box_accu_grd),
                     np.asarray(cls_accu), np.asarray(lang_stats['Bleu_4']),
                     np.asarray(lang_stats['METEOR']),
                     np.asarray(lang_stats['CIDEr']),
                     np.asarray(lang_stats['SPICE']))),
                opts=dict(title='Validation Score',
                          xlabel='Validation Epoch',
                          ylabel='Score',
                          legend=[
                              'BA (alpha)', 'BA (beta)', 'CLS Accu', 'Bleu_4',
                              'METEOR', 'CIDEr', 'SPICE'
                          ]))
        else:
            vis.line(X=np.tile(np.arange(epoch, epoch + 1), (7, 1)).T,
                     Y=np.column_stack(
                         (np.asarray(box_accu_att), np.asarray(box_accu_grd),
                          np.asarray(cls_accu),
                          np.asarray(lang_stats['Bleu_4']),
                          np.asarray(lang_stats['METEOR']),
                          np.asarray(lang_stats['CIDEr']),
                          np.asarray(lang_stats['SPICE']))),
                     opts=dict(title='Validation Score',
                               xlabel='Validation Epoch',
                               ylabel='Score',
                               legend=[
                                   'BA (alpha)', 'BA (beta)', 'CLS Accu',
                                   'Bleu_4', 'METEOR', 'CIDEr', 'SPICE'
                               ]),
                     win=vis_window['score'],
                     update='append')

    print('Saving the predictions')

    # Write validation result into summary
    val_result_history[iteration] = {
        'lang_stats': lang_stats,
        'predictions': predictions
    }

    return lang_stats