Esempio n. 1
0
def eval(epoch, opt, vis=None, vis_window=None):
    model.eval()

    data_iter_val = iter(dataloader_val)
    start = time.time()

    num_show = 0
    predictions = defaultdict(list)
    count = 0
    timestamp_file = json.load(open(opt.grd_reference))
    min_value = -1e8

    if opt.eval_obj_grounding:
        grd_output = defaultdict(dict)

        lemma_det_dict = {
            opt.wtol[key]: idx
            for key, idx in opt.wtod.items() if key in opt.wtol
        }
        print('{} classes have the associated lemma word!'.format(
            len(lemma_det_dict)))

    if opt.eval_obj_grounding or opt.language_eval:
        for step in range(len(dataloader_val)):
            data = data_iter_val.next()
            if opt.vis_attn:
                seg_feat, iseq, gts_seq, num, proposals, bboxs, box_mask, seg_id, seg_show, seg_dim_info, region_feat, frm_mask, sample_idx, ppl_mask = data
            else:
                seg_feat, iseq, gts_seq, num, proposals, bboxs, box_mask, seg_id, region_feat, frm_mask, sample_idx, ppl_mask = data

            proposals = proposals[:, :max(int(max(num[:, 1])), 1), :]
            ppl_mask = ppl_mask[:, :max(int(max(num[:, 1])), 1)]
            region_feat = region_feat[:, :max(int(max(num[:, 1])), 1), :]

            segs_feat.resize_(seg_feat.size()).data.copy_(seg_feat)
            input_num.resize_(num.size()).data.copy_(num)
            input_ppls.resize_(proposals.size()).data.copy_(proposals)
            mask_ppls.resize_(ppl_mask.size()).data.copy_(ppl_mask)
            pnt_mask = torch.cat(
                (mask_ppls.new(mask_ppls.size(0), 1).fill_(0), mask_ppls),
                dim=1)  # pad 1 column from a legacy reason
            ppls_feat.resize_(region_feat.size()).data.copy_(region_feat)
            sample_idx = Variable(sample_idx.type(input_num.type()))

            eval_opt = {
                'sample_max': 1,
                'beam_size': opt.beam_size,
                'inference_mode': True
            }
            dummy = input_ppls.new(input_ppls.size(0)).byte().fill_(0)

            batch_size = input_ppls.size(0)

            seq, att2_weights, sim_mat = model(segs_feat, dummy, dummy, input_num, \
                                               input_ppls, dummy, dummy, ppls_feat, dummy, sample_idx, pnt_mask, 'sample', eval_opt)

            # save localization results on generated sentences
            if opt.eval_obj_grounding:
                assert opt.beam_size == 1, 'only support beam_size is 1'

                att2_ind = torch.max(att2_weights.view(batch_size, att2_weights.size(1), \
                    opt.num_sampled_frm, opt.num_prop_per_frm), dim=-1)[1]
                obj_bbox_att2 = torch.gather(input_ppls.view(-1, opt.num_sampled_frm, opt.num_prop_per_frm, 7) \
                    .permute(0, 2, 1, 3).contiguous(), 1, att2_ind.unsqueeze(-1).expand((batch_size, \
                    att2_ind.size(1), opt.num_sampled_frm, input_ppls.size(-1)))) # Bx20x10x7

                for i in range(seq.size(0)):
                    vid_id, seg_idx = seg_id[i].split('_segment_')
                    seg_idx = str(int(seg_idx))
                    tmp_result = {
                        'clss': [],
                        'idx_in_sent': [],
                        'bbox_for_all_frames': []
                    }

                    for j in range(seq.size(1)):
                        if seq[i, j].item() != 0:
                            lemma = opt.wtol[opt.itow[str(seq[i, j].item())]]
                            if lemma in lemma_det_dict:
                                tmp_result['bbox_for_all_frames'].append(
                                    obj_bbox_att2[i, j, :, :4].tolist())
                                tmp_result['clss'].append(
                                    opt.itod[lemma_det_dict[lemma]])
                                tmp_result['idx_in_sent'].append(
                                    j
                                )  # redundant, for the sake of output format
                        else:
                            break
                    grd_output[vid_id][seg_idx] = tmp_result

            sents = utils.decode_sequence(dataset.itow, dataset.itod, dataset.ltow, dataset.itoc, \
                                          dataset.wtod, seq.data, opt.vocab_size, opt)

            for k, sent in enumerate(sents):
                vid_idx, seg_idx = seg_id[k].split('_segment_')
                seg_idx = str(int(seg_idx))

                predictions[vid_idx].append(
                    {'sentence':sent,
                    'timestamp':[round(timestamp, 2) for timestamp in timestamp_file[ \
                        'annotations'][vid_idx]['segments'][seg_idx]['timestamps']]})

                if num_show < 20:
                    print('segment %s: %s' % (seg_id[k], sent))
                    num_show += 1

                # visualization
                if opt.vis_attn:
                    assert (opt.beam_size == 1)  # only support beam_size=1
                    att2_weights = F.softmax(att2_weights, dim=2)
                    # visualize some selected examples
                    if torch.sum(proposals[k]) != 0:
                        vis_infer(seg_show[k], seg_id[k], sent, \
                            att2_weights[k].cpu().data, proposals[k], num[k].long(), \
                            bboxs[k], sim_mat[k].cpu().data, seg_dim_info[k])

            if count % 2 == 0:
                print(count)
            count += 1

    lang_stats = defaultdict(float)
    if opt.language_eval:
        print('Total videos to be evaluated %d' % (len(predictions)))

        submission = './experiments/results/' + 'densecap-' + opt.val_split + '-' + opt.id + '.json'
        dense_cap_all = {
            'version': 'VERSION 1.0',
            'results': predictions,
            'external_data': {
                'used': 'true',
                'details': 'Visual Genome for Faster R-CNN pre-training'
            }
        }
        with open(submission, 'w') as f:
            json.dump(dense_cap_all, f)

        references = opt.densecap_references
        verbose = opt.densecap_verbose
        tious_lst = [0.3, 0.5, 0.7, 0.9]
        evaluator = ANETcaptions(ground_truth_filenames=references,
                                 prediction_filename=submission,
                                 tious=tious_lst,
                                 max_proposals=1000,
                                 verbose=verbose)
        evaluator.evaluate()

        for m, v in evaluator.scores.items():
            lang_stats[m] = np.mean(v)

        print('\nResults Summary (lang eval):')
        print('Printing language evaluation metrics...')
        for m, s in lang_stats.items():
            print('{}: {:.3f}'.format(m, s * 100))
        print('\n')

    if opt.eval_obj_grounding:
        # write attention results to file
        attn_file = './experiments/results/attn-gen-sent-results-' + opt.val_split + '-' + opt.id + '.json'
        with open(attn_file, 'w') as f:
            json.dump(
                {
                    'results': grd_output,
                    'eval_mode': 'gen',
                    'external_data': {
                        'used':
                        True,
                        'details':
                        'Object detector pre-trained on Visual Genome on object detection task.'
                    }
                }, f)

        if not opt.test_mode:
            # offline eval
            evaluator = ANetGrdEval(reference_file=opt.grd_reference,
                                    submission_file=attn_file,
                                    split_file=opt.split_file,
                                    val_split=[opt.val_split],
                                    iou_thresh=0.5)

            print('\nResults Summary (generated sent):')
            print(
                'Printing attention accuracy on generated sentences, per class and per sentence, respectively...'
            )
            prec_all, recall_all, f1_all, prec_all_per_sent, rec_all_per_sent, f1_all_per_sent = evaluator.grd_eval(
                mode='all')
            prec_loc, recall_loc, f1_loc, prec_loc_per_sent, rec_loc_per_sent, f1_loc_per_sent = evaluator.grd_eval(
                mode='loc')
        else:
            print('*' * 62)
            print('*  [WARNING] Grounding eval unavailable for the test set!\
    *\n*            Please submit your result files under directory *\
     \n*            results/ to the eval server!                    *')
            print('*' * 62)

    if opt.att_model == 'topdown' and opt.eval_obj_grounding_gt:
        with torch.no_grad():
            box_accu_att, box_accu_grd, cls_accu = eval_grounding(
                opt)  # eval grounding
    else:
        box_accu_att, box_accu_grd, cls_accu = 0, 0, 0

    if opt.enable_visdom:
        assert (opt.language_eval)
        if vis_window['score'] is None:
            vis_window['score'] = vis.line(
                X=np.tile(np.arange(epoch, epoch + 1), (7, 1)).T,
                Y=np.column_stack(
                    (np.asarray(box_accu_att), np.asarray(box_accu_grd),
                     np.asarray(cls_accu), np.asarray(lang_stats['Bleu_4']),
                     np.asarray(lang_stats['METEOR']),
                     np.asarray(lang_stats['CIDEr']),
                     np.asarray(lang_stats['SPICE']))),
                opts=dict(title='Validation Score',
                          xlabel='Validation Epoch',
                          ylabel='Score',
                          legend=[
                              'BA (alpha)', 'BA (beta)', 'CLS Accu', 'Bleu_4',
                              'METEOR', 'CIDEr', 'SPICE'
                          ]))
        else:
            vis.line(X=np.tile(np.arange(epoch, epoch + 1), (7, 1)).T,
                     Y=np.column_stack(
                         (np.asarray(box_accu_att), np.asarray(box_accu_grd),
                          np.asarray(cls_accu),
                          np.asarray(lang_stats['Bleu_4']),
                          np.asarray(lang_stats['METEOR']),
                          np.asarray(lang_stats['CIDEr']),
                          np.asarray(lang_stats['SPICE']))),
                     opts=dict(title='Validation Score',
                               xlabel='Validation Epoch',
                               ylabel='Score',
                               legend=[
                                   'BA (alpha)', 'BA (beta)', 'CLS Accu',
                                   'Bleu_4', 'METEOR', 'CIDEr', 'SPICE'
                               ]),
                     win=vis_window['score'],
                     update='append')

    print('Saving the predictions')

    # Write validation result into summary
    val_result_history[iteration] = {
        'lang_stats': lang_stats,
        'predictions': predictions
    }

    return lang_stats
	
   
name_id = 'small_sample.json'
evaluation_type = 'fast'

#generate score for the captioning
if (evaluation_type == 'slow'):
	evaluator = old_ANETcaptions(ground_truth_filenames=['./data/val_1.json', './data/val_2.json'],
	                 prediction_filename='./'+name_id,
	                 tious=[0.3, 0.5, 0.7, 0.9],
	                 max_proposals=1000,
	                 verbose=True)
elif(evaluation_type == 'fast'):
	evaluator = ANETcaptions(ground_truth_filenames=['./data/val_1.json', './data/val_2.json'],
	                 prediction_filename='./'+name_id,
	                 tious=[0.3, 0.5, 0.7, 0.9],
	                 max_proposals=1000,
	                 verbose=True)
# verbose=args.verbose -> verbose = True
evaluator.evaluate()


# Output the results
for i, tiou in enumerate([0.3, 0.5, 0.7, 0.9]):
	print '-' * 80
	print "tIoU: " , tiou
	print '-' * 80
	for metric in evaluator.scores:
	    score = evaluator.scores[metric][i]
	    print '| %s: %2.4f'%(metric, 100*score)
Esempio n. 3
0
def evaluate_gt_proposal(val_loader,
                         model_att,
                         model_tep,
                         model_sg,
                         idx_to_word,
                         epoch=0):
    # Evaluate mode
    model_att.eval()
    model_tep.eval()
    model_sg.eval()

    out = {}
    out['version'] = 'VERSION 1.0'
    out['results'] = {}
    out['external_data'] = {}
    out['external_data']['used'] = 'false'
    out['external_data']['details'] = 'for evaluation'
    end = time.time()

    with torch.no_grad():
        for batch_idx, (data, boxes, duration, v_name,
                        timestamp) in enumerate(val_loader):
            if data.shape[2] < 5:
                print("Pass this data (idx:%d) because very short video." %
                      batch_idx)
                continue
            if args.use_gpu:
                data = Variable(data.cuda())
                boxes = Variable(boxes.cuda())
            else:
                data = Variable(data)
                boxes = Variable(boxes)

            # Predict proposals
            proposals = model_tep(data)

            # Obtain proposal features with ground-truth proposal
            # using weighted attention(descriptiveness) score
            pos_feats = get_gt_proposal(data,
                                        proposals[1],
                                        proposals[3],
                                        boxes,
                                        scale_ratios=args.scale_ratios,
                                        use_gpu=args.use_gpu)

            if args.use_gpu:
                pos_feats = Variable(pos_feats.cuda())
            else:
                pos_feats = Variable(pos_feats)
            att = model_att(pos_feats)
            if args.use_gpu:
                att = Variable(att.cuda())
            else:
                att = Variable(att)
            att = Variable(att)

            # Generate sentences
            gen_result, _ = model_sg.sample(pos_feats, att, greedy=True)
            gen_sents = idx_to_sent(gen_result, idx_to_word)

            start_times = timestamp[0, :, 0].data.cpu().numpy()
            end_times = timestamp[0, :, 1].data.cpu().numpy()

            out['results'][v_name[0]] = []
            for i in range(len(gen_sents)):
                temp = {}
                temp['sentence'] = gen_sents[i][0]
                temp['timestamp'] = [
                    float(start_times[i]),
                    float(end_times[i])
                ]
                out['results'][v_name[0]].append(temp)

            # Print
            if (batch_idx + 1) % args.print_freq == 0:
                print("\tValidation: [{}/{}]\t"
                      "Time: {:.3f}".format((batch_idx + 1), len(val_loader),
                                            time.time() - end))

    # Write to JSON
    if not os.path.isdir('./output'):
        os.makedirs('./output')
    json_name = 'output/result_{}_{}.json'.format(args.file_name, str(epoch))
    json.dump(out, open(json_name, 'w'))

    # Evaluate scores
    scores = {}
    evaluator = ANETcaptions(ground_truth_filenames=args.references,
                             prediction_filename=json_name,
                             tious=args.tious,
                             max_proposals=args.max_proposals_per_video,
                             verbose=True)
    evaluator.evaluate()
    print("Validation Scores")
    for metric in evaluator.scores:
        score = evaluator.scores[metric]
        scores[metric] = 100 * sum(score) / float(len(score))
        print('| %s: %2.4f' % (metric, scores[metric]))

    return scores
Esempio n. 4
0
def evaluate_gt(val_loader,
                model_att,
                model_sg,
                criterion,
                idx_to_word,
                epoch=0):
    # Evaluate mode
    model_att.eval()
    model_sg.eval()

    count = 0
    losses = 0.0
    out = {}
    out['version'] = 'VERSION 1.0'
    out['results'] = {}
    out['external_data'] = {}
    out['external_data']['used'] = 'false'
    out['external_data']['details'] = 'for evaluation'

    with torch.no_grad():
        for batch_idx, (data, target, v_name,
                        timestamp) in enumerate(val_loader):
            if args.use_gpu:
                data = data.cuda()
                target = target.cuda()
            data = Variable(data)
            target = Variable(target)

            # Generate sentences
            att = model_att(data)
            att = Variable(att)
            gen_result, output = model_sg(data, att)
            loss = criterion(output.view(-1, output.shape[2]),
                             target[:, 1:].contiguous().view(-1))

            losses += loss.item()

            # Into dict. structure
            gen_sents = idx_to_sent(gen_result, idx_to_word)
            start_times = timestamp[:, 0].data.cpu().numpy()
            end_times = timestamp[:, 1].data.cpu().numpy()

            for i in range(len(gen_sents)):
                if not v_name[i] in out['results']:
                    out['results'][v_name[i]] = []
                temp = {}
                temp['sentence'] = gen_sents[i][0]
                temp['timestamp'] = [
                    float(start_times[i]),
                    float(end_times[i])
                ]
                out['results'][v_name[i]].append(temp)
                count += 1

    print("Check Validation data : {} / {}".format(count,
                                                   len(val_loader.dataset)))

    avg_loss = losses / len(val_loader.dataset)
    print("Validation average loss : {:.4f}".format(avg_loss))

    # Write to JSON
    if not os.path.isdir('./output'):
        os.makedirs('./output')
    json_name = 'output/result_{}_{}.json'.format(args.file_name, str(epoch))
    json.dump(out, open(json_name, 'w'))

    # Evaluate scores
    scores = {}
    evaluator = ANETcaptions(ground_truth_filenames=args.references,
                             prediction_filename=json_name,
                             tious=args.tious,
                             max_proposals=args.max_proposals_per_video,
                             verbose=True)
    evaluator.evaluate()
    print("Validation Scores")
    for metric in evaluator.scores:
        score = evaluator.scores[metric]
        scores[metric] = 100 * sum(score) / float(len(score))
        print('| %s: %2.4f' % (metric, scores[metric]))

    return avg_loss, scores