def evaluate(loader, model, logger, print_freq=10, sampling_num=5):
    model.eval()
    model.module.set_testing(True, sample_num=sampling_num)
    meters = logger.reset_meters('test')
    results = []
    end = time.time()
    blue_score_all = 0
    for i, sample in enumerate(loader):
        batch_size = sample['visual'].size(0)
        # measure data loading time
        input_visual = Variable(sample['visual'].cuda(async=True), volatile=True )
        input_answer = Variable(sample['answer'].cuda(async=True), volatile=True)
        target_answer = sample['answer']
        input_question = Variable(sample['question'].cuda(async=True), volatile=True)
        output_answer, g_answers, g_answers_score, generated_q = model(input_visual, input_question, input_answer)
        bleu_score = calculate_bleu_score(generated_q.cpu().data, sample['question'], loader.dataset.wid_to_word)
        acc1, acc5, acc10 = utils.accuracy(output_answer.cpu().data, target_answer, topk=(1, 5, 10))
        meters['acc1'].update(acc1[0], n=batch_size)
        meters['acc5'].update(acc5[0], n=batch_size)
        meters['acc10'].update(acc10[0], n=batch_size)
        meters['bleu_score'].update(bleu_score, n=batch_size)
        g_answers = g_answers.cpu().data
        g_answers_score = g_answers_score.cpu().data

        for j in range(batch_size):
            new_question = generated_q.cpu().data[j].tolist()
            new_answer = g_answers[j]
            new_answer_score = g_answers_score[j]
            sampled_aqa = [[new_question, new_answer, new_answer_score],]
            num_result = {  'gt_question': sample['question'][j][1:].tolist(), #sample['question'][j].numpy(),
                            'gt_answer': sample['answer'][j],
                            'augmented_qa': sampled_aqa,}
            readable_result = {  
                            'gt_question': translate_tokens(sample['question'][j][1:], loader.dataset.wid_to_word), 
                            'gt_answer': loader.dataset.aid_to_ans[sample['answer'][j]], 
                            'augmented_qa': [ [
                                        translate_tokens(item[0], loader.dataset.wid_to_word), # translate question
                                        loader.dataset.aid_to_ans[item[1]], # translate answer
                                        ] for item in sampled_aqa],}
            results.append({'image': sample['image'][j], 
                            'numeric_result': num_result, 
                            'readable_result': readable_result}, )
        # measure elapsed time
        meters['batch_time'].update(time.time() - end, n=batch_size)
        end = time.time()

    print('* [Evaluation] Result: Acc@1:{acc1.avg:.3f}\t'
          'Acc@5:{acc5.avg:.3f}\tAcc@10:{acc10.avg:.3f}\t'
          'Time: {batch_time.avg:.3f}\t'
          'BLEU: {bleu_score.avg:.5f}'.format(
          acc1=meters['acc1'], acc5=meters['acc5'], acc10=meters['acc10'], 
          batch_time=meters['batch_time'], 
          bleu_score=meters['bleu_score']))

    model.module.set_testing(False)
    return results
Exemple #2
0
def evaluate(
    loader: torch.utils.data.DataLoader,
    model: torch.nn.Module,
    logger: logger.Experiment,
    sampling_num: int = 5,
    neptune_exp: Optional[neptune.experiments.Experiment] = None,
) -> List[RESULT]:
    aid_to_ans = loader.dataset.aid_to_ans + ["UNK"]
    model.eval()
    model.module.set_testing(True, sample_num=sampling_num)
    meters = logger.reset_meters("test")
    res_counter = {
        "correct@1": 0,
        "correct@5": 0,
        "correct@10": 0,
        "n_sample": 0,
    }
    results = []
    end = time.time()

    for sample in loader:
        batch_size = sample["visual"].size(0)
        input_visual = Variable(sample["visual"].cuda())
        input_answer = Variable(sample["answer"].cuda())
        target_answer = sample["answer"]
        input_question = sample["question"].long().cuda()
        output_answer, g_answers, g_answers_score, generated_q = model(
            input_visual, input_question, input_answer
        )
        output_answer_ = output_answer.detach().cpu().numpy()
        bleu_score = calculate_bleu_score(
            generated_q.cpu().data,
            sample["question"],
            loader.dataset.wid_to_word,
        )
        acc1, acc5, acc10 = utils.accuracy(
            output_answer.cpu().data, target_answer, topk=(1, 5, 10)
        )

        correct1, correct5, correct10 = utils.correct_k(
            output_answer.cpu().data, target_answer, topk=(1, 5, 10)
        )

        # accumulate number of correct predictions
        meters["acc1"].update(acc1.item(), n=batch_size)
        meters["acc5"].update(acc5.item(), n=batch_size)
        meters["acc10"].update(acc10.item(), n=batch_size)
        meters["bleu_score"].update(bleu_score, n=batch_size)
        g_answers = g_answers.cpu().data
        g_answers_score = g_answers_score.cpu().data

        res_counter["correct@1"] += correct1.item()
        res_counter["correct@5"] += correct5.item()
        res_counter["correct@10"] += correct10.item()
        res_counter["n_sample"] += batch_size

        for j in range(batch_size):
            new_question = generated_q.cpu().data[j].tolist()
            new_answer = g_answers[j]
            given_question = input_question.cpu().data[j].tolist()
            given_question = translate_tokens(
                given_question, loader.dataset.wid_to_word
            )
            predict_answers = np.flip(np.argsort(output_answer_[j]))[:10]
            predict_answers = [
                loader.dataset.aid_to_ans[w] for w in predict_answers
            ]
            new_answer_score = g_answers_score[j]
            sampled_aqa = [[new_question, new_answer, new_answer_score]]

            readable_result = {
                "gt_answer": aid_to_ans[sample["answer"][j]],
                "augmented_qa": [
                    [
                        translate_tokens(
                            item[0], loader.dataset.wid_to_word
                        ),  # translate question
                        aid_to_ans[item[1]],  # translate answer
                    ]
                    for item in sampled_aqa
                ],
                "given_question": given_question,
                "predict_answers": predict_answers,
            }
            results.append(
                {
                    "image": sample["image"][j],
                    "readable_result": readable_result,
                }
            )
        # measure elapsed time
        meters["batch_time"].update(time.time() - end, n=batch_size)
        end = time.time()

    print(
        "* [Evaluation] Result: Acc@1:{acc1.avg:.3f}\t"
        "Acc@5:{acc5.avg:.3f}\tAcc@10:{acc10.avg:.3f}\t"
        "Time: {batch_time.avg:.3f}\t"
        "BLEU: {bleu_score.avg:.5f}".format(
            acc1=meters["acc1"],
            acc5=meters["acc5"],
            acc10=meters["acc10"],
            batch_time=meters["batch_time"],
            bleu_score=meters["bleu_score"],
        )
    )
    print(f"{res_counter['correct@1']} / {res_counter['n_sample']}")

    if neptune_exp is not None:
        neptune_exp.log_metric("Acc@1", meters["acc1"].avg)
        neptune_exp.log_metric("Acc@5", meters["acc5"].avg)
        neptune_exp.log_metric("Acc@10", meters["acc10"].avg)
        neptune_exp.log_metric("N_Correct@1", res_counter["correct@1"])
        neptune_exp.log_metric("N_Samples", res_counter["n_sample"])

    model.module.set_testing(False)
    return results
def evaluate(loader, model, logger, print_freq=10, sampling_num=5):
    model.eval()
    model.module.set_testing(True, sample_num=sampling_num)
    meters = logger.reset_meters('test')
    results = []
    end = time.time()
    blue_score_all = 0
    for i, sample in enumerate(loader):
        batch_size = sample['visual'].size(0)
        # measure data loading time
        input_visual = Variable(sample['visual'].cuda(async=True),
                                volatile=True)
        target_answer = sample['answer']
        input_question = Variable(sample['question'].cuda(async=True),
                                  volatile=True)
        # compute output
        output_answer, g_answers, g_answers_score, generated_q = model(
            input_visual, input_question)
        acc1, acc5, acc10 = utils.accuracy(output_answer.cpu().data,
                                           target_answer,
                                           topk=(1, 5, 10))
        meters['acc1'].update(acc1, n=batch_size)
        meters['acc5'].update(acc5, n=batch_size)
        meters['acc10'].update(acc10, n=batch_size)
        _, g_answers = torch.max(output_answer, dim=1)
        g_answers = g_answers.cpu().data
        g_answers_score = g_answers_score.cpu().data

        for j in range(batch_size):
            sampled_aqa = []
            for k in range(sampling_num):
                new_question = generated_q[k].cpu().data[j].tolist()
                new_answer = g_answers[j, k]
                new_answer_score = g_answers_score[j, k]
                sampled_aqa.append(
                    [new_question, new_answer, new_answer_score])
            num_result = {
                'gt_question': sample['question'][j]
                [1:].tolist(),  #sample['question'][j].numpy(),
                'gt_answer': sample['answer'][j],
                'augmented_qa': sampled_aqa,
            }
            readable_result = {
                'gt_question':
                translate_tokens(sample['question'][j][1:],
                                 loader.dataset.wid_to_word),
                'gt_answer':
                loader.dataset.aid_to_ans[sample['answer'][j]],
                'augmented_qa': [
                    [
                        translate_tokens(
                            item[0],
                            loader.dataset.wid_to_word),  # translate question
                        loader.dataset.aid_to_ans[item[1]],  # translate answer
                    ] for item in sampled_aqa
                ],
            }
            results.append(
                {
                    'image': sample['image'][j],
                    'numeric_result': num_result,
                    'readable_result': readable_result
                }, )
        # measure elapsed time
        meters['batch_time'].update(time.time() - end, n=batch_size)
        end = time.time()

        if (i + 1) % print_freq == 0:
            print('Test: [{0}/{1}]\t'
                  'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'.format(
                      i + 1,
                      len(loader),
                      batch_time=meters['batch_time'],
                      data_time=meters['data_time']))

    print(
        '** Result: Acc@1:{}\tAcc@5:{}\tAcc@10:{}\tTime: {batch_time.avg:.3f}'.
        format(meters['acc1'].avg,
               meters['acc5'].avg,
               meters['acc10'].avg,
               batch_time=meters['batch_time']))

    model.module.set_testing(False)
    return results