def evaluate(loader, model, logger, print_freq=10, sampling_num=5): model.eval() model.module.set_testing(True, sample_num=sampling_num) meters = logger.reset_meters('test') results = [] end = time.time() blue_score_all = 0 for i, sample in enumerate(loader): batch_size = sample['visual'].size(0) # measure data loading time input_visual = Variable(sample['visual'].cuda(async=True), volatile=True ) input_answer = Variable(sample['answer'].cuda(async=True), volatile=True) target_answer = sample['answer'] input_question = Variable(sample['question'].cuda(async=True), volatile=True) output_answer, g_answers, g_answers_score, generated_q = model(input_visual, input_question, input_answer) bleu_score = calculate_bleu_score(generated_q.cpu().data, sample['question'], loader.dataset.wid_to_word) acc1, acc5, acc10 = utils.accuracy(output_answer.cpu().data, target_answer, topk=(1, 5, 10)) meters['acc1'].update(acc1[0], n=batch_size) meters['acc5'].update(acc5[0], n=batch_size) meters['acc10'].update(acc10[0], n=batch_size) meters['bleu_score'].update(bleu_score, n=batch_size) g_answers = g_answers.cpu().data g_answers_score = g_answers_score.cpu().data for j in range(batch_size): new_question = generated_q.cpu().data[j].tolist() new_answer = g_answers[j] new_answer_score = g_answers_score[j] sampled_aqa = [[new_question, new_answer, new_answer_score],] num_result = { 'gt_question': sample['question'][j][1:].tolist(), #sample['question'][j].numpy(), 'gt_answer': sample['answer'][j], 'augmented_qa': sampled_aqa,} readable_result = { 'gt_question': translate_tokens(sample['question'][j][1:], loader.dataset.wid_to_word), 'gt_answer': loader.dataset.aid_to_ans[sample['answer'][j]], 'augmented_qa': [ [ translate_tokens(item[0], loader.dataset.wid_to_word), # translate question loader.dataset.aid_to_ans[item[1]], # translate answer ] for item in sampled_aqa],} results.append({'image': sample['image'][j], 'numeric_result': num_result, 'readable_result': readable_result}, ) # measure elapsed time meters['batch_time'].update(time.time() - end, n=batch_size) end = time.time() print('* [Evaluation] Result: Acc@1:{acc1.avg:.3f}\t' 'Acc@5:{acc5.avg:.3f}\tAcc@10:{acc10.avg:.3f}\t' 'Time: {batch_time.avg:.3f}\t' 'BLEU: {bleu_score.avg:.5f}'.format( acc1=meters['acc1'], acc5=meters['acc5'], acc10=meters['acc10'], batch_time=meters['batch_time'], bleu_score=meters['bleu_score'])) model.module.set_testing(False) return results
def evaluate( loader: torch.utils.data.DataLoader, model: torch.nn.Module, logger: logger.Experiment, sampling_num: int = 5, neptune_exp: Optional[neptune.experiments.Experiment] = None, ) -> List[RESULT]: aid_to_ans = loader.dataset.aid_to_ans + ["UNK"] model.eval() model.module.set_testing(True, sample_num=sampling_num) meters = logger.reset_meters("test") res_counter = { "correct@1": 0, "correct@5": 0, "correct@10": 0, "n_sample": 0, } results = [] end = time.time() for sample in loader: batch_size = sample["visual"].size(0) input_visual = Variable(sample["visual"].cuda()) input_answer = Variable(sample["answer"].cuda()) target_answer = sample["answer"] input_question = sample["question"].long().cuda() output_answer, g_answers, g_answers_score, generated_q = model( input_visual, input_question, input_answer ) output_answer_ = output_answer.detach().cpu().numpy() bleu_score = calculate_bleu_score( generated_q.cpu().data, sample["question"], loader.dataset.wid_to_word, ) acc1, acc5, acc10 = utils.accuracy( output_answer.cpu().data, target_answer, topk=(1, 5, 10) ) correct1, correct5, correct10 = utils.correct_k( output_answer.cpu().data, target_answer, topk=(1, 5, 10) ) # accumulate number of correct predictions meters["acc1"].update(acc1.item(), n=batch_size) meters["acc5"].update(acc5.item(), n=batch_size) meters["acc10"].update(acc10.item(), n=batch_size) meters["bleu_score"].update(bleu_score, n=batch_size) g_answers = g_answers.cpu().data g_answers_score = g_answers_score.cpu().data res_counter["correct@1"] += correct1.item() res_counter["correct@5"] += correct5.item() res_counter["correct@10"] += correct10.item() res_counter["n_sample"] += batch_size for j in range(batch_size): new_question = generated_q.cpu().data[j].tolist() new_answer = g_answers[j] given_question = input_question.cpu().data[j].tolist() given_question = translate_tokens( given_question, loader.dataset.wid_to_word ) predict_answers = np.flip(np.argsort(output_answer_[j]))[:10] predict_answers = [ loader.dataset.aid_to_ans[w] for w in predict_answers ] new_answer_score = g_answers_score[j] sampled_aqa = [[new_question, new_answer, new_answer_score]] readable_result = { "gt_answer": aid_to_ans[sample["answer"][j]], "augmented_qa": [ [ translate_tokens( item[0], loader.dataset.wid_to_word ), # translate question aid_to_ans[item[1]], # translate answer ] for item in sampled_aqa ], "given_question": given_question, "predict_answers": predict_answers, } results.append( { "image": sample["image"][j], "readable_result": readable_result, } ) # measure elapsed time meters["batch_time"].update(time.time() - end, n=batch_size) end = time.time() print( "* [Evaluation] Result: Acc@1:{acc1.avg:.3f}\t" "Acc@5:{acc5.avg:.3f}\tAcc@10:{acc10.avg:.3f}\t" "Time: {batch_time.avg:.3f}\t" "BLEU: {bleu_score.avg:.5f}".format( acc1=meters["acc1"], acc5=meters["acc5"], acc10=meters["acc10"], batch_time=meters["batch_time"], bleu_score=meters["bleu_score"], ) ) print(f"{res_counter['correct@1']} / {res_counter['n_sample']}") if neptune_exp is not None: neptune_exp.log_metric("Acc@1", meters["acc1"].avg) neptune_exp.log_metric("Acc@5", meters["acc5"].avg) neptune_exp.log_metric("Acc@10", meters["acc10"].avg) neptune_exp.log_metric("N_Correct@1", res_counter["correct@1"]) neptune_exp.log_metric("N_Samples", res_counter["n_sample"]) model.module.set_testing(False) return results
def evaluate(loader, model, logger, print_freq=10, sampling_num=5): model.eval() model.module.set_testing(True, sample_num=sampling_num) meters = logger.reset_meters('test') results = [] end = time.time() blue_score_all = 0 for i, sample in enumerate(loader): batch_size = sample['visual'].size(0) # measure data loading time input_visual = Variable(sample['visual'].cuda(async=True), volatile=True) target_answer = sample['answer'] input_question = Variable(sample['question'].cuda(async=True), volatile=True) # compute output output_answer, g_answers, g_answers_score, generated_q = model( input_visual, input_question) acc1, acc5, acc10 = utils.accuracy(output_answer.cpu().data, target_answer, topk=(1, 5, 10)) meters['acc1'].update(acc1, n=batch_size) meters['acc5'].update(acc5, n=batch_size) meters['acc10'].update(acc10, n=batch_size) _, g_answers = torch.max(output_answer, dim=1) g_answers = g_answers.cpu().data g_answers_score = g_answers_score.cpu().data for j in range(batch_size): sampled_aqa = [] for k in range(sampling_num): new_question = generated_q[k].cpu().data[j].tolist() new_answer = g_answers[j, k] new_answer_score = g_answers_score[j, k] sampled_aqa.append( [new_question, new_answer, new_answer_score]) num_result = { 'gt_question': sample['question'][j] [1:].tolist(), #sample['question'][j].numpy(), 'gt_answer': sample['answer'][j], 'augmented_qa': sampled_aqa, } readable_result = { 'gt_question': translate_tokens(sample['question'][j][1:], loader.dataset.wid_to_word), 'gt_answer': loader.dataset.aid_to_ans[sample['answer'][j]], 'augmented_qa': [ [ translate_tokens( item[0], loader.dataset.wid_to_word), # translate question loader.dataset.aid_to_ans[item[1]], # translate answer ] for item in sampled_aqa ], } results.append( { 'image': sample['image'][j], 'numeric_result': num_result, 'readable_result': readable_result }, ) # measure elapsed time meters['batch_time'].update(time.time() - end, n=batch_size) end = time.time() if (i + 1) % print_freq == 0: print('Test: [{0}/{1}]\t' 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'.format( i + 1, len(loader), batch_time=meters['batch_time'], data_time=meters['data_time'])) print( '** Result: Acc@1:{}\tAcc@5:{}\tAcc@10:{}\tTime: {batch_time.avg:.3f}'. format(meters['acc1'].avg, meters['acc5'].avg, meters['acc10'].avg, batch_time=meters['batch_time'])) model.module.set_testing(False) return results