def evaluation(): test_data = aggregated_loader(args.img_dir,args.sal_dir,args.que_file,args.word2idx,'test',args.split_info,args.que_len,args.temporal_step,args.historical_step,transform,test_mode=True) testloader = torch.utils.data.DataLoader(test_data, batch_size=args.batch, shuffle=False, num_workers=1) # fix the batch size for evaluation as 1 model = SWM_agg(embed_size=args.embedding_size,vocab=train_data.word2idx) model.load_state_dict(torch.load(os.path.join(args.checkpoint,'model_best.pth')),strict=True) model = model.cuda() model.eval() eval_score = dict() for metric in eval_metrics: eval_score[metric] = [] for i, (img, que, sal, fix) in enumerate(testloader): # iterate through different frames of the same video sequence img, que = img.cuda(), que.cuda() pred = model(img,que) if len(pred) > 1: pred = pred.data.cpu().numpy().squeeze() sal = sal.numpy().squeeze() fix = fix.numpy().squeeze() else: pred = pred.data.cpu().numpy() sal = sal.numpy() fix = fix.numpy() for j in range(len(pred)): for k in range(len(pred[j])): cur_pred = pred[j,k] # evaluate on the current frame cur_pred = cv2.resize(cur_pred,(256,128)) if cur_pred.max()>0: cur_pred /= cur_pred.max() cur_sal = sal[j,k] cur_fix = fix[j,k] cur_pred = distortion_corr(cur_pred) cur_sal = distortion_corr(cur_sal) cur_fix = distortion_corr(cur_fix) if args.center_bias: cur_pred = add_center_bias(cur_pred) eval_score['cc'].append(cal_cc_score(cur_pred,cur_sal)) eval_score['sim'].append(cal_sim_score(cur_pred,cur_sal)) eval_score['kld'].append(cal_kld_score(cur_pred,cur_sal)) eval_score['nss'].append(cal_nss_score(cur_pred,cur_fix)) eval_score['sauc'].append(cal_sauc_score(cur_pred,cur_fix,test_data.shuf_map)) print('Evaluation scores for aggregated attention') for metric in eval_score.keys(): print('%s: %d' %(metric.upper(),np.mean(eval_score[metric])))
def validation(iteration): # initialize evaluation score model.eval() eval_score = dict() for metric in eval_metrics: eval_score[metric] = [] for i, (img, que, sal, fix) in enumerate(testloader): # iterate through different frames of the same video sequence img, que = img.cuda(), que.cuda() pred = model(img,que) if len(pred) > 1: pred = pred.data.cpu().numpy().squeeze() sal = sal.numpy().squeeze() fix = fix.numpy().squeeze() else: pred = pred.data.cpu().numpy() sal = sal.numpy() fix = fix.numpy() for j in range(len(pred)): for k in range(len(pred[j])): cur_pred = pred[j,k] # evaluate on the current frame cur_pred = cv2.resize(cur_pred,(256,128)) if cur_pred.max()>0: cur_pred /= cur_pred.max() cur_sal = sal[j,k] cur_fix = fix[j,k] cur_pred = distortion_corr(cur_pred) cur_sal = distortion_corr(cur_sal) cur_fix = distortion_corr(cur_fix) if args.center_bias: cur_pred = add_center_bias(cur_pred) eval_score['cc'].append(cal_cc_score(cur_pred,cur_sal)) eval_score['sim'].append(cal_sim_score(cur_pred,cur_sal)) eval_score['kld'].append(cal_kld_score(cur_pred,cur_sal)) eval_score['nss'].append(cal_nss_score(cur_pred,cur_fix)) eval_score['sauc'].append(cal_sauc_score(cur_pred,cur_fix,test_data.shuf_map)) with tf_summary_writer.as_default(): for metric in eval_score.keys(): tf.summary.scalar(metric.upper(),np.mean(eval_score[metric]),step=iteration) return np.mean(eval_score['cc'])
def evaluation(): test_data = conditional_loader(args.img_dir, args.sal_dir, args.que_file, args.word2idx, 'test', args.split_info, args.que_len, args.temporal_step, args.historical_step, transform, test_mode=True) testloader = torch.utils.data.DataLoader( test_data, batch_size=args.batch, shuffle=False, num_workers=1) # fix the batch size for evaluation as 1 model = SWM(embed_size=args.embedding_size, vocab=train_data.word2idx) model = model.cuda() model.load_state_dict( torch.load(os.path.join(args.checkpoint, 'model_best.pth'))) model.eval() eval_score = dict() label_pool = ['correct', 'incorrect'] for cond_label in label_pool: eval_score[cond_label] = dict() for metric in eval_metrics: eval_score[cond_label][metric] = [] for i, (img, que, cur_sal_pos, cur_fix_pos, cur_sal_neg, cur_fix_neg) in enumerate(testloader): # iterate through different frames of the same video sequence img, que = img.cuda(), que.cuda() pred = model(img, que) sal, fix = [], [] if len(pred) > 1: pred = pred.data.cpu().numpy().squeeze() sal.append(cur_sal_pos.numpy().squeeze()) sal.append(cur_sal_neg.numpy().squeeze()) fix.append(cur_fix_pos.numpy().squeeze()) fix.append(cur_fix_neg.numpy().squeeze()) else: pred = pred.data.cpu().numpy() sal.append(cur_sal_pos.numpy()) sal.append(cur_sal_neg.numpy()) fix.append(cur_fix_pos.numpy()) fix.append(cur_fix_neg.numpy()) for j in range(len(pred)): for k in range(len(pred[j])): for cond_idx, cond_label in enumerate(label_pool): cur_pred = pred[ j, k, cond_idx] # only evaluate on the current frame cur_pred = cv2.resize(cur_pred, (256, 128)) cur_sal = sal[cond_idx][j, k] cur_fix = fix[cond_idx][j, k] cur_pred = distortion_corr(cur_pred) cur_sal = distortion_corr(cur_sal) cur_fix = distortion_corr(cur_fix) if args.center_bias: cur_pred = add_center_bias(cur_pred) eval_score[cond_label]['cc'].append( cal_cc_score(cur_pred, cur_sal)) eval_score[cond_label]['sim'].append( cal_sim_score(cur_pred, cur_sal)) eval_score[cond_label]['kld'].append( cal_kld_score(cur_pred, cur_sal)) eval_score[cond_label]['nss'].append( cal_nss_score(cur_pred, cur_fix)) eval_score[cond_label]['sauc'].append( cal_sauc_score(cur_pred, cur_fix, test_data.shuf_map[cond_idx])) for cond_label in eval_score.keys(): print('Evaluation scores for %s attention' % cond_label) for metric in eval_score[cond_label].keys(): print('%s: %f' % (metric.upper(), np.mean(eval_score[cond_label][metric]))) print('\n')
def validation(iteration): # initialize evaluation score model.eval() eval_score = dict() label_pool = ['correct', 'incorrect'] for cond_label in label_pool: eval_score[cond_label] = dict() for metric in eval_metrics: eval_score[cond_label][metric] = [] for i, (img, que, cur_sal_pos, cur_fix_pos, cur_sal_neg, cur_fix_neg) in enumerate(testloader): # iterate through different frames of the same video sequence img, que = img.cuda(), que.cuda() pred = model(img, que) sal, fix = [], [] if len(pred) > 1: pred = pred.data.cpu().numpy().squeeze() sal.append(cur_sal_pos.numpy().squeeze()) sal.append(cur_sal_neg.numpy().squeeze()) fix.append(cur_fix_pos.numpy().squeeze()) fix.append(cur_fix_neg.numpy().squeeze()) else: pred = pred.data.cpu().numpy() sal.append(cur_sal_pos.numpy()) sal.append(cur_sal_neg.numpy()) fix.append(cur_fix_pos.numpy()) fix.append(cur_fix_neg.numpy()) for j in range(len(pred)): for k in range(len(pred[j])): for cond_idx, cond_label in enumerate(label_pool): cur_pred = pred[ j, k, cond_idx] # only evaluate on the current frame cur_pred = cv2.resize(cur_pred, (256, 128)) cur_sal = sal[cond_idx][j, k] cur_fix = fix[cond_idx][j, k] cur_pred = distortion_corr(cur_pred) cur_sal = distortion_corr(cur_sal) cur_fix = distortion_corr(cur_fix) if args.center_bias: cur_pred = add_center_bias(cur_pred) eval_score[cond_label]['cc'].append( cal_cc_score(cur_pred, cur_sal)) eval_score[cond_label]['sim'].append( cal_sim_score(cur_pred, cur_sal)) eval_score[cond_label]['kld'].append( cal_kld_score(cur_pred, cur_sal)) eval_score[cond_label]['nss'].append( cal_nss_score(cur_pred, cur_fix)) eval_score[cond_label]['sauc'].append( cal_sauc_score(cur_pred, cur_fix, test_data.shuf_map[cond_idx])) with tf_summary_writer.as_default(): for cond_label in eval_score.keys(): for metric in eval_score[cond_label].keys(): tf.summary.scalar(cond_label + '_' + metric.upper(), np.mean(eval_score[cond_label][metric]), step=iteration) return np.mean(eval_score['correct']['cc'])
def main(): img_rows, img_cols = 300, 400 #initializing salicon data sal_anno = '/home/eric/Desktop/experiment/salicon/salicon-api/annotations/fixations_train2014.json' salicon = SALICON(sal_anno) #loading VQA data vqa_dict = np.load('valid_data_train.npy') question_bank = np.load('question_type.npy') answer_bank = np.load('answer_type.npy') vqa_dir = '/media/eric/New Volume/VQA/VQA_HAT/vqahat_train' #defining data structure metrics = ['cc', 'sim', 'kld', 'emd', 'spearmanr'] que_score = dict() ans_score = dict() overall_score = dict() for question in question_bank: que_score[question] = init_metrics(metrics) for answer in answer_bank: ans_score[answer] = init_metrics(metrics) overall_score = init_metrics(metrics) #main loop for comparing different attention maps nan_count_q = dict() nan_count_a = dict() nan_corr_q = dict() nan_corr_a = dict() nan_count = 0 nan_corr = 0 for i in question_bank: nan_count_q[i] = 0 nan_corr_q[i] = 0 for i in answer_bank: nan_count_a[i] = 0 nan_corr_a[i] = 0 for cur_data in vqa_dict: question_id = cur_data['question_id'] question_type = cur_data['question_type'] answer_type = cur_data['answer_type'] img_id = cur_data['img_id'] #load vqa attention map vqa_img = os.path.join(vqa_dir, str(question_id) + '_1.png') que_att_map = cv2.imread(vqa_img) que_att_map = que_att_map[:, :, 0] que_att_map = cv2.resize(que_att_map, (img_cols, img_rows), interpolation=cv2.INTER_LINEAR) que_att_map = que_att_map.astype('float32') que_att_map /= 255 #load free-viewing attention map annIds = salicon.getAnnIds(img_id) anns = salicon.loadAnns(annIds) fv_att_map = salicon.showAnns(anns) fv_att_map = cv2.resize(fv_att_map, (img_cols, img_rows), interpolation=cv2.INTER_LINEAR) #computing scores for different metrics cc = cal_cc_score(fv_att_map, que_att_map) sim = cal_sim_score(fv_att_map, que_att_map) kld = cal_kld_score(fv_att_map, que_att_map) emd = cal_emd_score(fv_att_map, que_att_map) rank_corr, _ = spearmanr(fv_att_map.reshape(-1), que_att_map.reshape(-1)) #storing data in a naive way if np.isnan(cc): cc = 0 nan_count_q[question_type] += 1 nan_count_a[answer_type] += 1 nan_count += 1 if np.isnan(rank_corr): rank_corr = 0 nan_corr_q[question_type] += 1 nan_corr_a[answer_type] += 1 nan_corr += 1 que_score[question_type]['cc'] += cc que_score[question_type]['sim'] += sim que_score[question_type]['spearmanr'] += rank_corr que_score[question_type]['kld'] += kld que_score[question_type]['emd'] += emd que_score[question_type]['count'] += 1 ans_score[answer_type]['cc'] += cc ans_score[answer_type]['sim'] += sim ans_score[answer_type]['spearmanr'] += rank_corr ans_score[answer_type]['kld'] += kld ans_score[answer_type]['emd'] += emd ans_score[answer_type]['count'] += 1 overall_score['cc'] += cc overall_score['sim'] += sim overall_score['spearmanr'] += rank_corr overall_score['kld'] += kld overall_score['emd'] += emd overall_score['count'] += 1 #computing average score for q_type in question_bank: for cur_metric in metrics: if cur_metric == 'cc': que_score[q_type][cur_metric] /= que_score[q_type][ 'count'] - nan_count_q[q_type] elif cur_metric == 'spearmanr': que_score[q_type][cur_metric] /= que_score[q_type][ 'count'] - nan_corr_q[q_type] else: que_score[q_type][cur_metric] /= que_score[q_type]['count'] for a_type in answer_bank: for cur_metric in metrics: if cur_metric == 'cc': ans_score[a_type][cur_metric] /= ans_score[a_type][ 'count'] - nan_count_a[a_type] elif cur_metric == 'spearmanr': ans_score[a_type][cur_metric] /= ans_score[a_type][ 'count'] - nan_corr_a[a_type] else: ans_score[a_type][cur_metric] /= ans_score[a_type]['count'] for cur_metric in metrics: if cur_metric == 'cc': overall_score[cur_metric] /= overall_score['count'] - nan_count elif cur_metric == 'spearmanr': overall_score[cur_metric] /= overall_score['count'] - nan_corr else: overall_score[cur_metric] /= overall_score['count'] np.save('question_score', que_score) np.save('answer_score', ans_score) np.save('overall_score', overall_score)
def multi_question(): img_rows, img_cols = 300, 400 vqa_dir = '/media/eric/New Volume/VQA/VQA_HAT/vqahat_train' IQ_pair = np.load('multi_question.npy') metrics = ['cc', 'sim', 'kld', 'emd', 'spearmanr'] inter_score = dict() score = init_metrics(metrics) #main loop for comparing different attention maps nan_cc = 0 nan_corr = 0 for cur in IQ_pair.item(): sal_map = [] for q_id in IQ_pair.item()[cur]: I_dir = os.path.join(vqa_dir, str(q_id) + '_1.png') I = cv2.imread(I_dir) I = cv2.resize(I, (img_cols, img_rows), interpolation=cv2.INTER_LINEAR) I = I[:, :, 0] I = I.astype('float32') sal_map.append(I) tmp_pair = [(0, 1), (0, 2), (1, 2)] if len(sal_map) == 3 else [(0, 1)] if len(sal_map) == 1: continue tmp_cc = 0 tmp_kld = 0 tmp_sim = 0 tmp_corr = 0 nan_corr_ = 0 nan_cc_ = 0 for pair in tmp_pair: cc = cal_cc_score(sal_map[pair[0]], sal_map[pair[1]]) tmp_kld += cal_kld_score(sal_map[pair[0]], sal_map[pair[1]]) tmp_sim += cal_sim_score(sal_map[pair[0]], sal_map[pair[1]]) corr, _ = spearmanr(sal_map[pair[0]].reshape(-1), sal_map[pair[1]].reshape(-1)) if np.isnan(cc): nan_cc_ += 1 else: tmp_cc += cc if np.isnan(corr): nan_corr_ += 1 else: tmp_corr += corr score['count'] += 1 score['kld'] += tmp_kld / len(sal_map) score['sim'] += tmp_sim / len(sal_map) if len(sal_map) - nan_cc_ > 0: score['cc'] += tmp_cc / (len(sal_map) - nan_cc_) else: nan_cc += 1 if len(sal_map) - nan_corr_ > 0: score['spearmanr'] += tmp_corr / (len(sal_map) - nan_corr_) else: nan_corr += 1 for metric in metrics: if metric == 'cc': score[metric] /= score['count'] - nan_cc elif metric == 'spearmanr': score[metric] /= score['count'] - nan_corr else: score[metric] /= score['count'] np.save('multi_question_score', score)