def compute_mrr(D, I, qids, ref_dict): knn_pkl = {"D": D, "I": I} all_knn_list = all_gather(knn_pkl) mrr = 0.0 if is_first_worker(): D_merged = concat_key(all_knn_list, "D", axis=1) I_merged = concat_key(all_knn_list, "I", axis=1) print(D_merged.shape, I_merged.shape) # we pad with negative pids and distance -128 - if they make it to the top we have a problem idx = np.argsort(D_merged, axis=1)[:, ::-1][:, :10] sorted_I = np.take_along_axis(I_merged, idx, axis=1) candidate_dict = {} for i, qid in enumerate(qids): seen_pids = set() if qid not in candidate_dict: candidate_dict[qid] = [0] * 1000 j = 0 for pid in sorted_I[i]: if pid >= 0 and pid not in seen_pids: candidate_dict[qid][j] = pid j += 1 seen_pids.add(pid) allowed, message = quality_checks_qids(ref_dict, candidate_dict) if message != '': print(message) mrr_metrics = compute_metrics(ref_dict, candidate_dict) mrr = mrr_metrics["MRR @10"] print(mrr) return mrr
def compute_mrr_last(D, I, qids, ref_dict, dev_query_positive_id): knn_pkl = {"D": D, "I": I} all_knn_list = all_gather(knn_pkl) mrr = 0.0 final_recall = 0.0 if is_first_worker(): prediction = {} D_merged = concat_key(all_knn_list, "D", axis=1) I_merged = concat_key(all_knn_list, "I", axis=1) print(D_merged.shape, I_merged.shape) # we pad with negative pids and distance -128 - if they make it to the top we have a problem idx = np.argsort(D_merged, axis=1)[:, ::-1][:, :1000] sorted_I = np.take_along_axis(I_merged, idx, axis=1) candidate_dict = {} for i, qid in enumerate(qids): seen_pids = set() if qid not in candidate_dict: prediction[qid] = {} candidate_dict[qid] = [0] * 1000 j = 0 for pid in sorted_I[i]: if pid >= 0 and pid not in seen_pids: candidate_dict[qid][j] = pid prediction[qid][pid] = -(j + 1) #-rank j += 1 seen_pids.add(pid) # allowed, message = quality_checks_qids(ref_dict, candidate_dict) # if message != '': # print(message) # mrr_metrics = compute_metrics(ref_dict, candidate_dict) # mrr = mrr_metrics["MRR @10"] # print(mrr) allowed, message = quality_checks_qids(ref_dict, candidate_dict) if message != '': print(message) mrr_metrics = compute_metrics(ref_dict, candidate_dict) mrr = mrr_metrics["MRR @10"] print(mrr) evaluator = pytrec_eval.RelevanceEvaluator( convert_to_string_id(dev_query_positive_id), {'recall'}) eval_query_cnt = 0 recall = 0 topN = 1000 result = evaluator.evaluate(convert_to_string_id(prediction)) for k in result.keys(): eval_query_cnt += 1 recall += result[k]["recall_" + str(topN)] final_recall = recall / eval_query_cnt print('final_recall: ', final_recall) return mrr, final_recall
def EvalDevQuery(query_embedding2id, passage_embedding2id, dev_query_positive_id, I_nearest_neighbor, topN, \ rerankTopN=-1, dev_query_embedding=None, passage_embedding=None, lambda_test=None, wt_emb=1.0, f=1): if (rerankTopN > 0): I_nearest_neighbor, residual_scores_all = rerankWithResidualLearningScore(I_nearest_neighbor, \ rerankTopN, query_embedding2id, passage_embedding2id, dev_query_embedding, \ passage_embedding, lambda_test, wt_emb, fold=f) prediction = { } #[qid][docid] = docscore, here we use -rank as score, so the higher the rank (1 > 2), the higher the score (-1 > -2) total = 0 labeled = 0 Atotal = 0 Alabeled = 0 qids_to_ranked_candidate_passages = {} for query_idx in range(len(I_nearest_neighbor)): seen_pid = set() query_id = query_embedding2id[query_idx] prediction[query_id] = {} top_ann_pid = I_nearest_neighbor[query_idx].copy() selected_ann_idx = top_ann_pid[:topN] rank = 0 if query_id in qids_to_ranked_candidate_passages: pass else: # By default, all PIDs in the list of 1000 are 0. Only override those that are given tmp = [0] * 1000 qids_to_ranked_candidate_passages[query_id] = tmp for idx in selected_ann_idx: pred_pid = passage_embedding2id[idx] if not pred_pid in seen_pid: # this check handles multiple vector per document qids_to_ranked_candidate_passages[query_id][rank] = pred_pid Atotal += 1 if pred_pid not in dev_query_positive_id[query_id]: Alabeled += 1 if rank < 10: total += 1 if pred_pid not in dev_query_positive_id[query_id]: labeled += 1 rank += 1 prediction[query_id][pred_pid] = -rank seen_pid.add(pred_pid) # use out of the box evaluation script evaluator = pytrec_eval.RelevanceEvaluator( convert_to_string_id(dev_query_positive_id), {'map_cut', 'ndcg_cut', 'recip_rank', 'recall'}) eval_query_cnt = 0 result = evaluator.evaluate(convert_to_string_id(prediction)) qids_to_relevant_passageids = {} for qid in dev_query_positive_id: qid = int(qid) if qid in qids_to_relevant_passageids: pass else: qids_to_relevant_passageids[qid] = [] for pid in dev_query_positive_id[qid]: if pid > 0: qids_to_relevant_passageids[qid].append(pid) ms_mrr = compute_metrics(qids_to_relevant_passageids, qids_to_ranked_candidate_passages) ndcg = 0 Map = 0 mrr = 0 recall = 0 recall_1000 = 0 ndcgs = [] mrrs = [] recalls = [] for k in result.keys(): eval_query_cnt += 1 ndcg += result[k]["ndcg_cut_10"] ndcgs.append(result[k]["ndcg_cut_10"]) Map += result[k]["map_cut_10"] mrr += result[k]["recip_rank"] mrrs.append(result[k]["recip_rank"]) recall += result[k]["recall_" + str(topN)] recalls.append(result[k]["recall_" + str(topN)]) final_ndcg = ndcg / eval_query_cnt final_Map = Map / eval_query_cnt final_mrr = mrr / eval_query_cnt final_recall = recall / eval_query_cnt hole_rate = labeled / total Ahole_rate = Alabeled / Atotal return final_ndcg, ndcgs, mrrs, recalls, eval_query_cnt, final_Map, final_mrr, final_recall, hole_rate, ms_mrr, Ahole_rate, result, prediction