def test_ndcg(): x = [] k = 4 assert_eq(ndcg.dcg(x, k), 0) x = [1, 1, 1] k = 5 assert_eq(ndcg.dcg(x, k), 1 * (1 + 1 / np.log2(3) + 1 / np.log2(4))) k = 1 assert_eq(ndcg.dcg(x, k), 1) x = [[1, 1, 1], [1, 2, 3]] k = 3 res = ndcg.dcg(x, k) assert_eq(res[0], 1 * (1 + 1 / np.log2(3) + 1 / np.log2(4))) assert_eq(res[1], 1 + 3 / np.log2(3) + 7 / np.log2(4)) x = [1, 2, 3] opt_x = [3, 2, 1] k = 3 assert_eq(ndcg.ndcg(x, k), ndcg.dcg(x, k) / ndcg.dcg(opt_x, k)) x = [1, 1, 1] assert_eq(ndcg.ndcg(x, k), 1) x = [] assert_eq(ndcg.ndcg(x, k), 0)
def predict(estimator, test_path, best_dir, target_dir): input_fn_for_test = lambda: input_fn(test_path, 0) output_results = estimator.predict( input_fn_for_test, checkpoint_path=tf.train.latest_checkpoint(best_dir)) predict_result_dict = {} true_result_dict = {} for output_result in output_results: for i in range(output_result['ans_num'][0]): question = output_result['que_ans'][i].decode('utf8') answer = output_result['que_ans'][i + FLAGS.neg_num + 1].decode('utf8') if question not in true_result_dict: true_result_dict[question] = {} if i == 0: true_result_dict[question][answer] = 1 else: true_result_dict[question][answer] = 0 if question not in predict_result_dict: predict_result_dict[question] = {} predict_result_dict[question][answer] = float( output_result['output_rank'][i]) logger.info(best_dir) total_item_result = [] for master_key in predict_result_dict: complete_string = master_key + '\x01' temp_string = '' # 按照值的大小进行排序 sorted_final_sim_dict = sorted(predict_result_dict[master_key].items(), reverse=True, key=lambda x: x[1]) for item in sorted_final_sim_dict: if item[0] in complete_string: # 从相似item中排除掉其本身 continue temp_string += item[0] + '_' + str(item[1]) + ',' if temp_string != '': complete_string += temp_string.rstrip(',') total_item_result.append(complete_string) write_content(total_item_result, target_dir) # 写入dssm的相似结果 total_ndcg = 0 for key in predict_result_dict: label_list = [] sorted_answer_score = sorted(predict_result_dict[key].items(), reverse=True, key=lambda x: x[1])[:5] # 取前5个预测值最大的结果 for item in sorted_answer_score: assert item[0] in true_result_dict[key] label_list.append(true_result_dict[key][item[0]]) total_ndcg += ndcg(label_list, top_n=5) logger.info('Average ndcg@5 is {}'.format(total_ndcg / len(predict_result_dict))) return predict_result_dict
def measure_correlations(responsiveness, pyramid_scores, predictions): """ Computes the correlations: - between predictions and responsiveness for the 3 metrics - between predictions and pyramid scores for the 3 metrics return the results in a dictionary. """ r_to_pyramid = pearsonr(pyramid_scores, predictions)[0] rho_to_pyramid = spearmanr(pyramid_scores, predictions)[0] ndcg_to_pyramid = ndcg(predictions, pyramid_scores, 10) r_to_responsiveness = pearsonr(responsiveness, predictions)[0] rho_to_responsiveness = spearmanr(responsiveness, predictions)[0] ndcg_to_responsiveness = ndcg(predictions, responsiveness, 10) return { 'r_to_pyramid': r_to_pyramid, 'rho_to_pyramid': rho_to_pyramid, 'ndcg_to_pyramid': ndcg_to_pyramid, 'r_to_responsiveness': r_to_responsiveness, 'rho_to_responsiveness': rho_to_responsiveness, 'ndcg_to_responsiveness': ndcg_to_responsiveness }
def test_param_stomping(self): testval_init = np.asarray([3, 2, 3, 0]) testval = np.asarray([3, 2, 3, 0]) x = ndcg.ndcg(testval, 4, False) self.assertEqual(testval_init.tolist(), testval.tolist()) x = ndcg.idcg(testval, False) self.assertEqual(testval_init.tolist(), testval.tolist()) x = ndcg.dcg(testval, False) self.assertEqual(testval_init.tolist(), testval.tolist()) x = ndcg.cum_gain(testval) self.assertEqual(testval_init.tolist(), testval.tolist())
def py_eval_func(target_scores, predict_scores, qids): target_scores = list(target_scores.numpy()) predict_scores = list(predict_scores.numpy()) qids = list(qids.numpy()) x = list(zip(target_scores, predict_scores, qids)) groups = {} for e in x: if e[2] in groups: groups[e[2]].append((e[0], e[1])) else: groups[e[2]] = [(e[0], e[1])] cum_ndcg = [] for _, g in groups.items(): sorted_g = sorted(g, key=lambda x: x[1], reverse=True) pos = range(1, len(sorted_g) + 1) rel = [e[0] for e in sorted_g] partial_ndcg = ndcg(pos, rel) if partial_ndcg > 0.: # print(partial_ndcg) cum_ndcg.append(partial_ndcg) return np.asarray(cum_ndcg, dtype=np.float32)
def test_ndcg_nranks(self): self.assertAlmostEqual(0.9491769, ndcg.ndcg([3, 2, 3, 0], 4, False)) self.assertAlmostEqual(0.9491769, ndcg.ndcg([3, 2, 3], 4, False))
def test_ndcg(self): # from wikipedia self.assertAlmostEqual(0.9315085, ndcg.ndcg([3, 2, 3, 0, 1, 2], 6, False))
def test_ndcg_zeros(self): self.assertAlmostEqual(0.0, ndcg.ndcg([0, 0, 0, 0], 6, False))
def test_ndcg_none(self): # from wikipedia self.assertAlmostEqual(0.0, ndcg.ndcg([], 0, False)) self.assertAlmostEqual(0.0, ndcg.ndcg(np.asarray([]), 0, False)) self.assertAlmostEqual(0.0, ndcg.ndcg(None, 0, False))
def main(): scores = parse() print('DCG: %f' % dcg(scores)) print('NDCG: %f' % ndcg(scores)) print('pFound: %f' % pfound(scores))
assert (len(score) == len(ret_products)) assert (len(nz) == len(ret_products)) for p, v, s in zip(ret_products, nz, score): p['rel'] = v[1] p['score'] = s #for p in ret_products: # if p.get('score') is None: # print(p) sorted_products = sorted(ret_products, key=lambda x: x.get('score'), reverse=True) ret_ndcg = ndcg(range(1, len(sorted_products) + 1), [x['rel'] for x in sorted_products]) if ret_ndcg > 0.: if ret_ndcg > 0.7: f1.write(query + "\n") if ret_ndcg <= 0.7 and ret_ndcg > 0.6: f2.write(query + "\n") if ret_ndcg <= 0.6: f3.write(query + "\n") count += 1 cum_ndcg += ret_ndcg if count % 200 == 0: print("Processed %d queries, ndcg: %0.4f" % (count, cum_ndcg / count)) f1.flush()
def evaluate(rel_file, retr_file, write_flag): f = open(rel_file, 'r') data_rel = json.load(f) f.close() f = open(retr_file, 'r') data_retr = json.load(f) f.close() total_relevant = 0 rel_found = 0 total_retrieved = 0 macro_precision = 0 macro_recall = 0 t_ndcg = np.zeros((1, 3)) sumAIP = 0 sumIP = np.zeros((1, 11)) sumAP = 0 sum_prfAIP = 0 sum_prfIP = np.zeros((1, 11)) n_questions = len(data_retr["questions"]) np.set_printoptions(threshold=np.nan) for i in range(n_questions): #print i data_rel_id = data_rel["questions"][i]["id"] data_retr_id = data_retr["questions"][i]["id"] assert data_rel_id == data_retr_id rel_docs = [] relevant_docs = data_rel["questions"][i]["documents"] [rel_docs.append(doc.split('/')[4]) for doc in relevant_docs] #rel_docs = relevant_docs retr_docs = map(str, data_retr["questions"][i]["retrieved"]) retr_docs = [x.strip() for x in retr_docs] n_rel_docs = len(data_rel["questions"][i]["documents"]) n_retr_docs = len(data_retr["questions"][i]["retrieved"]) total_relevant += n_rel_docs total_retrieved += n_retr_docs common = set(rel_docs) & set(retr_docs) n_common = len(common) rel_found += n_common if n_retr_docs != 0: macro_precision += (n_common/n_retr_docs) if n_rel_docs != 0: macro_recall += (n_common/n_rel_docs) curve, avg_precision, int_curve = pr_rec(rel_docs, retr_docs) perfect_curve, perfect_int_curve = perfect_reranking(rel_docs, retr_docs, n_common) t_ndcg = np.add(t_ndcg, ndcg(rel_docs, retr_docs)) sumAIP += np.sum(int_curve) / 11 sumIP = np.add(sumIP, int_curve) sum_prfAIP += np.sum(perfect_int_curve, 1) / 11 sum_prfIP = np.add(sum_prfIP, perfect_int_curve) sumAP += avg_precision mean_ndcg = np.divide(t_ndcg, n_questions) micro_precision = rel_found / total_retrieved macro_precision = macro_precision / n_questions micro_recall = rel_found / total_relevant macro_recall = macro_recall / n_questions MAP = sumAP / n_questions MAIP = sumAIP / n_questions MIP = np.divide(sumIP, n_questions) prf_MAIP = sum_prfAIP / n_questions prf_MIP = np.divide(sum_prfIP, n_questions) print '\n' print 'Micro-Average Precision: ' + str("{0:.4f}".format(micro_precision)) print 'Macro-Average Precision: ' + str("{0:.4f}".format(macro_precision)) print 'Micro-Average Recall: ' + str("{0:.4f}".format(micro_recall)) print 'Macro-Average Recall: ' + str("{0:.4f}".format(macro_recall)) print 'MAP: ' + str("{0:.4f}".format(MAP)) print 'MAIP: ' + str("{0:.4f}".format(MAIP)) print 'MIP: ' + str(map("{0:.4f}".format,MIP[0])) print 'nDCG@20: ' + str("{0:.4f}".format(mean_ndcg[0][0])) print 'nDCG@100: ' + str("{0:.4f}".format(mean_ndcg[0][1])) + '\n' print 'Perfect Reranking: ' print 'MAIP: ' + str("{0:.4f}".format(prf_MAIP[0])) print 'MIP: ' + str(map("{0:.4f}".format,prf_MIP[0])) if write_flag!='-n': with open('evaluation.txt', 'a+') as outfile: outfile.write(retr_file) outfile.write(' %.4f' % (micro_precision)) outfile.write(' %.4f' % (macro_precision)) outfile.write(' %.4f' % (micro_recall)) outfile.write(' %.4f' % (macro_recall)) outfile.write(' %.4f' % (MAP)) outfile.write(' %.4f' % (MAIP)) outfile.write(' ') np.savetxt(outfile, MIP, delimiter=' ', fmt='%.4f', newline=' ') outfile.write('%.4f ' % (prf_MAIP[0])) np.savetxt(outfile, prf_MIP, delimiter=' ', fmt='%.4f', newline=' ') np.savetxt(outfile, mean_ndcg, delimiter=' ', fmt='%.4f')