def eval_reranker(res_fname="svm.test.res", pred_fname="svm.train.pred", format="trec", th=50, verbose=False, reranking_th=-100.0, ignore_noanswer=False, ignore_allanswer=False): ir, svm = read_res_pred_files(res_fname, pred_fname, format, verbose, reranking_th=reranking_th, ignore_noanswer=ignore_noanswer, ignore_allanswer=ignore_allanswer) # evaluate IR prec_se = metrics.recall_of_1(ir, th) acc_se = metrics.accuracy(ir, th) acc_se1 = metrics.accuracy1(ir, th) acc_se2 = metrics.accuracy2(ir, th) # evaluate SVM prec_svm = metrics.recall_of_1(svm, th) acc_svm = metrics.accuracy(svm, th) acc_svm1 = metrics.accuracy1(svm, th) acc_svm2 = metrics.accuracy2(svm, th) mrr_se = metrics.mrr(ir, th) mrr_svm = metrics.mrr(svm, th) map_se = metrics.map(ir) map_svm = metrics.map(svm) avg_acc1_svm = metrics.avg_acc1(svm, th) avg_acc1_ir = metrics.avg_acc1(ir, th) ''' print "%13s %5s" %("IR", "SVM") print "MRR: %5.2f %5.2f" %(mrr_se, mrr_svm) print "MAP: %5.4f %5.4f" %(map_se, map_svm) print "AvgRec: %5.2f %5.2f" %(avg_acc1_ir, avg_acc1_svm) print "%16s %6s %14s %6s %14s %6s %12s %4s" % ("IR", "SVM", "IR", "SVM", "IR", "SVM", "IR", "SVM") ''' rec1_se =-10 rec1_svm = -10 for i, (p_se, p_svm, a_se, a_svm, a_se1, a_svm1, a_se2, a_svm2) in enumerate(zip(prec_se, prec_svm, acc_se, acc_svm, acc_se1, acc_svm1, acc_se2, acc_svm2), 1): #print "REC-1@%02d: %6.2f %6.2f ACC@%02d: %6.2f %6.2f AC1@%02d: %6.2f %6.2f AC2@%02d: %4.0f %4.0f" %(i, p_se, p_svm, i, a_se, a_svm, i, a_se1, a_svm1, i, a_se2, a_svm2) if (rec1_se<-5): rec1_se = p_se rec1_svm = p_svm ''' print "REC-1 - percentage of questions with at least 1 correct answer in the top @X positions (useful for tasks were questions have at most one correct answer)" print "ACC - accuracy, i.e. number of correct answers retrieved at rank @X normalized by the rank and the total number of questions" print "AC1 - the number of correct answers at @X normalized by the number of maximum possible answers (perfect re-ranker)" print "AC2 - the absolute number of correct answers at @X" ''' print "Table view" print " MRR MAP P@1" print "REF_FILE %5.2f %5.2f %5.2f" % (mrr_se, map_se*100, rec1_se) print "SVM %5.2f %5.2f %5.2f" % (mrr_svm, map_svm*100, rec1_svm)
def eval_reranker(res_fname="svm.test.res", pred_fname="svm.train.pred", format="trec", th=10, verbose=False, reranking_th=0.0, ignore_noanswer=False): ir, svm, conf_matrix = read_res_pred_files(res_fname, pred_fname, format, verbose, reranking_th=reranking_th, ignore_noanswer=ignore_noanswer) # Calculate standard P, R, F1, Acc acc = 1.0 * ( conf_matrix['true']['true'] + conf_matrix['false']['false']) / ( conf_matrix['true']['true'] + conf_matrix['false']['false'] + conf_matrix['true']['false'] + conf_matrix['false']['true']) p = 0 if (conf_matrix['true']['true'] + conf_matrix['false']['true']) > 0: p = 1.0 * (conf_matrix['true']['true']) / ( conf_matrix['true']['true'] + conf_matrix['false']['true']) r = 0 if (conf_matrix['true']['true'] + conf_matrix['true']['false']) > 0: r = 1.0 * (conf_matrix['true']['true']) / ( conf_matrix['true']['true'] + conf_matrix['true']['false']) f1 = 0 if (p + r) > 0: f1 = 2.0 * p * r / (p + r) # evaluate IR prec_se = metrics.recall_of_1(ir, th) acc_se = metrics.accuracy(ir, th) acc_se1 = metrics.accuracy1(ir, th) acc_se2 = metrics.accuracy2(ir, th) # evaluate SVM prec_svm = metrics.recall_of_1(svm, th) acc_svm = metrics.accuracy(svm, th) acc_svm1 = metrics.accuracy1(svm, th) acc_svm2 = metrics.accuracy2(svm, th) mrr_se = metrics.mrr(ir, th) mrr_svm = metrics.mrr(svm, th) map_se = metrics.map(ir, th) map_svm = metrics.map(svm, th) avg_acc1_svm = metrics.avg_acc1(svm, th) avg_acc1_ir = metrics.avg_acc1(ir, th) print "acc\tf1\tMAP\tMRR\tAvgRec" print "%.4f %4.4f %4.4f %4.4f %4.4f" % (acc, f1, map_svm, mrr_svm, avg_acc1_svm)
def eval_search_engine(res_fname, format, th=10): ir = read_res_file(res_fname, format) # evaluate IR rec = metrics.recall_of_1(ir, th) acc = metrics.accuracy(ir, th) acc1 = metrics.accuracy1(ir, th) acc2 = metrics.accuracy2(ir, th) mrr = metrics.mrr(ir, th) print("%13s" % "IR") print("MRRof1: %5.2f" % mrr) for i, (r, a, a1, a2) in enumerate(zip(rec, acc, acc1, acc2), 1): print( "REC-1@%02d: %6.2f ACC@%02d: %6.2f AC1@%02d: %6.2f AC2@%02d: %4.0f" % (i, r, i, a, i, a1, i, a2)) print() print( "REC-1 - percentage of questions with at least 1 correct answer in the top @X positions (useful for tasks were questions have at most one correct answer)" ) print( "ACC - accuracy, i.e. number of correct answers retrieved at rank @X normalized by the rank and the total number of questions" ) print( "AC1 - the number of correct answers at @X normalized by the number of maximum possible answers (perfect re-ranker)" ) print("AC2 - the absolute number of correct answers at @X")
def eval_search_engine(res_fname, format, th=50): ir = read_res_file(res_fname, format) # evaluate IR rec = metrics.recall_of_1(ir, th) acc = metrics.accuracy(ir, th) acc1 = metrics.accuracy1(ir, th) acc2 = metrics.accuracy2(ir, th) mrr = metrics.mrr(ir, th) # MAP map_ir = metrics.map(ir) print "%10s" %"IR" print "MRR: %5.2f" % mrr print "MAP: %5.2f" % map_ir for i, (r, a, a1, a2) in enumerate(zip(rec, acc, acc1, acc2), 1): print "REC-1@%02d: %6.2f ACC@%02d: %6.2f AC1@%02d: %6.2f AC2@%02d: %4.0f" %(i, r, i, a, i, a1, i, a2) print print "REC-1 - percentage of questions with at least 1 correct answer in the top @X positions (useful for tasks were questions have at most one correct answer)" print "ACC - accuracy, i.e. number of correct answers retrieved at rank @X normalized by the rank and the total number of questions" print "AC1 - the number of correct answers at @X normalized by the number of maximum possible answers (perfect re-ranker)" print "AC2 - the absolute number of correct answers at @X"
def get_cv_evaluation_results(qid_aid_label_list, y_pred): predictions_dict = get_cv_ranked_predictions_dict(qid_aid_label_list, y_pred) logging.debug("Num of questions: %d" % (len(predictions_dict))) mrr_score = m.mrr(predictions_dict, 1000) map_score = m.map(predictions_dict) * 100 p1_score = m.recall_of_1(predictions_dict, 1000)[0] return mrr_score, map_score, p1_score
def eval_reranker(res_fname="svm.test.res", pred_fname="svm.train.pred", format="trec", th=50, verbose=False, reranking_th=0.0, ignore_noanswer=False): ir, svm = read_res_pred_files(res_fname, pred_fname, format, verbose, reranking_th=reranking_th, ignore_noanswer=ignore_noanswer) # evaluate IR prec_se = metrics.recall_of_1(ir, th) acc_se = metrics.accuracy(ir, th) acc_se1 = metrics.accuracy1(ir, th) acc_se2 = metrics.accuracy2(ir, th) # evaluate SVM prec_svm = metrics.recall_of_1(svm, th) acc_svm = metrics.accuracy(svm, th) acc_svm1 = metrics.accuracy1(svm, th) acc_svm2 = metrics.accuracy2(svm, th) mrr_se = metrics.mrr(ir, th) mrr_svm = metrics.mrr(svm, th) map_se = metrics.map(ir) map_svm = metrics.map(svm) avg_acc1_svm = metrics.avg_acc1(svm, th) avg_acc1_ir = metrics.avg_acc1(ir, th) print "%13s %5s" %("IR", "SVM") print "MRR: %5.2f %5.2f" %(mrr_se, mrr_svm) print "MAP: %5.4f %5.4f" %(map_se, map_svm) print "AvgRec: %5.2f %5.2f" %(avg_acc1_ir, avg_acc1_svm) print "%16s %6s %14s %6s %14s %6s %12s %4s" % ("IR", "SVM", "IR", "SVM", "IR", "SVM", "IR", "SVM") for i, (p_se, p_svm, a_se, a_svm, a_se1, a_svm1, a_se2, a_svm2) in enumerate(zip(prec_se, prec_svm, acc_se, acc_svm, acc_se1, acc_svm1, acc_se2, acc_svm2), 1): print "REC-1@%02d: %6.2f %6.2f ACC@%02d: %6.2f %6.2f AC1@%02d: %6.2f %6.2f AC2@%02d: %4.0f %4.0f" %(i, p_se, p_svm, i, a_se, a_svm, i, a_se1, a_svm1, i, a_se2, a_svm2) print print "REC-1 - percentage of questions with at least 1 correct answer in the top @X positions (useful for tasks were questions have at most one correct answer)" print "ACC - accuracy, i.e. number of correct answers retrieved at rank @X normalized by the rank and the total number of questions" print "AC1 - the number of correct answers at @X normalized by the number of maximum possible answers (perfect re-ranker)" print "AC2 - the absolute number of correct answers at @X"
def get_evaluation_results(df, y_pred, skip_all_positives_and_all_negatives=True): predictions_dict = get_ranked_predictions_dict( df, y_pred, skip_all_positives_and_all_negatives= skip_all_positives_and_all_negatives) logging.debug("Num of questions: %d" % (len(predictions_dict))) mrr_score = m.mrr(predictions_dict, 1000) map_score = m.map(predictions_dict) * 100 p1_score = m.recall_of_1(predictions_dict, 1000)[0] return mrr_score, map_score, p1_score
def stats_cv(path=".", format="trec", prefix="svm", th=50, verbose=False): mrrs_se = [] mrrs_svm = [] abs_mrrs = [] rel_mrrs = [] maps_se = [] maps_svm = [] abs_maps = [] rel_maps = [] recalls1_se = [] recalls1_svm = [] abs_recalls = [] rel_recalls = [] oracle_mrrs = [] oracle_maps = [] oracle_recs1 = [] num_folds = 0 print "%13s %5s %7s %7s" %("IR", "SVM", "(abs)", "(rel)") for fold in sorted(os.listdir(path)): currentFold = os.path.join(path, fold) if not os.path.isdir(currentFold): continue if not fold.startswith("fold"): logging.warn("Directories containing CV folds should start with 'fold'") continue print fold # Relevancy file res_fname = os.path.join(currentFold, "%s.test.res" % prefix) if not os.path.exists(res_fname): logging.error("Relevancy file not found: %s", res_fname) sys.exit(1) # Predictions file pred_fname = os.path.join(currentFold, "%s.pred" % prefix) if not os.path.exists(pred_fname): logging.error("SVM prediction file not found: %s", pred_fname) sys.exit(1) try: ir, svm = read_res_pred_files(res_fname, pred_fname, format, verbose) except: logging.error("Failed to process input files: %s %s", res_fname, pred_fname) logging.error("Check that the input file format is correct") sys.exit(1) # MRR mrr_se = metrics.mrr(ir, th) or 1 mrr_svm = metrics.mrr(svm, th) mrrs_se.append(mrr_se) mrrs_svm.append(mrr_svm) # improvement abs_mrr_diff = mrr_svm - mrr_se rel_mrr_diff = (mrr_svm - mrr_se)*100/mrr_se abs_mrrs.append(abs_mrr_diff) rel_mrrs.append(rel_mrr_diff) print "MRR: %5.2f %5.2f %+6.2f%% %+6.2f%%" % (mrr_se, mrr_svm, abs_mrr_diff, rel_mrr_diff) # MAP map_se = metrics.map(ir) or 1 map_svm = metrics.map(svm) maps_se.append(map_se) maps_svm.append(map_svm) # improvement abs_map_diff = map_svm - map_se rel_map_diff = (map_svm - map_se)*100/map_se abs_maps.append(abs_map_diff) rel_maps.append(rel_map_diff) print "MAP: %5.2f %5.2f %+6.2f%% %+6.2f%%" % (map_se, map_svm, abs_map_diff, rel_map_diff) # Recall-of-1@1 rec_se = metrics.recall_of_1(ir, th)[0] or 1 rec_svm = metrics.recall_of_1(svm, th)[0] recalls1_se.append(rec_se) recalls1_svm.append(rec_svm) # improvement abs_rec_diff = rec_svm - rec_se rel_rec_diff = (rec_svm - rec_se)*100/rec_se abs_recalls.append(abs_rec_diff) rel_recalls.append(rel_rec_diff) print "P@1: %5.2f %5.2f %+6.2f%% %+6.2f%%" % (rec_se, rec_svm, abs_rec_diff, rel_rec_diff) num_folds += 1 ''' mrr_oracle = metrics.oracle_mrr(ir, th) map_oracle = metrics.oracle_map(ir) prec_oracle = metrics.oracle_precision(ir, th)[0] rec1_oracle = metrics.oracle_recall_of_1(ir, th)[0] oracle_mrrs.append(mrr_oracle) oracle_maps.append(map_oracle) oracle_recs1.append(rec1_oracle) print "Oracle MRR: %5.2f, Oracle MAP: %5.2f, Oracle prec: %5.2f, Oracle rec@1: %5.2f" % (mrr_oracle, map_oracle, prec_oracle, rec1_oracle) ''' # mrrs avg_mrr_se, std_mrr_se = mean_and_std(mrrs_se) avg_mrr_svm, std_mrr_svm = mean_and_std(mrrs_svm) avg_abs_impr_mrr, std_abs_impr_mrr = mean_and_std(abs_mrrs) avg_rel_impr_mrr, std_rel_impr_mrr = mean_and_std(rel_mrrs) #oracle_avg_mrr, std_oracle_avg_mrr = mean_and_std(oracle_mrrs) # maps avg_map_se, std_map_se = mean_and_std(maps_se) avg_map_svm, std_map_svm = mean_and_std(maps_svm) avg_abs_impr_map, std_abs_impr_map = mean_and_std(abs_maps) avg_rel_impr_map, std_rel_impr_map = mean_and_std(rel_maps) #oracle_avg_map, std_oracle_avg_map = mean_and_std(oracle_maps) # recall avg_rec1_se, std_rec1_se = mean_and_std(recalls1_se) # se avg_rec1_svm, std_rec1_svm = mean_and_std(recalls1_svm) # svm avg_abs_impr_rec1, std_abs_impr_rec1 = mean_and_std(abs_recalls) # absolute avg_rel_impr_rec1, std_rel_impr_rec1 = mean_and_std(rel_recalls) # relative #oracle_avg_rec1, std_oracle_avg_rec1 = mean_and_std(oracle_recs1) FMT = u"%3s: %5.2f \u00B1 %4.2f %5.2f \u00B1 %4.2f %+6.2f%% \u00B1 %4.2f %+6.2f%% \u00B1 %4.2f" #ORACLE_FMT = u"Oracle MRR: %5.2f \u00B1 %4.2f, Oracle MAP: %5.2f \u00B1 %4.2f, Oracle P@1: %5.2f \u00B1 %4.2f" print print "Averaged over %s folds" % num_folds print "%17s %12s %14s %14s" %("IR", "SVM", "(abs)", "(rel)") print FMT % ("MRR", avg_mrr_se, std_mrr_se, avg_mrr_svm, std_mrr_svm, avg_abs_impr_mrr, std_abs_impr_mrr, avg_rel_impr_mrr, std_rel_impr_mrr) print FMT % ("MAP", avg_map_se, std_map_se, avg_map_svm, std_map_svm, avg_abs_impr_map, std_abs_impr_map, avg_rel_impr_map, std_rel_impr_map) print FMT % ("P@1", avg_rec1_se, std_rec1_se, avg_rec1_svm, std_rec1_svm, avg_abs_impr_rec1, std_abs_impr_rec1, avg_rel_impr_rec1, std_rel_impr_rec1)
def stats_cv(path=".", format="trec", prefix="svm", th=50, suf="", verbose=False, truth_file=None, ignore_noanswer=False, cut_truth_map_at_N=None): mrrs_se = [] mrrs_svm = [] abs_mrrs = [] rel_mrrs = [] maps_se = [] maps_svm = [] abs_maps = [] rel_maps = [] recalls1_se = [] recalls1_svm = [] abs_recalls = [] rel_recalls = [] num_folds = 0 truth = read_truth_file(truth_file, format, cut_truth_map_at_N) print "%13s %5s %7s %7s" % ("IR", "SVM", "(abs)", "(rel)") for fold in sorted(os.listdir(path)): currentFold = os.path.join(path, fold) if not os.path.isdir(currentFold): continue if not fold.startswith("fold"): logging.warn( "Directories containing CV folds should start with 'fold'") continue print fold # Relevancy file res_fname = os.path.join(currentFold, "%s.relevancy" % prefix) if not os.path.exists(res_fname): logging.error("Relevancy file not found: %s", res_fname) sys.exit(1) # Predictions file pred_fname = os.path.join(currentFold, "%s.pred" % (prefix + suf)) if not os.path.exists(pred_fname): logging.error("SVM prediction file not found: %s", pred_fname) sys.exit(1) try: ir, svm = read_res_pred_files(res_fname, pred_fname, format, verbose, ignore_noanswer=ignore_noanswer, truth_map=truth) except: logging.error("Failed to process input files: %s %s", res_fname, pred_fname) logging.error("Check that the input file format is correct") sys.exit(1) # MRR mrr_se = metrics.mrr(ir, th) mrr_svm = metrics.mrr(svm, th) mrrs_se.append(mrr_se) mrrs_svm.append(mrr_svm) # improvement abs_mrr_diff = mrr_svm - mrr_se rel_mrr_diff = (mrr_svm - mrr_se) * 100 / mrr_se abs_mrrs.append(abs_mrr_diff) rel_mrrs.append(rel_mrr_diff) print "MRR: %5.2f %5.2f %+6.2f%% %+6.2f%%" % ( mrr_se, mrr_svm, abs_mrr_diff, rel_mrr_diff) # MAP map_se = metrics.map(ir) map_svm = metrics.map(svm) maps_se.append(map_se) maps_svm.append(map_svm) # improvement abs_map_diff = map_svm - map_se rel_map_diff = (map_svm - map_se) * 100 / map_se abs_maps.append(abs_map_diff) rel_maps.append(rel_map_diff) print "MAP: %5.2f %5.2f %+6.2f%% %+6.2f%%" % ( map_se * 100, map_svm * 100, abs_map_diff, rel_map_diff) # Recall-of-1@1 rec_se = metrics.recall_of_1(ir, th)[0] rec_svm = metrics.recall_of_1(svm, th)[0] recalls1_se.append(rec_se) recalls1_svm.append(rec_svm) # improvement abs_rec_diff = rec_svm - rec_se rel_rec_diff = (rec_svm - rec_se) * 100 / rec_se abs_recalls.append(abs_rec_diff) rel_recalls.append(rel_rec_diff) print "P@1: %5.2f %5.2f %+6.2f%% %+6.2f%%" % ( rec_se, rec_svm, abs_rec_diff, rel_rec_diff) num_folds += 1 # mrrs avg_mrr_se, std_mrr_se = mean_and_std(mrrs_se) avg_mrr_svm, std_mrr_svm = mean_and_std(mrrs_svm) avg_abs_impr_mrr, std_abs_impr_mrr = mean_and_std(abs_mrrs) avg_rel_impr_mrr, std_rel_impr_mrr = mean_and_std(rel_mrrs) # maps avg_map_se, std_map_se = mean_and_std(maps_se) avg_map_svm, std_map_svm = mean_and_std(maps_svm) avg_abs_impr_map, std_abs_impr_map = mean_and_std(abs_maps) avg_rel_impr_map, std_rel_impr_map = mean_and_std(rel_maps) # recall avg_rec1_se, std_rec1_se = mean_and_std(recalls1_se) # se avg_rec1_svm, std_rec1_svm = mean_and_std(recalls1_svm) # svm avg_abs_impr_rec1, std_abs_impr_rec1 = mean_and_std( abs_recalls) # absolute avg_rel_impr_rec1, std_rel_impr_rec1 = mean_and_std( rel_recalls) # relative FMT = u"%3s: %5.2f \u00B1 %4.2f %5.2f \u00B1 %4.2f %+6.2f%% \u00B1 %4.2f %+6.2f%% \u00B1 %4.2f" print print "Averaged over %s folds" % num_folds print "%17s %12s %14s %14s" % ("IR", "SVM", "(abs)", "(rel)") print FMT % ("MRR", avg_mrr_se, std_mrr_se, avg_mrr_svm, std_mrr_svm, avg_abs_impr_mrr, std_abs_impr_mrr, avg_rel_impr_mrr, std_rel_impr_mrr) print FMT % ("MAP", avg_map_se * 100, std_map_se, avg_map_svm * 100, std_map_svm, avg_abs_impr_map, std_abs_impr_map, avg_rel_impr_map, std_rel_impr_map) print FMT % ("P@1", avg_rec1_se, std_rec1_se, avg_rec1_svm, std_rec1_svm, avg_abs_impr_rec1, std_abs_impr_rec1, avg_rel_impr_rec1, std_rel_impr_rec1) print "Table view" print " MRR MAP P@1" print u"IR %5.2f\u00B1%4.2f %5.2f\u00B1%4.2f %5.2f\u00B1%4.2f" % ( avg_mrr_se, std_mrr_se, avg_map_se * 100, std_map_se * 100, avg_rec1_se, std_rec1_se) print u"SVM %5.2f\u00B1%4.2f %5.2f\u00B1%4.2f %5.2f\u00B1%4.2f" % ( avg_mrr_svm, std_mrr_svm, avg_map_svm * 100, std_map_svm * 100, avg_rec1_svm, std_rec1_svm)
def eval_reranker(res_fname="svm.test.res", pred_fname="svm.train.pred", format="trec", th=10, verbose=False, reranking_th=0.0, ignore_noanswer=False): ir, svm, conf_matrix = read_res_pred_files(res_fname, pred_fname, format, verbose, reranking_th=reranking_th, ignore_noanswer=ignore_noanswer) # Calculate standard P, R, F1, Acc acc = 1.0 * ( conf_matrix['true']['true'] + conf_matrix['false']['false']) / ( conf_matrix['true']['true'] + conf_matrix['false']['false'] + conf_matrix['true']['false'] + conf_matrix['false']['true']) p = 0 if (conf_matrix['true']['true'] + conf_matrix['false']['true']) > 0: p = 1.0 * (conf_matrix['true']['true']) / ( conf_matrix['true']['true'] + conf_matrix['false']['true']) r = 0 if (conf_matrix['true']['true'] + conf_matrix['true']['false']) > 0: r = 1.0 * (conf_matrix['true']['true']) / ( conf_matrix['true']['true'] + conf_matrix['true']['false']) f1 = 0 if (p + r) > 0: f1 = 2.0 * p * r / (p + r) # evaluate IR prec_se = metrics.recall_of_1(ir, th) acc_se = metrics.accuracy(ir, th) acc_se1 = metrics.accuracy1(ir, th) acc_se2 = metrics.accuracy2(ir, th) # evaluate SVM prec_svm = metrics.recall_of_1(svm, th) acc_svm = metrics.accuracy(svm, th) acc_svm1 = metrics.accuracy1(svm, th) acc_svm2 = metrics.accuracy2(svm, th) mrr_se = metrics.mrr(ir, th) mrr_svm = metrics.mrr(svm, th) map_se = metrics.map(ir, th) map_svm = metrics.map(svm, th) avg_acc1_svm = metrics.avg_acc1(svm, th) avg_acc1_ir = metrics.avg_acc1(ir, th) print("") print("*** Official score (MAP for SYS): %5.4f" % (map_svm)) print("") print("") print("******************************") print("*** Classification results ***") print("******************************") print("") print("Acc = %5.4f" % (acc)) print("P = %5.4f" % (p)) print("R = %5.4f" % (r)) print("F1 = %5.4f" % (f1)) print("") print("") print("********************************") print("*** Detailed ranking results ***") print("********************************") print("") print("IR -- Score for the output of the IR system (baseline).") print("SYS -- Score for the output of the tested system.") print("") print("%13s %5s" % ("IR", "SYS")) print("MAP : %5.4f %5.4f" % (map_se, map_svm)) print("AvgRec: %5.4f %5.4f" % (avg_acc1_ir, avg_acc1_svm)) print("MRR : %6.2f %6.2f" % (mrr_se, mrr_svm)) print("%16s %6s %14s %6s %14s %6s %12s %4s" % ("IR", "SYS", "IR", "SYS", "IR", "SYS", "IR", "SYS")) for i, (p_se, p_svm, a_se, a_svm, a_se1, a_svm1, a_se2, a_svm2) in enumerate( zip(prec_se, prec_svm, acc_se, acc_svm, acc_se1, acc_svm1, acc_se2, acc_svm2), 1): print( "REC-1@%02d: %6.2f %6.2f ACC@%02d: %6.2f %6.2f AC1@%02d: %6.2f %6.2f AC2@%02d: %4.0f %4.0f" % (i, p_se, p_svm, i, a_se, a_svm, i, a_se1, a_svm1, i, a_se2, a_svm2)) print() print( "REC-1 - percentage of questions with at least 1 correct answer in the top @X positions (useful for tasks where questions have at most one correct answer)" ) print( "ACC - accuracy, i.e., number of correct answers retrieved at rank @X normalized by the rank and the total number of questions" ) print( "AC1 - the number of correct answers at @X normalized by the number of maximum possible answers (perfect re-ranker)" ) print("AC2 - the absolute number of correct answers at @X")
def eval_reranker(res_fname="svm.test.res", pred_fname="svm.train.pred", format="trec", th=10, verbose=False, reranking_th=0.0, ignore_noanswer=False): ir, svm, conf_matrix = read_res_pred_files(res_fname, pred_fname, format, verbose, reranking_th=reranking_th, ignore_noanswer=ignore_noanswer) # Calculate standard P, R, F1, Acc acc = 1.0 * ( conf_matrix['true']['true'] + conf_matrix['false']['false']) / ( conf_matrix['true']['true'] + conf_matrix['false']['false'] + conf_matrix['true']['false'] + conf_matrix['false']['true']) p = 0 if (conf_matrix['true']['true'] + conf_matrix['false']['true']) > 0: p = 1.0 * (conf_matrix['true']['true']) / ( conf_matrix['true']['true'] + conf_matrix['false']['true']) r = 0 if (conf_matrix['true']['true'] + conf_matrix['true']['false']) > 0: r = 1.0 * (conf_matrix['true']['true']) / ( conf_matrix['true']['true'] + conf_matrix['true']['false']) f1 = 0 if (p + r) > 0: f1 = 2.0 * p * r / (p + r) # evaluate IR prec_se = metrics.recall_of_1(ir, th) acc_se = metrics.accuracy(ir, th) acc_se1 = metrics.accuracy1(ir, th) acc_se2 = metrics.accuracy2(ir, th) # evaluate SVM prec_svm = metrics.recall_of_1(svm, th) acc_svm = metrics.accuracy(svm, th) acc_svm1 = metrics.accuracy1(svm, th) acc_svm2 = metrics.accuracy2(svm, th) mrr_se = metrics.mrr(ir, th) mrr_svm = metrics.mrr(svm, th) map_se = metrics.map(ir, th) map_svm = metrics.map(svm, th) avg_acc1_svm = metrics.avg_acc1(svm, th) avg_acc1_ir = metrics.avg_acc1(ir, th) #print "" #print "*** Official score (MAP for SYS): %5.4f" %(map_svm) #print "" #print "" #print "******************************" #print "*** Classification results ***" #print "******************************" #print "" #print "Acc = %5.4f" %(acc) #print "P = %5.4f" %(p) #print "R = %5.4f" %(r) #print "F1 = %5.4f" %(f1) #print "" #print "" #print "********************************" #print "*** Detailed ranking results ***" #print "********************************" #print "" #print "IR -- Score for the output of the IR system (baseline)." #print "SYS -- Score for the output of the tested system." #print "" #print "%13s %5s" %("IR", "SYS") #print "MAP : %5.4f %5.4f" %(map_se, map_svm) #print "AvgRec: %5.4f %5.4f" %(avg_acc1_ir, avg_acc1_svm) #print "MRR : %6.2f %6.2f" %(mrr_se, mrr_svm) print "MAP : %5.4f\tMRR : %5.4f\tAvgRec: %5.4f" % (map_svm, mrr_svm, avg_acc1_svm) #print "Acc : %5.4f" %(acc) #print "P : %5.4f" %(p) #print "R : %5.4f" %(r) #print "F1 : %5.4f" %(f1) """
def eval_reranker(res_fname="svm.test.res", pred_fname="svm.train.pred", format="trec", th=10, verbose=False, reranking_th=0.0, ignore_noanswer=False): ir, svm, conf_matrix = read_res_pred_files(res_fname, pred_fname, format, verbose, reranking_th=reranking_th, ignore_noanswer=ignore_noanswer) # Calculate standard P, R, F1, Acc acc = 1.0 * (conf_matrix['true']['true'] + conf_matrix['false']['false']) / (conf_matrix['true']['true'] + conf_matrix['false']['false'] + conf_matrix['true']['false'] + conf_matrix['false']['true']) p = 0 if (conf_matrix['true']['true'] + conf_matrix['false']['true']) > 0: p = 1.0 * (conf_matrix['true']['true']) / (conf_matrix['true']['true'] + conf_matrix['false']['true']) r = 0 if (conf_matrix['true']['true'] + conf_matrix['true']['false']) > 0: r = 1.0 * (conf_matrix['true']['true']) / (conf_matrix['true']['true'] + conf_matrix['true']['false']) f1 = 0 if (p + r) > 0: f1 = 2.0 * p * r / (p + r) # evaluate IR prec_se = metrics.recall_of_1(ir, th) acc_se = metrics.accuracy(ir, th) acc_se1 = metrics.accuracy1(ir, th) acc_se2 = metrics.accuracy2(ir, th) # evaluate SVM prec_svm = metrics.recall_of_1(svm, th) acc_svm = metrics.accuracy(svm, th) acc_svm1 = metrics.accuracy1(svm, th) acc_svm2 = metrics.accuracy2(svm, th) mrr_se = metrics.mrr(ir, th) mrr_svm = metrics.mrr(svm, th) map_se = metrics.map(ir, th) map_svm = metrics.map(svm, th) avg_acc1_svm = metrics.avg_acc1(svm, th) avg_acc1_ir = metrics.avg_acc1(ir, th) print ("") print ("*** Official score (MAP for SYS): %5.4f" %(map_svm)) print ("") print ("") print( "******************************") print( "*** Classification results ***") print( "******************************") print( "") print( "Acc = %5.4f" %(acc)) print( "P = %5.4f" %(p)) print( "R = %5.4f" %(r)) print( "F1 = %5.4f" %(f1)) print( "") print( "") print( "********************************") print( "*** Detailed ranking results ***") print( "********************************") print( "") print( "IR -- Score for the output of the IR system (baseline).") print( "SYS -- Score for the output of the tested system.") print( "") print( "%13s %5s" %("IR", "SYS")) print( "MAP : %5.4f %5.4f" %(map_se, map_svm)) print( "AvgRec: %5.4f %5.4f" %(avg_acc1_ir, avg_acc1_svm)) print( "MRR : %6.2f %6.2f" %(mrr_se, mrr_svm)) print( "%16s %6s %14s %6s %14s %6s %12s %4s" % ("IR", "SYS", "IR", "SYS", "IR", "SYS", "IR", "SYS")) for i, (p_se, p_svm, a_se, a_svm, a_se1, a_svm1, a_se2, a_svm2) in enumerate(zip(prec_se, prec_svm, acc_se, acc_svm, acc_se1, acc_svm1, acc_se2, acc_svm2), 1): print( "REC-1@%02d: %6.2f %6.2f ACC@%02d: %6.2f %6.2f AC1@%02d: %6.2f %6.2f AC2@%02d: %4.0f %4.0f" %(i, p_se, p_svm, i, a_se, a_svm, i, a_se1, a_svm1, i, a_se2, a_svm2)) print( "REC-1 - percentage of questions with at least 1 correct answer in the top @X positions (useful for tasks where questions have at most one correct answer)") print( "ACC - accuracy, i.e., number of correct answers retrieved at rank @X normalized by the rank and the total number of questions") print( "AC1 - the number of correct answers at @X normalized by the number of maximum possible answers (perfect re-ranker)") print( "AC2 - the absolute number of correct answers at @X") return map_svm