def eval_reranker(res_fname="svm.test.res", pred_fname="svm.train.pred", format="trec", th=50, verbose=False, reranking_th=-100.0, ignore_noanswer=False, ignore_allanswer=False): ir, svm = read_res_pred_files(res_fname, pred_fname, format, verbose, reranking_th=reranking_th, ignore_noanswer=ignore_noanswer, ignore_allanswer=ignore_allanswer) # evaluate IR prec_se = metrics.recall_of_1(ir, th) acc_se = metrics.accuracy(ir, th) acc_se1 = metrics.accuracy1(ir, th) acc_se2 = metrics.accuracy2(ir, th) # evaluate SVM prec_svm = metrics.recall_of_1(svm, th) acc_svm = metrics.accuracy(svm, th) acc_svm1 = metrics.accuracy1(svm, th) acc_svm2 = metrics.accuracy2(svm, th) mrr_se = metrics.mrr(ir, th) mrr_svm = metrics.mrr(svm, th) map_se = metrics.map(ir) map_svm = metrics.map(svm) avg_acc1_svm = metrics.avg_acc1(svm, th) avg_acc1_ir = metrics.avg_acc1(ir, th) ''' print "%13s %5s" %("IR", "SVM") print "MRR: %5.2f %5.2f" %(mrr_se, mrr_svm) print "MAP: %5.4f %5.4f" %(map_se, map_svm) print "AvgRec: %5.2f %5.2f" %(avg_acc1_ir, avg_acc1_svm) print "%16s %6s %14s %6s %14s %6s %12s %4s" % ("IR", "SVM", "IR", "SVM", "IR", "SVM", "IR", "SVM") ''' rec1_se =-10 rec1_svm = -10 for i, (p_se, p_svm, a_se, a_svm, a_se1, a_svm1, a_se2, a_svm2) in enumerate(zip(prec_se, prec_svm, acc_se, acc_svm, acc_se1, acc_svm1, acc_se2, acc_svm2), 1): #print "REC-1@%02d: %6.2f %6.2f ACC@%02d: %6.2f %6.2f AC1@%02d: %6.2f %6.2f AC2@%02d: %4.0f %4.0f" %(i, p_se, p_svm, i, a_se, a_svm, i, a_se1, a_svm1, i, a_se2, a_svm2) if (rec1_se<-5): rec1_se = p_se rec1_svm = p_svm ''' print "REC-1 - percentage of questions with at least 1 correct answer in the top @X positions (useful for tasks were questions have at most one correct answer)" print "ACC - accuracy, i.e. number of correct answers retrieved at rank @X normalized by the rank and the total number of questions" print "AC1 - the number of correct answers at @X normalized by the number of maximum possible answers (perfect re-ranker)" print "AC2 - the absolute number of correct answers at @X" ''' print "Table view" print " MRR MAP P@1" print "REF_FILE %5.2f %5.2f %5.2f" % (mrr_se, map_se*100, rec1_se) print "SVM %5.2f %5.2f %5.2f" % (mrr_svm, map_svm*100, rec1_svm)
def eval_reranker(res_fname="svm.test.res", pred_fname="svm.train.pred", format="trec", th=10, verbose=False, reranking_th=0.0, ignore_noanswer=False): ir, svm, conf_matrix = read_res_pred_files(res_fname, pred_fname, format, verbose, reranking_th=reranking_th, ignore_noanswer=ignore_noanswer) # Calculate standard P, R, F1, Acc acc = 1.0 * ( conf_matrix['true']['true'] + conf_matrix['false']['false']) / ( conf_matrix['true']['true'] + conf_matrix['false']['false'] + conf_matrix['true']['false'] + conf_matrix['false']['true']) p = 0 if (conf_matrix['true']['true'] + conf_matrix['false']['true']) > 0: p = 1.0 * (conf_matrix['true']['true']) / ( conf_matrix['true']['true'] + conf_matrix['false']['true']) r = 0 if (conf_matrix['true']['true'] + conf_matrix['true']['false']) > 0: r = 1.0 * (conf_matrix['true']['true']) / ( conf_matrix['true']['true'] + conf_matrix['true']['false']) f1 = 0 if (p + r) > 0: f1 = 2.0 * p * r / (p + r) # evaluate IR prec_se = metrics.recall_of_1(ir, th) acc_se = metrics.accuracy(ir, th) acc_se1 = metrics.accuracy1(ir, th) acc_se2 = metrics.accuracy2(ir, th) # evaluate SVM prec_svm = metrics.recall_of_1(svm, th) acc_svm = metrics.accuracy(svm, th) acc_svm1 = metrics.accuracy1(svm, th) acc_svm2 = metrics.accuracy2(svm, th) mrr_se = metrics.mrr(ir, th) mrr_svm = metrics.mrr(svm, th) map_se = metrics.map(ir, th) map_svm = metrics.map(svm, th) avg_acc1_svm = metrics.avg_acc1(svm, th) avg_acc1_ir = metrics.avg_acc1(ir, th) print "acc\tf1\tMAP\tMRR\tAvgRec" print "%.4f %4.4f %4.4f %4.4f %4.4f" % (acc, f1, map_svm, mrr_svm, avg_acc1_svm)
def eval_search_engine(res_fname, format, th=10): ir = read_res_file(res_fname, format) # evaluate IR rec = metrics.recall_of_1(ir, th) acc = metrics.accuracy(ir, th) acc1 = metrics.accuracy1(ir, th) acc2 = metrics.accuracy2(ir, th) mrr = metrics.mrr(ir, th) print("%13s" % "IR") print("MRRof1: %5.2f" % mrr) for i, (r, a, a1, a2) in enumerate(zip(rec, acc, acc1, acc2), 1): print( "REC-1@%02d: %6.2f ACC@%02d: %6.2f AC1@%02d: %6.2f AC2@%02d: %4.0f" % (i, r, i, a, i, a1, i, a2)) print() print( "REC-1 - percentage of questions with at least 1 correct answer in the top @X positions (useful for tasks were questions have at most one correct answer)" ) print( "ACC - accuracy, i.e. number of correct answers retrieved at rank @X normalized by the rank and the total number of questions" ) print( "AC1 - the number of correct answers at @X normalized by the number of maximum possible answers (perfect re-ranker)" ) print("AC2 - the absolute number of correct answers at @X")
def eval_search_engine(res_fname, format, th=50): ir = read_res_file(res_fname, format) # evaluate IR rec = metrics.recall_of_1(ir, th) acc = metrics.accuracy(ir, th) acc1 = metrics.accuracy1(ir, th) acc2 = metrics.accuracy2(ir, th) mrr = metrics.mrr(ir, th) # MAP map_ir = metrics.map(ir) print "%10s" %"IR" print "MRR: %5.2f" % mrr print "MAP: %5.2f" % map_ir for i, (r, a, a1, a2) in enumerate(zip(rec, acc, acc1, acc2), 1): print "REC-1@%02d: %6.2f ACC@%02d: %6.2f AC1@%02d: %6.2f AC2@%02d: %4.0f" %(i, r, i, a, i, a1, i, a2) print print "REC-1 - percentage of questions with at least 1 correct answer in the top @X positions (useful for tasks were questions have at most one correct answer)" print "ACC - accuracy, i.e. number of correct answers retrieved at rank @X normalized by the rank and the total number of questions" print "AC1 - the number of correct answers at @X normalized by the number of maximum possible answers (perfect re-ranker)" print "AC2 - the absolute number of correct answers at @X"
def eval_reranker(res_fname="svm.test.res", pred_fname="svm.train.pred", format="trec", th=50, verbose=False, reranking_th=0.0, ignore_noanswer=False): ir, svm = read_res_pred_files(res_fname, pred_fname, format, verbose, reranking_th=reranking_th, ignore_noanswer=ignore_noanswer) # evaluate IR prec_se = metrics.recall_of_1(ir, th) acc_se = metrics.accuracy(ir, th) acc_se1 = metrics.accuracy1(ir, th) acc_se2 = metrics.accuracy2(ir, th) # evaluate SVM prec_svm = metrics.recall_of_1(svm, th) acc_svm = metrics.accuracy(svm, th) acc_svm1 = metrics.accuracy1(svm, th) acc_svm2 = metrics.accuracy2(svm, th) mrr_se = metrics.mrr(ir, th) mrr_svm = metrics.mrr(svm, th) map_se = metrics.map(ir) map_svm = metrics.map(svm) avg_acc1_svm = metrics.avg_acc1(svm, th) avg_acc1_ir = metrics.avg_acc1(ir, th) print "%13s %5s" %("IR", "SVM") print "MRR: %5.2f %5.2f" %(mrr_se, mrr_svm) print "MAP: %5.4f %5.4f" %(map_se, map_svm) print "AvgRec: %5.2f %5.2f" %(avg_acc1_ir, avg_acc1_svm) print "%16s %6s %14s %6s %14s %6s %12s %4s" % ("IR", "SVM", "IR", "SVM", "IR", "SVM", "IR", "SVM") for i, (p_se, p_svm, a_se, a_svm, a_se1, a_svm1, a_se2, a_svm2) in enumerate(zip(prec_se, prec_svm, acc_se, acc_svm, acc_se1, acc_svm1, acc_se2, acc_svm2), 1): print "REC-1@%02d: %6.2f %6.2f ACC@%02d: %6.2f %6.2f AC1@%02d: %6.2f %6.2f AC2@%02d: %4.0f %4.0f" %(i, p_se, p_svm, i, a_se, a_svm, i, a_se1, a_svm1, i, a_se2, a_svm2) print print "REC-1 - percentage of questions with at least 1 correct answer in the top @X positions (useful for tasks were questions have at most one correct answer)" print "ACC - accuracy, i.e. number of correct answers retrieved at rank @X normalized by the rank and the total number of questions" print "AC1 - the number of correct answers at @X normalized by the number of maximum possible answers (perfect re-ranker)" print "AC2 - the absolute number of correct answers at @X"
def eval_reranker(res_fname="svm.test.res", pred_fname="svm.train.pred", format="trec", th=10, verbose=False, reranking_th=0.0, ignore_noanswer=False): ir, svm, conf_matrix = read_res_pred_files(res_fname, pred_fname, format, verbose, reranking_th=reranking_th, ignore_noanswer=ignore_noanswer) # Calculate standard P, R, F1, Acc acc = 1.0 * ( conf_matrix['true']['true'] + conf_matrix['false']['false']) / ( conf_matrix['true']['true'] + conf_matrix['false']['false'] + conf_matrix['true']['false'] + conf_matrix['false']['true']) p = 0 if (conf_matrix['true']['true'] + conf_matrix['false']['true']) > 0: p = 1.0 * (conf_matrix['true']['true']) / ( conf_matrix['true']['true'] + conf_matrix['false']['true']) r = 0 if (conf_matrix['true']['true'] + conf_matrix['true']['false']) > 0: r = 1.0 * (conf_matrix['true']['true']) / ( conf_matrix['true']['true'] + conf_matrix['true']['false']) f1 = 0 if (p + r) > 0: f1 = 2.0 * p * r / (p + r) # evaluate IR prec_se = metrics.recall_of_1(ir, th) acc_se = metrics.accuracy(ir, th) acc_se1 = metrics.accuracy1(ir, th) acc_se2 = metrics.accuracy2(ir, th) # evaluate SVM prec_svm = metrics.recall_of_1(svm, th) acc_svm = metrics.accuracy(svm, th) acc_svm1 = metrics.accuracy1(svm, th) acc_svm2 = metrics.accuracy2(svm, th) mrr_se = metrics.mrr(ir, th) mrr_svm = metrics.mrr(svm, th) map_se = metrics.map(ir, th) map_svm = metrics.map(svm, th) avg_acc1_svm = metrics.avg_acc1(svm, th) avg_acc1_ir = metrics.avg_acc1(ir, th) print("") print("*** Official score (MAP for SYS): %5.4f" % (map_svm)) print("") print("") print("******************************") print("*** Classification results ***") print("******************************") print("") print("Acc = %5.4f" % (acc)) print("P = %5.4f" % (p)) print("R = %5.4f" % (r)) print("F1 = %5.4f" % (f1)) print("") print("") print("********************************") print("*** Detailed ranking results ***") print("********************************") print("") print("IR -- Score for the output of the IR system (baseline).") print("SYS -- Score for the output of the tested system.") print("") print("%13s %5s" % ("IR", "SYS")) print("MAP : %5.4f %5.4f" % (map_se, map_svm)) print("AvgRec: %5.4f %5.4f" % (avg_acc1_ir, avg_acc1_svm)) print("MRR : %6.2f %6.2f" % (mrr_se, mrr_svm)) print("%16s %6s %14s %6s %14s %6s %12s %4s" % ("IR", "SYS", "IR", "SYS", "IR", "SYS", "IR", "SYS")) for i, (p_se, p_svm, a_se, a_svm, a_se1, a_svm1, a_se2, a_svm2) in enumerate( zip(prec_se, prec_svm, acc_se, acc_svm, acc_se1, acc_svm1, acc_se2, acc_svm2), 1): print( "REC-1@%02d: %6.2f %6.2f ACC@%02d: %6.2f %6.2f AC1@%02d: %6.2f %6.2f AC2@%02d: %4.0f %4.0f" % (i, p_se, p_svm, i, a_se, a_svm, i, a_se1, a_svm1, i, a_se2, a_svm2)) print() print( "REC-1 - percentage of questions with at least 1 correct answer in the top @X positions (useful for tasks where questions have at most one correct answer)" ) print( "ACC - accuracy, i.e., number of correct answers retrieved at rank @X normalized by the rank and the total number of questions" ) print( "AC1 - the number of correct answers at @X normalized by the number of maximum possible answers (perfect re-ranker)" ) print("AC2 - the absolute number of correct answers at @X")
def eval_reranker(res_fname="svm.test.res", pred_fname="svm.train.pred", format="trec", th=10, verbose=False, reranking_th=0.0, ignore_noanswer=False): ir, svm, conf_matrix = read_res_pred_files(res_fname, pred_fname, format, verbose, reranking_th=reranking_th, ignore_noanswer=ignore_noanswer) # Calculate standard P, R, F1, Acc acc = 1.0 * ( conf_matrix['true']['true'] + conf_matrix['false']['false']) / ( conf_matrix['true']['true'] + conf_matrix['false']['false'] + conf_matrix['true']['false'] + conf_matrix['false']['true']) p = 0 if (conf_matrix['true']['true'] + conf_matrix['false']['true']) > 0: p = 1.0 * (conf_matrix['true']['true']) / ( conf_matrix['true']['true'] + conf_matrix['false']['true']) r = 0 if (conf_matrix['true']['true'] + conf_matrix['true']['false']) > 0: r = 1.0 * (conf_matrix['true']['true']) / ( conf_matrix['true']['true'] + conf_matrix['true']['false']) f1 = 0 if (p + r) > 0: f1 = 2.0 * p * r / (p + r) # evaluate IR prec_se = metrics.recall_of_1(ir, th) acc_se = metrics.accuracy(ir, th) acc_se1 = metrics.accuracy1(ir, th) acc_se2 = metrics.accuracy2(ir, th) # evaluate SVM prec_svm = metrics.recall_of_1(svm, th) acc_svm = metrics.accuracy(svm, th) acc_svm1 = metrics.accuracy1(svm, th) acc_svm2 = metrics.accuracy2(svm, th) mrr_se = metrics.mrr(ir, th) mrr_svm = metrics.mrr(svm, th) map_se = metrics.map(ir, th) map_svm = metrics.map(svm, th) avg_acc1_svm = metrics.avg_acc1(svm, th) avg_acc1_ir = metrics.avg_acc1(ir, th) #print "" #print "*** Official score (MAP for SYS): %5.4f" %(map_svm) #print "" #print "" #print "******************************" #print "*** Classification results ***" #print "******************************" #print "" #print "Acc = %5.4f" %(acc) #print "P = %5.4f" %(p) #print "R = %5.4f" %(r) #print "F1 = %5.4f" %(f1) #print "" #print "" #print "********************************" #print "*** Detailed ranking results ***" #print "********************************" #print "" #print "IR -- Score for the output of the IR system (baseline)." #print "SYS -- Score for the output of the tested system." #print "" #print "%13s %5s" %("IR", "SYS") #print "MAP : %5.4f %5.4f" %(map_se, map_svm) #print "AvgRec: %5.4f %5.4f" %(avg_acc1_ir, avg_acc1_svm) #print "MRR : %6.2f %6.2f" %(mrr_se, mrr_svm) print "MAP : %5.4f\tMRR : %5.4f\tAvgRec: %5.4f" % (map_svm, mrr_svm, avg_acc1_svm) #print "Acc : %5.4f" %(acc) #print "P : %5.4f" %(p) #print "R : %5.4f" %(r) #print "F1 : %5.4f" %(f1) """
def eval_reranker(res_fname="svm.test.res", pred_fname="svm.train.pred", format="trec", th=10, verbose=False, reranking_th=0.0, ignore_noanswer=False): ir, svm, conf_matrix = read_res_pred_files(res_fname, pred_fname, format, verbose, reranking_th=reranking_th, ignore_noanswer=ignore_noanswer) # Calculate standard P, R, F1, Acc acc = 1.0 * (conf_matrix['true']['true'] + conf_matrix['false']['false']) / (conf_matrix['true']['true'] + conf_matrix['false']['false'] + conf_matrix['true']['false'] + conf_matrix['false']['true']) p = 0 if (conf_matrix['true']['true'] + conf_matrix['false']['true']) > 0: p = 1.0 * (conf_matrix['true']['true']) / (conf_matrix['true']['true'] + conf_matrix['false']['true']) r = 0 if (conf_matrix['true']['true'] + conf_matrix['true']['false']) > 0: r = 1.0 * (conf_matrix['true']['true']) / (conf_matrix['true']['true'] + conf_matrix['true']['false']) f1 = 0 if (p + r) > 0: f1 = 2.0 * p * r / (p + r) # evaluate IR prec_se = metrics.recall_of_1(ir, th) acc_se = metrics.accuracy(ir, th) acc_se1 = metrics.accuracy1(ir, th) acc_se2 = metrics.accuracy2(ir, th) # evaluate SVM prec_svm = metrics.recall_of_1(svm, th) acc_svm = metrics.accuracy(svm, th) acc_svm1 = metrics.accuracy1(svm, th) acc_svm2 = metrics.accuracy2(svm, th) mrr_se = metrics.mrr(ir, th) mrr_svm = metrics.mrr(svm, th) map_se = metrics.map(ir, th) map_svm = metrics.map(svm, th) avg_acc1_svm = metrics.avg_acc1(svm, th) avg_acc1_ir = metrics.avg_acc1(ir, th) print ("") print ("*** Official score (MAP for SYS): %5.4f" %(map_svm)) print ("") print ("") print( "******************************") print( "*** Classification results ***") print( "******************************") print( "") print( "Acc = %5.4f" %(acc)) print( "P = %5.4f" %(p)) print( "R = %5.4f" %(r)) print( "F1 = %5.4f" %(f1)) print( "") print( "") print( "********************************") print( "*** Detailed ranking results ***") print( "********************************") print( "") print( "IR -- Score for the output of the IR system (baseline).") print( "SYS -- Score for the output of the tested system.") print( "") print( "%13s %5s" %("IR", "SYS")) print( "MAP : %5.4f %5.4f" %(map_se, map_svm)) print( "AvgRec: %5.4f %5.4f" %(avg_acc1_ir, avg_acc1_svm)) print( "MRR : %6.2f %6.2f" %(mrr_se, mrr_svm)) print( "%16s %6s %14s %6s %14s %6s %12s %4s" % ("IR", "SYS", "IR", "SYS", "IR", "SYS", "IR", "SYS")) for i, (p_se, p_svm, a_se, a_svm, a_se1, a_svm1, a_se2, a_svm2) in enumerate(zip(prec_se, prec_svm, acc_se, acc_svm, acc_se1, acc_svm1, acc_se2, acc_svm2), 1): print( "REC-1@%02d: %6.2f %6.2f ACC@%02d: %6.2f %6.2f AC1@%02d: %6.2f %6.2f AC2@%02d: %4.0f %4.0f" %(i, p_se, p_svm, i, a_se, a_svm, i, a_se1, a_svm1, i, a_se2, a_svm2)) print( "REC-1 - percentage of questions with at least 1 correct answer in the top @X positions (useful for tasks where questions have at most one correct answer)") print( "ACC - accuracy, i.e., number of correct answers retrieved at rank @X normalized by the rank and the total number of questions") print( "AC1 - the number of correct answers at @X normalized by the number of maximum possible answers (perfect re-ranker)") print( "AC2 - the absolute number of correct answers at @X") return map_svm