def main(): rpd_eval = RpdEvaluator(qrel_orig_path=QREL, run_b_orig_path=ORIG_B, run_a_orig_path=ORIG_A, run_b_rep_path=None, run_a_rep_path=None) rpd_eval.trim() rpd_eval.evaluate() for run_name, info in runs_rpd.items(): with open(info.get('path')) as run_file: info['run'] = pytrec_eval.parse_run(run_file) trim(info['run']) info['scores'] = rpd_eval.evaluate(info['run']) pairs = [('rpd_wcr04_tf_1', 'rpd_wcr0405_tf_1'), ('rpd_wcr04_tf_2', 'rpd_wcr0405_tf_2'), ('rpd_wcr04_tf_3', 'rpd_wcr0405_tf_3'), ('rpd_wcr04_tf_4', 'rpd_wcr0405_tf_4'), ('rpd_wcr04_tf_5', 'rpd_wcr0405_tf_5')] df_content = { 'P_10': [ rpd_eval.er(run_b_score=runs_rpd[pair[0]]['scores'], run_a_score=runs_rpd[pair[1]]['scores'])['P_10'] for pair in pairs ], 'ndcg': [ rpd_eval.er(run_b_score=runs_rpd[pair[0]]['scores'], run_a_score=runs_rpd[pair[1]]['scores'])['ndcg'] for pair in pairs ], 'map': [ rpd_eval.er(run_b_score=runs_rpd[pair[0]]['scores'], run_a_score=runs_rpd[pair[1]]['scores'])['map'] for pair in pairs ], } df = pd.DataFrame(df_content, index=['tf_1', 'tf_2', 'tf_3', 'tf_4', 'tf_5']) orig_val = 1 ax = df.plot.bar(rot=0) ax.hlines(orig_val, -.5, 5.5, linestyles='dashed', color='black') ax.annotate(' ', (3, orig_val), color='black') ax.set_xlabel("Reproduced Run") ax.set_ylabel("Effect Ratio (ER)") ax.get_figure().savefig('data/plots/rpd_er.pdf', format='pdf', bbox_inches='tight') plt.show()
def test_rpd_ttest_path_param(): pval = rpd_eval.ttest() assert 'baseline' in pval.keys() assert 'advanced' in pval.keys() _rpd_eval = RpdEvaluator(qrel_orig_path='./example/data/qrels/core17.txt', run_b_orig_path='./example/orig_b.txt', run_a_orig_path='./example/orig_a.txt') _rpd_eval.trim() _rpd_eval.evaluate() _pval = _rpd_eval.ttest(run_b_path='./example/rpd_b.txt') assert 'baseline' in _pval.keys() # pick a few samples here since nan comparisons cause problems in combination with assert assert pval.get('baseline').get('ndcg') == _pval.get('baseline').get( 'ndcg') assert pval.get('baseline').get('P_10') == _pval.get('baseline').get( 'P_10') assert pval.get('baseline').get('map') == _pval.get('baseline').get('map') _pval = _rpd_eval.ttest(run_b_path='./example/rpd_b.txt', run_a_path='./example/rpd_a.txt') assert 'advanced' in _pval.keys() # pick a few samples here since nan comparisons cause problems in combination with assert assert pval.get('advanced').get('ndcg') == _pval.get('advanced').get( 'ndcg') assert pval.get('advanced').get('P_10') == _pval.get('advanced').get( 'P_10') assert pval.get('advanced').get('map') == _pval.get('advanced').get('map')
def main(): rpd_eval = RpdEvaluator(qrel_orig_path=QREL, run_b_orig_path=ORIG_B, run_a_orig_path=ORIG_A, run_b_rep_path=None, run_a_rep_path=None) rpd_eval.trim() rpd_eval.evaluate() for run_name, info in runs_rpd.items(): with open(info.get('path')) as run_file: info['run'] = pytrec_eval.parse_run(run_file) trim(info['run']) info['scores'] = rpd_eval.evaluate(info['run']) info['rmse'] = rpd_eval.rmse(run_b_score=info['scores']) baseline_runs = [ 'rpd_wcr04_tf_1', 'rpd_wcr04_tf_2', 'rpd_wcr04_tf_3', 'rpd_wcr04_tf_4', 'rpd_wcr04_tf_5' ] advanced_runs = [ 'rpd_wcr0405_tf_1', 'rpd_wcr0405_tf_2', 'rpd_wcr0405_tf_3', 'rpd_wcr0405_tf_4', 'rpd_wcr0405_tf_5' ] cutoffs = ['5', '10', '15', '20', '30', '100', '200', '500', '1000'] df_content = {} for run_name in baseline_runs: df_content[run_name] = [ runs_rpd[run_name]['rmse']['baseline']['ndcg_cut_' + co] for co in cutoffs ] df = pd.DataFrame(df_content, index=cutoffs) ax = df.plot.line(style='o-') ax.set_xlabel('Cut-off values') ax.set_ylabel('RMSE') ax.get_figure().savefig('data/plots/rpd_b_rmse.pdf', format='pdf', bbox_inches='tight') plt.show() df_content = {} for run_name in advanced_runs: df_content[run_name] = [ runs_rpd[run_name]['rmse']['baseline']['ndcg_cut_' + co] for co in cutoffs ] df = pd.DataFrame(df_content, index=cutoffs) ax = df.plot.line(style='o-') ax.set_xlabel('Cut-off values') ax.set_ylabel('RMSE') ax.get_figure().savefig('data/plots/rpd_a_rmse.pdf', format='pdf', bbox_inches='tight') plt.show()
def test_rmse_path_param(): rmse = rpd_eval.rmse() assert 'baseline' in rmse.keys() assert 'advanced' in rmse.keys() _rpd_eval = RpdEvaluator(qrel_orig_path='./example/data/qrels/core17.txt', run_b_orig_path='./example/orig_b.txt', run_a_orig_path='./example/orig_a.txt') _rpd_eval.trim() _rpd_eval.evaluate() _rmse = _rpd_eval.rmse(run_b_path='./example/rpd_b.txt') assert 'baseline' in _rmse.keys() assert rmse.get('baseline') == _rmse.get('baseline') _rmse = _rpd_eval.rmse(run_b_path='./example/rpd_b.txt', run_a_path='./example/rpd_a.txt') assert 'advanced' in _rmse.keys() assert rmse.get('advanced') == _rmse.get('advanced')
def test_ktu_path_param(): ktu = rpd_eval.ktau_union() assert 'baseline' in ktu.keys() assert 'advanced' in ktu.keys() _rpd_eval = RpdEvaluator(qrel_orig_path='./example/data/qrels/core17.txt', run_b_orig_path='./example/orig_b.txt', run_a_orig_path='./example/orig_a.txt') _rpd_eval.trim() _rpd_eval.evaluate() _ktu = _rpd_eval.ktau_union(run_b_path='./example/rpd_b.txt') assert 'baseline' in _ktu.keys() assert ktu.get('baseline') == _ktu.get('baseline') _ktu = _rpd_eval.ktau_union(run_b_path='./example/rpd_b.txt', run_a_path='./example/rpd_a.txt') assert 'advanced' in _ktu.keys() assert ktu.get('advanced') == _ktu.get('advanced')
def test_rpd_dri_path_param(): dri = rpd_eval.dri() _rpd_eval = RpdEvaluator(qrel_orig_path='./example/data/qrels/core17.txt', run_b_orig_path='./example/orig_b.txt', run_a_orig_path='./example/orig_a.txt') _rpd_eval.trim() _rpd_eval.evaluate() _dri = _rpd_eval.dri(run_b_path='./example/rpd_b.txt', run_a_path='./example/rpd_a.txt') # pick a few samples here since nan comparisons cause problems in combination with assert assert dri.get('ndcg') == _dri.get('ndcg') assert dri.get('P_10') == _dri.get('P_10') assert dri.get('map') == _dri.get('map')
def test_ttest_with_identical_score_distributions(): rpd_eval = RpdEvaluator(qrel_orig_path='./example/data/qrels/core17.txt', run_b_orig_path='./example/orig_b.txt', run_a_orig_path='./example/orig_a.txt', run_b_rep_path='./example/orig_b.txt', run_a_rep_path='./example/orig_a.txt') rpd_eval.trim() rpd_eval.evaluate() ttest = rpd_eval.ttest() pvals = list(filter(lambda x: x == 1.0, ttest.get('baseline').values())) assert len(pvals) == len(ttest.get('baseline').keys()) pvals = list(filter(lambda x: x == 1.0, ttest.get('advanced').values())) assert len(pvals) == len(ttest.get('advanced').keys())
def main(): rpd_eval = RpdEvaluator(qrel_orig_path=QREL, run_b_orig_path=ORIG_B, run_a_orig_path=ORIG_A, run_b_rep_path=None, run_a_rep_path=None) rpd_eval.trim() rpd_eval.evaluate() for run_name, info in runs_rpd.items(): with open(info.get('path')) as run_file: info['run'] = pytrec_eval.parse_run(run_file) trim(info['run']) info['scores'] = rpd_eval.evaluate(info['run']) average_retrieval_performance( rpd_eval.run_b_orig_score, { 'tf_1': runs_rpd.get('rpd_wcr04_tf_1').get('scores'), 'tf_2': runs_rpd.get('rpd_wcr04_tf_2').get('scores'), 'tf_3': runs_rpd.get('rpd_wcr04_tf_3').get('scores'), 'tf_4': runs_rpd.get('rpd_wcr04_tf_4').get('scores'), 'tf_5': runs_rpd.get('rpd_wcr04_tf_5').get('scores'), }, measures=['P_10', 'ndcg', 'bpref', 'map'], xlabel='Reproduced run (wcr04)', ylabel='Score', outfile='data/plots/rpd_b_arp.pdf') average_retrieval_performance( rpd_eval.run_a_orig_score, { 'tf_1': runs_rpd.get('rpd_wcr0405_tf_1').get('scores'), 'tf_2': runs_rpd.get('rpd_wcr0405_tf_2').get('scores'), 'tf_3': runs_rpd.get('rpd_wcr0405_tf_3').get('scores'), 'tf_4': runs_rpd.get('rpd_wcr0405_tf_4').get('scores'), 'tf_5': runs_rpd.get('rpd_wcr0405_tf_5').get('scores'), }, measures=['P_10', 'ndcg', 'bpref', 'map'], xlabel='Reproduced run (wcr0405)', ylabel='Score', outfile='data/plots/rpd_a_arp.pdf')
def main(): cutoffs = [1000, 100, 50, 20, 10, 5] # BASELINE for run_name, info in zip( list(runs_rpd.keys())[::2], list(runs_rpd.values())[::2]): rpd_eval = RpdEvaluator(qrel_orig_path=QREL, run_b_orig_path=ORIG_B, run_a_orig_path=ORIG_A, run_b_rep_path=None, run_a_rep_path=None) rpd_eval.trim() rpd_eval.evaluate() with open(info.get('path')) as run_file: info['run'] = pytrec_eval.parse_run(run_file) for cutoff in cutoffs: rpd_eval.trim(cutoff) rpd_eval.trim(cutoff, info['run']) info['ktu_' + str(cutoff)] = arp( rpd_eval.ktau_union(info['run'])['baseline']) df_content = {} for run_name, info in zip( list(runs_rpd.keys())[::2], list(runs_rpd.values())[::2]): df_content[run_name] = [ info.get('ktu_' + str(cutoff)) for cutoff in cutoffs[::-1] ] ax = pd.DataFrame(data=df_content, index=[str(cutoff) for cutoff in cutoffs[::-1]]).plot(style='-*') ax.set_xlabel('Cut-off values') ax.set_ylabel(r"Kendall's $\tau$") ax.get_figure().savefig('data/plots/rpd_b_ktu.pdf', format='pdf', bbox_inches='tight') plt.show() # ADVANCED for run_name, info in zip( list(runs_rpd.keys())[1::2], list(runs_rpd.values())[1::2]): rpd_eval = RpdEvaluator(qrel_orig_path=QREL, run_b_orig_path=ORIG_B, run_a_orig_path=ORIG_A, run_b_rep_path=None, run_a_rep_path=None) rpd_eval.trim() rpd_eval.evaluate() with open(info.get('path')) as run_file: info['run'] = pytrec_eval.parse_run(run_file) for cutoff in cutoffs: rpd_eval.trim(cutoff) rpd_eval.trim(cutoff, info['run']) # scores = rpl_eval.evaluate(info['run']) info['ktu_' + str(cutoff)] = arp( rpd_eval.ktau_union(info['run'])['baseline']) df_content = {} for run_name, info in zip( list(runs_rpd.keys())[1::2], list(runs_rpd.values())[1::2]): df_content[run_name] = [ info.get('ktu_' + str(cutoff)) for cutoff in cutoffs[::-1] ] ax = pd.DataFrame(data=df_content, index=[str(cutoff) for cutoff in cutoffs[::-1]]).plot(style='-*') ax.set_xlabel('Cut-off values') ax.set_ylabel(r"Kendall's $\tau$") ax.get_figure().savefig('data/plots/rpd_a_ktu.pdf', format='pdf', bbox_inches='tight') plt.show()
def main(): rpd_eval = RpdEvaluator(qrel_orig_path=QREL, run_b_orig_path=ORIG_B, run_a_orig_path=ORIG_A, run_b_rep_path=None, run_a_rep_path=None) rpd_eval.trim() rpd_eval.evaluate() for run_name, info in runs_rpd.items(): with open(info.get('path')) as run_file: info['run'] = pytrec_eval.parse_run(run_file) trim(info['run']) info['scores'] = rpd_eval.evaluate(info['run']) dri_er = { 'wcr_tf_1': { 'er': rpd_eval.er(runs_rpd['rpd_wcr04_tf_1']['scores'], runs_rpd['rpd_wcr0405_tf_1']['scores']), 'dri': rpd_eval.dri(runs_rpd['rpd_wcr04_tf_1']['scores'], runs_rpd['rpd_wcr0405_tf_1']['scores']) }, 'wcr_tf_2': { 'er': rpd_eval.er(runs_rpd['rpd_wcr04_tf_2']['scores'], runs_rpd['rpd_wcr0405_tf_2']['scores']), 'dri': rpd_eval.dri(runs_rpd['rpd_wcr04_tf_2']['scores'], runs_rpd['rpd_wcr0405_tf_2']['scores']) }, 'wcr_tf_3': { 'er': rpd_eval.er(runs_rpd['rpd_wcr04_tf_3']['scores'], runs_rpd['rpd_wcr0405_tf_3']['scores']), 'dri': rpd_eval.dri(runs_rpd['rpd_wcr04_tf_3']['scores'], runs_rpd['rpd_wcr0405_tf_3']['scores']) }, 'wcr_tf_4': { 'er': rpd_eval.er(runs_rpd['rpd_wcr04_tf_4']['scores'], runs_rpd['rpd_wcr0405_tf_4']['scores']), 'dri': rpd_eval.dri(runs_rpd['rpd_wcr04_tf_4']['scores'], runs_rpd['rpd_wcr0405_tf_4']['scores']) }, 'wcr_tf_5': { 'er': rpd_eval.er(runs_rpd['rpd_wcr04_tf_5']['scores'], runs_rpd['rpd_wcr0405_tf_5']['scores']), 'dri': rpd_eval.dri(runs_rpd['rpd_wcr04_tf_5']['scores'], runs_rpd['rpd_wcr0405_tf_5']['scores']) }, } measures = ['P_10', 'map', 'ndcg'] marker_color = [('o', 'b'), ('^', 'g'), ('v', 'r')] fig, ax1 = plt.subplots() ax1.set_xlabel('Effect Ratio (ER)') ax1.set_ylabel(u'Delta Relative Improvement (ΔRI)') for measure, mk in zip(measures, marker_color): ax1.plot([dri_er[r]['er'][measure] for r in dri_er.keys()], [dri_er[r]['dri'][measure] for r in dri_er.keys()], marker=mk[0], color=mk[1], linestyle='None', label=measure) ax1.tick_params(axis='y', labelcolor='k') fig.tight_layout() plt.axhline(0, color='grey') plt.axvline(1, color='grey') plt.legend() plt.title('Reproducibility') plt.savefig('data/plots/rpd_dri_vs_er.pdf', format='pdf', bbox_inches='tight') plt.show()
import pytest from repro_eval.Evaluator import RpdEvaluator from repro_eval.config import ERR_MSG rpd_eval = RpdEvaluator(qrel_orig_path=None, run_b_orig_path=None, run_a_orig_path=None, run_b_rep_path=None, run_a_rep_path=None) def test_ktu(capfd): assert None is rpd_eval.ktau_union() out, err = capfd.readouterr() assert out == ''.join([ERR_MSG, '\n']) def test_rbo(capfd): assert None is rpd_eval.rbo() out, err = capfd.readouterr() assert out == ''.join([ERR_MSG, '\n']) def test_rmse(capfd): assert None is rpd_eval.rmse() out, err = capfd.readouterr() assert out == ''.join([ERR_MSG, '\n']) def test_er(capfd): assert None is rpd_eval.er()
import pytest from repro_eval.Evaluator import RpdEvaluator, RplEvaluator import numpy as np rpd_eval = RpdEvaluator(qrel_orig_path='./example/data/qrels/core17.txt', run_b_orig_path='./example/orig_b.txt', run_a_orig_path='./example/orig_a.txt', run_b_rep_path='./example/rpd_b.txt', run_a_rep_path='./example/rpd_a.txt') rpd_eval.trim() rpd_eval.evaluate() def test_ktu_path_param(): ktu = rpd_eval.ktau_union() assert 'baseline' in ktu.keys() assert 'advanced' in ktu.keys() _rpd_eval = RpdEvaluator(qrel_orig_path='./example/data/qrels/core17.txt', run_b_orig_path='./example/orig_b.txt', run_a_orig_path='./example/orig_a.txt') _rpd_eval.trim() _rpd_eval.evaluate() _ktu = _rpd_eval.ktau_union(run_b_path='./example/rpd_b.txt') assert 'baseline' in _ktu.keys() assert ktu.get('baseline') == _ktu.get('baseline') _ktu = _rpd_eval.ktau_union(run_b_path='./example/rpd_b.txt', run_a_path='./example/rpd_a.txt')
def main(): rpd_eval = RpdEvaluator(qrel_orig_path=QREL, run_b_orig_path=ORIG_B, run_a_orig_path=ORIG_A, run_b_rep_path=RPD_B, run_a_rep_path=RPD_A) rpd_eval.trim() rpd_eval.evaluate() # KTU ktau = rpd_eval.ktau_union() print("Kendall's tau Union (KTU)") print('------------------------------------------------------------------') for topic, value in ktau.get('baseline').items(): print_base_adv(topic, 'KTU', value, ktau.get('advanced').get(topic)) print_base_adv('ARP', 'KTU', arp(ktau.get('baseline')), arp(ktau.get('advanced'))) # RBO rbo = rpd_eval.rbo() print("Rank-biased Overlap (RBO)") print('------------------------------------------------------------------') for topic, value in rbo.get('baseline').items(): print_base_adv(topic, 'RBO', value, rbo.get('advanced').get(topic)) print_base_adv('ARP', 'RBO', arp(rbo.get('baseline')), arp(rbo.get('advanced'))) # RMSE rmse = rpd_eval.rmse() print("Root mean square error (RMSE)") print('------------------------------------------------------------------') for measure, value in rmse.get('baseline').items(): print_base_adv(measure, 'RMSE', value, rmse.get('advanced').get(measure)) # ER print("Effect ratio (ER)") print('------------------------------------------------------------------') er = rpd_eval.er() for measure, value in er.items(): print_simple_line(measure, 'ER', value) # DRI print("Delta Relative Improvement (DRI)") print('------------------------------------------------------------------') dri = rpd_eval.dri() for measure, value in dri.items(): print_simple_line(measure, 'DRI', value) # ttest pvals = rpd_eval.ttest() print("Two-tailed paired t-test (p-value)") print('------------------------------------------------------------------') for measure, value in pvals.get('baseline').items(): print_base_adv(measure, 'PVAL', value, pvals.get('advanced').get(measure))