コード例 #1
0
ファイル: rpd_er.py プロジェクト: irgroup/repro_eval
def main():
    rpd_eval = RpdEvaluator(qrel_orig_path=QREL,
                            run_b_orig_path=ORIG_B,
                            run_a_orig_path=ORIG_A,
                            run_b_rep_path=None,
                            run_a_rep_path=None)

    rpd_eval.trim()
    rpd_eval.evaluate()

    for run_name, info in runs_rpd.items():
        with open(info.get('path')) as run_file:
            info['run'] = pytrec_eval.parse_run(run_file)
            trim(info['run'])
            info['scores'] = rpd_eval.evaluate(info['run'])

    pairs = [('rpd_wcr04_tf_1', 'rpd_wcr0405_tf_1'),
             ('rpd_wcr04_tf_2', 'rpd_wcr0405_tf_2'),
             ('rpd_wcr04_tf_3', 'rpd_wcr0405_tf_3'),
             ('rpd_wcr04_tf_4', 'rpd_wcr0405_tf_4'),
             ('rpd_wcr04_tf_5', 'rpd_wcr0405_tf_5')]

    df_content = {
        'P_10': [
            rpd_eval.er(run_b_score=runs_rpd[pair[0]]['scores'],
                        run_a_score=runs_rpd[pair[1]]['scores'])['P_10']
            for pair in pairs
        ],
        'ndcg': [
            rpd_eval.er(run_b_score=runs_rpd[pair[0]]['scores'],
                        run_a_score=runs_rpd[pair[1]]['scores'])['ndcg']
            for pair in pairs
        ],
        'map': [
            rpd_eval.er(run_b_score=runs_rpd[pair[0]]['scores'],
                        run_a_score=runs_rpd[pair[1]]['scores'])['map']
            for pair in pairs
        ],
    }

    df = pd.DataFrame(df_content,
                      index=['tf_1', 'tf_2', 'tf_3', 'tf_4', 'tf_5'])
    orig_val = 1
    ax = df.plot.bar(rot=0)
    ax.hlines(orig_val, -.5, 5.5, linestyles='dashed', color='black')
    ax.annotate(' ', (3, orig_val), color='black')
    ax.set_xlabel("Reproduced Run")
    ax.set_ylabel("Effect Ratio (ER)")
    ax.get_figure().savefig('data/plots/rpd_er.pdf',
                            format='pdf',
                            bbox_inches='tight')
    plt.show()
コード例 #2
0
ファイル: test_path_param.py プロジェクト: irgroup/repro_eval
def test_rpd_ttest_path_param():
    pval = rpd_eval.ttest()
    assert 'baseline' in pval.keys()
    assert 'advanced' in pval.keys()

    _rpd_eval = RpdEvaluator(qrel_orig_path='./example/data/qrels/core17.txt',
                             run_b_orig_path='./example/orig_b.txt',
                             run_a_orig_path='./example/orig_a.txt')
    _rpd_eval.trim()
    _rpd_eval.evaluate()

    _pval = _rpd_eval.ttest(run_b_path='./example/rpd_b.txt')
    assert 'baseline' in _pval.keys()
    # pick a few samples here since nan comparisons cause problems in combination with assert
    assert pval.get('baseline').get('ndcg') == _pval.get('baseline').get(
        'ndcg')
    assert pval.get('baseline').get('P_10') == _pval.get('baseline').get(
        'P_10')
    assert pval.get('baseline').get('map') == _pval.get('baseline').get('map')

    _pval = _rpd_eval.ttest(run_b_path='./example/rpd_b.txt',
                            run_a_path='./example/rpd_a.txt')
    assert 'advanced' in _pval.keys()
    # pick a few samples here since nan comparisons cause problems in combination with assert
    assert pval.get('advanced').get('ndcg') == _pval.get('advanced').get(
        'ndcg')
    assert pval.get('advanced').get('P_10') == _pval.get('advanced').get(
        'P_10')
    assert pval.get('advanced').get('map') == _pval.get('advanced').get('map')
コード例 #3
0
def main():
    rpd_eval = RpdEvaluator(qrel_orig_path=QREL,
                            run_b_orig_path=ORIG_B,
                            run_a_orig_path=ORIG_A,
                            run_b_rep_path=None,
                            run_a_rep_path=None)

    rpd_eval.trim()
    rpd_eval.evaluate()

    for run_name, info in runs_rpd.items():
        with open(info.get('path')) as run_file:
            info['run'] = pytrec_eval.parse_run(run_file)
            trim(info['run'])
            info['scores'] = rpd_eval.evaluate(info['run'])
            info['rmse'] = rpd_eval.rmse(run_b_score=info['scores'])

    baseline_runs = [
        'rpd_wcr04_tf_1', 'rpd_wcr04_tf_2', 'rpd_wcr04_tf_3', 'rpd_wcr04_tf_4',
        'rpd_wcr04_tf_5'
    ]
    advanced_runs = [
        'rpd_wcr0405_tf_1', 'rpd_wcr0405_tf_2', 'rpd_wcr0405_tf_3',
        'rpd_wcr0405_tf_4', 'rpd_wcr0405_tf_5'
    ]
    cutoffs = ['5', '10', '15', '20', '30', '100', '200', '500', '1000']

    df_content = {}
    for run_name in baseline_runs:
        df_content[run_name] = [
            runs_rpd[run_name]['rmse']['baseline']['ndcg_cut_' + co]
            for co in cutoffs
        ]

    df = pd.DataFrame(df_content, index=cutoffs)
    ax = df.plot.line(style='o-')
    ax.set_xlabel('Cut-off values')
    ax.set_ylabel('RMSE')
    ax.get_figure().savefig('data/plots/rpd_b_rmse.pdf',
                            format='pdf',
                            bbox_inches='tight')
    plt.show()

    df_content = {}
    for run_name in advanced_runs:
        df_content[run_name] = [
            runs_rpd[run_name]['rmse']['baseline']['ndcg_cut_' + co]
            for co in cutoffs
        ]

    df = pd.DataFrame(df_content, index=cutoffs)
    ax = df.plot.line(style='o-')
    ax.set_xlabel('Cut-off values')
    ax.set_ylabel('RMSE')
    ax.get_figure().savefig('data/plots/rpd_a_rmse.pdf',
                            format='pdf',
                            bbox_inches='tight')
    plt.show()
コード例 #4
0
ファイル: test_path_param.py プロジェクト: irgroup/repro_eval
def test_rmse_path_param():
    rmse = rpd_eval.rmse()
    assert 'baseline' in rmse.keys()
    assert 'advanced' in rmse.keys()

    _rpd_eval = RpdEvaluator(qrel_orig_path='./example/data/qrels/core17.txt',
                             run_b_orig_path='./example/orig_b.txt',
                             run_a_orig_path='./example/orig_a.txt')
    _rpd_eval.trim()
    _rpd_eval.evaluate()

    _rmse = _rpd_eval.rmse(run_b_path='./example/rpd_b.txt')
    assert 'baseline' in _rmse.keys()
    assert rmse.get('baseline') == _rmse.get('baseline')

    _rmse = _rpd_eval.rmse(run_b_path='./example/rpd_b.txt',
                           run_a_path='./example/rpd_a.txt')
    assert 'advanced' in _rmse.keys()
    assert rmse.get('advanced') == _rmse.get('advanced')
コード例 #5
0
ファイル: test_path_param.py プロジェクト: irgroup/repro_eval
def test_ktu_path_param():
    ktu = rpd_eval.ktau_union()
    assert 'baseline' in ktu.keys()
    assert 'advanced' in ktu.keys()

    _rpd_eval = RpdEvaluator(qrel_orig_path='./example/data/qrels/core17.txt',
                             run_b_orig_path='./example/orig_b.txt',
                             run_a_orig_path='./example/orig_a.txt')
    _rpd_eval.trim()
    _rpd_eval.evaluate()

    _ktu = _rpd_eval.ktau_union(run_b_path='./example/rpd_b.txt')
    assert 'baseline' in _ktu.keys()
    assert ktu.get('baseline') == _ktu.get('baseline')

    _ktu = _rpd_eval.ktau_union(run_b_path='./example/rpd_b.txt',
                                run_a_path='./example/rpd_a.txt')
    assert 'advanced' in _ktu.keys()
    assert ktu.get('advanced') == _ktu.get('advanced')
コード例 #6
0
ファイル: test_path_param.py プロジェクト: irgroup/repro_eval
def test_rpd_dri_path_param():
    dri = rpd_eval.dri()

    _rpd_eval = RpdEvaluator(qrel_orig_path='./example/data/qrels/core17.txt',
                             run_b_orig_path='./example/orig_b.txt',
                             run_a_orig_path='./example/orig_a.txt')
    _rpd_eval.trim()
    _rpd_eval.evaluate()

    _dri = _rpd_eval.dri(run_b_path='./example/rpd_b.txt',
                         run_a_path='./example/rpd_a.txt')

    # pick a few samples here since nan comparisons cause problems in combination with assert
    assert dri.get('ndcg') == _dri.get('ndcg')
    assert dri.get('P_10') == _dri.get('P_10')
    assert dri.get('map') == _dri.get('map')
コード例 #7
0
ファイル: test_ttest.py プロジェクト: irgroup/repro_eval
def test_ttest_with_identical_score_distributions():
    rpd_eval = RpdEvaluator(qrel_orig_path='./example/data/qrels/core17.txt',
                            run_b_orig_path='./example/orig_b.txt',
                            run_a_orig_path='./example/orig_a.txt',
                            run_b_rep_path='./example/orig_b.txt',
                            run_a_rep_path='./example/orig_a.txt')

    rpd_eval.trim()
    rpd_eval.evaluate()

    ttest = rpd_eval.ttest()

    pvals = list(filter(lambda x: x == 1.0, ttest.get('baseline').values()))
    assert len(pvals) == len(ttest.get('baseline').keys())

    pvals = list(filter(lambda x: x == 1.0, ttest.get('advanced').values()))
    assert len(pvals) == len(ttest.get('advanced').keys())
コード例 #8
0
ファイル: rpd_arp.py プロジェクト: irgroup/repro_eval
def main():
    rpd_eval = RpdEvaluator(qrel_orig_path=QREL,
                            run_b_orig_path=ORIG_B,
                            run_a_orig_path=ORIG_A,
                            run_b_rep_path=None,
                            run_a_rep_path=None)

    rpd_eval.trim()
    rpd_eval.evaluate()

    for run_name, info in runs_rpd.items():
        with open(info.get('path')) as run_file:
            info['run'] = pytrec_eval.parse_run(run_file)
            trim(info['run'])
            info['scores'] = rpd_eval.evaluate(info['run'])

    average_retrieval_performance(
        rpd_eval.run_b_orig_score, {
            'tf_1': runs_rpd.get('rpd_wcr04_tf_1').get('scores'),
            'tf_2': runs_rpd.get('rpd_wcr04_tf_2').get('scores'),
            'tf_3': runs_rpd.get('rpd_wcr04_tf_3').get('scores'),
            'tf_4': runs_rpd.get('rpd_wcr04_tf_4').get('scores'),
            'tf_5': runs_rpd.get('rpd_wcr04_tf_5').get('scores'),
        },
        measures=['P_10', 'ndcg', 'bpref', 'map'],
        xlabel='Reproduced run (wcr04)',
        ylabel='Score',
        outfile='data/plots/rpd_b_arp.pdf')

    average_retrieval_performance(
        rpd_eval.run_a_orig_score, {
            'tf_1': runs_rpd.get('rpd_wcr0405_tf_1').get('scores'),
            'tf_2': runs_rpd.get('rpd_wcr0405_tf_2').get('scores'),
            'tf_3': runs_rpd.get('rpd_wcr0405_tf_3').get('scores'),
            'tf_4': runs_rpd.get('rpd_wcr0405_tf_4').get('scores'),
            'tf_5': runs_rpd.get('rpd_wcr0405_tf_5').get('scores'),
        },
        measures=['P_10', 'ndcg', 'bpref', 'map'],
        xlabel='Reproduced run (wcr0405)',
        ylabel='Score',
        outfile='data/plots/rpd_a_arp.pdf')
コード例 #9
0
ファイル: rpd_ktu.py プロジェクト: irgroup/repro_eval
def main():
    cutoffs = [1000, 100, 50, 20, 10, 5]

    # BASELINE
    for run_name, info in zip(
            list(runs_rpd.keys())[::2],
            list(runs_rpd.values())[::2]):
        rpd_eval = RpdEvaluator(qrel_orig_path=QREL,
                                run_b_orig_path=ORIG_B,
                                run_a_orig_path=ORIG_A,
                                run_b_rep_path=None,
                                run_a_rep_path=None)

        rpd_eval.trim()
        rpd_eval.evaluate()

        with open(info.get('path')) as run_file:
            info['run'] = pytrec_eval.parse_run(run_file)
            for cutoff in cutoffs:
                rpd_eval.trim(cutoff)
                rpd_eval.trim(cutoff, info['run'])
                info['ktu_' + str(cutoff)] = arp(
                    rpd_eval.ktau_union(info['run'])['baseline'])

    df_content = {}
    for run_name, info in zip(
            list(runs_rpd.keys())[::2],
            list(runs_rpd.values())[::2]):
        df_content[run_name] = [
            info.get('ktu_' + str(cutoff)) for cutoff in cutoffs[::-1]
        ]

    ax = pd.DataFrame(data=df_content,
                      index=[str(cutoff)
                             for cutoff in cutoffs[::-1]]).plot(style='-*')
    ax.set_xlabel('Cut-off values')
    ax.set_ylabel(r"Kendall's $\tau$")
    ax.get_figure().savefig('data/plots/rpd_b_ktu.pdf',
                            format='pdf',
                            bbox_inches='tight')
    plt.show()

    # ADVANCED
    for run_name, info in zip(
            list(runs_rpd.keys())[1::2],
            list(runs_rpd.values())[1::2]):
        rpd_eval = RpdEvaluator(qrel_orig_path=QREL,
                                run_b_orig_path=ORIG_B,
                                run_a_orig_path=ORIG_A,
                                run_b_rep_path=None,
                                run_a_rep_path=None)

        rpd_eval.trim()
        rpd_eval.evaluate()

        with open(info.get('path')) as run_file:
            info['run'] = pytrec_eval.parse_run(run_file)
            for cutoff in cutoffs:
                rpd_eval.trim(cutoff)
                rpd_eval.trim(cutoff, info['run'])
                # scores = rpl_eval.evaluate(info['run'])
                info['ktu_' + str(cutoff)] = arp(
                    rpd_eval.ktau_union(info['run'])['baseline'])

    df_content = {}
    for run_name, info in zip(
            list(runs_rpd.keys())[1::2],
            list(runs_rpd.values())[1::2]):
        df_content[run_name] = [
            info.get('ktu_' + str(cutoff)) for cutoff in cutoffs[::-1]
        ]

    ax = pd.DataFrame(data=df_content,
                      index=[str(cutoff)
                             for cutoff in cutoffs[::-1]]).plot(style='-*')
    ax.set_xlabel('Cut-off values')
    ax.set_ylabel(r"Kendall's $\tau$")
    ax.get_figure().savefig('data/plots/rpd_a_ktu.pdf',
                            format='pdf',
                            bbox_inches='tight')
    plt.show()
コード例 #10
0
def main():
    rpd_eval = RpdEvaluator(qrel_orig_path=QREL,
                            run_b_orig_path=ORIG_B,
                            run_a_orig_path=ORIG_A,
                            run_b_rep_path=None,
                            run_a_rep_path=None)

    rpd_eval.trim()
    rpd_eval.evaluate()

    for run_name, info in runs_rpd.items():
        with open(info.get('path')) as run_file:
            info['run'] = pytrec_eval.parse_run(run_file)
            trim(info['run'])
            info['scores'] = rpd_eval.evaluate(info['run'])

    dri_er = {
        'wcr_tf_1': {
            'er':
            rpd_eval.er(runs_rpd['rpd_wcr04_tf_1']['scores'],
                        runs_rpd['rpd_wcr0405_tf_1']['scores']),
            'dri':
            rpd_eval.dri(runs_rpd['rpd_wcr04_tf_1']['scores'],
                         runs_rpd['rpd_wcr0405_tf_1']['scores'])
        },
        'wcr_tf_2': {
            'er':
            rpd_eval.er(runs_rpd['rpd_wcr04_tf_2']['scores'],
                        runs_rpd['rpd_wcr0405_tf_2']['scores']),
            'dri':
            rpd_eval.dri(runs_rpd['rpd_wcr04_tf_2']['scores'],
                         runs_rpd['rpd_wcr0405_tf_2']['scores'])
        },
        'wcr_tf_3': {
            'er':
            rpd_eval.er(runs_rpd['rpd_wcr04_tf_3']['scores'],
                        runs_rpd['rpd_wcr0405_tf_3']['scores']),
            'dri':
            rpd_eval.dri(runs_rpd['rpd_wcr04_tf_3']['scores'],
                         runs_rpd['rpd_wcr0405_tf_3']['scores'])
        },
        'wcr_tf_4': {
            'er':
            rpd_eval.er(runs_rpd['rpd_wcr04_tf_4']['scores'],
                        runs_rpd['rpd_wcr0405_tf_4']['scores']),
            'dri':
            rpd_eval.dri(runs_rpd['rpd_wcr04_tf_4']['scores'],
                         runs_rpd['rpd_wcr0405_tf_4']['scores'])
        },
        'wcr_tf_5': {
            'er':
            rpd_eval.er(runs_rpd['rpd_wcr04_tf_5']['scores'],
                        runs_rpd['rpd_wcr0405_tf_5']['scores']),
            'dri':
            rpd_eval.dri(runs_rpd['rpd_wcr04_tf_5']['scores'],
                         runs_rpd['rpd_wcr0405_tf_5']['scores'])
        },
    }

    measures = ['P_10', 'map', 'ndcg']
    marker_color = [('o', 'b'), ('^', 'g'), ('v', 'r')]

    fig, ax1 = plt.subplots()
    ax1.set_xlabel('Effect Ratio (ER)')
    ax1.set_ylabel(u'Delta Relative Improvement (ΔRI)')

    for measure, mk in zip(measures, marker_color):
        ax1.plot([dri_er[r]['er'][measure] for r in dri_er.keys()],
                 [dri_er[r]['dri'][measure] for r in dri_er.keys()],
                 marker=mk[0],
                 color=mk[1],
                 linestyle='None',
                 label=measure)

    ax1.tick_params(axis='y', labelcolor='k')
    fig.tight_layout()
    plt.axhline(0, color='grey')
    plt.axvline(1, color='grey')
    plt.legend()
    plt.title('Reproducibility')
    plt.savefig('data/plots/rpd_dri_vs_er.pdf',
                format='pdf',
                bbox_inches='tight')
    plt.show()
コード例 #11
0
ファイル: test_empty_rpd.py プロジェクト: irgroup/repro_eval
import pytest
from repro_eval.Evaluator import RpdEvaluator
from repro_eval.config import ERR_MSG

rpd_eval = RpdEvaluator(qrel_orig_path=None,
                        run_b_orig_path=None,
                        run_a_orig_path=None,
                        run_b_rep_path=None,
                        run_a_rep_path=None)


def test_ktu(capfd):
    assert None is rpd_eval.ktau_union()
    out, err = capfd.readouterr()
    assert out == ''.join([ERR_MSG, '\n'])


def test_rbo(capfd):
    assert None is rpd_eval.rbo()
    out, err = capfd.readouterr()
    assert out == ''.join([ERR_MSG, '\n'])


def test_rmse(capfd):
    assert None is rpd_eval.rmse()
    out, err = capfd.readouterr()
    assert out == ''.join([ERR_MSG, '\n'])


def test_er(capfd):
    assert None is rpd_eval.er()
コード例 #12
0
ファイル: test_path_param.py プロジェクト: irgroup/repro_eval
import pytest
from repro_eval.Evaluator import RpdEvaluator, RplEvaluator
import numpy as np

rpd_eval = RpdEvaluator(qrel_orig_path='./example/data/qrels/core17.txt',
                        run_b_orig_path='./example/orig_b.txt',
                        run_a_orig_path='./example/orig_a.txt',
                        run_b_rep_path='./example/rpd_b.txt',
                        run_a_rep_path='./example/rpd_a.txt')

rpd_eval.trim()
rpd_eval.evaluate()


def test_ktu_path_param():
    ktu = rpd_eval.ktau_union()
    assert 'baseline' in ktu.keys()
    assert 'advanced' in ktu.keys()

    _rpd_eval = RpdEvaluator(qrel_orig_path='./example/data/qrels/core17.txt',
                             run_b_orig_path='./example/orig_b.txt',
                             run_a_orig_path='./example/orig_a.txt')
    _rpd_eval.trim()
    _rpd_eval.evaluate()

    _ktu = _rpd_eval.ktau_union(run_b_path='./example/rpd_b.txt')
    assert 'baseline' in _ktu.keys()
    assert ktu.get('baseline') == _ktu.get('baseline')

    _ktu = _rpd_eval.ktau_union(run_b_path='./example/rpd_b.txt',
                                run_a_path='./example/rpd_a.txt')
コード例 #13
0
ファイル: rpd_eval.py プロジェクト: irgroup/repro_eval
def main():
    rpd_eval = RpdEvaluator(qrel_orig_path=QREL,
                            run_b_orig_path=ORIG_B,
                            run_a_orig_path=ORIG_A,
                            run_b_rep_path=RPD_B,
                            run_a_rep_path=RPD_A)

    rpd_eval.trim()
    rpd_eval.evaluate()

    # KTU
    ktau = rpd_eval.ktau_union()
    print("Kendall's tau Union (KTU)")
    print('------------------------------------------------------------------')
    for topic, value in ktau.get('baseline').items():
        print_base_adv(topic, 'KTU', value, ktau.get('advanced').get(topic))
    print_base_adv('ARP', 'KTU', arp(ktau.get('baseline')),
                   arp(ktau.get('advanced')))

    # RBO
    rbo = rpd_eval.rbo()
    print("Rank-biased Overlap (RBO)")
    print('------------------------------------------------------------------')
    for topic, value in rbo.get('baseline').items():
        print_base_adv(topic, 'RBO', value, rbo.get('advanced').get(topic))
    print_base_adv('ARP', 'RBO', arp(rbo.get('baseline')),
                   arp(rbo.get('advanced')))

    # RMSE
    rmse = rpd_eval.rmse()
    print("Root mean square error (RMSE)")
    print('------------------------------------------------------------------')
    for measure, value in rmse.get('baseline').items():
        print_base_adv(measure, 'RMSE', value,
                       rmse.get('advanced').get(measure))

    # ER
    print("Effect ratio (ER)")
    print('------------------------------------------------------------------')
    er = rpd_eval.er()
    for measure, value in er.items():
        print_simple_line(measure, 'ER', value)

    # DRI
    print("Delta Relative Improvement (DRI)")
    print('------------------------------------------------------------------')
    dri = rpd_eval.dri()
    for measure, value in dri.items():
        print_simple_line(measure, 'DRI', value)

    # ttest
    pvals = rpd_eval.ttest()
    print("Two-tailed paired t-test (p-value)")
    print('------------------------------------------------------------------')
    for measure, value in pvals.get('baseline').items():
        print_base_adv(measure, 'PVAL', value,
                       pvals.get('advanced').get(measure))