Exemple #1
0
def test_run_rbo():
    with open('./example/rpd_b.txt') as _base_file, open(
            './example/rpd_a.txt') as _adv_file:
        _base_run = pytrec_eval.parse_run(_base_file)
        _adv_run = pytrec_eval.parse_run(_adv_file)
    _rbo = rpd_eval.rbo(run_b_rep=_base_run, run_a_rep=_adv_run)
    assert 'baseline' in _rbo.keys()
    assert rbo_base == _rbo.get('baseline')
    assert 'advanced' in _rbo.keys()
    assert rbo_adv == _rbo.get('advanced')
Exemple #2
0
def test_run_ktu():
    with open('./example/rpd_b.txt') as _base_file, open(
            './example/rpd_a.txt') as _adv_file:
        _base_run = pytrec_eval.parse_run(_base_file)
        _adv_run = pytrec_eval.parse_run(_adv_file)
    _ktu = rpd_eval.ktau_union(run_b_rep=_base_run, run_a_rep=_adv_run)
    assert 'baseline' in _ktu.keys()
    assert ktu_base == _ktu.get('baseline')
    assert 'advanced' in _ktu.keys()
    assert ktu_adv == _ktu.get('advanced')
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument('qrel')
    parser.add_argument('run')
    parser.add_argument('measure')

    args = parser.parse_args()

    assert os.path.exists(args.qrel)
    assert os.path.exists(args.run)

    with open(args.qrel, 'r') as f_qrel:
        qrel = pytrec_eval.parse_qrel(f_qrel)

    with open(args.run, 'r') as f_run:
        run = pytrec_eval.parse_run(f_run)
    
    evaluator = pytrec_eval.RelevanceEvaluator(
        qrel, {args.measure})

    results = evaluator.evaluate(run)
    
    def print_line(measure, scope, value):
        #scope = query_id = topic_id
        print('{:25s}{:8s}{:.22f}'.format(measure, scope, value))
    avg_DCG = []
    for query_id, query_measures in results.items():
   
        for measure, value in sorted(query_measures.items()):
            avg_DCG.append(value)
            print_line(measure, query_id, value)
    print(avg_DCG)
    print(mean(avg_DCG))
    print(' avg of nDCG {:f}'.format(mean(avg_DCG)))
Exemple #4
0
    def get_metric(self,
                   qrels: str,
                   trec: str,
                   metric: str = 'ndcg_cut_10',
                   split: dict = None,
                   split_idx: int = -1) -> Dict[str, float]:
        with open(qrels, 'r') as f_qrel:
            qrel = pytrec_eval.parse_qrel(f_qrel)
        with open(trec, 'r') as f_run:
            run = pytrec_eval.parse_run(f_run)

        # partial evaluation
        if split is not None and split_idx >= 0:
            for qid in copy.deepcopy(run):
                if qid not in split[split_idx]:
                    _ = run.pop(qid)

        evaluator = pytrec_eval.RelevanceEvaluator(
            qrel, pytrec_eval.supported_measures)
        results = evaluator.evaluate(run)
        for query_id, query_measures in sorted(results.items()):
            pass
        mes = {}
        for measure in sorted(query_measures.keys()):
            mes[measure] = pytrec_eval.compute_aggregated_measure(
                measure, [
                    query_measures[measure]
                    for query_measures in results.values()
                ])
        return mes[metric]
def read_ranking(bow_model_path):
    """return BoW ranking as dict of dicts {qid: {doc_id: score, ...}, ...}"""
    print('read BoW ranking')
    with open(bow_model_path, 'r') as f:
        # pytrec_eval loads ranking as dict of dicts
        run = pytrec_eval.parse_run(f)
    return run
def qrel_metrics(qrel_file, run_file, metrics=('ndcg', 'map')):
    """Get metrics (ndcg and map by default) for a run compared to a qrel file.

    Arguments:
        qrel_file -- qrel file with ground truth data
        run_file -- predictions from the run
        metrics -- which metrics to evaluate on,
                   can use any valid metrics that the trec_eval tool accepts

    Returns:
        metric_values -- dictionary of metric values (out of 100), rounded to two decimal places
    """
    with open(qrel_file, 'r') as f_qrel:
        qrel = pytrec_eval.parse_qrel(f_qrel)

    with open(run_file, 'r') as f_run:
        run = pytrec_eval.parse_run(f_run)
        
    evaluator = pytrec_eval.RelevanceEvaluator(qrel, set(metrics))
    results = evaluator.evaluate(run)

    metric_values = {}
    for measure in sorted(metrics):
        res = pytrec_eval.compute_aggregated_measure(
                measure, 
                [query_measures[measure]  for query_measures in results.values()]
            )
        metric_values[measure] = np.round(100 * res, 2)
    return metric_values
Exemple #7
0
def cal_ndcg(qrels, trec, k):
    with open(qrels, 'r') as f_qrel:
        qrel = pytrec_eval.parse_qrel(f_qrel)

    with open(trec, 'r') as f_run:
        run = pytrec_eval.parse_run(f_run)

    evaluator = pytrec_eval.RelevanceEvaluator(qrel,
                                               pytrec_eval.supported_measures)
    results = evaluator.evaluate(run)
    for query_id, query_measures in sorted(results.items()):
        pass

    mes = {}
    for measure in sorted(query_measures.keys()):
        mes[measure] = pytrec_eval.compute_aggregated_measure(
            measure,
            [query_measures[measure] for query_measures in results.values()])

    metric = 'ndcg_cut_%d' % k
    if metric not in mes:
        print('Depth of NDCG not available.')
        exit()
    ndcg = mes[metric]

    return ndcg
Exemple #8
0
def evaluate(eval_path, qrel_path, res_path):

    measures = {"map", "ndcg_cut", "recall", "P"}

    with open(qrel_path, 'r') as f_qrel:
        qrel = pytrec_eval.parse_qrel(f_qrel)

    evaluator = pytrec_eval.RelevanceEvaluator(qrel, measures)

    with open(res_path, 'r') as f_run:
        run = pytrec_eval.parse_run(f_run)

    all_metrics = evaluator.evaluate(run)

    metrics = {
        'P_5': 0,
        'P_10': 0,
        'P_20': 0,
        'ndcg_cut_5': 0,
        'ndcg_cut_10': 0,
        'ndcg_cut_20': 0,
        'ndcg_cut_100': 0,
        'map': 0,
        'recall_100': 0
    }

    nb_queries = len(all_metrics)
    for key, values in all_metrics.items():
        for metric in metrics:
            metrics[metric] += values[metric] / nb_queries

    with open(eval_path, 'w') as f:
        json.dump(metrics, f)
Exemple #9
0
def main():
    rpd_eval = RpdEvaluator(qrel_orig_path=QREL,
                            run_b_orig_path=ORIG_B,
                            run_a_orig_path=ORIG_A,
                            run_b_rep_path=None,
                            run_a_rep_path=None)

    rpd_eval.trim()
    rpd_eval.evaluate()

    for run_name, info in runs_rpd.items():
        with open(info.get('path')) as run_file:
            info['run'] = pytrec_eval.parse_run(run_file)
            trim(info['run'])
            info['scores'] = rpd_eval.evaluate(info['run'])
            info['rmse'] = rpd_eval.rmse(run_b_score=info['scores'])

    baseline_runs = [
        'rpd_wcr04_tf_1', 'rpd_wcr04_tf_2', 'rpd_wcr04_tf_3', 'rpd_wcr04_tf_4',
        'rpd_wcr04_tf_5'
    ]
    advanced_runs = [
        'rpd_wcr0405_tf_1', 'rpd_wcr0405_tf_2', 'rpd_wcr0405_tf_3',
        'rpd_wcr0405_tf_4', 'rpd_wcr0405_tf_5'
    ]
    cutoffs = ['5', '10', '15', '20', '30', '100', '200', '500', '1000']

    df_content = {}
    for run_name in baseline_runs:
        df_content[run_name] = [
            runs_rpd[run_name]['rmse']['baseline']['ndcg_cut_' + co]
            for co in cutoffs
        ]

    df = pd.DataFrame(df_content, index=cutoffs)
    ax = df.plot.line(style='o-')
    ax.set_xlabel('Cut-off values')
    ax.set_ylabel('RMSE')
    ax.get_figure().savefig('data/plots/rpd_b_rmse.pdf',
                            format='pdf',
                            bbox_inches='tight')
    plt.show()

    df_content = {}
    for run_name in advanced_runs:
        df_content[run_name] = [
            runs_rpd[run_name]['rmse']['baseline']['ndcg_cut_' + co]
            for co in cutoffs
        ]

    df = pd.DataFrame(df_content, index=cutoffs)
    ax = df.plot.line(style='o-')
    ax.set_xlabel('Cut-off values')
    ax.set_ylabel('RMSE')
    ax.get_figure().savefig('data/plots/rpd_a_rmse.pdf',
                            format='pdf',
                            bbox_inches='tight')
    plt.show()
Exemple #10
0
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument('qrel')
    parser.add_argument('run', nargs=2)

    # A bit too strict, as it does not allow for parametrized measures,
    # but sufficient for the example.
    parser.add_argument(
        '--measure',
        #choices=pytrec_eval.supported_measures,
        required=True)

    args = parser.parse_args()

    assert os.path.exists(args.qrel)
    assert all(map(os.path.exists, args.run))

    with open(args.qrel, 'r') as f_qrel:
        qrel = pytrec_eval.parse_qrel(f_qrel)

    with open(args.run[0], 'r') as f_run:
        first_run = pytrec_eval.parse_run(f_run)

    with open(args.run[1], 'r') as f_run:
        second_run = pytrec_eval.parse_run(f_run)

    evaluator = pytrec_eval.RelevanceEvaluator(qrel, {args.measure})

    first_results = evaluator.evaluate(first_run)
    print(first_results.keys())
    second_results = evaluator.evaluate(second_run)

    query_ids = list(set(first_results.keys()) & set(second_results.keys()))

    first_scores = [
        first_results[query_id][args.measure] for query_id in query_ids
    ]
    second_scores = [
        second_results[query_id][args.measure] for query_id in query_ids
    ]

    print(scipy.stats.ttest_rel(first_scores, second_scores))
Exemple #11
0
def pytrec_evaluation(runfile, qrelfile, measures = pytrec_eval.supported_measures):
    """ run trec_eval with "measures" from the Python interface """
    with open(runfile, "r") as ranking:
        run = pytrec_eval.parse_run(ranking)
    with open(qrelfile, "r") as qrel:
        qrel = pytrec_eval.parse_qrel(qrel)

    evaluator = pytrec_eval.RelevanceEvaluator(
        qrel, measures)

    return evaluator.evaluate(run)
Exemple #12
0
def main():
    rpl_eval = RplEvaluator(qrel_orig_path=QREL,
                            run_b_orig_path=ORIG_B,
                            run_a_orig_path=ORIG_A,
                            run_b_rep_path=None,
                            run_a_rep_path=None,
                            qrel_rpd_path=QREL_RPL)

    rpl_eval.trim()
    rpl_eval.evaluate()

    for run_name, info in runs_rpl.items():
        with open(info.get('path')) as run_file:
            info['run'] = pytrec_eval.parse_run(run_file)
            trim(info['run'])
            info['scores'] = rpl_eval.evaluate(info['run'])

    pairs = [('rpl_wcr04_tf_1', 'rpl_wcr0405_tf_1'),
             ('rpl_wcr04_tf_2', 'rpl_wcr0405_tf_2'),
             ('rpl_wcr04_tf_3', 'rpl_wcr0405_tf_3'),
             ('rpl_wcr04_tf_4', 'rpl_wcr0405_tf_4'),
             ('rpl_wcr04_tf_5', 'rpl_wcr0405_tf_5')]

    df_content = {
        'P_10': [
            rpl_eval.er(run_b_score=runs_rpl[pair[0]]['scores'],
                        run_a_score=runs_rpl[pair[1]]['scores'])['P_10']
            for pair in pairs
        ],
        'ndcg': [
            rpl_eval.er(run_b_score=runs_rpl[pair[0]]['scores'],
                        run_a_score=runs_rpl[pair[1]]['scores'])['ndcg']
            for pair in pairs
        ],
        'map': [
            rpl_eval.er(run_b_score=runs_rpl[pair[0]]['scores'],
                        run_a_score=runs_rpl[pair[1]]['scores'])['map']
            for pair in pairs
        ],
    }

    df = pd.DataFrame(df_content,
                      index=['tf_1', 'tf_2', 'tf_3', 'tf_4', 'tf_5'])
    orig_val = 1
    ax = df.plot.bar(rot=0)
    ax.hlines(orig_val, -.5, 5.5, linestyles='dashed', color='black')
    ax.annotate(' ', (3, orig_val), color='black')
    ax.set_xlabel("Replicated Run")
    ax.set_ylabel("Effect Ratio (ER)")
    ax.get_figure().savefig('data/plots/rpl_er.pdf',
                            format='pdf',
                            bbox_inches='tight')
    plt.show()
Exemple #13
0
def compute_metrics(coll_path,
                    Collection,
                    queries_index,
                    qrel,
                    results,
                    model_name,
                    save_res=False):
    """Function that saves the results of retrieval: the top_k documents according to their score for
    a certain model identified by model_name. Then, it computes different metrics for IR using the pytrec_eval
    package""" #HR
    Collection.save_results(queries_index, results, model_name, top_k=1000)

    with open(model_name, 'r') as f_run:
        run = pytrec_eval.parse_run(f_run)
    if not save_res:
        os.remove(model_name)

    #measures = {"map", "ndcg_cut", "recall", "P"}
    measures = {"ndcg_cut"}

    evaluator = pytrec_eval.RelevanceEvaluator(qrel, measures)

    all_metrics = evaluator.evaluate(run)

    metrics = {
        'P_5': 0,
        'P_10': 0,
        'P_20': 0,
        'ndcg_cut_5': 0,
        'ndcg_cut_10': 0,
        'ndcg_cut_20': 0,
        'ndcg_cut_1000': 0,
        'map': 0,
        'recall_1000': 0
    }

    nb_queries = len(all_metrics)
    for key, values in all_metrics.items():
        for metric in metrics:
            metrics[metric] += values[metric] / nb_queries

    return metrics
Exemple #14
0
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument('qrel')
    parser.add_argument('run')

    args = parser.parse_args()

    assert os.path.exists(args.qrel)
    assert os.path.exists(args.run)

    with open(args.qrel, 'r') as f_qrel:
        qrel = pytrec_eval.parse_qrel(f_qrel)

    with open(args.run, 'r') as f_run:
        run = pytrec_eval.parse_run(f_run)

    evaluator = pytrec_eval.RelevanceEvaluator(
        qrel, pytrec_eval.supported_measures)

    results = evaluator.evaluate(run)

    def print_line(measure, scope, value):
        print('{:25s}{:8s}{:.4f}'.format(measure, scope, value))

    for query_id, query_measures in sorted(results.items()):
        for measure, value in sorted(query_measures.items()):
            print_line(measure, query_id, value)

    # Scope hack: use query_measures of last item in previous loop to
    # figure out all unique measure names.
    #
    # TODO(cvangysel): add member to RelevanceEvaluator
    #                  with a list of measure names.
    for measure in sorted(query_measures.keys()):
        print_line(
            measure,
            'all',
            pytrec_eval.compute_aggregated_measure(
                measure,
                [query_measures[measure]
                 for query_measures in results.values()]))
Exemple #15
0
def main():
    rpd_eval = RpdEvaluator(qrel_orig_path=QREL,
                            run_b_orig_path=ORIG_B,
                            run_a_orig_path=ORIG_A,
                            run_b_rep_path=None,
                            run_a_rep_path=None)

    rpd_eval.trim()
    rpd_eval.evaluate()

    for run_name, info in runs_rpd.items():
        with open(info.get('path')) as run_file:
            info['run'] = pytrec_eval.parse_run(run_file)
            trim(info['run'])
            info['scores'] = rpd_eval.evaluate(info['run'])

    average_retrieval_performance(
        rpd_eval.run_b_orig_score, {
            'tf_1': runs_rpd.get('rpd_wcr04_tf_1').get('scores'),
            'tf_2': runs_rpd.get('rpd_wcr04_tf_2').get('scores'),
            'tf_3': runs_rpd.get('rpd_wcr04_tf_3').get('scores'),
            'tf_4': runs_rpd.get('rpd_wcr04_tf_4').get('scores'),
            'tf_5': runs_rpd.get('rpd_wcr04_tf_5').get('scores'),
        },
        measures=['P_10', 'ndcg', 'bpref', 'map'],
        xlabel='Reproduced run (wcr04)',
        ylabel='Score',
        outfile='data/plots/rpd_b_arp.pdf')

    average_retrieval_performance(
        rpd_eval.run_a_orig_score, {
            'tf_1': runs_rpd.get('rpd_wcr0405_tf_1').get('scores'),
            'tf_2': runs_rpd.get('rpd_wcr0405_tf_2').get('scores'),
            'tf_3': runs_rpd.get('rpd_wcr0405_tf_3').get('scores'),
            'tf_4': runs_rpd.get('rpd_wcr0405_tf_4').get('scores'),
            'tf_5': runs_rpd.get('rpd_wcr0405_tf_5').get('scores'),
        },
        measures=['P_10', 'ndcg', 'bpref', 'map'],
        xlabel='Reproduced run (wcr0405)',
        ylabel='Score',
        outfile='data/plots/rpd_a_arp.pdf')
Exemple #16
0
    def get_metric(self,
                   qrels: str,
                   trec: str,
                   metric: str = 'ndcg_cut_10') -> Dict[str, float]:
        with open(qrels, 'r') as f_qrel:
            qrel = pytrec_eval.parse_qrel(f_qrel)
        with open(trec, 'r') as f_run:
            run = pytrec_eval.parse_run(f_run)

        evaluator = pytrec_eval.RelevanceEvaluator(
            qrel, pytrec_eval.supported_measures)
        results = evaluator.evaluate(run)
        for query_id, query_measures in sorted(results.items()):
            pass
        mes = {}
        for measure in sorted(query_measures.keys()):
            mes[measure] = pytrec_eval.compute_aggregated_measure(
                measure, [
                    query_measures[measure]
                    for query_measures in results.values()
                ])
        return mes[metric]
Exemple #17
0
    def __test(self):
        with open(os.path.join(TREC_EVAL_TEST_DIR, ground_truth_filename)) as \
                f_trec_eval:
            trec_eval_output = parse_trec_eval(f_trec_eval)

        measures = set(
            measure if measure in pytrec_eval.supported_measures else
            prefix_match(measure, pytrec_eval.supported_measures)
            for measure in trec_eval_output['all'].keys())

        with open(os.path.join(TREC_EVAL_TEST_DIR, qrel_filename)) as f_qrel:
            qrel = pytrec_eval.parse_qrel(f_qrel)

        with open(os.path.join(TREC_EVAL_TEST_DIR, run_filename)) as f_run:
            run = pytrec_eval.parse_run(f_run)

        evaluator = pytrec_eval.RelevanceEvaluator(qrel, measures, **kwargs)

        results = evaluator.evaluate(run)

        expected_measures = trec_eval_output['all']

        for measure in expected_measures:
            agg_measure_value = pytrec_eval.compute_aggregated_measure(
                measure, [
                    query_measure_values[measure]
                    for query_measure_values in results.values()
                ])

            ground_truth_agg_measure_value = \
                trec_eval_output['all'][measure]

            self.assertAlmostEqual(agg_measure_value,
                                   ground_truth_agg_measure_value,
                                   places=3,
                                   msg=measure)
def main():
    os.chdir(os.path.dirname(os.path.realpath('__file__')))
    # set folders
    corpus_folder = 'corpus/' + FLAGS.corpus_name + '/' + FLAGS.corpus_name
    index_folder = 'corpus/' + FLAGS.corpus_name + '/index'
    # model_folder = 'corpus/' + FLAGS.corpus_name + '/models/' + FLAGS.model_name
    data_folder = 'corpus/' + FLAGS.corpus_name + '/data'
    query_folder = 'corpus/' + FLAGS.corpus_name + '/queries'
    qrels_folder = 'corpus/' + FLAGS.corpus_name + '/qrels'
    rankings_folder = 'corpus/' + FLAGS.corpus_name + '/rankings/' + FLAGS.model_name

    # create folders
    if not os.path.exists(rankings_folder):
        os.makedirs(rankings_folder)
        # if not os.path.exists(model_folder):
        # os.makedirs(model_folder)
    if not os.path.exists(query_folder) or not os.path.exists(qrels_folder):
        print(
            'folders containing queries and qrels are required - please add them'
        )
        return False

    # set random seed - enable reproducibility
    np.random.seed(FLAGS.seed)
    # establish connection with UMLS db
    umls_lookup = umls.UMLSLookup()

    # load required data
    print(
        'load processed data required to retrofit word vectors and perform retrieval tasks'
    )
    with open(data_folder + '/docs.json', 'r') as df:
        corpus = json.load(df)
    with open(data_folder + '/idfs.json', 'r') as wf:
        idfs = json.load(wf)
    with open(data_folder + '/cfs.json', 'r') as cff:
        cfs = json.load(cff)
    with open(data_folder + '/word_dict.json', 'r') as wdf:
        word_dict = json.load(wdf)
    # compute reverse word dict
    reverse_word_dict = dict(zip(word_dict.values(), word_dict.keys()))
    # store docnos and docs as separate lists
    docnos = list(corpus.keys())
    docs = list(corpus.values())
    del corpus  # free memory space

    # pre process relational data
    if not os.path.exists(data_folder + '/term2cui.json'):
        # map terms to cuis using QuickUMLS
        term2cui = tf_utils.get_term2cui(word_dict,
                                         data_folder,
                                         threshold=FLAGS.threshold,
                                         stypes_fname=FLAGS.stypes_fname)
    else:
        # laod (term, cui) pairs
        print('load (term, cui) pairs')
        with open(data_folder + '/term2cui.json', 'r') as tcf:
            term2cui = json.load(tcf)
    """
	SEMANTIC PROCESSING
	"""

    # load semantic model
    print('load semantic model')
    with tf.Session() as sess:
        # restore model and get required tensors
        saver = tf.train.import_meta_graph(FLAGS.semantic_model + '.ckpt.meta')
        saver.restore(sess, FLAGS.semantic_model + '.ckpt')
        word_embs = sess.run(tf.get_default_graph().get_tensor_by_name(
            'embeddings/word_embs:0'))
    """
	RETROFITTING
	"""

    if FLAGS.retrofit:
        # get synonyms for each word within vocabulary
        print('get synonyms')
        syns = tf_utils.get_syns(term2cui, word_dict, umls_lookup)
        if FLAGS.syn_weights:
            # convert collection frequencies from list to dict
            cfs = dict(cfs)
        else:
            cfs = None
        # retrofit word vectors
        print('retrofit word vectors for {} iterations'.format(
            FLAGS.iterations))
        word_embs = retrofit(word_embs,
                             syns,
                             reverse_word_dict,
                             FLAGS.iterations,
                             alpha=1.0,
                             beta=FLAGS.beta,
                             cfs=cfs)

    # compute doc embeddings
    print('compute document vectors w/ retrofitted word vectors')
    doc_embs, filt_ids = tf_utils.compute_doc_embs(docs, word_dict, word_embs,
                                                   idfs)

    if not FLAGS.reranking:
        """
		RETRIEVAL
		"""
        print('perform retrieval over the entire collection')
        # load queries
        q = tf_utils.read_ohsu_queries(query_folder + '/' + FLAGS.query_fname)
        # set query embs and ids
        q_embs = []
        q_ids = []
        # loop over queries and generate rankings
        for qid, qtext in q.items():
            # prepare queries for semantic matching
            q_proj = tf_utils.prepare_query(qtext[FLAGS.qfield], word_dict,
                                            word_embs)
            if q_proj is None:
                print('query {} does not contain known terms'.format(qid))
            else:
                q_embs.append(q_proj)
                q_ids.append(qid)
        q_embs = np.array(q_embs)
        # perform search and evaluate model effectiveness
        tf_utils.semantic_search(docnos, doc_embs, q_ids, q_embs,
                                 rankings_folder, FLAGS.model_name)
        scores = tf_utils.evaluate(
            ['Rprec', 'P_5', 'P_10', 'P_20', 'ndcg', 'map'], rankings_folder,
            FLAGS.model_name, qrels_folder, FLAGS.qrels_fname)

    else:
        """
		RE-RANKING
		"""
        print('perform re-ranking over top 1000 documents from a baseline run')
        # parse and store qrels
        with open(qrels_folder + '/' + FLAGS.qrels_fname + '.txt',
                  'r') as qrelf:
            qrels = pytrec_eval.parse_qrel(qrelf)
        # initialize evaluator over qrels
        evaluator = pytrec_eval.RelevanceEvaluator(
            qrels, {'P'})  # evaluate on Precision

        # parse input run
        print('parse input run')
        with open(FLAGS.run_path, 'r') as runf:
            run = pytrec_eval.parse_run(runf)

        # load queries
        q = tf_utils.read_ohsu_queries(query_folder + '/' + FLAGS.query_fname)
        # get query ids
        qids = list(q.keys())
        # shuffle query ids
        np.random.shuffle(qids)

        if FLAGS.fixed_gamma:
            # perform re-ranking based on a fixed value of gamma
            print('perform re-ranking w/ gamma=%.2f' % (FLAGS.fixed_gamma))
            # initialize combined (output) run
            crun = trec_utils.OnlineTRECRun(FLAGS.model_name + '_gamma_' +
                                            str(FLAGS.fixed_gamma))
            # combine rankings using fixed gamma
            comb_run = tf_utils.compute_combined_run(
                run, FLAGS.qfield, q, docnos, doc_embs, word_dict, word_embs,
                SCORE_NORMALIZERS[FLAGS.normalizer], FLAGS.fixed_gamma)
            # store test ranking in combined run
            for qid, doc_ids_and_scores in comb_run.items():
                crun.add_ranking(
                    qid, [(score, docno)
                          for docno, score in doc_ids_and_scores.items()])
            # close and store run
            crun.close_and_write(out_path=rankings_folder + '/' +
                                 FLAGS.model_name + '_gamma_' +
                                 str(FLAGS.fixed_gamma) + '.txt',
                                 overwrite=True)
            print('combined run stored in {}'.format(rankings_folder))
            # evalaute combined run
            print('evaluate run combined w/ gamma=%.2f' % (FLAGS.fixed_gamma))
            tf_utils.evaluate(['map', 'P_10', 'ndcg'], rankings_folder,
                              FLAGS.model_name + '_gamma_' +
                              str(FLAGS.fixed_gamma), qrels_folder,
                              FLAGS.qrels_fname)
        else:
            # learn optimal weight to combine runs
            print("learn optimal weight to combine runs with sweep: {}".format(
                FLAGS.sweep))
            # set variable to store scores and weights
            scores_and_weights = []
            # initialize kfold with FLAGS.num_folds
            kfold = sklearn.model_selection.KFold(n_splits=FLAGS.num_folds)
            for fold, (train_qids, test_qids) in enumerate(kfold.split(qids)):
                print('fold n. {}'.format(fold))
                # restrict queries to train_qids and test_qids
                qtrain = {qids[ix]: q[qids[ix]] for ix in train_qids}
                qtest = {qids[ix]: q[qids[ix]] for ix in test_qids}
                # obtain best combination on training queries
                train_score, best_train_weight = max(
                    tf_utils.perform_reranking(
                        run, FLAGS.qfield, qtrain, docnos, doc_embs, word_dict,
                        word_embs, FLAGS.sweep,
                        SCORE_NORMALIZERS[FLAGS.normalizer], FLAGS.ref_measure,
                        evaluator))
                print(
                    'fold %d: best_train_weight=%.2f, %s =%.4f' %
                    (fold, best_train_weight, FLAGS.ref_measure, train_score))
                # compute combined run with best combination on test queries
                test_crun = tf_utils.compute_combined_run(
                    run, FLAGS.qfield, qtest, docnos, doc_embs, word_dict,
                    word_embs, SCORE_NORMALIZERS[FLAGS.normalizer],
                    best_train_weight)
                # evaluate test run
                test_res = evaluator.evaluate(test_crun)
                # compute aggregated measure score for test queries
                test_score = pytrec_eval.compute_aggregated_measure(
                    FLAGS.ref_measure, [
                        qscore[FLAGS.ref_measure]
                        for qscore in test_res.values()
                    ])
                # store averaged scores w/ best weights
                scores_and_weights.append(
                    (np.mean([train_score, test_score]), best_train_weight))

            # get (best) weight that produces the highest averaged score
            best_score, best_weight = max(scores_and_weights)
            print('found best weight=%.2f' % (best_weight))
            # initialize combined (output) run
            crun = trec_utils.OnlineTRECRun(FLAGS.model_name +
                                            '_best_weight_' +
                                            str(FLAGS.best_weight))
            # compute combined run based on test weight
            comb_run = tf_utils.compute_combined_run(
                run, FLAGS.qfield, q, docnos, doc_embs, word_dict, word_embs,
                SCORE_NORMALIZERS[FLAGS.normalizer], best_weight)
            # store ranking in crun
            for qid, doc_ids_and_scores in comb_run.items():
                crun.add_ranking(
                    qid, [(score, doc_id)
                          for doc_id, score in doc_ids_and_scores.items()])
            # close and store run
            crun.close_and_write(out_path=rankings_folder + '/' +
                                 FLAGS.model_name + '_best_weight_' +
                                 str(FLAGS.best_weight) + '.txt',
                                 overwrite=True)
            print('combined run stored in {}'.format(rankings_folder))
            # evalaute combined run
            print(
                'evaluate run combined w/ {}-fold cross validation and best weight={}'
                .format(FLAGS.num_folds, FLAGS.best_weight))
            tf_utils.evaluate(['map', 'P_10', 'ndcg'], rankings_folder,
                              FLAGS.model_name + '_best_weight_' +
                              str(FLAGS.best_weight), qrels_folder,
                              FLAGS.qrels_fname)
Exemple #19
0
with open(qrel_path, 'r') as f_qrel:
    qrel = pytrec_eval.parse_qrel(f_qrel)

evaluator = pytrec_eval.RelevanceEvaluator(qrel, {'ndcg_cut'})

redirects = {}
with open(redirects_path) as f:
    for line in f:
        if not line.startswith('#'):
            subj, pred, obj = line.split(maxsplit=2)
            obj = obj[:obj.rfind('.')].strip()
            redirects[subj] = obj

with open(ir_run_path, "r") as ir_run_file:
    ir_run = pytrec_eval.parse_run(ir_run_file)

model = Word2Vec.load(args.model)

entityv = KeyedVectors(model.vector_size * 2)
entityv_entities = []
entityv_weights = []
wordv = KeyedVectors(model.vector_size * 2)
wordv_entities = []
wordv_weights = []
for entity, vocab in model.wv.vocab.items():
    if entity.startswith('<'):
        entityv_entities.append(entity)
        entityv_weights.append(
            np.concatenate(
                (model.syn1neg[vocab.index], model.wv.syn0[vocab.index])))
Exemple #20
0
def main():
    cutoffs = [1000, 100, 50, 20, 10, 5]

    # BASELINE
    for run_name, info in zip(
            list(runs_rpd.keys())[::2],
            list(runs_rpd.values())[::2]):
        rpd_eval = RpdEvaluator(qrel_orig_path=QREL,
                                run_b_orig_path=ORIG_B,
                                run_a_orig_path=ORIG_A,
                                run_b_rep_path=None,
                                run_a_rep_path=None)

        rpd_eval.trim()
        rpd_eval.evaluate()

        with open(info.get('path')) as run_file:
            info['run'] = pytrec_eval.parse_run(run_file)
            for cutoff in cutoffs:
                rpd_eval.trim(cutoff)
                rpd_eval.trim(cutoff, info['run'])
                info['ktu_' + str(cutoff)] = arp(
                    rpd_eval.ktau_union(info['run'])['baseline'])

    df_content = {}
    for run_name, info in zip(
            list(runs_rpd.keys())[::2],
            list(runs_rpd.values())[::2]):
        df_content[run_name] = [
            info.get('ktu_' + str(cutoff)) for cutoff in cutoffs[::-1]
        ]

    ax = pd.DataFrame(data=df_content,
                      index=[str(cutoff)
                             for cutoff in cutoffs[::-1]]).plot(style='-*')
    ax.set_xlabel('Cut-off values')
    ax.set_ylabel(r"Kendall's $\tau$")
    ax.get_figure().savefig('data/plots/rpd_b_ktu.pdf',
                            format='pdf',
                            bbox_inches='tight')
    plt.show()

    # ADVANCED
    for run_name, info in zip(
            list(runs_rpd.keys())[1::2],
            list(runs_rpd.values())[1::2]):
        rpd_eval = RpdEvaluator(qrel_orig_path=QREL,
                                run_b_orig_path=ORIG_B,
                                run_a_orig_path=ORIG_A,
                                run_b_rep_path=None,
                                run_a_rep_path=None)

        rpd_eval.trim()
        rpd_eval.evaluate()

        with open(info.get('path')) as run_file:
            info['run'] = pytrec_eval.parse_run(run_file)
            for cutoff in cutoffs:
                rpd_eval.trim(cutoff)
                rpd_eval.trim(cutoff, info['run'])
                # scores = rpl_eval.evaluate(info['run'])
                info['ktu_' + str(cutoff)] = arp(
                    rpd_eval.ktau_union(info['run'])['baseline'])

    df_content = {}
    for run_name, info in zip(
            list(runs_rpd.keys())[1::2],
            list(runs_rpd.values())[1::2]):
        df_content[run_name] = [
            info.get('ktu_' + str(cutoff)) for cutoff in cutoffs[::-1]
        ]

    ax = pd.DataFrame(data=df_content,
                      index=[str(cutoff)
                             for cutoff in cutoffs[::-1]]).plot(style='-*')
    ax.set_xlabel('Cut-off values')
    ax.set_ylabel(r"Kendall's $\tau$")
    ax.get_figure().savefig('data/plots/rpd_a_ktu.pdf',
                            format='pdf',
                            bbox_inches='tight')
    plt.show()
Exemple #21
0
def main():

    parser = argparse.ArgumentParser()
    parser.add_argument('-c', '--config', nargs="?", type=str)
    args = parser.parse_args()

    config = json.load(open(args.config, 'r'))

    IR_models = [
        mz.models.list_available()[i] for i in config["index_mz_models"]
    ]

    with open(config["collection_path"] + '/test/qrels', 'r') as f_qrel:
        qrel = pytrec_eval.parse_qrel(f_qrel)

    evaluator = pytrec_eval.RelevanceEvaluator(qrel, set(config["measures"]))

    bm25_res = json.load(
        open(config["collection_path"] + '/test/' + 'BM25.metrics.json', 'r'))

    with open(config["collection_path"] + '/test/' + 'BM25.res', 'r') as f_run:
        bm25_run = pytrec_eval.parse_run(f_run)

    bm25_results = evaluator.evaluate(bm25_run)

    _ = ""

    for key, value in bm25_res.items():
        if key in config["print_measures"]:
            _ += str(value)[:6] + " & "

    print('BM25 & ' + _[:-2] + '\\\\')

    all_res = dict()
    for model_class in IR_models:

        validation_path = config[
            "collection_path"] + '/validation/' + model_class.__name__
        test_path = config["collection_path"] + '/test/' + model_class.__name__

        if os.path.exists(validation_path) and os.path.exists(test_path):
            best_model = ""
            best_metric = 0
            for file in os.listdir(validation_path):
                if '.json' in file:
                    val_res = json.load(open(validation_path + '/' + file,
                                             'r'))
                    if val_res[config["optim_measure"]] > best_metric:
                        best_model = file
                        best_metric = val_res[config["optim_measure"]]

            if best_model != "" and os.path.exists(test_path + '/' +
                                                   best_model):
                test_res = json.load(open(test_path + '/' + best_model, 'r'))
                all_res[model_class.__name__] = [best_model, test_res]

                with open(
                        config["collection_path"] + '/test/' +
                        model_class.__name__ + '/' + best_model[:-12] + 'res',
                        'r') as f_run:
                    run = pytrec_eval.parse_run(f_run)

                results = evaluator.evaluate(run)

                query_ids = list(
                    set(bm25_results.keys()) & set(results.keys()))

                _ = ""

                for key, value in test_res.items():
                    if key in config["print_measures"]:
                        bm25_scores = [
                            bm25_results[query_id][key]
                            for query_id in query_ids
                        ]
                        scores = [
                            results[query_id][key] for query_id in query_ids
                        ]
                        test = scipy.stats.ttest_rel(bm25_scores, scores)
                        _ += str(value)[:6]
                        if test[0] < 0:
                            if test[1] < 0.01 / len(config["print_measures"]):
                                _ += "\\textsuperscript{\\textbf{++}}"
                            elif test[1] < 0.05 / len(
                                    config["print_measures"]):
                                _ += "\\textsuperscript{\\textbf{+}}"

                        else:
                            if test[1] < 0.01 / len(config["print_measures"]):
                                _ += "\\textsuperscript{\\textbf{-\,-}}"
                            elif test[1] < 0.05 / len(
                                    config["print_measures"]):
                                _ += "\\textsuperscript{\\textbf{-}}"

                        _ += " & "

                print(model_class.__name__ + ' & ' + _[:-2] + '\\\\')
Exemple #22
0
def main():
    rpd_eval = RpdEvaluator(qrel_orig_path=QREL,
                            run_b_orig_path=ORIG_B,
                            run_a_orig_path=ORIG_A,
                            run_b_rep_path=None,
                            run_a_rep_path=None)

    rpd_eval.trim()
    rpd_eval.evaluate()

    for run_name, info in runs_rpd.items():
        with open(info.get('path')) as run_file:
            info['run'] = pytrec_eval.parse_run(run_file)
            trim(info['run'])
            info['scores'] = rpd_eval.evaluate(info['run'])

    dri_er = {
        'wcr_tf_1': {
            'er':
            rpd_eval.er(runs_rpd['rpd_wcr04_tf_1']['scores'],
                        runs_rpd['rpd_wcr0405_tf_1']['scores']),
            'dri':
            rpd_eval.dri(runs_rpd['rpd_wcr04_tf_1']['scores'],
                         runs_rpd['rpd_wcr0405_tf_1']['scores'])
        },
        'wcr_tf_2': {
            'er':
            rpd_eval.er(runs_rpd['rpd_wcr04_tf_2']['scores'],
                        runs_rpd['rpd_wcr0405_tf_2']['scores']),
            'dri':
            rpd_eval.dri(runs_rpd['rpd_wcr04_tf_2']['scores'],
                         runs_rpd['rpd_wcr0405_tf_2']['scores'])
        },
        'wcr_tf_3': {
            'er':
            rpd_eval.er(runs_rpd['rpd_wcr04_tf_3']['scores'],
                        runs_rpd['rpd_wcr0405_tf_3']['scores']),
            'dri':
            rpd_eval.dri(runs_rpd['rpd_wcr04_tf_3']['scores'],
                         runs_rpd['rpd_wcr0405_tf_3']['scores'])
        },
        'wcr_tf_4': {
            'er':
            rpd_eval.er(runs_rpd['rpd_wcr04_tf_4']['scores'],
                        runs_rpd['rpd_wcr0405_tf_4']['scores']),
            'dri':
            rpd_eval.dri(runs_rpd['rpd_wcr04_tf_4']['scores'],
                         runs_rpd['rpd_wcr0405_tf_4']['scores'])
        },
        'wcr_tf_5': {
            'er':
            rpd_eval.er(runs_rpd['rpd_wcr04_tf_5']['scores'],
                        runs_rpd['rpd_wcr0405_tf_5']['scores']),
            'dri':
            rpd_eval.dri(runs_rpd['rpd_wcr04_tf_5']['scores'],
                         runs_rpd['rpd_wcr0405_tf_5']['scores'])
        },
    }

    measures = ['P_10', 'map', 'ndcg']
    marker_color = [('o', 'b'), ('^', 'g'), ('v', 'r')]

    fig, ax1 = plt.subplots()
    ax1.set_xlabel('Effect Ratio (ER)')
    ax1.set_ylabel(u'Delta Relative Improvement (ΔRI)')

    for measure, mk in zip(measures, marker_color):
        ax1.plot([dri_er[r]['er'][measure] for r in dri_er.keys()],
                 [dri_er[r]['dri'][measure] for r in dri_er.keys()],
                 marker=mk[0],
                 color=mk[1],
                 linestyle='None',
                 label=measure)

    ax1.tick_params(axis='y', labelcolor='k')
    fig.tight_layout()
    plt.axhline(0, color='grey')
    plt.axvline(1, color='grey')
    plt.legend()
    plt.title('Reproducibility')
    plt.savefig('data/plots/rpd_dri_vs_er.pdf',
                format='pdf',
                bbox_inches='tight')
    plt.show()
Exemple #23
0
def main():
    os.chdir(os.path.dirname(os.path.realpath('__file__')))

    # set folders
    corpus_folder = 'corpus/' + FLAGS.corpus_name + '/' + FLAGS.corpus_name
    data_folder = 'corpus/' + FLAGS.corpus_name + '/data'
    query_folder = 'corpus/' + FLAGS.corpus_name + '/queries'
    qrels_folder = 'corpus/' + FLAGS.corpus_name + '/qrels'
    rankings_folder = 'corpus/' + FLAGS.corpus_name + '/rankings/' + FLAGS.model_name

    # create folders
    if not os.path.exists(rankings_folder):
        os.makedirs(rankings_folder)
    if not os.path.exists(query_folder) or not os.path.exists(qrels_folder):
        print(
            'folders containing queries and qrels are required - please add them'
        )
        return False

    # parse and store qrels
    if FLAGS.qrels_fname:
        with open(qrels_folder + '/' + FLAGS.qrels_fname + '.txt',
                  'r') as qrelf:
            qrels = pytrec_eval.parse_qrel(qrelf)
        # initialize evaluator over qrels
        evaluator = pytrec_eval.RelevanceEvaluator(
            qrels, {'P'})  # evaluate on Precision
    else:
        print("please provide qrels filename")
        return False
    """
	LEXICAL PREPROCESSING
	"""

    # parse input run
    print('parse input run')
    with open(FLAGS.run_path, 'r') as runf:
        run = pytrec_eval.parse_run(runf)
    """
	SEMANTIC PREPROCESSING
	"""

    # load required data
    print(
        'load processed data required to perform re-ranking over lexical model w/ semantic model'
    )
    with open(data_folder + '/docs.json', 'r') as cf:
        corpus = json.load(cf)
    with open(data_folder + '/idfs.json', 'r') as wf:
        idfs = json.load(wf)
    with open(data_folder + '/cfs.json', 'r') as cff:
        cfs = json.load(cff)
    with open(data_folder + '/word_dict.json', 'r') as wdf:
        word_dict = json.load(wdf)
    # compute reverse word dictionary
    reverse_word_dict = dict(zip(word_dict.values(), word_dict.keys()))

    # store docnos and docs as separate lists
    docnos = list(corpus.keys())
    docs = list(corpus.values())
    del corpus  # free memory space

    # load semantic model
    print('load semantic model')
    with tf.Session() as sess:
        # restore model and get required tensors
        saver = tf.train.import_meta_graph(FLAGS.semantic_model + '.ckpt.meta')
        saver.restore(sess, FLAGS.semantic_model + '.ckpt')
        word_embs = sess.run(tf.get_default_graph().get_tensor_by_name(
            'embeddings/word_embs:0'))
    # compute doc embeddings
    doc_embs, filt_ids = tf_utils.compute_doc_embs(docs, word_dict, word_embs,
                                                   idfs)
    """
	COMPUTE RE-RANKING
	"""

    # set random seed
    np.random.seed(FLAGS.seed)
    # load queries
    q = tf_utils.read_ohsu_queries(query_folder + '/' + FLAGS.query_fname)
    # get query ids
    qids = list(q.keys())
    # shuffle query ids
    np.random.shuffle(qids)

    if FLAGS.fixed_gamma:
        # perform re-ranking based on a fixed value of gamma
        print('perform re-ranking w/ gamma=%.2f' % (FLAGS.fixed_gamma))
        # initialize combined (output) run
        crun = trec_utils.OnlineTRECRun(FLAGS.model_name + '_gamma_' +
                                        str(FLAGS.fixed_gamma))
        # combine rankings using fixed gamma
        comb_run = tf_utils.compute_combined_run(
            run, FLAGS.qfield, q, docnos, doc_embs, word_dict, word_embs,
            SCORE_NORMALIZERS[FLAGS.normalizer], FLAGS.fixed_gamma)
        # store test ranking in combined run
        for qid, doc_ids_and_scores in comb_run.items():
            crun.add_ranking(qid,
                             [(score, docno)
                              for docno, score in doc_ids_and_scores.items()])
        # close and store run
        crun.close_and_write(out_path=rankings_folder + '/' +
                             FLAGS.model_name + '_gamma_' +
                             str(FLAGS.fixed_gamma) + '.txt',
                             overwrite=True)
        print('combined run stored in {}'.format(rankings_folder))
        # evalaute combined run
        print('evaluate run combined w/ gamma=%.2f' % (FLAGS.fixed_gamma))
        tf_utils.evaluate(['map', 'P_10', 'ndcg'], rankings_folder,
                          FLAGS.model_name + '_gamma_' +
                          str(FLAGS.fixed_gamma), qrels_folder,
                          FLAGS.qrels_fname)
    else:
        # learn optimal weight to combine runs
        print("learn optimal weight to combine runs with sweep: {}".format(
            FLAGS.sweep))
        # set variable to store scores and weights
        scores_and_weights = []

        # initialize kfold with FLAGS.num_folds
        kfold = sklearn.model_selection.KFold(n_splits=FLAGS.num_folds)
        for fold, (train_qids, test_qids) in enumerate(kfold.split(qids)):
            print('fold n. {}'.format(fold))
            # restrict queries to train_qids and test_qids
            qtrain = {qids[ix]: q[qids[ix]] for ix in train_qids}
            qtest = {qids[ix]: q[qids[ix]] for ix in test_qids}
            # obtain best combination on training queries
            train_score, best_train_weight = max(
                tf_utils.perform_reranking(run, FLAGS.qfield, qtrain, docnos,
                                           doc_embs, word_dict, word_embs,
                                           FLAGS.sweep,
                                           SCORE_NORMALIZERS[FLAGS.normalizer],
                                           FLAGS.ref_measure, evaluator))
            print('fold %d: best_train_weight=%.2f, %s =%.4f' %
                  (fold, best_train_weight, FLAGS.ref_measure, train_score))
            # compute combined run with best combination on test queries
            test_crun = tf_utils.compute_combined_run(
                run, FLAGS.qfield, qtest, docnos, doc_embs, word_dict,
                word_embs, SCORE_NORMALIZERS[FLAGS.normalizer],
                best_train_weight)
            # evaluate test run
            test_res = evaluator.evaluate(test_crun)
            # compute aggregated measure score for test queries
            test_score = pytrec_eval.compute_aggregated_measure(
                FLAGS.ref_measure,
                [qscore[FLAGS.ref_measure] for qscore in test_res.values()])
            # store averaged scores w/ best weights
            scores_and_weights.append(
                (np.mean([train_score, test_score]), best_train_weight))

        # get (best) weight that produces the highest averaged score
        best_score, best_weight = max(scores_and_weights)
        print('found best weight=%.2f' % (best_weight))
        # initialize combined (output) run
        crun = trec_utils.OnlineTRECRun(FLAGS.model_name + '_best_weight_' +
                                        str(FLAGS.best_weight))
        # compute combined run based on test weight
        comb_run = tf_utils.compute_combined_run(
            run, FLAGS.qfield, q, docnos, doc_embs, word_dict, word_embs,
            SCORE_NORMALIZERS[FLAGS.normalizer], best_weight)
        # store ranking in crun
        for qid, doc_ids_and_scores in comb_run.items():
            crun.add_ranking(qid,
                             [(score, doc_id)
                              for doc_id, score in doc_ids_and_scores.items()])
        # close and store run
        crun.close_and_write(out_path=rankings_folder + '/' +
                             FLAGS.model_name + '_best_weight_' +
                             str(FLAGS.best_weight) + '.txt',
                             overwrite=True)
        print('combined run stored in {}'.format(rankings_folder))
        # evalaute combined run
        print(
            'evaluate run combined w/ {}-fold cross validation and best weight={}'
            .format(FLAGS.num_folds, FLAGS.best_weight))
        tf_utils.evaluate(['map', 'P_10', 'ndcg'], rankings_folder,
                          FLAGS.model_name + '_best_weight_' +
                          str(FLAGS.best_weight), qrels_folder,
                          FLAGS.qrels_fname)
Exemple #24
0
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument(
        '--qrel',
        default=
        '/Users/woffee/www/emse-apiqa/QA2021/data/QA2021_stackoverflow4_qrel.txt'
    )
    parser.add_argument(
        '--run',
        default='/Users/woffee/www/emse-apiqa/QA2021/data/pyltr_pred.txt')

    args = parser.parse_args()

    print("args.qrel:", args.qrel)
    print("args.run", args.run)

    assert os.path.exists(args.qrel)
    assert os.path.exists(args.run)

    final_auc, final_accuracy = calc_auc(args.qrel, args.run)

    with open(args.qrel, 'r') as f_qrel:
        qrel = pytrec_eval.parse_qrel(f_qrel)

    with open(args.run, 'r') as f_run:
        run = pytrec_eval.parse_run(f_run)

    evaluator = pytrec_eval.RelevanceEvaluator(qrel,
                                               pytrec_eval.supported_measures)

    results = evaluator.evaluate(run)

    def print_line(measure, scope, value):
        print('{:25s}{:8s}{:.4f}'.format(measure, scope, value))

    total = len(results.items())
    sum_map = 0.0

    for query_id, query_measures in sorted(results.items()):
        for measure, value in sorted(query_measures.items()):
            # print_line(measure, query_id, value)
            pass

    # Scope hack: use query_measures of last item in previous loop to
    # figure out all unique measure names.
    #
    # TODO(cvangysel): add member to RelevanceEvaluator
    #                  with a list of measure names.
    print("==========")
    selected_measures = [
        'map', 'recip_rank', 'P_5', 'P_10', 'P_15', 'P_20', 'recall_5',
        'recall_10', 'recall_15', 'recall_20', 'ndcg'
    ]

    eva_values = {}
    for measure in selected_measures:
        eva_values[measure] = pytrec_eval.compute_aggregated_measure(
            measure,
            [query_measures[measure] for query_measures in results.values()])
        # print_line( measure, 'all', eva_values[measure])
    for measure in selected_measures:
        print_line(measure, 'all', eva_values[measure])

    print(
        "%.4f\t%.4f\t%.4f\t%.4f\t%.4f\t%.4f\t%.4f\t%.4f\t%.4f\t%.4f\t%.4f\t%.4f\t%.4f"
        % (final_auc, final_accuracy, eva_values['map'],
           eva_values['recip_rank'], eva_values['P_5'], eva_values['P_10'],
           eva_values['P_15'], eva_values['P_20'], eva_values['recall_5'],
           eva_values['recall_10'], eva_values['recall_15'],
           eva_values['recall_20'], eva_values['ndcg']))