def qrel_metrics(qrel_file, run_file, metrics=('ndcg', 'map')):
    """Get metrics (ndcg and map by default) for a run compared to a qrel file.

    Arguments:
        qrel_file -- qrel file with ground truth data
        run_file -- predictions from the run
        metrics -- which metrics to evaluate on,
                   can use any valid metrics that the trec_eval tool accepts

    Returns:
        metric_values -- dictionary of metric values (out of 100), rounded to two decimal places
    """
    with open(qrel_file, 'r') as f_qrel:
        qrel = pytrec_eval.parse_qrel(f_qrel)

    with open(run_file, 'r') as f_run:
        run = pytrec_eval.parse_run(f_run)
        
    evaluator = pytrec_eval.RelevanceEvaluator(qrel, set(metrics))
    results = evaluator.evaluate(run)

    metric_values = {}
    for measure in sorted(metrics):
        res = pytrec_eval.compute_aggregated_measure(
                measure, 
                [query_measures[measure]  for query_measures in results.values()]
            )
        metric_values[measure] = np.round(100 * res, 2)
    return metric_values
Example #2
0
    def get_metric(self,
                   qrels: str,
                   trec: str,
                   metric: str = 'ndcg_cut_10',
                   split: dict = None,
                   split_idx: int = -1) -> Dict[str, float]:
        with open(qrels, 'r') as f_qrel:
            qrel = pytrec_eval.parse_qrel(f_qrel)
        with open(trec, 'r') as f_run:
            run = pytrec_eval.parse_run(f_run)

        # partial evaluation
        if split is not None and split_idx >= 0:
            for qid in copy.deepcopy(run):
                if qid not in split[split_idx]:
                    _ = run.pop(qid)

        evaluator = pytrec_eval.RelevanceEvaluator(
            qrel, pytrec_eval.supported_measures)
        results = evaluator.evaluate(run)
        for query_id, query_measures in sorted(results.items()):
            pass
        mes = {}
        for measure in sorted(query_measures.keys()):
            mes[measure] = pytrec_eval.compute_aggregated_measure(
                measure, [
                    query_measures[measure]
                    for query_measures in results.values()
                ])
        return mes[metric]
Example #3
0
 def show(self, metrics):
     result = {}
     for metric in metrics:
         res = pytrec_eval.compute_aggregated_measure(metric, [user[metric] for user in self.result.values()])
         result[metric] = res
         # print('{}={}'.format(metric, res))
     return result
Example #4
0
def cal_ndcg(qrels, trec, k):
    with open(qrels, 'r') as f_qrel:
        qrel = pytrec_eval.parse_qrel(f_qrel)

    with open(trec, 'r') as f_run:
        run = pytrec_eval.parse_run(f_run)

    evaluator = pytrec_eval.RelevanceEvaluator(qrel,
                                               pytrec_eval.supported_measures)
    results = evaluator.evaluate(run)
    for query_id, query_measures in sorted(results.items()):
        pass

    mes = {}
    for measure in sorted(query_measures.keys()):
        mes[measure] = pytrec_eval.compute_aggregated_measure(
            measure,
            [query_measures[measure] for query_measures in results.values()])

    metric = 'ndcg_cut_%d' % k
    if metric not in mes:
        print('Depth of NDCG not available.')
        exit()
    ndcg = mes[metric]

    return ndcg
def perform_reranking(run, qfield, queries, docnos, doc_embs, word_dict,
                      word_embs, sweep, normalizer, ref_measure, evaluator):
    """perform re-ranking of input run w/ semantic model"""
    # loop over weight values with sweep equal to sweep
    for weight in np.arange(0.0, 1.0, sweep):
        # generate combined run with current weight
        combined_run = compute_combined_run(run, qfield, queries, docnos,
                                            doc_embs, word_dict, word_embs,
                                            normalizer, weight)
        # evaluate combined run
        results = evaluator.evaluate(combined_run)
        # compute aggregated measure score
        agg_measure_score = pytrec_eval.compute_aggregated_measure(
            ref_measure, [qscore[ref_measure] for qscore in results.values()])
        # return aggregated mesure score and weight
        yield agg_measure_score, weight
Example #6
0
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument('qrel')
    parser.add_argument('run')

    args = parser.parse_args()

    assert os.path.exists(args.qrel)
    assert os.path.exists(args.run)

    with open(args.qrel, 'r') as f_qrel:
        qrel = pytrec_eval.parse_qrel(f_qrel)

    with open(args.run, 'r') as f_run:
        run = pytrec_eval.parse_run(f_run)

    evaluator = pytrec_eval.RelevanceEvaluator(
        qrel, pytrec_eval.supported_measures)

    results = evaluator.evaluate(run)

    def print_line(measure, scope, value):
        print('{:25s}{:8s}{:.4f}'.format(measure, scope, value))

    for query_id, query_measures in sorted(results.items()):
        for measure, value in sorted(query_measures.items()):
            print_line(measure, query_id, value)

    # Scope hack: use query_measures of last item in previous loop to
    # figure out all unique measure names.
    #
    # TODO(cvangysel): add member to RelevanceEvaluator
    #                  with a list of measure names.
    for measure in sorted(query_measures.keys()):
        print_line(
            measure,
            'all',
            pytrec_eval.compute_aggregated_measure(
                measure,
                [query_measures[measure]
                 for query_measures in results.values()]))
    def evaluate(self, metrics):
        if platform.system().lower().startswith("win"):
            print("Cannot evaluate result, windows platform.")
            self.final_measures = dict({"P_20": 0.1})
            return self.final_measures

        evaluator = pytrec_eval.RelevanceEvaluator(self.qrels,
                                                   set(self.all_metrics))
        results = evaluator.evaluate(self.predicted_qrels)

        final_measures = dict()
        for measure in metrics:
            final_measures[measure] = pytrec_eval.compute_aggregated_measure(
                measure,
                [
                    query_measures[measure]
                    for query_measures in results.values()
                ],
            )
        self.final_measures = final_measures
        return self.final_measures
Example #8
0
    def get_metric(self,
                   qrels: str,
                   trec: str,
                   metric: str = 'ndcg_cut_10') -> Dict[str, float]:
        with open(qrels, 'r') as f_qrel:
            qrel = pytrec_eval.parse_qrel(f_qrel)
        with open(trec, 'r') as f_run:
            run = pytrec_eval.parse_run(f_run)

        evaluator = pytrec_eval.RelevanceEvaluator(
            qrel, pytrec_eval.supported_measures)
        results = evaluator.evaluate(run)
        for query_id, query_measures in sorted(results.items()):
            pass
        mes = {}
        for measure in sorted(query_measures.keys()):
            mes[measure] = pytrec_eval.compute_aggregated_measure(
                measure, [
                    query_measures[measure]
                    for query_measures in results.values()
                ])
        return mes[metric]
Example #9
0
def evaluate(qrels_df,
             run_df,
             aggregated_measures={
                 'recall_1000': '',
                 'ndcg': '',
                 'Rprec': '',
                 'P_10': ''
             }):
    MEASURES_AGGREGATED = aggregated_measures

    evaluator = pytrec_eval.RelevanceEvaluator(
        utils.qrels_to_pytrec_eval(qrels_df),
        utils.pytrec_eval.supported_measures)
    results = evaluator.evaluate(utils.run_to_pytrec_eval(run_df))

    for measure in MEASURES_AGGREGATED.keys():
        measure_all = pytrec_eval.compute_aggregated_measure(
            measure, [
                MEASURES_AGGREGATED[measure]
                for MEASURES_AGGREGATED in results.values()
            ])
        MEASURES_AGGREGATED[measure] = round(measure_all, 4)

    return (results, MEASURES_AGGREGATED)
Example #10
0
    def __test(self):
        with open(os.path.join(TREC_EVAL_TEST_DIR, ground_truth_filename)) as \
                f_trec_eval:
            trec_eval_output = parse_trec_eval(f_trec_eval)

        measures = set(
            measure if measure in pytrec_eval.supported_measures else
            prefix_match(measure, pytrec_eval.supported_measures)
            for measure in trec_eval_output['all'].keys())

        with open(os.path.join(TREC_EVAL_TEST_DIR, qrel_filename)) as f_qrel:
            qrel = pytrec_eval.parse_qrel(f_qrel)

        with open(os.path.join(TREC_EVAL_TEST_DIR, run_filename)) as f_run:
            run = pytrec_eval.parse_run(f_run)

        evaluator = pytrec_eval.RelevanceEvaluator(qrel, measures, **kwargs)

        results = evaluator.evaluate(run)

        expected_measures = trec_eval_output['all']

        for measure in expected_measures:
            agg_measure_value = pytrec_eval.compute_aggregated_measure(
                measure, [
                    query_measure_values[measure]
                    for query_measure_values in results.values()
                ])

            ground_truth_agg_measure_value = \
                trec_eval_output['all'][measure]

            self.assertAlmostEqual(agg_measure_value,
                                   ground_truth_agg_measure_value,
                                   places=3,
                                   msg=measure)
Example #11
0
def main():
    os.chdir(os.path.dirname(os.path.realpath('__file__')))

    # set folders
    corpus_folder = 'corpus/' + FLAGS.corpus_name + '/' + FLAGS.corpus_name
    data_folder = 'corpus/' + FLAGS.corpus_name + '/data'
    query_folder = 'corpus/' + FLAGS.corpus_name + '/queries'
    qrels_folder = 'corpus/' + FLAGS.corpus_name + '/qrels'
    rankings_folder = 'corpus/' + FLAGS.corpus_name + '/rankings/' + FLAGS.model_name

    # create folders
    if not os.path.exists(rankings_folder):
        os.makedirs(rankings_folder)
    if not os.path.exists(query_folder) or not os.path.exists(qrels_folder):
        print(
            'folders containing queries and qrels are required - please add them'
        )
        return False

    # parse and store qrels
    if FLAGS.qrels_fname:
        with open(qrels_folder + '/' + FLAGS.qrels_fname + '.txt',
                  'r') as qrelf:
            qrels = pytrec_eval.parse_qrel(qrelf)
        # initialize evaluator over qrels
        evaluator = pytrec_eval.RelevanceEvaluator(
            qrels, {'P'})  # evaluate on Precision
    else:
        print("please provide qrels filename")
        return False
    """
	LEXICAL PREPROCESSING
	"""

    # parse input run
    print('parse input run')
    with open(FLAGS.run_path, 'r') as runf:
        run = pytrec_eval.parse_run(runf)
    """
	SEMANTIC PREPROCESSING
	"""

    # load required data
    print(
        'load processed data required to perform re-ranking over lexical model w/ semantic model'
    )
    with open(data_folder + '/docs.json', 'r') as cf:
        corpus = json.load(cf)
    with open(data_folder + '/idfs.json', 'r') as wf:
        idfs = json.load(wf)
    with open(data_folder + '/cfs.json', 'r') as cff:
        cfs = json.load(cff)
    with open(data_folder + '/word_dict.json', 'r') as wdf:
        word_dict = json.load(wdf)
    # compute reverse word dictionary
    reverse_word_dict = dict(zip(word_dict.values(), word_dict.keys()))

    # store docnos and docs as separate lists
    docnos = list(corpus.keys())
    docs = list(corpus.values())
    del corpus  # free memory space

    # load semantic model
    print('load semantic model')
    with tf.Session() as sess:
        # restore model and get required tensors
        saver = tf.train.import_meta_graph(FLAGS.semantic_model + '.ckpt.meta')
        saver.restore(sess, FLAGS.semantic_model + '.ckpt')
        word_embs = sess.run(tf.get_default_graph().get_tensor_by_name(
            'embeddings/word_embs:0'))
    # compute doc embeddings
    doc_embs, filt_ids = tf_utils.compute_doc_embs(docs, word_dict, word_embs,
                                                   idfs)
    """
	COMPUTE RE-RANKING
	"""

    # set random seed
    np.random.seed(FLAGS.seed)
    # load queries
    q = tf_utils.read_ohsu_queries(query_folder + '/' + FLAGS.query_fname)
    # get query ids
    qids = list(q.keys())
    # shuffle query ids
    np.random.shuffle(qids)

    if FLAGS.fixed_gamma:
        # perform re-ranking based on a fixed value of gamma
        print('perform re-ranking w/ gamma=%.2f' % (FLAGS.fixed_gamma))
        # initialize combined (output) run
        crun = trec_utils.OnlineTRECRun(FLAGS.model_name + '_gamma_' +
                                        str(FLAGS.fixed_gamma))
        # combine rankings using fixed gamma
        comb_run = tf_utils.compute_combined_run(
            run, FLAGS.qfield, q, docnos, doc_embs, word_dict, word_embs,
            SCORE_NORMALIZERS[FLAGS.normalizer], FLAGS.fixed_gamma)
        # store test ranking in combined run
        for qid, doc_ids_and_scores in comb_run.items():
            crun.add_ranking(qid,
                             [(score, docno)
                              for docno, score in doc_ids_and_scores.items()])
        # close and store run
        crun.close_and_write(out_path=rankings_folder + '/' +
                             FLAGS.model_name + '_gamma_' +
                             str(FLAGS.fixed_gamma) + '.txt',
                             overwrite=True)
        print('combined run stored in {}'.format(rankings_folder))
        # evalaute combined run
        print('evaluate run combined w/ gamma=%.2f' % (FLAGS.fixed_gamma))
        tf_utils.evaluate(['map', 'P_10', 'ndcg'], rankings_folder,
                          FLAGS.model_name + '_gamma_' +
                          str(FLAGS.fixed_gamma), qrels_folder,
                          FLAGS.qrels_fname)
    else:
        # learn optimal weight to combine runs
        print("learn optimal weight to combine runs with sweep: {}".format(
            FLAGS.sweep))
        # set variable to store scores and weights
        scores_and_weights = []

        # initialize kfold with FLAGS.num_folds
        kfold = sklearn.model_selection.KFold(n_splits=FLAGS.num_folds)
        for fold, (train_qids, test_qids) in enumerate(kfold.split(qids)):
            print('fold n. {}'.format(fold))
            # restrict queries to train_qids and test_qids
            qtrain = {qids[ix]: q[qids[ix]] for ix in train_qids}
            qtest = {qids[ix]: q[qids[ix]] for ix in test_qids}
            # obtain best combination on training queries
            train_score, best_train_weight = max(
                tf_utils.perform_reranking(run, FLAGS.qfield, qtrain, docnos,
                                           doc_embs, word_dict, word_embs,
                                           FLAGS.sweep,
                                           SCORE_NORMALIZERS[FLAGS.normalizer],
                                           FLAGS.ref_measure, evaluator))
            print('fold %d: best_train_weight=%.2f, %s =%.4f' %
                  (fold, best_train_weight, FLAGS.ref_measure, train_score))
            # compute combined run with best combination on test queries
            test_crun = tf_utils.compute_combined_run(
                run, FLAGS.qfield, qtest, docnos, doc_embs, word_dict,
                word_embs, SCORE_NORMALIZERS[FLAGS.normalizer],
                best_train_weight)
            # evaluate test run
            test_res = evaluator.evaluate(test_crun)
            # compute aggregated measure score for test queries
            test_score = pytrec_eval.compute_aggregated_measure(
                FLAGS.ref_measure,
                [qscore[FLAGS.ref_measure] for qscore in test_res.values()])
            # store averaged scores w/ best weights
            scores_and_weights.append(
                (np.mean([train_score, test_score]), best_train_weight))

        # get (best) weight that produces the highest averaged score
        best_score, best_weight = max(scores_and_weights)
        print('found best weight=%.2f' % (best_weight))
        # initialize combined (output) run
        crun = trec_utils.OnlineTRECRun(FLAGS.model_name + '_best_weight_' +
                                        str(FLAGS.best_weight))
        # compute combined run based on test weight
        comb_run = tf_utils.compute_combined_run(
            run, FLAGS.qfield, q, docnos, doc_embs, word_dict, word_embs,
            SCORE_NORMALIZERS[FLAGS.normalizer], best_weight)
        # store ranking in crun
        for qid, doc_ids_and_scores in comb_run.items():
            crun.add_ranking(qid,
                             [(score, doc_id)
                              for doc_id, score in doc_ids_and_scores.items()])
        # close and store run
        crun.close_and_write(out_path=rankings_folder + '/' +
                             FLAGS.model_name + '_best_weight_' +
                             str(FLAGS.best_weight) + '.txt',
                             overwrite=True)
        print('combined run stored in {}'.format(rankings_folder))
        # evalaute combined run
        print(
            'evaluate run combined w/ {}-fold cross validation and best weight={}'
            .format(FLAGS.num_folds, FLAGS.best_weight))
        tf_utils.evaluate(['map', 'P_10', 'ndcg'], rankings_folder,
                          FLAGS.model_name + '_best_weight_' +
                          str(FLAGS.best_weight), qrels_folder,
                          FLAGS.qrels_fname)
def main():
    os.chdir(os.path.dirname(os.path.realpath('__file__')))
    # set folders
    corpus_folder = 'corpus/' + FLAGS.corpus_name + '/' + FLAGS.corpus_name
    index_folder = 'corpus/' + FLAGS.corpus_name + '/index'
    # model_folder = 'corpus/' + FLAGS.corpus_name + '/models/' + FLAGS.model_name
    data_folder = 'corpus/' + FLAGS.corpus_name + '/data'
    query_folder = 'corpus/' + FLAGS.corpus_name + '/queries'
    qrels_folder = 'corpus/' + FLAGS.corpus_name + '/qrels'
    rankings_folder = 'corpus/' + FLAGS.corpus_name + '/rankings/' + FLAGS.model_name

    # create folders
    if not os.path.exists(rankings_folder):
        os.makedirs(rankings_folder)
        # if not os.path.exists(model_folder):
        # os.makedirs(model_folder)
    if not os.path.exists(query_folder) or not os.path.exists(qrels_folder):
        print(
            'folders containing queries and qrels are required - please add them'
        )
        return False

    # set random seed - enable reproducibility
    np.random.seed(FLAGS.seed)
    # establish connection with UMLS db
    umls_lookup = umls.UMLSLookup()

    # load required data
    print(
        'load processed data required to retrofit word vectors and perform retrieval tasks'
    )
    with open(data_folder + '/docs.json', 'r') as df:
        corpus = json.load(df)
    with open(data_folder + '/idfs.json', 'r') as wf:
        idfs = json.load(wf)
    with open(data_folder + '/cfs.json', 'r') as cff:
        cfs = json.load(cff)
    with open(data_folder + '/word_dict.json', 'r') as wdf:
        word_dict = json.load(wdf)
    # compute reverse word dict
    reverse_word_dict = dict(zip(word_dict.values(), word_dict.keys()))
    # store docnos and docs as separate lists
    docnos = list(corpus.keys())
    docs = list(corpus.values())
    del corpus  # free memory space

    # pre process relational data
    if not os.path.exists(data_folder + '/term2cui.json'):
        # map terms to cuis using QuickUMLS
        term2cui = tf_utils.get_term2cui(word_dict,
                                         data_folder,
                                         threshold=FLAGS.threshold,
                                         stypes_fname=FLAGS.stypes_fname)
    else:
        # laod (term, cui) pairs
        print('load (term, cui) pairs')
        with open(data_folder + '/term2cui.json', 'r') as tcf:
            term2cui = json.load(tcf)
    """
	SEMANTIC PROCESSING
	"""

    # load semantic model
    print('load semantic model')
    with tf.Session() as sess:
        # restore model and get required tensors
        saver = tf.train.import_meta_graph(FLAGS.semantic_model + '.ckpt.meta')
        saver.restore(sess, FLAGS.semantic_model + '.ckpt')
        word_embs = sess.run(tf.get_default_graph().get_tensor_by_name(
            'embeddings/word_embs:0'))
    """
	RETROFITTING
	"""

    if FLAGS.retrofit:
        # get synonyms for each word within vocabulary
        print('get synonyms')
        syns = tf_utils.get_syns(term2cui, word_dict, umls_lookup)
        if FLAGS.syn_weights:
            # convert collection frequencies from list to dict
            cfs = dict(cfs)
        else:
            cfs = None
        # retrofit word vectors
        print('retrofit word vectors for {} iterations'.format(
            FLAGS.iterations))
        word_embs = retrofit(word_embs,
                             syns,
                             reverse_word_dict,
                             FLAGS.iterations,
                             alpha=1.0,
                             beta=FLAGS.beta,
                             cfs=cfs)

    # compute doc embeddings
    print('compute document vectors w/ retrofitted word vectors')
    doc_embs, filt_ids = tf_utils.compute_doc_embs(docs, word_dict, word_embs,
                                                   idfs)

    if not FLAGS.reranking:
        """
		RETRIEVAL
		"""
        print('perform retrieval over the entire collection')
        # load queries
        q = tf_utils.read_ohsu_queries(query_folder + '/' + FLAGS.query_fname)
        # set query embs and ids
        q_embs = []
        q_ids = []
        # loop over queries and generate rankings
        for qid, qtext in q.items():
            # prepare queries for semantic matching
            q_proj = tf_utils.prepare_query(qtext[FLAGS.qfield], word_dict,
                                            word_embs)
            if q_proj is None:
                print('query {} does not contain known terms'.format(qid))
            else:
                q_embs.append(q_proj)
                q_ids.append(qid)
        q_embs = np.array(q_embs)
        # perform search and evaluate model effectiveness
        tf_utils.semantic_search(docnos, doc_embs, q_ids, q_embs,
                                 rankings_folder, FLAGS.model_name)
        scores = tf_utils.evaluate(
            ['Rprec', 'P_5', 'P_10', 'P_20', 'ndcg', 'map'], rankings_folder,
            FLAGS.model_name, qrels_folder, FLAGS.qrels_fname)

    else:
        """
		RE-RANKING
		"""
        print('perform re-ranking over top 1000 documents from a baseline run')
        # parse and store qrels
        with open(qrels_folder + '/' + FLAGS.qrels_fname + '.txt',
                  'r') as qrelf:
            qrels = pytrec_eval.parse_qrel(qrelf)
        # initialize evaluator over qrels
        evaluator = pytrec_eval.RelevanceEvaluator(
            qrels, {'P'})  # evaluate on Precision

        # parse input run
        print('parse input run')
        with open(FLAGS.run_path, 'r') as runf:
            run = pytrec_eval.parse_run(runf)

        # load queries
        q = tf_utils.read_ohsu_queries(query_folder + '/' + FLAGS.query_fname)
        # get query ids
        qids = list(q.keys())
        # shuffle query ids
        np.random.shuffle(qids)

        if FLAGS.fixed_gamma:
            # perform re-ranking based on a fixed value of gamma
            print('perform re-ranking w/ gamma=%.2f' % (FLAGS.fixed_gamma))
            # initialize combined (output) run
            crun = trec_utils.OnlineTRECRun(FLAGS.model_name + '_gamma_' +
                                            str(FLAGS.fixed_gamma))
            # combine rankings using fixed gamma
            comb_run = tf_utils.compute_combined_run(
                run, FLAGS.qfield, q, docnos, doc_embs, word_dict, word_embs,
                SCORE_NORMALIZERS[FLAGS.normalizer], FLAGS.fixed_gamma)
            # store test ranking in combined run
            for qid, doc_ids_and_scores in comb_run.items():
                crun.add_ranking(
                    qid, [(score, docno)
                          for docno, score in doc_ids_and_scores.items()])
            # close and store run
            crun.close_and_write(out_path=rankings_folder + '/' +
                                 FLAGS.model_name + '_gamma_' +
                                 str(FLAGS.fixed_gamma) + '.txt',
                                 overwrite=True)
            print('combined run stored in {}'.format(rankings_folder))
            # evalaute combined run
            print('evaluate run combined w/ gamma=%.2f' % (FLAGS.fixed_gamma))
            tf_utils.evaluate(['map', 'P_10', 'ndcg'], rankings_folder,
                              FLAGS.model_name + '_gamma_' +
                              str(FLAGS.fixed_gamma), qrels_folder,
                              FLAGS.qrels_fname)
        else:
            # learn optimal weight to combine runs
            print("learn optimal weight to combine runs with sweep: {}".format(
                FLAGS.sweep))
            # set variable to store scores and weights
            scores_and_weights = []
            # initialize kfold with FLAGS.num_folds
            kfold = sklearn.model_selection.KFold(n_splits=FLAGS.num_folds)
            for fold, (train_qids, test_qids) in enumerate(kfold.split(qids)):
                print('fold n. {}'.format(fold))
                # restrict queries to train_qids and test_qids
                qtrain = {qids[ix]: q[qids[ix]] for ix in train_qids}
                qtest = {qids[ix]: q[qids[ix]] for ix in test_qids}
                # obtain best combination on training queries
                train_score, best_train_weight = max(
                    tf_utils.perform_reranking(
                        run, FLAGS.qfield, qtrain, docnos, doc_embs, word_dict,
                        word_embs, FLAGS.sweep,
                        SCORE_NORMALIZERS[FLAGS.normalizer], FLAGS.ref_measure,
                        evaluator))
                print(
                    'fold %d: best_train_weight=%.2f, %s =%.4f' %
                    (fold, best_train_weight, FLAGS.ref_measure, train_score))
                # compute combined run with best combination on test queries
                test_crun = tf_utils.compute_combined_run(
                    run, FLAGS.qfield, qtest, docnos, doc_embs, word_dict,
                    word_embs, SCORE_NORMALIZERS[FLAGS.normalizer],
                    best_train_weight)
                # evaluate test run
                test_res = evaluator.evaluate(test_crun)
                # compute aggregated measure score for test queries
                test_score = pytrec_eval.compute_aggregated_measure(
                    FLAGS.ref_measure, [
                        qscore[FLAGS.ref_measure]
                        for qscore in test_res.values()
                    ])
                # store averaged scores w/ best weights
                scores_and_weights.append(
                    (np.mean([train_score, test_score]), best_train_weight))

            # get (best) weight that produces the highest averaged score
            best_score, best_weight = max(scores_and_weights)
            print('found best weight=%.2f' % (best_weight))
            # initialize combined (output) run
            crun = trec_utils.OnlineTRECRun(FLAGS.model_name +
                                            '_best_weight_' +
                                            str(FLAGS.best_weight))
            # compute combined run based on test weight
            comb_run = tf_utils.compute_combined_run(
                run, FLAGS.qfield, q, docnos, doc_embs, word_dict, word_embs,
                SCORE_NORMALIZERS[FLAGS.normalizer], best_weight)
            # store ranking in crun
            for qid, doc_ids_and_scores in comb_run.items():
                crun.add_ranking(
                    qid, [(score, doc_id)
                          for doc_id, score in doc_ids_and_scores.items()])
            # close and store run
            crun.close_and_write(out_path=rankings_folder + '/' +
                                 FLAGS.model_name + '_best_weight_' +
                                 str(FLAGS.best_weight) + '.txt',
                                 overwrite=True)
            print('combined run stored in {}'.format(rankings_folder))
            # evalaute combined run
            print(
                'evaluate run combined w/ {}-fold cross validation and best weight={}'
                .format(FLAGS.num_folds, FLAGS.best_weight))
            tf_utils.evaluate(['map', 'P_10', 'ndcg'], rankings_folder,
                              FLAGS.model_name + '_best_weight_' +
                              str(FLAGS.best_weight), qrels_folder,
                              FLAGS.qrels_fname)
Example #13
0
def eval_ranking_pred(label_file, pred_file):
    #
    # load directory structure
    #

    labels = []

    with jsonlines.open(label_file, mode='r') as reader:
        for file in reader:
            labels.append(file)


    label_dict = {}
    for label in labels:
        label_dict.update({label.get('guid'): label.get('label')})

    print(len(labels))
    print(len(label_dict))

    with open(pred_file, 'r') as reader:
        content = reader.read().splitlines()
        predictions = [ast.literal_eval(file) for file in content]

    print(len(predictions))

    pred_dict = {}
    pos = []
    for pred in predictions:
        pred_dict.update({pred.get('guid'): max(pred.get('res'))+ 100 * np.argmax(pred.get('res')) + 100})  # für binary ist hier 1 anstatt 0 (für mseloss output)
        pos.append(min(pred.get('res')))  # für binary ist hier 1 anstatt 0 (für mseloss)

    print(min(pos))
    assert abs(min(pos)) < 100


    files = list(label_dict.keys())
    files.sort()

    qrels = {}
    for file in files:
        qrels.update({file.split('_')[0]: {}})
    for file in files:
        # print(file.split('_')[0])
        qrels.get(file.split('_')[0]).update({file.split('_')[1]: label_dict.get(file)})
        # qrels.update({file.split('_')[0]: {file.split('_')[1]: label_dict.get(file)}})
        # label_dict.get(file)

    # print(qrels.get('001'))

    run = {}
    for file in files:
        run.update({file.split('_')[0]: {}})
    for file in files:
        # print(file.split('_')[0])
        if pred_dict.get(file):
            run.get(file.split('_')[0]).update({file.split('_')[1]: pred_dict.get(file)})
        else:
            run.get(file.split('_')[0]).update({file.split('_')[1]: 0})

    # trec eval

    evaluator = pytrec_eval.RelevanceEvaluator(
        qrels, {'map', 'P_1', 'recall_1', 'P_2', 'recall_2'})

    results = evaluator.evaluate(run)

    def print_line(measure, scope, value):
        print('{:25s}{:8s}{:.4f}'.format(measure, scope, value))

    def write_line(measure, scope, value):
        return '{:25s}{:8s}{:.4f}'.format(measure, scope, value)

    for query_id, query_measures in sorted(results.items()):
        for measure, value in sorted(query_measures.items()):
            print_line(measure, query_id, value)

    # Scope hack: use query_measures of last item in previous loop to
    # figure out all unique measure names.
    #
    # TODO(cvangysel): add member to RelevanceEvaluator
    #                  with a list of measure names.
    for measure in sorted(query_measures.keys()):
        print_line(
            measure,
            'all',
            pytrec_eval.compute_aggregated_measure(
                measure,
                [query_measures[measure]
                 for query_measures in results.values()]))

    with open(pred_file.split('.txt')[0] + '_eval_200_3.txt', 'w') as output:
        for measure in sorted(query_measures.keys()):
            output.write(write_line(
                measure,
                'all',
                pytrec_eval.compute_aggregated_measure(
                    measure,
                    [query_measures[measure]
                     for query_measures in results.values()])) + '\n')
Example #14
0
def eval_ranking_bm25(label_file, bm25_folder):
    labels = []

    with jsonlines.open(label_file, mode='r') as reader:
        for file in reader:
            labels.append(file)

    label_dict = {}
    for label in labels:
        label_dict.update({label.get('guid'): label.get('label')})

    files = list(label_dict.keys())
    files.sort()

    qrels = {}
    for file in files:
        qrels.update({file.split('_')[0]: {}})
    for file in files:
        print(file.split('_')[0])
        qrels.get(file.split('_')[0]).update({file.split('_')[1]: label_dict.get(file)})
        # qrels.update({file.split('_')[0]: {file.split('_')[1]: label_dict.get(file)}})
        # label_dict.get(file)

    run = {}
    for file in files:
        run.update({file.split('_')[0]: {}})
    for key in list(run.keys()):
        with open(os.path.join(bm25_folder, 'bm25_top50_{}.txt'.format(key)), #.xml für clef-ip corpus
                  'r') as out:  # (.xml) for clef-ip top 50, different splitting also!
            #text = [text.split('-')[0] + '-' + text.split('-')[1] for text in
            #        [text.split('\n')[0].strip() for text in out.readlines()]]
            text = [text.split('_')[1] for text in
                    [text.split('\n')[0].strip() for text in out.readlines()]]
            for file in files:
                if file.split('_')[1] in text:
                    run.get(key).update(
                        {text[text.index(file.split('_')[1])]: len(text) - text.index(file.split('_')[1])})
                else:
                    run.get(key).update({file.split('_')[1]: 0})

    # trec eval
    evaluator = pytrec_eval.RelevanceEvaluator(
        qrels, {'map', 'P_1', 'recall_1', 'P_2', 'recall_2'})#pytrec_eval.supported_measures

    results = evaluator.evaluate(run)

    def print_line(measure, scope, value):
        print('{:25s}{:8s}{:.4f}'.format(measure, scope, value))

    def write_line(measure, scope, value):
        return '{:25s}{:8s}{:.4f}'.format(measure, scope, value)

    for query_id, query_measures in sorted(results.items()):
        for measure, value in sorted(query_measures.items()):
            print_line(measure, query_id, value)

    # Scope hack: use query_measures of last item in previous loop to
    # figure out all unique measure names.
    #
    # TODO(cvangysel): add member to RelevanceEvaluator
    #                  with a list of measure names.
    for measure in sorted(query_measures.keys()):
        print_line(
            measure,
            'all',
            pytrec_eval.compute_aggregated_measure(
                measure,
                [query_measures[measure]
                 for query_measures in results.values()]))

    with open(os.path.join(bm25_folder, 'eval_bm25_200_3.txt'), 'w') as output:
        for measure in sorted(query_measures.keys()):
            output.write(write_line(
                measure,
                'all',
                pytrec_eval.compute_aggregated_measure(
                    measure,
                    [query_measures[measure]
                     for query_measures in results.values()])) + '\n')
Example #15
0
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument(
        '--qrel',
        default=
        '/Users/woffee/www/emse-apiqa/QA2021/data/QA2021_stackoverflow4_qrel.txt'
    )
    parser.add_argument(
        '--run',
        default='/Users/woffee/www/emse-apiqa/QA2021/data/pyltr_pred.txt')

    args = parser.parse_args()

    print("args.qrel:", args.qrel)
    print("args.run", args.run)

    assert os.path.exists(args.qrel)
    assert os.path.exists(args.run)

    final_auc, final_accuracy = calc_auc(args.qrel, args.run)

    with open(args.qrel, 'r') as f_qrel:
        qrel = pytrec_eval.parse_qrel(f_qrel)

    with open(args.run, 'r') as f_run:
        run = pytrec_eval.parse_run(f_run)

    evaluator = pytrec_eval.RelevanceEvaluator(qrel,
                                               pytrec_eval.supported_measures)

    results = evaluator.evaluate(run)

    def print_line(measure, scope, value):
        print('{:25s}{:8s}{:.4f}'.format(measure, scope, value))

    total = len(results.items())
    sum_map = 0.0

    for query_id, query_measures in sorted(results.items()):
        for measure, value in sorted(query_measures.items()):
            # print_line(measure, query_id, value)
            pass

    # Scope hack: use query_measures of last item in previous loop to
    # figure out all unique measure names.
    #
    # TODO(cvangysel): add member to RelevanceEvaluator
    #                  with a list of measure names.
    print("==========")
    selected_measures = [
        'map', 'recip_rank', 'P_5', 'P_10', 'P_15', 'P_20', 'recall_5',
        'recall_10', 'recall_15', 'recall_20', 'ndcg'
    ]

    eva_values = {}
    for measure in selected_measures:
        eva_values[measure] = pytrec_eval.compute_aggregated_measure(
            measure,
            [query_measures[measure] for query_measures in results.values()])
        # print_line( measure, 'all', eva_values[measure])
    for measure in selected_measures:
        print_line(measure, 'all', eva_values[measure])

    print(
        "%.4f\t%.4f\t%.4f\t%.4f\t%.4f\t%.4f\t%.4f\t%.4f\t%.4f\t%.4f\t%.4f\t%.4f\t%.4f"
        % (final_auc, final_accuracy, eva_values['map'],
           eva_values['recip_rank'], eva_values['P_5'], eva_values['P_10'],
           eva_values['P_15'], eva_values['P_20'], eva_values['recall_5'],
           eva_values['recall_10'], eva_values['recall_15'],
           eva_values['recall_20'], eva_values['ndcg']))