Example #1
0
 def ndcg(self, hit_list, total_known_DTI):
     if total_known_DTI == 0:
         return float('nan')
     else:
         if total_known_DTI >= len(hit_list):
             ideal_list = [1 for number in range(len(hit_list))]
         else:
             ideal_list = [1 for number in range(total_known_DTI)] + [
                 0 for number in range(len(hit_list) - total_known_DTI)
             ]
         return rank.dcg_at_k(hit_list, len(hit_list), 1) / rank.dcg_at_k(
             ideal_list, len(hit_list), 1)
def generate_reward(gold_index_list, answer_index_list):
    reward = 0
    ap = 0
    reciprocal_rank = 0
    answer_list = list(answer_index_list)
    size = len(answer_index_list)
    true = sum(gold_index_list > 0)
    inp = np.zeros(size)
    for rank, val in enumerate(gold_index_list):
        if val and rank in answer_list:
            inp[answer_list.index(rank)] = val
    maxk = sum(inp > 0)
    if true:
        ap = average_precision(inp) * (maxk / true)
    reciprocal_rank = mean_reciprocal_rank([inp])
    ndcg = ndcg_at_k(inp, min(10, size))
    dcg_five = dcg_at_k(inp, 5)
    reward = (ap + reciprocal_rank + ndcg + dcg_five) / 4
    ranks = [1, 3, 5, 10]
    reward_tuple = [reward, ap, reciprocal_rank, ndcg, dcg_five]
    for r in ranks:
        reward_tuple.append(precision_at_k(inp, min(r, len(inp))))
    for r in ranks:
        reward_tuple.append(ndcg_at_k(inp, min(r, len(inp))))
    return reward_tuple
Example #3
0
def evaluate_results(qids_rs, Y, k):
    values = defaultdict(list)
    for qid, r in qids_rs:
        gold = harvest(Y, qid)
        gold_topk = gold[argtopk(gold, k)]
        R = np.count_nonzero(gold_topk)
        # real ndcg
        idcg = rm.dcg_at_k(gold_topk, k)
        ndcg = rm.dcg_at_k(r, k) / idcg
        values["ndcg"].append(ndcg)
        # Verified

        # MAP@k
        ap = rm.average_precision(r)
        values["MAP"].append(ap)

        # MRR - compute by hand
        ind = np.asarray(r).nonzero()[0]
        mrr = (1. / (ind[0] + 1)) if ind.size else 0.
        values["MRR"].append(mrr)

        # R precision
        # R = min(R, k)  # ok lets be fair.. you cant get more than k
        # we dont need that anymore, since we chop of the remainder
        # before computing R
        recall = rm.recall(r, R)
        values["recall"].append(recall)

        # precision = rm.precision_at_k(pad(scored_result, k), k)
        precision = rm.precision(r)
        values["precision"].append(precision)

        f1 = f1_score(precision, recall)
        values["f1_score"].append(f1)

        # Safe variant does not fail if len(r) < k
        p_at_5 = rm.safe_precision_at_k(r, 5)
        values["precision@5"].append(p_at_5)

        p_at_10 = rm.safe_precision_at_k(r, 10)
        values["precision@10"].append(p_at_10)
    return values
Example #4
0
def evaluate_results(qids_rs, Y, k):
    values = defaultdict(list)
    for qid, r in qids_rs:
        gold = harvest(Y, qid)
        gold_topk = gold[argtopk(gold, k)]
        R = np.count_nonzero(gold_topk)
        # real ndcg
        idcg = rm.dcg_at_k(gold_topk, k)
        ndcg = rm.dcg_at_k(r, k) / idcg
        values["ndcg"].append(ndcg)
        # Verified

        # MAP@k
        ap = rm.average_precision(r)
        values["MAP"].append(ap)

        # MRR - compute by hand
        ind = np.asarray(r).nonzero()[0]
        mrr = (1. / (ind[0] + 1)) if ind.size else 0.
        values["MRR"].append(mrr)

        # R precision
        # R = min(R, k)  # ok lets be fair.. you cant get more than k
        # we dont need that anymore, since we chop of the remainder
        # before computing R
        recall = rm.recall(r, R)
        values["recall"].append(recall)

        # precision = rm.precision_at_k(pad(scored_result, k), k)
        precision = rm.precision(r)
        values["precision"].append(precision)

        f1 = f1_score(precision, recall)
        values["f1_score"].append(f1)

        # Safe variant does not fail if len(r) < k
        p_at_5 = rm.safe_precision_at_k(r, 5)
        values["precision@5"].append(p_at_5)

        p_at_10 = rm.safe_precision_at_k(r, 10)
        values["precision@10"].append(p_at_10)
    return values
Example #5
0
def generate_reward(gold_index_list, answer_index_list, reward_type):
    reward = 0
    ap = 0.
    reciprocal_rank = 0
    answer_list = list(answer_index_list)
    size = len(answer_index_list)
    true = sum(gold_index_list > 0)
    inp = np.zeros(size)
    for rank, val in enumerate(gold_index_list):
        if val and rank in answer_list:
            inp[answer_list.index(rank)] = val
    maxk = sum(inp > 0)
    if true:
        ap = average_precision(inp) * (maxk / true)
    reciprocal_rank = mean_reciprocal_rank([inp])
    ndcg = ndcg_at_k(inp, min(10, size))
    dcg_five = dcg_at_k(inp, 5)
    reward = rewards[reward_type - 1](inp, ap, reciprocal_rank, ndcg, dcg_five)
    return reward, ap, reciprocal_rank, ndcg, dcg_five
Example #6
0
 def summarize(self):
     """Give summary statistics about the tournament."""
     res = self.run()
     # res = self.results
     # champ should be undefeated
     champ = list(np.where(res.strength == max(res.strength))[0])
     copeland = (res.wins[champ] == self.n_rounds)
     # top-k
     ranks = pd.DataFrame(data=np.transpose([
         res.strength.rank(ascending=False),
         res.wins.rank(ascending=False), res.wins
     ]),
                          columns=["str_rank", "win_rank", "wins"])
     ranks['relevant'] = ranks['str_rank'] <= self.k
     borda = (ranks.win_rank[champ] == ranks.win_rank.min())
     top_k_df = ranks.loc[ranks['str_rank'] <= self.k]
     top_k = sum(top_k_df['wins'] >= self.n_rounds - 2) / self.k
     tau, k_p = scipy.stats.kendalltau(ranks.str_rank, ranks.win_rank)
     rho, sp_p = scipy.stats.spearmanr(ranks.str_rank, ranks.win_rank)
     ranks.sort_values(by="win_rank")
     # using rank_metrics
     rel_vec = ranks.relevant.values
     prec = rank_metrics.r_precision(rel_vec)
     prec_at_k = rank_metrics.precision_at_k(rel_vec, self.k)
     avg_prec = rank_metrics.average_precision(rel_vec)
     dcg = rank_metrics.dcg_at_k(rel_vec, self.k)
     ndcg = rank_metrics.ndcg_at_k(rel_vec, self.k)
     df = pd.DataFrame(data=[
         list([
             int(copeland),
             int(borda),
             float(top_k), prec, prec_at_k, avg_prec, dcg, ndcg,
             float(tau),
             float(rho)
         ])
     ],
                       columns=[
                           'undef_champ', 'top_champ', 'top_k_found',
                           'precision', 'precision_at_k', 'avg_prec', 'dcg',
                           'ndcg', 'tau', 'rho'
                       ])
     return df
Example #7
0
def generate_reward(gold_index_list, answer_index_list, reward_type=1):
    reward = 0
    ap = 0
    reciprocal_rank = 0
    answer_list = list(deepcopy(answer_index_list))
    size = len(answer_index_list)
    true = sum(gold_index_list)
    inp = np.zeros(size)
    for rank, val in enumerate(gold_index_list):
        if val and rank in answer_list:
            inp[answer_list.index(rank)] = 2
    if true:
        ap = average_precision(inp) * (sum(inp > 0) / true)
    reciprocal_rank = mean_reciprocal_rank([inp])
    #ndcg = ndcg_at_k(inp,size)
    #if reward_type==1:
    #    reward = (ap+reciprocal_rank)/2
    #elif reward_type ==2 :
    #    reward = dcg_at_k(inp,size)
    rewards = [(ap + reciprocal_rank) / 2, dcg_at_k(inp, size)]
    return rewards[reward_type - 1], ap, reciprocal_rank, (inp[0] > 0)
Example #8
0
# In[36]:

import numpy as np
import rank_metrics
import sys
relevanceVector = np.loadtxt(open(sys.argv[1] + "/rv/relevanceVector_" +
                                  sys.argv[2]),
                             delimiter=" ")
f = open(sys.argv[1] + '/em/evalMetrics_' + sys.argv[2], 'w')
for k in range(1, 16):
    total_precision_k = 0
    total_dcg_k = 0
    total_ndcg_k = 0
    for row in relevanceVector:
        precision_k = rank_metrics.precision_at_k(row, k)
        dcg_k = rank_metrics.dcg_at_k(row, k, 0)
        ndcg_k = rank_metrics.ndcg_at_k(row, k, 0)
        total_precision_k = total_precision_k + precision_k
        total_dcg_k = total_dcg_k + dcg_k
        total_ndcg_k = total_ndcg_k + ndcg_k
    f.write("precision@" + str(k) + ": " + str(total_precision_k) + "\n")
    f.write("dcg@" + str(k) + ": " + str(total_dcg_k) + "\n")
    f.write("ndcg@" + str(k) + ": " + str(total_ndcg_k) + "\n")

mrr = rank_metrics.mean_reciprocal_rank(relevanceVector)
f.write("Mean Reciprocal Rank: " + str(mrr) + "\n")
maP = rank_metrics.mean_average_precision(relevanceVector)
f.write("Mean Average Precision: " + str(maP) + "\n")
f.close()
    def test_dcg(self):

        r = [3, 2, 3, 0, 1, 2]
        self.assertEqual(dcg_at_k(r, 1, method=0), 3)
        self.assertAlmostEqual(dcg_at_k(r, 6, method=0), 6.861, places=3)
    def test_two_dcg_for_binary_relevance(self):
        r = [1, 1, 0, 0, 1, 1, 0, 1]

        self.assertEqual(dcg_at_k(r, 5, method=0), dcg_at_k(r, 5, method=1))
        self.assertEqual(dcg_at_k(r, 4, method=0), dcg_at_k(r, 4, method=1))
Example #11
0
    def evaluate(self, X, Y, k=20, verbose=0, replacement=0, n_jobs=1):
        """
        :X: [(qid, str)] query id, query string pairs
        :Y: pandas dataseries with qid,docid index or [dict]
        :k: Limit the result for all metrics to this value, the models are also
        given a hint of how many they should return.
        :replacement: 0 means that (query, doc) pairs not prevalent in Y will
        not be considered relevant, None means that those are not considered
        (skipped).
        """
        # rs = []

        # if n_jobs > 1:
        #     return process_and_evaluate(self, X, Y, k, n_jobs)
        values = defaultdict(list)
        for qid, query in X:
            # execute query
            if verbose > 0:
                print(qid, ":", query)
            t0 = timer()
            # if replacement is None, we need to drop after querying
            result = self.query(query, k=(None if replacement is None else k))
            values["time_per_query"].append(timer() - t0)
            # if verbose > 0:
            #     print(result[:k])
            # result = result[:k]  # TRIM HERE
            # soak the generator
            scored_result = [
                harvest(Y, qid, docid, replacement) for docid in result
            ]
            if replacement is None:
                scored_result, notfound = filter_none(scored_result)
                values["gold_not_found"].append(notfound)

            if k is not None:
                # dont let the models cheat by returning more than k
                r = scored_result[:k]
            else:
                # if k is None, consider all
                r = scored_result

            # if verbose > 0:
            #     print(r)

            # gold = np.array(list(Y[qid].values()))
            gold = harvest(Y, qid)
            import sys
            # print(gold, file=sys.stderr)
            topk_indices = argtopk(gold, k)
            print(topk_indices, file=sys.stderr)
            gold_topk = gold[topk_indices]
            # print('Top k in gold standard:', gold_topk, file=sys.stderr)
            R = np.count_nonzero(gold_topk)
            if verbose > 0:
                print("Retrieved {} relevant out of {} possible.".format(
                    np.count_nonzero(r), R))

            # real ndcg
            idcg = rm.dcg_at_k(gold_topk, k)
            ndcg = rm.dcg_at_k(scored_result, k) / idcg
            values["ndcg"].append(ndcg)
            # Verified

            # MAP@k
            ap = rm.average_precision(r)
            values["MAP"].append(ap)

            # MRR - compute by hand
            ind = np.asarray(r).nonzero()[0]
            mrr = (1. / (ind[0] + 1)) if ind.size else 0.
            values["MRR"].append(mrr)

            # R precision
            # R = min(R, k)  # ok lets be fair.. you cant get more than k
            # we dont need that anymore, since we chop of the remainder
            # before computing R
            recall = rm.recall(r, R)
            values["recall"].append(recall)

            # precision = rm.precision_at_k(pad(scored_result, k), k)
            precision = rm.precision(r)
            values["precision"].append(precision)

            f1 = f1_score(precision, recall)
            values["f1_score"].append(f1)

            # Safe variant does not fail if len(r) < k
            p_at_5 = rm.safe_precision_at_k(r, 5)
            values["precision@5"].append(p_at_5)

            p_at_10 = rm.safe_precision_at_k(r, 10)
            values["precision@10"].append(p_at_10)

            # rs.append(r)
            if verbose > 0:
                # print("Precision: {:.4f}".format(precision))
                # print("Recall: {:.4f}".format(recall))
                # print("F1-Score: {:.4f}".format(f1))
                print("AP: {:.4f}".format(ap))
                print("RR: {:.4f}".format(mrr))
                print("NDCG: {:.4f}".format(ndcg))

        return values
Example #12
0
def rew4(inp, ap, reciprocal_rank, ndcg, dcg_five):
    return (dcg_five + dcg_at_k(inp, 3) + dcg_at_k(inp, 1)) / 3
Example #13
0
    def evaluate(self, X, Y, k=20, verbose=0, replacement=0, n_jobs=1):
        """
        :X: [(qid, str)] query id, query string pairs
        :Y: pandas dataseries with qid,docid index or [dict]
        :k: Limit the result for all metrics to this value, the models are also
        given a hint of how many they should return.
        :replacement: 0 means that (query, doc) pairs not prevalent in Y will
        not be considered relevant, None means that those are not considered
        (skipped).
        """
        # rs = []

        # if n_jobs > 1:
        #     return process_and_evaluate(self, X, Y, k, n_jobs)
        values = defaultdict(list)
        for qid, query in X:
            # execute query
            if verbose > 0:
                print(qid, ":", query)
            t0 = timer()
            # if replacement is None, we need to drop after querying
            result = self.query(query, k=(None if replacement is None else k))
            values["time_per_query"].append(timer() - t0)
            # if verbose > 0:
            #     print(result[:k])
            # result = result[:k]  # TRIM HERE
            # soak the generator
            scored_result = [harvest(Y, qid, docid, replacement)
                             for docid in result]
            if replacement is None:
                scored_result, notfound = filter_none(scored_result)
                values["gold_not_found"].append(notfound)

            if k is not None:
                # dont let the models cheat by returning more than k
                r = scored_result[:k]
            else:
                # if k is None, consider all
                r = scored_result

            # if verbose > 0:
            #     print(r)

            # gold = np.array(list(Y[qid].values()))
            gold = harvest(Y, qid)
            import sys
            # print(gold, file=sys.stderr)
            topk_indices = argtopk(gold, k)
            print(topk_indices, file=sys.stderr)
            gold_topk = gold[topk_indices]
            # print('Top k in gold standard:', gold_topk, file=sys.stderr)
            R = np.count_nonzero(gold_topk)
            if verbose > 0:
                print("Retrieved {} relevant out of {} possible."
                      .format(np.count_nonzero(r), R))

            # real ndcg
            idcg = rm.dcg_at_k(gold_topk, k)
            ndcg = rm.dcg_at_k(scored_result, k) / idcg
            values["ndcg"].append(ndcg)
            # Verified

            # MAP@k
            ap = rm.average_precision(r)
            values["MAP"].append(ap)

            # MRR - compute by hand
            ind = np.asarray(r).nonzero()[0]
            mrr = (1. / (ind[0] + 1)) if ind.size else 0.
            values["MRR"].append(mrr)

            # R precision
            # R = min(R, k)  # ok lets be fair.. you cant get more than k
            # we dont need that anymore, since we chop of the remainder
            # before computing R
            recall = rm.recall(r, R)
            values["recall"].append(recall)

            # precision = rm.precision_at_k(pad(scored_result, k), k)
            precision = rm.precision(r)
            values["precision"].append(precision)

            f1 = f1_score(precision, recall)
            values["f1_score"].append(f1)

            # Safe variant does not fail if len(r) < k
            p_at_5 = rm.safe_precision_at_k(r, 5)
            values["precision@5"].append(p_at_5)

            p_at_10 = rm.safe_precision_at_k(r, 10)
            values["precision@10"].append(p_at_10)

            # rs.append(r)
            if verbose > 0:
                # print("Precision: {:.4f}".format(precision))
                # print("Recall: {:.4f}".format(recall))
                # print("F1-Score: {:.4f}".format(f1))
                print("AP: {:.4f}".format(ap))
                print("RR: {:.4f}".format(mrr))
                print("NDCG: {:.4f}".format(ndcg))

        return values