def evaluate_results(qids_rs, Y, k): values = defaultdict(list) for qid, r in qids_rs: gold = harvest(Y, qid) gold_topk = gold[argtopk(gold, k)] R = np.count_nonzero(gold_topk) # real ndcg idcg = rm.dcg_at_k(gold_topk, k) ndcg = rm.dcg_at_k(r, k) / idcg values["ndcg"].append(ndcg) # Verified # MAP@k ap = rm.average_precision(r) values["MAP"].append(ap) # MRR - compute by hand ind = np.asarray(r).nonzero()[0] mrr = (1. / (ind[0] + 1)) if ind.size else 0. values["MRR"].append(mrr) # R precision # R = min(R, k) # ok lets be fair.. you cant get more than k # we dont need that anymore, since we chop of the remainder # before computing R recall = rm.recall(r, R) values["recall"].append(recall) # precision = rm.precision_at_k(pad(scored_result, k), k) precision = rm.precision(r) values["precision"].append(precision) f1 = f1_score(precision, recall) values["f1_score"].append(f1) # Safe variant does not fail if len(r) < k p_at_5 = rm.safe_precision_at_k(r, 5) values["precision@5"].append(p_at_5) p_at_10 = rm.safe_precision_at_k(r, 10) values["precision@10"].append(p_at_10) return values
def evaluate(self, X, Y, k=20, verbose=0, replacement=0, n_jobs=1): """ :X: [(qid, str)] query id, query string pairs :Y: pandas dataseries with qid,docid index or [dict] :k: Limit the result for all metrics to this value, the models are also given a hint of how many they should return. :replacement: 0 means that (query, doc) pairs not prevalent in Y will not be considered relevant, None means that those are not considered (skipped). """ # rs = [] # if n_jobs > 1: # return process_and_evaluate(self, X, Y, k, n_jobs) values = defaultdict(list) for qid, query in X: # execute query if verbose > 0: print(qid, ":", query) t0 = timer() # if replacement is None, we need to drop after querying result = self.query(query, k=(None if replacement is None else k)) values["time_per_query"].append(timer() - t0) # if verbose > 0: # print(result[:k]) # result = result[:k] # TRIM HERE # soak the generator scored_result = [ harvest(Y, qid, docid, replacement) for docid in result ] if replacement is None: scored_result, notfound = filter_none(scored_result) values["gold_not_found"].append(notfound) if k is not None: # dont let the models cheat by returning more than k r = scored_result[:k] else: # if k is None, consider all r = scored_result # if verbose > 0: # print(r) # gold = np.array(list(Y[qid].values())) gold = harvest(Y, qid) import sys # print(gold, file=sys.stderr) topk_indices = argtopk(gold, k) print(topk_indices, file=sys.stderr) gold_topk = gold[topk_indices] # print('Top k in gold standard:', gold_topk, file=sys.stderr) R = np.count_nonzero(gold_topk) if verbose > 0: print("Retrieved {} relevant out of {} possible.".format( np.count_nonzero(r), R)) # real ndcg idcg = rm.dcg_at_k(gold_topk, k) ndcg = rm.dcg_at_k(scored_result, k) / idcg values["ndcg"].append(ndcg) # Verified # MAP@k ap = rm.average_precision(r) values["MAP"].append(ap) # MRR - compute by hand ind = np.asarray(r).nonzero()[0] mrr = (1. / (ind[0] + 1)) if ind.size else 0. values["MRR"].append(mrr) # R precision # R = min(R, k) # ok lets be fair.. you cant get more than k # we dont need that anymore, since we chop of the remainder # before computing R recall = rm.recall(r, R) values["recall"].append(recall) # precision = rm.precision_at_k(pad(scored_result, k), k) precision = rm.precision(r) values["precision"].append(precision) f1 = f1_score(precision, recall) values["f1_score"].append(f1) # Safe variant does not fail if len(r) < k p_at_5 = rm.safe_precision_at_k(r, 5) values["precision@5"].append(p_at_5) p_at_10 = rm.safe_precision_at_k(r, 10) values["precision@10"].append(p_at_10) # rs.append(r) if verbose > 0: # print("Precision: {:.4f}".format(precision)) # print("Recall: {:.4f}".format(recall)) # print("F1-Score: {:.4f}".format(f1)) print("AP: {:.4f}".format(ap)) print("RR: {:.4f}".format(mrr)) print("NDCG: {:.4f}".format(ndcg)) return values
def evaluate(self, X, Y, k=20, verbose=0, replacement=0, n_jobs=1): """ :X: [(qid, str)] query id, query string pairs :Y: pandas dataseries with qid,docid index or [dict] :k: Limit the result for all metrics to this value, the models are also given a hint of how many they should return. :replacement: 0 means that (query, doc) pairs not prevalent in Y will not be considered relevant, None means that those are not considered (skipped). """ # rs = [] # if n_jobs > 1: # return process_and_evaluate(self, X, Y, k, n_jobs) values = defaultdict(list) for qid, query in X: # execute query if verbose > 0: print(qid, ":", query) t0 = timer() # if replacement is None, we need to drop after querying result = self.query(query, k=(None if replacement is None else k)) values["time_per_query"].append(timer() - t0) # if verbose > 0: # print(result[:k]) # result = result[:k] # TRIM HERE # soak the generator scored_result = [harvest(Y, qid, docid, replacement) for docid in result] if replacement is None: scored_result, notfound = filter_none(scored_result) values["gold_not_found"].append(notfound) if k is not None: # dont let the models cheat by returning more than k r = scored_result[:k] else: # if k is None, consider all r = scored_result # if verbose > 0: # print(r) # gold = np.array(list(Y[qid].values())) gold = harvest(Y, qid) import sys # print(gold, file=sys.stderr) topk_indices = argtopk(gold, k) print(topk_indices, file=sys.stderr) gold_topk = gold[topk_indices] # print('Top k in gold standard:', gold_topk, file=sys.stderr) R = np.count_nonzero(gold_topk) if verbose > 0: print("Retrieved {} relevant out of {} possible." .format(np.count_nonzero(r), R)) # real ndcg idcg = rm.dcg_at_k(gold_topk, k) ndcg = rm.dcg_at_k(scored_result, k) / idcg values["ndcg"].append(ndcg) # Verified # MAP@k ap = rm.average_precision(r) values["MAP"].append(ap) # MRR - compute by hand ind = np.asarray(r).nonzero()[0] mrr = (1. / (ind[0] + 1)) if ind.size else 0. values["MRR"].append(mrr) # R precision # R = min(R, k) # ok lets be fair.. you cant get more than k # we dont need that anymore, since we chop of the remainder # before computing R recall = rm.recall(r, R) values["recall"].append(recall) # precision = rm.precision_at_k(pad(scored_result, k), k) precision = rm.precision(r) values["precision"].append(precision) f1 = f1_score(precision, recall) values["f1_score"].append(f1) # Safe variant does not fail if len(r) < k p_at_5 = rm.safe_precision_at_k(r, 5) values["precision@5"].append(p_at_5) p_at_10 = rm.safe_precision_at_k(r, 10) values["precision@10"].append(p_at_10) # rs.append(r) if verbose > 0: # print("Precision: {:.4f}".format(precision)) # print("Recall: {:.4f}".format(recall)) # print("F1-Score: {:.4f}".format(f1)) print("AP: {:.4f}".format(ap)) print("RR: {:.4f}".format(mrr)) print("NDCG: {:.4f}".format(ndcg)) return values