コード例 #1
0
def group_ndcg_score(truth, prediction, k=None, group_indices=None):
    if group_indices is None:
        return ndcg_score(np.expand_dims(truth, axis=0),
                          np.expand_dims(prediction, axis=0),
                          k=k)
    else:
        avg_ndcg = 0
        cnt = 0
        for sel in group_indices:
            sel_truth = truth[sel]
            sel_prediction = prediction[sel]
            if len(sel) == 1:
                continue
            else:
                try:
                    group_ndcg = ndcg_score(np.expand_dims(sel_truth, axis=0),
                                            np.expand_dims(sel_prediction,
                                                           axis=0),
                                            k=k)
                    avg_ndcg += group_ndcg
                    cnt += 1
                except Exception:
                    print(sel_truth)
                    print(sel_prediction)
                    raise Exception
        avg_ndcg /= cnt
        return avg_ndcg
コード例 #2
0
def main():
    movielens = fetch_movielens()

    train = movielens['train']
    test = movielens['test']
    print(train.shape)
    print(test.shape)

    model = LightFM(learning_rate=0.05, loss='bpr')
    model.fit(train, epochs=10)

    k = 10
    train_precision = precision_at_k(model, train, k=k).mean()
    test_precision = precision_at_k(model, test, k=k).mean()
    print(f'precision_at_{k}(train): {train_precision}')
    print(f'precision_at_{k}(test) : {test_precision}')

    train_auc = auc_score(model, train).mean()
    test_auc = auc_score(model, test).mean()
    print(f'auc_score(train): {train_auc}')
    print(f'auc_score(test) : {test_auc}')

    y_train_preds = model.predict_rank(train)
    y_test_preds = model.predict_rank(test)
    train_ndcg = ndcg_score(train.toarray(), y_train_preds.toarray())
    test_ndcg = ndcg_score(test.toarray(), y_test_preds.toarray())
    print(f'ndcg_score(train): {train_ndcg}')
    print(f'ndcg_score(test) : {test_ndcg}')

    print('DONE')

    return 0
コード例 #3
0
def show_ndcg():
    print("This {0:.4f}".format(ndcg_score(r1, r2, 4)))
    print("Best {0:.4f}".format(ndcg_score(r1, r2_best, 4)))
    print("Worst {0:.4f}".format(ndcg_score(r1, r2_worst, 4)))
    print(dcg_score(r1, r2))
    print(dcg_score(r1, r2_best))
    print(dcg_score(r1, r2_worst))
コード例 #4
0
def test_ndcg_score():
    # Check perfect ranking
    y_true = [1, 0, 2]
    y_score = [
        [0.15, 0.55, 0.2],
        [0.7, 0.2, 0.1],
        [0.06, 0.04, 0.9]
    ]
    perfect = ndcg_score(y_true, y_score)
    assert_equal(perfect, 1.0)

    # Check bad ranking with a small K
    y_true = [0, 2, 1]
    y_score = [
        [0.15, 0.55, 0.2],
        [0.7, 0.2, 0.1],
        [0.06, 0.04, 0.9]
    ]
    short_k = ndcg_score(y_true, y_score, k=1)
    assert_equal(short_k, 0.0)

    # Check a random scoring
    y_true = [2, 1, 0]
    y_score = [
        [0.15, 0.55, 0.2],
        [0.7, 0.2, 0.1],
        [0.06, 0.04, 0.9]
    ]
    average_ranking = ndcg_score(y_true, y_score, k=2)
    assert_almost_equal(average_ranking, 0.63092975)
コード例 #5
0
ファイル: train copy.py プロジェクト: WesleyClode/MBIMC-GAE
def evaluate_metric(args, net, dataset, segment='valid', debug = False):
    # 输入的是valid 与 test data里面的数据,每个人一个购买的数据,我们对预测的分进行排序,看testvalid的NDCG是多少
    # 我们可以对用户进行128一个batch进行计算
    # 对test data
    # 我们的pred 是对所有的用户进行预测,难免时间有点长,我们
    # 我们根据的是用户对所有物品的一个评分,这个评分对应一个物品,而我们的testdata里也有对应物品,我们看能不能预测到
    # input: predicted value\test data 
    # output: NDCG,recall
    possible_rating_values = dataset.possible_rating_values
    nd_possible_rating_values = th.FloatTensor(possible_rating_values).to(args.device)
    if segment == "test":
        rating_matrix = dataset.test_rating_matrix
        enc_graph = dataset.test_enc_graph
        dec_graph = dataset.test_recall_dec_graph
        user_len = len(list(pd.unique(dataset.test_rating_info["user_id"])))
    elif segment == "valid":
        rating_matrix = dataset.valid_rating_matrix
        enc_graph = dataset.valid_enc_graph
        dec_graph = dataset.valid_recall_dec_graph        
        user_len = len(list(pd.unique(dataset.valid_rating_info["user_id"])))
    else:
        raise NotImplementedError   

    # Evaluate RMSE
    net.eval()
    with th.no_grad():
        pred_ratings, reg_loss, user_out, movie_out, W = net(enc_graph, dec_graph, dataset.user_feature, dataset.movie_feature)
        if args.loss_func == "CE":
            max_rating, max_indices = th.max(pred_ratings, dim=1)
            pred = nd_possible_rating_values[max_indices]
            real_pred_ratings = (th.softmax(pred_ratings, dim=1) *
                                nd_possible_rating_values.view(1, -1)).sum(dim=1)
        elif args.loss_func == "MLP":
            real_pred_ratings = pred_ratings[:, 0]

    
    pred = real_pred_ratings.cpu().numpy()
    
    predition = np.reshape(pred, (user_len, movie_out.shape[0]))
    print("pred:",predition[0:2],predition.shape)
    #pred = real_pred_ratings.cpu().numpy()[0:movie_out.shape[0]]

    rating_matrix = rating_matrix.cpu().numpy()

    metric_ndcg = []

    ndcg_20 = ndcg_score(rating_matrix, predition, k=20)
    ndcg_40 = ndcg_score(rating_matrix, predition, k=40)
    ndcg_80 = ndcg_score(rating_matrix, predition, k=80)
    
    metric_ndcg.append(ndcg_20)
    metric_ndcg.append(ndcg_40)
    metric_ndcg.append(ndcg_80)

    if segment == "test":
        print("ndcg@20:",ndcg_20)
        print("ndcg@40:",ndcg_40)
        print("ndcg@80:",ndcg_80)

    return metric_ndcg    
コード例 #6
0
ファイル: util.py プロジェクト: zgsy05/sadedegel
def grid_search(relevance, docs, summarize_class, parameter_space):
    scores = {}

    with Live(console=console, screen=True, auto_refresh=False) as live:
        for i, param in enumerate(parameter_space, 1):
            summarizer = summarize_class(**param)

            score_10, score_50, score_80 = [], [], []

            for y_true, d in zip(relevance, docs):
                y_pred = [summarizer.predict(d)]

                score_10.append(
                    ndcg_score(y_true, y_pred, k=ceil(len(d) * 0.1)))
                score_50.append(
                    ndcg_score(y_true, y_pred, k=ceil(len(d) * 0.5)))
                score_80.append(
                    ndcg_score(y_true, y_pred, k=ceil(len(d) * 0.8)))

            scores[(summarizer.__class__.__name__,
                    json.dumps(param))] = np.array(score_10).mean(), np.array(
                        score_50).mean(), np.array(score_80).mean()

            live.update(topk_table(scores, i, len(parameter_space)),
                        refresh=True)

    return scores
コード例 #7
0
    def test_perform_w_new_items(self):
        metric = self.metric

        result_w_new_items = metric.perform(split_w_new_items)

        u1_actual = [[2, 3, 1, 0, 2, 3, 0, 0]]
        u1_ideal = [[3, 3, 2, 2, 1, 0, 0, 0]]

        u1_expected_ndcg = ndcg_score(u1_ideal, u1_actual)
        u1_result_ndcg = float(
            result_w_new_items.query('from_id == "u1"')[str(metric)])

        self.assertAlmostEqual(u1_expected_ndcg, u1_result_ndcg)

        u2_actual = [[3, 0, 4, 3]]
        u2_ideal = [[4, 3, 3, 0]]

        u2_expected_ndcg = ndcg_score(u2_ideal, u2_actual)
        u2_result_ndcg = float(
            result_w_new_items.query('from_id == "u2"')[str(metric)])

        self.assertAlmostEqual(u2_expected_ndcg, u2_result_ndcg)

        sys_expected_ndcg = (u1_expected_ndcg + u2_expected_ndcg) / 2
        sys_result_ndcg = float(
            result_w_new_items.query('from_id == "sys"')[str(metric)])

        self.assertAlmostEqual(sys_expected_ndcg, sys_result_ndcg)
コード例 #8
0
ファイル: test_ranking.py プロジェクト: ldirer/scikit-learn
def test_ndcg_score():
    # Check perfect ranking
    y_true = [1, 0, 2]
    y_score = [
        [0.15, 0.55, 0.2],
        [0.7, 0.2, 0.1],
        [0.06, 0.04, 0.9]
    ]
    perfect = ndcg_score(y_true, y_score)
    assert_equal(perfect, 1.0)

    # Check bad ranking with a small K
    y_true = [0, 2, 1]
    y_score = [
        [0.15, 0.55, 0.2],
        [0.7, 0.2, 0.1],
        [0.06, 0.04, 0.9]
    ]
    short_k = ndcg_score(y_true, y_score, k=1)
    assert_equal(short_k, 0.0)

    # Check a random scoring
    y_true = [2, 1, 0]
    y_score = [
        [0.15, 0.55, 0.2],
        [0.7, 0.2, 0.1],
        [0.06, 0.04, 0.9]
    ]
    average_ranking = ndcg_score(y_true, y_score, k=2)
    assert_almost_equal(average_ranking, 0.63092975)
コード例 #9
0
def evaluate(filename, k, relevance_field_name=None):
    if relevance_field_name:
        evals = list(
            load_json(
                filename, lambda x: {
                    'name': x['name'],
                    'score': x[relevance_field_name]
                }))
        predicted = [
            thing['name'] for thing in sorted(evals, key=lambda x: -x['score'])
        ]
    else:
        predicted = [
            thing['name']
            for thing in load_json(filename, lambda x: {'name': x['name']})
        ]
    random_predicted = copy.copy(predicted)
    random.shuffle(random_predicted)
    if len(seeds.intersection(predicted)) < 30:
        logging.warning(
            "Not enough seeds included in the list to be evaluated. This evaluation may not be accurate."
        )
    results = {}
    results[f'mAP@{k}'] = ap_at_k(seeds, predicted, k)
    results[f'p@{k}'] = precision(seeds, predicted, k)
    results[f'random_mAP@{k}'] = ap_at_k(seeds, random_predicted, k)
    results[f'random_p@{k}'] = precision(seeds, random_predicted, k)
    if args.ndcg:
        from sklearn.metrics import ndcg_score
        scores = np.array([thing['score'] for thing in evals])
        targets = np.array([thing['name'] in seeds for thing in evals])
        results[f'nDCG@{k}'] = ndcg_score([targets[:k]], [scores[:k]])
        random.shuffle(scores)
        results[f'random_nDCG@{k}'] = ndcg_score([targets[:k]], [scores[:k]])
    return results
コード例 #10
0
def create_candidate_svm(embedding, term, quants, classifier, plot_svm=False, descriptions=None, quant_name=None, pgbar=None, **kwargs):
    #!! term is only used for visualization, and ist must stay that way for CLUSTER_DIRECTION_ALGO = "reclassify" !
    bin_labels = np.array(quants, dtype=bool) # Ensure that regardless of quant_measure this is correct binary classification labels
    # (tmp := len(quants)/(2*np.bincount(bin_labels)))[0]/tmp[1] is roughly equal to bin_labels.mean() so balancing is good
    if classifier == "SVM":
        svm = sklearn.svm.LinearSVC(class_weight="balanced", loss="hinge", max_iter=20000)
    elif classifier == "SVM_square":
        svm = sklearn.svm.LinearSVC(dual=False, class_weight="balanced") #squared-hinge instead of hinge (but fastest!)
    elif classifier == "SVM2":
        warnings.warn("Using an SVM Implementation that's slower for this kind of data!")
        svm = sklearn.svm.SVC(kernel="linear", class_weight="balanced", decision_function_shape="ovo")  #slower than LinearSVC, don't use!
        # see https://stackoverflow.com/q/33843981/5122790, https://stackoverflow.com/q/35076586/5122790
    else:
        raise NotImplementedError(f"Demanded classifier {classifier} not implemented!")
    with warnings.catch_warnings(record=True) as w:
        warnings.simplefilter("always")
        svm.fit(embedding, bin_labels)
        if w: assert issubclass(w[0].category, (sklearn.exceptions.ConvergenceWarning, DeprecationWarning))
        no_converge = (bool(w) and issubclass(w[0].category, sklearn.exceptions.ConvergenceWarning))
    tn, fp, fn, tp = confusion_matrix(bin_labels, svm.predict(embedding)).ravel()
    res = {"accuracy": (tp + tn) / len(quants), "precision": tp / (tp + fp), "recall": tp / (tp + fn), "did_converge": not no_converge}
    res["f_one"] = 2 * (res["precision"] * res["recall"]) / (res["precision"] + res["recall"])
    #now, in [DESC15:4.2.1], they compare the "ranking induced by \vec{v_t} with the number of times the term occurs in the entity's documents" with Cohen's Kappa.

    #see notebooks/proof_of_concept/get_svm_decisionboundary.ipynb#Checking-projection-methods-&-distance-measures-from-point-to-projection for the ranking
    decision_plane = NDPlane(svm.coef_[0], svm.intercept_[0])  #don't even need the plane class here
    dist = lambda x, plane: np.dot(plane.normal, x) + plane.intercept
    distances = [dist(point, decision_plane) for point in embedding]
    assert np.allclose(distances, svm.decision_function(embedding)) #see https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html#sklearn.svm.SVC.decision_function, https://stats.stackexchange.com/a/14881
    distances /= np.linalg.norm(svm.coef_[0]) #TODO: add the links and this normalification to the distances-notebook
    #sanity check: do most of the points with label=0 have the same sign `np.count_nonzero(np.sign(np.array(distances)[bin_labels])+1)
    # bin_labels, np.array((np.sign(np.array(distances))+1)/2, dtype=bool)
    # quant_ranking = np.zeros(quants.shape); quant_ranking[np.where(quants > 0)] = np.argsort(quants[quants > 0])
    #TODO cohen's kappa hat nen sample_weight parameter!! DESC15 write they select Kappa "due to its tolerance to class imbalance." -> Does that mean I have to set the weight?!
    kappa_weights = get_setting("KAPPA_WEIGHTS") if get_setting("KAPPA_WEIGHTS") != "None" else None
    res["kappa_rank2rank_dense"]  = cohen_kappa(rankdata(quants, method="dense"), rankdata(distances, method="dense"), weights=kappa_weights) #if there are 14.900 zeros, the next is a 1
    res["kappa_rank2rank_min"] = cohen_kappa(rankdata(quants, method="min"), rankdata(distances, method="dense"), weights=kappa_weights) #if there are 14.900 zeros, the next one is a 14.901
    res["kappa_bin2bin"]    = cohen_kappa(bin_labels, [i > 0 for i in distances], weights=kappa_weights)
    res["kappa_digitized"]  = cohen_kappa(np.digitize(quants, np.histogram_bin_edges(quants)[1:]), np.digitize(distances, np.histogram_bin_edges(distances)[1:]), weights=kappa_weights)
    res["ndcg_all"] = ndcg_score(np.array([quants]), np.expand_dims(distances,0))
    res["ndcg_onlypos"] = ndcg_score(np.array([quants]), np.expand_dims(distances, 0), k=np.count_nonzero(np.array(quants)))
    nonzero_indices = np.where(np.array(quants) > 0)[0]
    q2, d2 = np.array(quants)[nonzero_indices], np.array(distances)[nonzero_indices]
    with nullcontext(): #warnings.catch_warnings(): #TODO get rid of what cuases the nans here!!!
        # warnings.filterwarnings('ignore', r'invalid value encountered in true_divide')
        if quant_name == "count":  # in DESC15 they write "measure the correlation between the ranking induced by \vec{vt} and the number of times t appears in the documents associated with each entity", so maybe compare ranking to count?!
            # res["kappa_count2rank"] = cohen_kappa(quants, rankdata(distances, method="dense"), weights=kappa_weights)
            res["kappa_count2rank_onlypos"] = cohen_kappa(q2, rankdata(d2, method="dense"), weights=kappa_weights)
        res["kappa_rank2rank_onlypos_dense"] = cohen_kappa(rankdata(q2, method="dense"), rankdata(d2, method="dense"), weights=kappa_weights)
        res["kappa_rank2rank_onlypos_min"] = cohen_kappa(rankdata(q2, method="min"), rankdata(d2, method="min"), weights=kappa_weights)
        res["kappa_rank2rank_onlypos_max"] = cohen_kappa(rankdata(q2, method="max"), rankdata(d2, method="max"), weights=kappa_weights)
        # res["kappa_digitized_onlypos_1"] = cohen_kappa(np.digitize(q2, np.histogram_bin_edges(quants)[1:]), np.digitize(d2, np.histogram_bin_edges(distances)[1:]), weights=kappa_weights)
        #one ^ has as histogram-bins what it would be for ALL data, two only for the nonzero-ones
        res["kappa_digitized_onlypos_2"] = cohen_kappa(np.digitize(q2, np.histogram_bin_edges(q2)[1:]), np.digitize(d2, np.histogram_bin_edges(d2)[1:]), weights=kappa_weights)
    if plot_svm and descriptions is not None:
        display_svm(embedding, np.array(bin_labels, dtype=int), svm, term=term, descriptions=descriptions, name=term+" "+(", ".join(f"{k}: {round(v, 3)}" for k, v in res.items())), quants=quants, distances=distances, **kwargs)
    if pgbar is not None:
        pgbar.update(1)
    return res, decision_plane, term
コード例 #11
0
ファイル: __main__.py プロジェクト: bhbduman/sadedegel
def evaluate(table_format, tag, debug):
    """Evaluate all summarizers in sadedeGel"""
    if not debug:
        warnings.filterwarnings("ignore")

    anno = load_annotated_corpus(False)
    relevance = [[doc['relevance']] for doc in anno]

    summarizers = [
        summ for summ in SUMMARIZERS if any(_tag in summ[1] for _tag in tag)
    ]

    scores = defaultdict(list)

    for word_tokenizer in ['simple', 'bert']:
        click.echo("Word Tokenizer: " +
                   click.style(f"{word_tokenizer}", fg="blue"))
        docs = [Doc.from_sentences(doc['sentences'])
                for doc in anno]  # Reset document because of memoization
        with tokenizer_context(word_tokenizer):
            for name, summarizer in summarizers:
                click.echo(click.style(f"    {name} ", fg="magenta"), nl=False)
                # skip simple tokenizer for clustering models
                if ("cluster" in summarizer or "rank" in summarizer or name == "TFIDF Summarizer") and \
                        word_tokenizer == "simple":
                    click.echo(click.style("SKIP", fg="yellow"))
                    continue

                for i, (y_true, d) in enumerate(zip(relevance, docs)):
                    dot_progress(i, len(relevance))

                    y_pred = [summarizer.predict(d.sents)]

                    score_10 = ndcg_score(y_true, y_pred, k=ceil(len(d) * 0.1))
                    score_50 = ndcg_score(y_true, y_pred, k=ceil(len(d) * 0.5))
                    score_80 = ndcg_score(y_true, y_pred, k=ceil(len(d) * 0.8))

                    scores[f"{name} - {word_tokenizer}"].append(
                        (score_10, score_50, score_80))

    table = [[
        algo,
        np.array([s[0] for s in scores]).mean(),
        np.array([s[1] for s in scores]).mean(),
        np.array([s[2] for s in scores]).mean()
    ] for algo, scores in scores.items()]

    # TODO: Sample weight of instances.
    print(
        tabulate(table,
                 headers=[
                     'Method & Tokenizer', 'ndcg(k=0.1)', 'ndcg(k=0.5)',
                     'ndcg(k=0.8)'
                 ],
                 tablefmt=table_format,
                 floatfmt=".4f"))

    if debug:
        click.echo(np.array(table).shape)
コード例 #12
0
ファイル: eval_test.py プロジェクト: AnnemijnD/datamining
def eval():

    true_relevance = np.asarray([[10, 0, 0, 1, 5]])
    scores = np.asarray([[1, 0, 0, 0, 1]])
    ndcg_score(true_relevance, scores,k=5)

    score = 0
    return score
コード例 #13
0
ファイル: train.py プロジェクト: WesleyClode/MBIMC-GAE
def evaluate_metric(args, net, dataset, segment='valid', debug=False):
    possible_rating_values = dataset.possible_rating_values
    nd_possible_rating_values = th.FloatTensor(possible_rating_values).to(
        args.device)
    if segment == "test":
        rating_matrix = dataset.test_rating_matrix
        enc_graph = dataset.test_enc_graph
        dec_graph = dataset.test_recall_dec_graph
        user_len = len(list(pd.unique(dataset.test_rating_info["user_id"])))
    elif segment == "valid":
        rating_matrix = dataset.valid_rating_matrix
        enc_graph = dataset.valid_enc_graph
        dec_graph = dataset.valid_recall_dec_graph
        user_len = len(list(pd.unique(dataset.valid_rating_info["user_id"])))
    else:
        raise NotImplementedError

    # Evaluate RMSE
    net.eval()
    with th.no_grad():
        pred_ratings, reg_loss, user_out, movie_out, W = net(
            enc_graph, dec_graph, dataset.user_feature, dataset.movie_feature)
        if args.loss_func == "CE":
            max_rating, max_indices = th.max(pred_ratings, dim=1)
            pred = nd_possible_rating_values[max_indices]
            real_pred_ratings = (th.softmax(pred_ratings, dim=1) *
                                 nd_possible_rating_values.view(1, -1)).sum(
                                     dim=1)
        elif args.loss_func == "MLP":
            real_pred_ratings = pred_ratings[:, 0]

    pred = real_pred_ratings.cpu().numpy()
    predition = np.reshape(pred, (user_len, movie_out.shape[0]))
    #pred = real_pred_ratings.cpu().numpy()[0:movie_out.shape[0]]

    rating_matrix = rating_matrix.cpu().numpy()

    metric_ndcg = []

    ndcg_20 = ndcg_score(rating_matrix, predition, k=20)
    ndcg_40 = ndcg_score(rating_matrix, predition, k=40)
    ndcg_80 = ndcg_score(rating_matrix, predition, k=80)

    metric_ndcg.append(ndcg_20)
    metric_ndcg.append(ndcg_40)
    metric_ndcg.append(ndcg_80)

    if segment == "test":
        print("NDCG test")
        print("ndcg@20:", ndcg_20)
        print("ndcg@40:", ndcg_40)
        print("ndcg@80:", ndcg_80)

    return metric_ndcg
コード例 #14
0
def compute_ndcg_scores(groundTruthRanks,integerValuedQueryRanks): 
    ndcg_scores_5 = []
    ndcg_scores_10 = []
    count = 0
    for x,y in zip(groundTruthRanks,integerValuedQueryRanks):
        if(len(x)>1):
            true_relevance = np.asarray([x]) 
            relevance_score = np.asarray([y]) 
            ndcg_scores_5.append(ndcg_score(true_relevance, relevance_score,k=5))
            ndcg_scores_10.append(ndcg_score(true_relevance, relevance_score,k=10))
    return (sum(ndcg_scores_5)/len(ndcg_scores_5)),(sum(ndcg_scores_10)/len(ndcg_scores_10))
コード例 #15
0
def calc_rank_scores_at_k(y_true, y_pred, top_k_pctiles=[10,20]):
    # Given a list of scalar true and predicted scores, calculate F1 metrics for top K percentile elements.
    # Calculates precision@K, supports@K, ndcg@K
    
    precisions = []
    ndcg_scores = []
    supports = []
    
    for top_pctile in top_k_pctiles:
        pctile = 100 - top_pctile                   # top 10%-tile means 90%-tile CDF
        thr_true  = np.percentile(y_true, pctile)   # find threshold for true labels
        thr_pred  = np.percentile(y_pred, pctile)   # find threshold for predicted labels
        labels_true = y_true >= thr_true            # label +ve class in true_scores
        labels_pred = y_pred >= thr_pred            # label +ve class in predicted scores
        
        f1_metrics = f1_help(labels_true,           # calculate f1 for topK viral
                             labels_pred,           # binary classfiication
                             average='binary',
                             pos_label=1)
        
        num_top_rank = sum(labels_true)
        ndcg = ndcg_score(y_true.reshape((1,-1)),   # calculate ndcg score at K
                          y_pred.reshape((1,-1)),   # the rank scores must be axis 1
                          k=num_top_rank)           # each 'query' is axis 0
        
        precisions.append(f1_metrics[0])
        supports.append(sum(labels_true))
        ndcg_scores.append(ndcg)
    
    return precisions, supports, ndcg_scores
コード例 #16
0
def evaluate_ground_truth(args):
    """评估ground truth训练集的质量。"""
    author_rank_val = load_author_rank('val')
    author_rank_train = load_author_rank('train_original')
    fields = list(set(author_rank_val) & set(author_rank_train))
    author_rank_val = {k: v for k, v in author_rank_val.items() if k in fields}
    author_rank_train = {
        k: v
        for k, v in author_rank_train.items() if k in fields
    }

    num_authors = OAGCSDataset()[0].num_nodes('author')
    true_relevance = np.zeros((len(fields), num_authors), dtype=np.int32)
    scores = np.zeros_like(true_relevance)
    for i, f in enumerate(fields):
        for r, a in enumerate(author_rank_val[f]):
            if a != -1:
                true_relevance[i, a] = math.ceil((100 - r) / 10)
        author_rank_val[f] = [a for a in author_rank_val[f] if a != -1]
        for r, a in enumerate(author_rank_train[f]):
            scores[i, a] = len(author_rank_train[f]) - r

    for k in (100, 50, 20, 10, 5):
        print(
            'nDCG@{0}={1:.4f}\tPrecision@{0}={2:.4f}\tRecall@{0}={3:.4f}'.
            format(
                k, ndcg_score(true_relevance, scores, k=k, ignore_ties=True),
                sum(
                    precision_at_k(author_rank_val[f], author_rank_train[f], k)
                    for f in fields) / len(fields),
                sum(
                    recall_at_k(author_rank_val[f], author_rank_train[f], k)
                    for f in fields) / len(fields)))
コード例 #17
0
def compute_ndcg_between_vislists(l1: lux.vis.VisList, l2: lux.vis.VisList,
                                  k: int) -> float:
    if len(l1) == len(l2) == 1:
        return 1
    l1_scores = [vis.score for vis in l1]
    map1 = convert_vlist_to_hashmap(l1)

    l2_scores = [vis.score for vis in l2]
    map2 = convert_vlist_to_hashmap(l2)

    # Combine two dictionaries map1,map2 into a single global_map
    global_map = set(map1.keys())
    global_map.update(set(map2.keys()))
    global_map = list(global_map)

    # Somehow our own NDCG calculation always leads to > 1
    # aligned_score1 = list(get_aligned_dict(map1,global_map).values())
    # aligned_score2 = list(get_aligned_dict(map2,global_map).values())
    # return ndcg(aligned_score1,aligned_score2,5)

    # from scipy.stats import stats
    # rank1 = stats.rankdata(aligned_score1)
    # rank2 =stats.rankdata(aligned_score2)
    # return ndcg(rank1,rank2,3)
    aligned_score1 = np.asarray(
        [list(get_aligned_dict(map1, global_map).values())])
    aligned_score2 = np.asarray(
        [list(get_aligned_dict(map2, global_map).values())])
    from sklearn.metrics import ndcg_score

    return ndcg_score(aligned_score1, aligned_score2, k=k)
コード例 #18
0
ファイル: metrics.py プロジェクト: Stashq/PiSR1
def get_ndcg_score(model: RecommenderSystem,
                   test_ratings: pd.DataFrame) -> List[float]:

    test_users = set(test_ratings['userId'].values)

    ndcg_scores = []

    for user_id in tqdm(test_users, desc='Testing predictions'):
        pred_movies, pred_scores = model.predict_scores(user_id)

        pred_movies = {
            movie_id: score
            for movie_id, score in zip(pred_movies, pred_scores)
        }

        test_user_ratings = test_ratings.loc[test_ratings['userId'] == user_id]
        test_user_ratings = test_user_ratings.sort_values(by='rating',
                                                          ascending=False)
        test_user_movies = test_user_ratings['movieId'].values

        pred_movies_scores = []

        for movie_id in test_user_ratings['movieId'].values:
            pred_score = pred_movies[movie_id]
            pred_movies_scores.append(pred_score)

        ndcg = ndcg_score([test_user_movies], [pred_movies_scores])
        ndcg_scores.append(ndcg)

    return ndcg_scores
コード例 #19
0
ファイル: utils.py プロジェクト: ostwalt/GraphHINGE
def evaluate_ndcg(k, pred_list, label_list, batch_size, list_length):
    preds = np.array_split(pred_list.flatten(),
                           pred_list.shape[0] / list_length)
    labels = np.array_split(label_list.flatten(),
                            pred_list.shape[0] / list_length)
    NDCG = ndcg_score(y_true=labels, y_score=preds, k=k)
    '''
    ndcg=[]
    for pred,label in zip(preds,labels):
        
        idx = np.argsort(-pred)
        accumulation = 0.0
        normalization = 0.0 
        sorted_label = label[np.argsort(-label)]
        for i in range(0,k):
            accumulation += float(label[idx[i]])/ np.log2(i+2.0)
            normalization  += float(sorted_label[i])/ np.log2(i+2.0)
        if normalization == 0:
            ndcg.append(0)
        else:
            ndcg.append(accumulation/normalization)
        
    NDCG=np.mean(ndcg)
    '''
    return NDCG
コード例 #20
0
def main():
    # Evaluate model performance
    # Get the "ideal" order of y_test by sorting in descending order.

    args = parse.get_test_args()

    X_train, X_test, X_val, y_train, y_test, y_val, group_vali, group_train = get_data(
        args["data_path"])

    gbm = lgb.Booster(model_file=args["model_path"])

    true_relevance = y_test.sort_values(ascending=False)

    # Get the actual order of y_test by sorting it according to our model's predictions.

    test_pred = gbm.predict(X_test)
    y_test = pd.DataFrame({
        "relevance_score": y_test,
        "predicted_ranking": test_pred
    })

    relevance_score = y_test.sort_values("predicted_ranking", ascending=False)

    # Use computed variables to calculate the nDCG score
    print(
        "nDCG score: ",
        ndcg_score([true_relevance.to_numpy()],
                   [relevance_score["relevance_score"].to_numpy()]),
    )
コード例 #21
0
def test_restriction_local(U: int, I: int) -> None:
    try:
        from sklearn.metrics import ndcg_score
    except:
        pytest.skip()
    rns = np.random.RandomState(42)
    recommendables: List[np.ndarray] = []
    for _ in range(U):
        recommendables.append(
            rns.choice(np.arange(I), replace=False, size=rns.randint(2, I))
        )
    scores = rns.randn(U, I)
    X_gt = (rns.rand(U, I) >= 0.3).astype(np.float64)
    eval = Evaluator(
        sps.csr_matrix(X_gt),
        offset=0,
        cutoff=I,
        n_threads=1,
        per_user_recommendable_items=recommendables,
    )
    # empty mask
    mock_rec = MockRecommender(sps.csr_matrix(X_gt.shape), scores)
    my_score = eval.get_score(mock_rec)
    sklearn_metrics = defaultdict(list)
    for i in range(scores.shape[0]):
        if X_gt[i, recommendables[i]].sum() == 0:
            continue
        ndcg = ndcg_score(
            X_gt[i, recommendables[i]][None, :], scores[i, recommendables[i]][None, :]
        )
        sklearn_metrics["ndcg"].append(ndcg)

    assert my_score["ndcg"] == pytest.approx(np.mean(sklearn_metrics["ndcg"]), abs=1e-8)
コード例 #22
0
def get_ndcg(surprise_predictions, k_highest_scores=None):
    """ 
    Calculates the ndcg (normalized discounted cumulative gain) from surprise predictions, using sklearn.metrics.ndcg_score and scipy.sparse
  
    Parameters: 
    surprise_predictions (List of surprise.prediction_algorithms.predictions.Prediction): list of predictions
    k_highest_scores (positive integer): Only consider the highest k scores in the ranking. If None, use all. 
  
    Returns: 
    float in [0., 1.]: The averaged NDCG scores over all recommendations
  
    """

    uids = [int(p.uid) for p in surprise_predictions]
    iids = [int(p.iid) for p in surprise_predictions]
    r_uis = [p.r_ui for p in surprise_predictions]
    ests = [p.est for p in surprise_predictions]

    assert (len(uids) == len(iids) == len(r_uis) == len(ests))

    sparse_preds = sparse.coo_matrix((ests, (uids, iids)))
    sparse_vals = sparse.coo_matrix((r_uis, (uids, iids)))

    dense_preds = sparse_preds.toarray()
    dense_vals = sparse_vals.toarray()

    return ndcg_score(y_true=dense_vals,
                      y_score=dense_preds,
                      k=k_highest_scores)
コード例 #23
0
def test_restriction_global(U: int, I: int, R: int) -> None:
    rns = np.random.RandomState(42)
    recommendable = rns.choice(np.arange(I), replace=False, size=R)
    scores = rns.randn(U, I)
    X_gt = (rns.rand(U, I) >= 0.3).astype(np.float64)
    eval = Evaluator(
        sps.csr_matrix(X_gt),
        offset=0,
        cutoff=I,
        n_threads=1,
        recommendable_items=recommendable,
    )
    # empty mask
    mock_rec = MockRecommender(sps.csr_matrix(X_gt.shape), scores)
    my_score = eval.get_score(mock_rec)
    sklearn_metrics = defaultdict(list)
    for i in range(scores.shape[0]):
        if X_gt[i, recommendable].sum() == 0:
            continue
        ndcg = ndcg_score(X_gt[i, recommendable][None, :],
                          scores[i, recommendable][None, :])
        sklearn_metrics["ndcg"].append(ndcg)

    assert my_score["ndcg"] == pytest.approx(np.mean(sklearn_metrics["ndcg"]),
                                             abs=1e-8)
コード例 #24
0
def ndcg(
    target_index: int,
    scraps: np.ndarray,
    interactions: np.ndarray,
    benchmark: Benchmark,
    exp_base: int = 1,
    k: int = None,
) -> List[float]:

    _assert_scrap_size(scraps, benchmark)

    if k:
        scraps = scraps[:, :k]

    assert isinstance(exp_base, int)
    assert scraps.shape == interactions.shape

    exp_base = float(exp_base)
    levels = benchmark.dsm.levels[target_index]
    relevance = levels[scraps].astype(float)
    relevance[relevance <= 0] = np.inf
    relevance -= 1
    relevance = np.power(exp_base, -relevance)

    scores = []
    for interaction, ranking in zip(interactions, relevance):
        score = ndcg_score([interaction], [ranking])
        scores.append(score)

    return scores
コード例 #25
0
 def mean_ndcg(self, res, qrels):
   from sklearn.metrics import ndcg_score
   ndcgs=[]
   joined = res.merge(qrels, how='left', on=['qid', 'docno'])
   for qid, qid_group in joined.fillna(0).groupby('qid'):
     ndcgs.append(ndcg_score([qid_group["label"].values], [qid_group["score"].values]))
   return  sum(ndcgs) / len(ndcgs)
コード例 #26
0
def evaluate_model(model, valid_data_loader, k, device):
    scores = []
    for inputs, targets in valid_data_loader:
        out = model(inputs.to(device))
        scores.append(ndcg_score(targets.numpy(), out[0].T.cpu().numpy(), k=k))

    return np.mean(scores)
コード例 #27
0
ファイル: stats.py プロジェクト: darcyabjones/selectml
    def __call__(
        self,
        y: "npt.ArrayLike",
        preds: "npt.ArrayLike",
    ) -> "StatsOutType":
        from ..sk.metrics import (
            spearmans_correlation,
            pearsons_correlation,
            tau_correlation,
        )
        from sklearn.metrics import ndcg_score

        y_ = clip(np.asarray(y))
        preds_ = clip(np.asarray(preds))

        results: "StatsOutType" = {
            "pearsons": pearsons_correlation(self.as_1d(y_),
                                             self.as_1d(preds_)),
            "spearmans": spearmans_correlation(self.as_1d(y_),
                                               self.as_1d(preds_)),
            "tau": tau_correlation(self.as_1d(y_), self.as_1d(preds_)),
            "ndcg": ndcg_score(self.as_2d(y_).T,
                               self.as_2d(preds_).T),
        }

        return results
コード例 #28
0
def ndcg_at_k(expected_order, actual_scores, k):
    """
    NDCG score provided by the sklearn ndcg_score method:
    https://scikit-learn.org/stable/modules/generated/sklearn.metrics.ndcg_score.html

    The expected order is the array of places (integer number >0): lower values
    mean more important items. The actual score is the score computed for each
    element. Higher score means more important item.

    :param expected_order: real ranking from ground-truth
    :param actual_scores: scores to test
    :param k: number of places in the rank to take into account
    :return: float value of precision between 0 and 1
    """
    if expected_order.shape != actual_scores.shape:
        raise Exception("Shapes must match")
    if len(expected_order.shape) != 1:
        raise Exception("Not tested on higher dimensions")

    expected_order = np.expand_dims(expected_order, axis=0)
    actual_scores = np.expand_dims(actual_scores, axis=0)

    ndcg_true_rel = (np.max(expected_order) - expected_order)
    ndcg_real_scores = actual_scores
    return ndcg_score(ndcg_true_rel, ndcg_real_scores, k=k)
コード例 #29
0
 def ndcg(gt, preds):
     print('.ndcg')
     gt = torch.from_numpy(gt)
     preds = torch.from_numpy(preds)
     K = [5, 10, 20]
     return [ndcg_score(gt, preds, k=k)
             for k in K]  # 看这个地方具体怎么写的 修改这个地方的具体实现
コード例 #30
0
def test_metrics_with_cutoff(U: int, I: int, C: int) -> None:
    rns = np.random.RandomState(42)
    scores = rns.randn(U, I)
    X_gt = (rns.rand(U, I) >= 0.3).astype(np.float64)
    eval = Evaluator(sps.csr_matrix(X_gt), offset=0, cutoff=C, n_threads=2)
    eval_finer_chunk = Evaluator(
        sps.csr_matrix(X_gt), offset=0, cutoff=C, n_threads=2, mb_size=1
    )
    # empty mask
    mock_rec = MockRecommender(sps.csr_matrix(X_gt.shape), scores)
    my_score = eval.get_score(mock_rec)
    my_score_finer = eval_finer_chunk.get_score(mock_rec)
    for key in my_score:
        assert my_score_finer[key] == pytest.approx(my_score[key])

    ndcg = 0.0
    valid_users = 0
    map = 0.0
    precision = 0.0
    recall = 0.0
    item_appearance_count = np.zeros((I,), dtype=np.float64)
    for i in range(U):
        nzs = set(X_gt[i].nonzero()[0])
        if len(nzs) == 0:
            continue
        valid_users += 1
        ndcg += ndcg_score(X_gt[[i]], scores[[i]], k=C)
        recommended = scores[i].argsort()[::-1][:C]
        recall_denom = min(C, len(nzs))
        ap = 0.0
        current_hit = 0
        for i, rec in enumerate(recommended):
            item_appearance_count[rec] += 1.0
            if rec in nzs:
                current_hit += 1
                ap += current_hit / float(i + 1)
        ap /= recall_denom
        map += ap
        recall += current_hit / recall_denom
        precision += current_hit / C
    entropy = (lambda p: -p.dot(np.log(p)))(
        item_appearance_count / item_appearance_count.sum()
    )
    item_appearance_sorted_normalized = (
        np.sort(item_appearance_count) / item_appearance_count.sum()
    )
    lorentz_curve = np.cumsum(item_appearance_sorted_normalized)

    gini_index = 0
    delta = 1 / I
    for i in range(I):
        f = 2 * (((i + 1) / I) - lorentz_curve[i])
        gini_index += delta * f

    assert my_score["ndcg"] == pytest.approx(ndcg / valid_users)
    assert my_score["map"] == pytest.approx(map / valid_users, abs=1e-8)
    assert my_score["precision"] == pytest.approx(precision / valid_users, abs=1e-8)
    assert my_score["recall"] == pytest.approx(recall / valid_users, abs=1e-8)
    assert my_score["entropy"] == pytest.approx(entropy)
    assert my_score["gini_index"] == pytest.approx(gini_index)
コード例 #31
0
def calculate_ndcg_k(b_hat, c_ui_test, k):
    """
    Calculate NDCG of predicted relevance scores.

    Args:
        b_hat (np.array): Array filled with predicted relevance scores for user-item combinations.
        c_ui_test (sp.csr_matrix): Sparse user-item matrix with test interactions.
        k (int): Length of recommended lists.

    Returns:
        float: NDCG score averaged over all users.

    """

    n_users, n_items = c_ui_test.shape
    ndcgs = np.zeros(n_users)
    for u in range(n_users):
        # Take predicted relevance scores for this user.
        scores = b_hat[u, :]
        # True relevance is whether or not user bought the item in the test period.
        true_relevance = np.array(sp.csr_matrix.todense(c_ui_test[u, :])).flatten()
        true_relevance_binary = (true_relevance > 0).astype(float)
        # Reshape to use sklearn function.
        scores = np.reshape(scores, (1, len(scores)))
        true_relevance_binary = np.reshape(
            true_relevance_binary, (1, len(true_relevance_binary))
        )
        ndcg_u = ndcg_score(true_relevance_binary, scores, k)
        ndcgs[u] = ndcg_u
    ndcg = np.mean(ndcgs)
    return ndcg