コード例 #1
0
ファイル: rec_eval.py プロジェクト: dawenl/cofactor
def MAP_at_k_batch(train_data, heldout_data, Et, Eb, user_idx, mu=None, k=100,
                   vad_data=None):
    '''
    mean average precision@k
    '''
    batch_users = user_idx.stop - user_idx.start

    X_pred = _make_prediction(train_data, Et, Eb, user_idx, batch_users, mu=mu,
                              vad_data=vad_data)
    idx_topk_part = bn.argpartition(-X_pred, k, axis=1)
    topk_part = X_pred[np.arange(batch_users)[:, np.newaxis],
                       idx_topk_part[:, :k]]
    idx_part = np.argsort(-topk_part, axis=1)
    # X_pred[np.arange(batch_users)[:, np.newaxis], idx_topk] is the sorted
    # topk predicted score
    idx_topk = idx_topk_part[np.arange(batch_users)[:, np.newaxis], idx_part]

    aps = np.zeros(batch_users)
    for i, idx in enumerate(xrange(user_idx.start, user_idx.stop)):
        actual = heldout_data[idx].nonzero()[1]
        if len(actual) > 0:
            predicted = idx_topk[i]
            aps[i] = apk(actual, predicted, k=k)
        else:
            aps[i] = np.nan
    return aps
コード例 #2
0
ファイル: rec_eval.py プロジェクト: dawenl/cofactor
def NDCG_binary_at_k_batch(train_data, heldout_data, Et, Eb, user_idx,
                           mu=None, k=100, vad_data=None):
    '''
    normalized discounted cumulative gain@k for binary relevance
    ASSUMPTIONS: all the 0's in heldout_data indicate 0 relevance
    '''
    batch_users = user_idx.stop - user_idx.start

    X_pred = _make_prediction(train_data, Et, Eb, user_idx,
                              batch_users, mu=mu, vad_data=vad_data)
    idx_topk_part = bn.argpartition(-X_pred, k, axis=1)
    topk_part = X_pred[np.arange(batch_users)[:, np.newaxis],
                       idx_topk_part[:, :k]]
    idx_part = np.argsort(-topk_part, axis=1)
    # X_pred[np.arange(batch_users)[:, np.newaxis], idx_topk] is the sorted
    # topk predicted score
    idx_topk = idx_topk_part[np.arange(batch_users)[:, np.newaxis], idx_part]
    # build the discount template
    tp = 1. / np.log2(np.arange(2, k + 2))

    heldout_batch = heldout_data[user_idx]
    DCG = (heldout_batch[np.arange(batch_users)[:, np.newaxis],
                         idx_topk].toarray() * tp).sum(axis=1)
    IDCG = np.array([(tp[:min(n, k)]).sum()
                     for n in heldout_batch.getnnz(axis=1)])
    return DCG / IDCG
コード例 #3
0
ファイル: rec_eval.py プロジェクト: dawenl/cofactor
def recall_at_k_batch(train_data, heldout_data, Et, Eb, user_idx,
                      k=20, normalize=True, mu=None, vad_data=None):
    batch_users = user_idx.stop - user_idx.start

    X_pred = _make_prediction(train_data, Et, Eb, user_idx,
                              batch_users, mu=mu, vad_data=vad_data)
    idx = bn.argpartition(-X_pred, k, axis=1)
    X_pred_binary = np.zeros_like(X_pred, dtype=bool)
    X_pred_binary[np.arange(batch_users)[:, np.newaxis], idx[:, :k]] = True

    X_true_binary = (heldout_data[user_idx] > 0).toarray()
    tmp = (np.logical_and(X_true_binary, X_pred_binary).sum(axis=1)).astype(
        np.float32)
    recall = tmp / np.minimum(k, X_true_binary.sum(axis=1))
    return recall
コード例 #4
0
ファイル: metrics.py プロジェクト: dawn310826/TR-AAE
def NDCG_binary_at_k_batch(X_pred, heldout_batch, k=10):
    batch_users = X_pred.shape[0]
    idx_topk_part = bn.argpartition(-X_pred, k, axis=1)
    topk_part = X_pred[np.arange(batch_users)[:, np.newaxis],
                       idx_topk_part[:, :k]]
    idx_part = np.argsort(-topk_part, axis=1)
    # X_pred[np.arange(batch_users)[:, np.newaxis], idx_topk] is the sorted
    # topk predicted score
    idx_topk = idx_topk_part[np.arange(batch_users)[:, np.newaxis], idx_part]
    # build the discount template
    tp = 1. / np.log2(np.arange(2, k + 2))

    DCG = (heldout_batch[np.arange(batch_users)[:, np.newaxis],
                         idx_topk].toarray() * tp).sum(axis=1)
    IDCG = np.array([(tp[:min(n, k)]).sum()
                     for n in heldout_batch.getnnz(axis=1)]) + 0.0001
    return DCG / IDCG
コード例 #5
0
def batch_ndcg_at_k(heldout_batch, X_pred, lo, hi, k):
    idx_topk_part = bn.argpartition(-X_pred, k, axis = 1)
    topk_part = X_pred[np.arange(hi - lo)[:, np.newaxis],
                       idx_topk_part[:, :k]]
    idx_part = np.argsort(-topk_part, axis=1)
    idx_topk = idx_topk_part[np.arange(hi - lo)[:, np.newaxis], idx_part]
    tp = 1. / np.log2(np.arange(2, k + 2))

    DCG = (heldout_batch[np.arange(hi - lo)[:, np.newaxis],
                         idx_topk].toarray() * tp).sum(axis=1)
    IDCG = np.array([(tp[:min(n, k)]).sum()
                     for n in heldout_batch.getnnz(axis=1)])
    # print topk_part[6]
    # print X_pred[6, idx_topk[6]]
    # print 'my DCG: \n',DCG
    # print '\n'
    return DCG / IDCG
コード例 #6
0
def batch_map_at_k(heldout_batch, X_pred, lo, hi, k):
    idx_topk_part = bn.argpartition(-X_pred, k, axis=1)
    topk_part = X_pred[np.arange(hi - lo)[:, np.newaxis], idx_topk_part[:, :k]]
    idx_part = np.argsort(-topk_part, axis=1)
    idx_topk = idx_topk_part[np.arange(hi - lo)[:, np.newaxis], idx_part]
    aps = np.zeros(hi - lo)
    for i, idx in enumerate(xrange(lo, hi)):
        actual = heldout_batch[i].nonzero()[1]
        if len(actual) > 0:
            predicted = idx_topk[i]
            # print 'actual:',actual
            # print predicted
            # print '\n'
            aps[i] = apk(actual, predicted, k=k)
        else:
            aps[i] = np.nan
    return aps
コード例 #7
0
def ndcg_recall_on_batch(preds, holdouts, k=100):
    N, M = preds.shape
    total_items = holdouts.getnnz(axis=1)

    top_inds = bn.argpartition(-preds, k, axis=1)[:, :k]
    top_items = preds[np.arange(N)[:, np.newaxis], top_inds]
    ranked_inds = np.argsort(-top_items, axis=1)[:, :k]
    ranked_items = top_inds[np.arange(N)[:, np.newaxis], ranked_inds]

    matches = holdouts[np.expand_dims(np.arange(N), 1), ranked_items]
    dcg = np.sum(matches / np.log2(np.arange(k) + 2), axis=1)
    idcg = vidcg(total_items, k)
    ndcg = dcg / idcg

    recalls = np.sum(matches, axis=1) / np.minimum(k, total_items)

    return ndcg, recalls
コード例 #8
0
def NDCG_binary_at_k_batch(train_data,
                           heldout_data,
                           Et,
                           Eb,
                           user_idx,
                           mu=None,
                           k=100,
                           vad_data=None):
    '''
    normalized discounted cumulative gain@k for binary relevance
    ASSUMPTIONS: all the 0's in heldout_data indicate 0 relevance
    '''
    batch_users = user_idx.stop - user_idx.start

    X_pred = _make_prediction(train_data,
                              Et,
                              Eb,
                              user_idx,
                              batch_users,
                              mu=mu,
                              vad_data=vad_data)
    X_pred[X_pred <= 0] = 0

    idx_topk_part = bn.argpartition(-X_pred, k, axis=1)
    topk_part = X_pred[np.arange(batch_users)[:, np.newaxis],
                       idx_topk_part[:, :k]]
    idx_part = np.argsort(-topk_part, axis=1)
    # X_pred[np.arange(batch_users)[:, np.newaxis], idx_topk] is the sorted
    # topk predicted score
    idx_topk = idx_topk_part[np.arange(batch_users)[:, np.newaxis], idx_part]
    # build the discount template
    tp = 1. / np.log2(np.arange(2, k + 2))

    heldout_batch = heldout_data[user_idx]
    DCG = (heldout_batch[np.arange(batch_users)[:, np.newaxis],
                         idx_topk].toarray() * tp).sum(axis=1)
    IDCG = np.array([(tp[:min(n, k)]).sum()
                     for n in heldout_batch.getnnz(axis=1)])
    # print idx_topk
    # print 'DCG : %.4f, IDCG: %.4f' % (DCG, IDCG)
    # print topk_part[6]
    # print X_pred[6,idx_topk[6]]
    # print 'DCG rec eval:\n ', DCG
    # print '\n'
    return DCG / IDCG
コード例 #9
0
    def Knearest(self, smat):
        k = args.k
        #         smat=sparse.csr_matrix(smat)
        print(type(smat))
        res = []
        #         if smatn.find(".npy")!=-1:
        #             print("npy")
        #             smat=np.load(smatn)
        #             smat=sparse.csr_matrix(smat)
        #         else:
        #             smat=sparse.load_npz(smatn)
        #         print(smat.toarray())
        print(self.custn)

        for j in range(2):
            if j == 0:
                #                 outp_sim=sim_outp+"_cc.npy"
                #                 outp_index=index_outp+"_cc.npy"
                sim_mat = smat[:self.custn, :self.custn]
            else:
                #                 outp_sim=sim_outp+"_pp.npy"
                #                 outp_index=index_outp+"_pp.npy"
                sim_mat = smat[self.custn:, self.custn:]

            print(sim_mat.shape)
            #         print(cust_cust.toarray())
            res_sim = []
            res_index = []

            for i in range(sim_mat.shape[0]):
                x = sim_mat[i].toarray()
                size = x.shape[1]
                #                 print(x[0])
                index = bn.argpartition(x[0], size - k)[-k:]
                res_index.append(np.array(index))
                #             print(np.array(index))
                data = x[0][index]
                res_sim.append(np.array(data))
#                 print(data)
            print(len(res_sim))
            print(len(res_sim[0]))
            res.append([res_sim, res_index])
#             np.save(outp_sim, res_sim)
#             np.save(outp_index, res_index)
        return res
コード例 #10
0
def NDCG_binary_at_k_batch(X_pred, heldout_batch, k=100):

    batch_users = X_pred.shape[0]
    #print("batch_users: {}".format(batch_users))
    idx_topk_part = bn.argpartition(-X_pred, k, axis=1)
    topk_part = X_pred[np.arange(batch_users)[:, np.newaxis],
                       idx_topk_part[:, :k]]
    idx_part = np.argsort(-topk_part, axis=1)

    idx_topk = idx_topk_part[np.arange(batch_users)[:, np.newaxis], idx_part]

    tp = 1. / np.log2(np.arange(2, k + 2))

    DCG = (heldout_batch[np.arange(batch_users)[:, np.newaxis],
                         idx_topk].toarray() * tp).sum(axis=1)
    IDCG = np.array([(tp[:min(n, k)]).sum()
                     for n in heldout_batch.getnnz(axis=1)])
    return DCG / IDCG
コード例 #11
0
def ndcg_binary_at_k_batch(x_pred, heldout_batch, k=100):
    batch_users = x_pred.shape[0]
    idx_topk_part = bn.argpartition(-x_pred, k, axis=1)
    topk_part = x_pred[np.arange(batch_users)[:, np.newaxis],
                       idx_topk_part[:, :k]]
    idx_part = np.argsort(-topk_part, axis=1)

    idx_topk = idx_topk_part[np.arange(batch_users)[:, np.newaxis], idx_part]

    tp = 1. / np.log2(np.arange(2, k + 2))

    dcg = (heldout_batch[np.arange(batch_users)[:, np.newaxis],
                         idx_topk].toarray() * tp).sum(axis=1)
    idcg = np.array([(tp[:min(n, k)]).sum()
                     for n in heldout_batch.getnnz(axis=1)])
    ndcg = dcg / idcg
    ndcg[np.isnan(ndcg)] = 0
    return ndcg
コード例 #12
0
def Rpre_at_k_batch(X_pred, heldout_batch, length):
    batch_users = X_pred.shape[0]
    X_pred_binary = np.zeros_like(X_pred, dtype=bool)

    X_true_binary = heldout_batch.astype('int')
    true_size = X_true_binary.sum(axis=1)

    for i in range(batch_users):
        if (true_size[i] == 0):
            continue

        idx = bn.argpartition(-X_pred[i, :], length[i][0] - 1)
        X_pred_binary[i, idx[:length[i][0]]] = True

    tmp = (np.logical_and(X_true_binary,
                          X_pred_binary).sum(axis=1)).astype(np.float32)
    Rpre = tmp[:, np.newaxis] / (length + 0.0000000000000000001)
    return Rpre
コード例 #13
0
ファイル: metrics.py プロジェクト: dawn310826/TR-AAE
def MAP_at_k_batch(X_pred, heldout_batch, k=10):
    batch_users = X_pred.shape[0]
    idx = bn.argpartition(-X_pred, k, axis=1)

    X_true_binary = (heldout_batch > 0).toarray()
    tmp = np.zeros_like(batch_users, dtype=float)

    for i in range(1, k + 1):
        rel = np.zeros_like(batch_users, dtype=int)
        for user in range(batch_users):
            if X_true_binary[user, idx[user, k - 1]]:
                rel[user] = 1
        print(rel.shape)
        r = Precision_at_k_batch(X_pred, heldout_batch, i) * rel
        tmp = tmp + r

    Map = tmp / (np.minimum(k, X_true_binary.sum(axis=1)) + 0.0001)
    return Map
コード例 #14
0
def Rpre_at_k_batch(X_pred, heldout_batch):
    batch_users = X_pred.shape[0]
    X_pred_binary = np.zeros_like(X_pred, dtype=bool)

    X_true_binary = (heldout_batch > 0).toarray()
    true_size = X_true_binary.sum(axis=1)

    for i in range(batch_users):
        if (true_size[i] == 0):
            continue

        idx = bn.argpartition(-X_pred[i, :], true_size[i] - 1)
        X_pred_binary[i, idx[:true_size[i]]] = True

    tmp = (np.logical_and(X_true_binary,
                          X_pred_binary).sum(axis=1)).astype(np.float32)
    Rpre = tmp / (X_true_binary.sum(axis=1) + 0.0000001)
    return Rpre
コード例 #15
0
def MAP_at_k_batch(train_data,
                   heldout_data,
                   Et,
                   Eb,
                   user_idx,
                   mu=None,
                   k=100,
                   vad_data=None,
                   clear_invalid=True):
    '''
    mean average precision@k
    '''
    batch_users = user_idx.stop - user_idx.start

    X_pred = _make_prediction(train_data,
                              Et,
                              Eb,
                              user_idx,
                              batch_users,
                              mu=mu,
                              vad_data=vad_data)
    if clear_invalid:
        X_pred = clear_invalid_project(train_data=train_data,
                                       vad_data=vad_data,
                                       X_pred=X_pred,
                                       lo=user_idx.start,
                                       hi=user_idx.stop)
    idx_topk_part = bn.argpartition(-X_pred, k, axis=1)
    topk_part = X_pred[np.arange(batch_users)[:, np.newaxis],
                       idx_topk_part[:, :k]]
    idx_part = np.argsort(-topk_part, axis=1)
    # X_pred[np.arange(batch_users)[:, np.newaxis], idx_topk] is the sorted
    # topk predicted score
    idx_topk = idx_topk_part[np.arange(batch_users)[:, np.newaxis], idx_part]

    aps = np.zeros(batch_users)
    for i, idx in enumerate(xrange(user_idx.start, user_idx.stop)):
        actual = heldout_data[idx].nonzero()[1]
        if len(actual) > 0:
            predicted = idx_topk[i]
            aps[i] = apk(actual, predicted, k=k)
        else:
            aps[i] = np.nan
    return aps
コード例 #16
0
def evaluate_emb(emb, labels):
    """Evaluate embeddings based on Recall@k."""
    d_mat = get_distance_matrix(emb)
    d_mat = d_mat.asnumpy()
    labels = labels.asnumpy()

    names = []
    accs = []
    for k in [1, 2, 4, 8, 16]:
        names.append('Recall@%d' % k)
        correct, cnt = 0.0, 0.0
        for i in range(emb.shape[0]):
            d_mat[i, i] = 1e10
            nns = argpartition(d_mat[i], k)[:k]
            if any(labels[i] == labels[nn] for nn in nns):
                correct += 1
            cnt += 1
        accs.append(correct / cnt)
    return names, accs
コード例 #17
0
def precision_at_k_batch(train_data, heldout_data, Et, Eb, user_idx,
                         k=20, normalize=True, mu=None, vad_data=None):
    batch_users = user_idx.stop - user_idx.start

    X_pred = _make_prediction(train_data, Et, Eb, user_idx,
                              batch_users, mu=mu, vad_data=vad_data)
    idx = bn.argpartition(-X_pred, k, axis=1)
    X_pred_binary = np.zeros_like(X_pred, dtype=bool)
    X_pred_binary[np.arange(batch_users)[:, np.newaxis], idx[:, :k]] = True

    X_true_binary = (heldout_data[user_idx] > 0).toarray()
    tmp = (np.logical_and(X_true_binary, X_pred_binary).sum(axis=1)).astype(
        np.float32)

    if normalize:
        precision = tmp / np.minimum(k, X_true_binary.sum(axis=1))
    else:
        precision = tmp / k
    return precision
コード例 #18
0
ファイル: train.py プロジェクト: CoderHHX/incubator-mxnet
def evaluate_emb(emb, labels):
    """Evaluate embeddings based on Recall@k."""
    d_mat = get_distance_matrix(emb)
    d_mat = d_mat.asnumpy()
    labels = labels.asnumpy()

    names = []
    accs = []
    for k in [1, 2, 4, 8, 16]:
        names.append('Recall@%d' % k)
        correct, cnt = 0.0, 0.0
        for i in range(emb.shape[0]):
            d_mat[i, i] = 1e10
            nns = argpartition(d_mat[i], k)[:k]
            if any(labels[i] == labels[nn] for nn in nns):
                correct += 1
            cnt += 1
        accs.append(correct/cnt)
    return names, accs
コード例 #19
0
def translation_results(X, y, vocab, M, lg2_vectors, lg2_vocab):
    """X, y, vocab - The training or test data that you want results for
    T - The translation matrix
    lg2_vectors, lg2_vocab - Foreign language used to find the nearest neighbor
    """

    # Data Prep on Inputs
    X_word, y_word = zip(*vocab)
    X_norm, X_normed = normalize(X)
    y_norm, y_normed = normalize(y)
    lg2_vectors_norm, lg2_vectors_normed = normalize(lg2_vectors)

    # yhat
    yhat = X.dot(M)
    yhat_norm, yhat_normed = normalize(yhat)

    #X_norm = normalize(X)


    # Nearest Neighbors
    neg_cosine = -yhat_normed.dot(lg2_vectors_normed.T)
    ranked_neighbor_indices = bn.argpartition(neg_cosine, 1, axis = 1 )
    # Nearest Neighbor
    nearest_neighbor_indices = ranked_neighbor_indices[:, 0]
    yhat_neighbor = lg2_vectors[nearest_neighbor_indices, :]
    yhat_neighbor_norm, yhat_neighbor_normed = normalize(yhat_neighbor)
    yhat_neighbor_word = np.asarray(lg2_vocab)[nearest_neighbor_indices]

    # Results DF
    cols = ['X_norm', 'y_norm', 'yhat_norm', 'yhat_neighbor_norm',
            'X_word', 'y_word', 'yhat_neighbor_word']
    results_df = pd.DataFrame({'X_norm': X_norm,
                               'y_norm': y_norm,
                               'yhat_norm': yhat_norm,
                               'yhat_neighbor_norm': yhat_neighbor_norm,
                               'X_word': X_word,
                               'y_word': y_word,
                               'yhat_neighbor_word': yhat_neighbor_word,})
    results_df = results_df[cols]
    results_df['neighbor_correct'] = results_df.y_word == \
        results_df.yhat_neighbor_word

    return results_df
コード例 #20
0
    def hit_at_k(pred_scores, ground_truth, k=100):
        r"""Compute the hit at k.

        The Hit@k is either 1, if a relevan item is in the top *k* scored items, or 0 otherwise.

        Parameters
        ----------
        pred_scores : :obj:`numpy.array`
            The array with the predicted scores. Users are on the rows and items on the columns.
        ground_truth : :obj:`numpy.array`
            Binary array with the ground truth. 1 means the item is relevant for the user
            and 0 not relevant. Users are on the rows and items on the columns.
        k : :obj:`int` [optional]
            The number of top items to considers, by default 100

        Returns
        -------
        :obj:`numpy.array`
            An array containing the *hit@k* value for each user.

        Examples
        --------
        >>> import numpy as np
        >>> from rectorch.metrics import Metrics
        >>> scores = np.array([[4., 3., 2., 1.]])
        >>> ground_truth = np.array([[0, 0, 1., 1.]])
        >>> Metrics.hit_at_k(scores, ground_truth, 3)
        np.array([1.])
        >>> Metrics.hit_at_k(scores, ground_truth, 2)
        np.array([0.])
        """
        assert pred_scores.shape == ground_truth.shape,\
            "'pred_scores' and 'ground_truth' must have the same shape."
        k = min(pred_scores.shape[1], k)
        idx = bn.argpartition(-pred_scores, k - 1, axis=1)
        pred_scores_binary = np.zeros_like(pred_scores, dtype=bool)
        pred_scores_binary[np.arange(pred_scores.shape[0])[:, np.newaxis],
                           idx[:, :k]] = True
        X_true_binary = (ground_truth > 0)
        num = (np.logical_and(
            X_true_binary, pred_scores_binary).sum(axis=1)).astype(np.float32)
        return num > 0
コード例 #21
0
def threaded_multiple_arg_min(vector, s):
    count = multiprocessing.cpu_count()
    n = vector.size
    if (s == 0):
        return np.array([])
    if (s == 1):
        return np.argmin(vector)
    if (s > n):
        return np.argsort(vector)
    if (n < 1000):
        return np.argsort(vector)[:s]
    split_size = max(s * 100, n // 10 // count)  # s*s*1000 ## ????
    while ((n % split_size) <= s * 10):
        split_size += s * 10 // count + 1
    l = list(range(0, vector.size, split_size))
    r_list = [[] for x in l]
    t_count = min(count, (n - 1) // split_size + 1)
    #executor = concurrent.futures.ThreadPoolExecutor(count)
    #def _run_sort_thread(indexes, s):
    #    for i in indexes:
    #        x = split_size * i
    #        r_x = bottleneck.argpartition(vector[x:x+split_size], s)[:s] + x
    #        r_list[i] = list(r_x)
    #futures = {}
    for i in range(len(l)):
        #for i in range(1, t_count):
        #indexes = range(i,len(l),t_count)
        #args = (_run_sort_thread, indexes, s)
        #futures[executor.submit(*args)] = i
        x = split_size * i
        r_x = bottleneck.argpartition(vector[x:x + split_size], s)[:s] + x
        r_list[i] = list(r_x)
    #_run_sort_thread( range(0,len(l),t_count), s)
    #concurrent.futures.wait(futures)
    i_list = []
    for x in r_list:
        i_list += x
    indexes = np.array(i_list)
    #indexes = bottleneck.argpartition(vector, s)[:s]
    values = vector[indexes]
    new_index = np.argsort(values)[:s]
    return indexes[new_index]
def NDCG_binary_at_k_batch(X_pred, heldout_batch, k=100):
    '''
    Normalized Discounted Cumulative Gain@k for binary relevance
    ASSUMPTIONS: all the 0's in heldout_data indicate 0 relevance
    '''
    batch_users = X_pred.shape[0]
    idx_topk_part = bn.argpartition(-X_pred, k, axis=1)
    topk_part = X_pred[np.arange(batch_users)[:, np.newaxis],
                       idx_topk_part[:, :k]]
    idx_part = np.argsort(-topk_part, axis=1)

    idx_topk = idx_topk_part[np.arange(batch_users)[:, np.newaxis], idx_part]

    tp = 1. / np.log2(np.arange(2, k + 2))

    DCG = (heldout_batch[np.arange(batch_users)[:, np.newaxis],
                         idx_topk].toarray() * tp).sum(axis=1)
    IDCG = np.array([(tp[:min(n, k)]).sum()
                     for n in heldout_batch.getnnz(axis=1)])
    return DCG / IDCG
コード例 #23
0
def NDCG_binary_at_k_batch(train_data, heldout_data, Et, Eb, user_idx,
                           mu=None, k=100, vad_data=None, clear_invalid=False, cache=False):
    '''
    normalized discounted cumulative gain@k for binary relevance
    ASSUMPTIONS: all the 0's in heldout_data indicate 0 relevance
    '''
    batch_users = user_idx.stop - user_idx.start
    if cache:
        file_path = os.path.join(constants.PRED_DIR, 'pred_%d_%d.npz' % (user_idx.start, user_idx.stop))
        if os.path.exists(file_path):
            X_pred = np.load(file_path)['X_pred']
        else:
            X_pred = _make_prediction(train_data, Et, Eb, user_idx,
                                      batch_users, mu=mu, vad_data=vad_data)
            np.savez(file_path, X_pred=X_pred)
    else:
        X_pred = _make_prediction(train_data, Et, Eb, user_idx,
                                  batch_users, mu=mu, vad_data=vad_data)

    #clear backed projects in training data --> integrated into learning process.
    if clear_invalid:
        X_pred = clear_invalid_project(train_data=train_data, vad_data=vad_data,
                                       X_pred=X_pred, lo=user_idx.start, hi=user_idx.stop)
        # for ui in range(user_idx.start,user_idx.stop):
        #     X_pred[ui, user_invalid_projects_map[ui]] = 0.0
    idx_topk_part = bn.argpartition(-X_pred, k, axis=1)
    topk_part = X_pred[np.arange(batch_users)[:, np.newaxis],
                       idx_topk_part[:, :k]]
    idx_part = np.argsort(-topk_part, axis=1)
    # X_pred[np.arange(batch_users)[:, np.newaxis], idx_topk] is the sorted
    # topk predicted score
    idx_topk = idx_topk_part[np.arange(batch_users)[:, np.newaxis], idx_part]
    # build the discount template
    tp = 1. / np.log2(np.arange(2, k + 2))

    heldout_batch = heldout_data[user_idx]
    DCG = (heldout_batch[np.arange(batch_users)[:, np.newaxis],
                         idx_topk].toarray() * tp).sum(axis=1)
    IDCG = np.array([(tp[:min(n, k)]).sum()
                     for n in heldout_batch.getnnz(axis=1)])
    return DCG / IDCG
コード例 #24
0
def Recall_at_k_batch(X_pred, heldout_batch, k=100, input_batch=None):
    if input_batch is not None:
        X_pred[input_batch.nonzero()] = -np.inf
    batch_users = X_pred.shape[0]

    idx = bn.argpartition(-X_pred, k, axis=1)
    X_pred_binary = np.zeros_like(X_pred, dtype=bool)
    X_pred_binary[np.arange(batch_users)[:, np.newaxis], idx[:, :k]] = True

    X_true_binary = (heldout_batch > 0)  #.toarray()
    try:
        X_true_binary = X_true_binary.toarray()
    except:
        # print("Wasn't sparse")
        pass

    tmp = (np.logical_and(X_true_binary,
                          X_pred_binary).sum(axis=1)).astype(np.float32)
    recall = tmp / np.maximum(np.minimum(k, X_true_binary.sum(axis=1)), 1)
    recall = recall.astype(np.float32)
    return recall
コード例 #25
0
def get_recall(preds, targets, k=10):

    batch_size = preds.shape[0]
    voc_size = preds.shape[1]

    print("batch_size", batch_size)
    print("voc_size", voc_size)

    idx = bn.argpartition(-preds, k, axis=1)
    hit = targets[np.arange(batch_size)[:, np.newaxis], idx[:, :k]]

    hit = np.count_nonzero(hit, axis=1)
    hit = np.array(hit)

    hit = np.squeeze(hit)

    recall = np.array([min(n, k) for n in np.count_nonzero(targets, axis=1)])

    recall = hit / recall

    return recall
コード例 #26
0
ファイル: utils.py プロジェクト: verachtertr/RecVAE
def ndcg(X_pred, heldout_batch, k=100):
    '''
    normalized discounted cumulative gain@k for binary relevance
    ASSUMPTIONS: all the 0's in heldout_data indicate 0 relevance
    '''
    batch_users = X_pred.shape[0]
    idx_topk_part = bn.argpartition(-X_pred, k, axis=1)
    topk_part = X_pred[np.arange(batch_users)[:, np.newaxis],
                       idx_topk_part[:, :k]]
    idx_part = np.argsort(-topk_part, axis=1)
    # X_pred[np.arange(batch_users)[:, np.newaxis], idx_topk] is the sorted
    # topk predicted score
    idx_topk = idx_topk_part[np.arange(batch_users)[:, np.newaxis], idx_part]
    # build the discount template
    tp = 1. / np.log2(np.arange(2, k + 2))

    DCG = (heldout_batch[np.arange(batch_users)[:, np.newaxis],
                         idx_topk].toarray() * tp).sum(axis=1)
    IDCG = np.array([(tp[:min(n, k)]).sum()
                     for n in heldout_batch.getnnz(axis=1)])
    return DCG / IDCG
コード例 #27
0
def NDCG_at_k_batch_a(X_pred, heldout_batch, k=100):

    #print(heldout_batch[1].to_dense().size())
    #print(heldout_batch[1].to_dense().nonzero())
    #print((heldout_batch[1].to_dense() != 0).sum(dim=1))
    X_pred = X_pred.cpu().detach().numpy()
    nnz = (heldout_batch.to_dense() != 0).sum(dim=1)
    heldout_batch = heldout_batch.to_dense().cpu().detach().numpy()
    batch_users = X_pred.shape[0]
    #print(batch_users)
    idx_topk_part = bn.argpartition(-X_pred, k, axis=1)
    topk_part = X_pred[np.arange(batch_users)[:, np.newaxis],
                       idx_topk_part[:, :k]]
    idx_part = np.argsort(-topk_part, axis=1)

    idx_topk = idx_topk_part[np.arange(batch_users)[:, np.newaxis], idx_part]

    tp = 1. / np.log2(np.arange(2, k + 2))

    DCG = (heldout_batch[np.arange(batch_users)[:, np.newaxis], idx_topk] *
           tp).sum(axis=1)
    #print(nnz.size())
    IDCG = np.array([(tp[:min(n, k)]).sum() for n in nnz])
    return DCG / IDCG

    pred = X_pred.cpu().detach().numpy()
    gt = heldout_batch.to_dense().cpu().detach().numpy()

    gt_idx_sorted = np.argsort(-pred, axis=1)
    idx_topk = idx_topk_part[np.arange(len(pred))[:, np.newaxis],
                             gt_idx_sorted]

    gains = 2**gt - 1
    tp = 1. / np.log2(np.arange(2, k + 2))
    #print(gt_idx_sorted)
    print(pred[gt_idx_sorted].shape)
    DCG = (heldout_batch[np.arange(batch_users)[:, np.newaxis], idx_topk] *
           tp).sum(axis=1)
    iDCG = gt[gt_idx_sorted] * tp
    return DCG / IDCG
コード例 #28
0
def get_dealwifi(wifi_info,ntop=8):
    str = wifi_info.split(';')
    every_wifi = np.array([each.split('|')for each in str]) # 转化为矩阵
    wifi_id = every_wifi[:,0]
    wifi_value = every_wifi[:,1]
    wifi_state = every_wifi[:,2]
    print(wifi_state)
    print(wifi_value)
    #print(np.array(wifi_value))
    if 'true' in wifi_state:
        connection_wifi_name = wifi_id[wifi_state.tolist().index('true')]
    else:
        connection_wifi_name = 'null'
    if len(wifi_id)>= 8:
        top_5_idx = bottleneck.argpartition(np.array(wifi_value), ntop)[:ntop] # 找到前n大的几个数的索引
        return wf_name_2_idx(wifi_id[top_5_idx]),wf_name_2_idx([connection_wifi_name])
    else:
        sort_index  = np.argsort(-np.array(wifi_value))

        w_name = wifi_id[sort_index].tolist()
        w_name.extend(['null']*(8-len(wifi_value)))
        return wf_name_2_idx(w_name), wf_name_2_idx([connection_wifi_name])
コード例 #29
0
def precision_at_k_batch(train_data,
                         heldout_data,
                         Et,
                         Eb,
                         user_idx,
                         k=20,
                         normalize=True,
                         mu=None,
                         vad_data=None):
    batch_users = user_idx.stop - user_idx.start

    X_pred = _make_prediction(train_data,
                              Et,
                              Eb,
                              user_idx,
                              batch_users,
                              mu=mu,
                              vad_data=vad_data)
    # Xavier: first k indexes are corresponding to highest k elements.
    idx = bn.argpartition(-X_pred, k, axis=1)
    # Xavier: a matrix whose elements are zero, in this case, are false.
    X_pred_binary = np.zeros_like(X_pred, dtype=bool)
    # Xavier: np.arange(batch_users) returns an array like [batch_users.start, ..., batch_users.end]
    #   [:, np.newaxis] -> reshape the array from (batch_users.amount,) to (batch_users.amount, 1)
    #       ref: https://stackoverflow.com/questions/29241056/how-does-numpy-newaxis-work-and-when-to-use-it
    #   Set the value to 1 at (userIdx, itemIdx), where userIdxes denotes all users in batch_users and itemIdxes denotes
    #       all indexes representing the items that are highest k prediction and (userIdx, itemIdx) is all possible
    #       values in cartesian product of userIdxes and itemIdxes
    X_pred_binary[np.arange(batch_users)[:, np.newaxis], idx[:, :k]] = True

    X_true_binary = (heldout_data[user_idx] > 0).toarray()
    tmp = (np.logical_and(X_true_binary,
                          X_pred_binary).sum(axis=1)).astype(np.float32)

    if normalize:
        precision = tmp / np.minimum(k, X_true_binary.sum(axis=1))
    else:
        precision = tmp / k
    return precision
コード例 #30
0
def ans_output(X_pred, k=500, file_ans=0):
    '''
	normalized discounted cumulative gain@k for binary relevance
	ASSUMPTIONS: all the 0's in heldout_data indicate 0 relevance
	'''
    batch_users = X_pred.shape[0]
    print(X_pred.shape)
    idx_topk_part = bn.argpartition(-X_pred, k - 1, axis=1)

    topk_part = X_pred[np.arange(batch_users)[:, np.newaxis],
                       idx_topk_part[:, :k]]
    idx_part = np.argsort(-topk_part, axis=1)

    # X_pred[np.arange(batch_users)[:, np.newaxis], idx_topk] is the sorted
    # topk predicted score
    idx_topk = idx_topk_part[np.arange(batch_users)[:, np.newaxis], idx_part]

    if (file_ans != 0):
        for i in range(idx_topk.shape[0]):
            for j in range(idx_topk.shape[1]):
                file_ans.write('{0} '.format(idx_topk[i][j]))
            file_ans.write('\n')
コード例 #31
0
ファイル: evaluation.py プロジェクト: txrxrxr/EVCF
    def NDCG_binary_at_k_batch(X_pred, heldout_batch, k=100):
        '''
        normalized discounted cumulative gain@k for binary relevance
        ASSUMPTIONS: all the 0's in heldout_data indicate 0 relevance
        '''
        batch_users = X_pred.shape[0]
        idx_topk_part = bn.argpartition(-X_pred, k, axis=1)
        topk_part = X_pred[np.arange(batch_users)[:, np.newaxis],
                           idx_topk_part[:, :k]]
        idx_part = np.argsort(-topk_part, axis=1)
        # X_pred[np.arange(batch_users)[:, np.newaxis], idx_topk] is the sorted
        # topk predicted score
        idx_topk = idx_topk_part[np.arange(batch_users)[:, np.newaxis], idx_part]
        # build the discount template
        tp = 1. / np.log2(np.arange(2, k + 2))
        tp = torch.tensor(tp, dtype=torch.float)  # ! in order to do operations with torch tensor

        DCG = (heldout_batch[np.arange(batch_users)[:, np.newaxis],
                             idx_topk].cpu() * tp).sum(dim=1)
        IDCG = torch.tensor([(tp[:min(n, k)]).sum()
                             for n in (heldout_batch != 0).sum(dim=1)])
        return DCG / IDCG
コード例 #32
0
def NDCG_binary_at_k_batch(X_pred, heldout_batch, k=10):
    '''
    normalized discounted cumulative gain@k for binary relevance
    ASSUMPTIONS: all the 0's in heldout_data indicate 0 relevance
    '''
    batch_users = X_pred.shape[0]

    # bn.argpartition
    # kth 번째까지 등장하는 원소들이 리스트 내부에서 가장 작은 kth번째 원소들이도록 partition해주는 인덱스 리스트를 출력해줌.
    # 여기서는 -를 취해줬으므로, k번째까지 등장하는 원소들이 리스트 내부에서 가장 큰 100번째 원소들이도록 partition해주는 인덱스 리스트를 출력.

    idx_topk_part = bn.argpartition(-X_pred, k, axis=1)

    topk_part = X_pred[np.arange(batch_users)[:, np.newaxis],
                       idx_topk_part[:, :k]]

    # np.argsort -> sorting한 리스트의 arg를 뱉어냄.
    # -를 붙여줌으로써 내림차순으로 정리(높은 놈이 위에)
    idx_part = np.argsort(-topk_part, axis=1)

    # X_pred[np.arange(batch_users)[:, np.newaxis], idx_topk] is the sorted

    # topk predicted score
    idx_topk = idx_topk_part[np.arange(batch_users)[:, np.newaxis], idx_part]
    # build the discount template
    tp = 1. / np.log2(np.arange(2, k + 2))

    DCG_filter = (heldout_batch[np.arange(batch_users)[:, np.newaxis],
                                idx_topk].toarray() > 0)

    DCG = (DCG_filter * tp).sum(axis=1)

    # sparse matrix 내에서, 고객의 총 interaction 수와 k 중 더 작은 것을 골라서 IDCG 계산

    IDCG = np.array([(tp[:min(n, k)]).sum()
                     for n in heldout_batch.getnnz(axis=1)])

    return DCG / IDCG
コード例 #33
0
def Recall_binary_at_k_batch(logits, y_true, k=10):
    """
    Function taken from Variational Autoencoders for Collaborative Filtering
    :param logits: the un-normalised predictions
    :param y_true: the true predictions (binary)
    :param k: cut-off value
    :return: normalised recall at k
    """
    n = logits.shape[0]
    dummy_column = np.arange(n).reshape(n, 1)

    idx_topk_part = bn.argpartition(-logits, k, axis=1)[:, :k]
    X_pred_binary = np.zeros_like(logits, dtype=bool)
    X_pred_binary[dummy_column, idx_topk_part] = True

    X_true_binary = (y_true > 0).toarray()
    tmp = (np.logical_and(X_true_binary, X_pred_binary).sum(axis=1)).astype(
        np.float32)
    recall = tmp / np.minimum(k, X_true_binary.sum(axis=1))

    assert (recall >= 0).all() and (recall <= 1).all()

    return recall
コード例 #34
0
ファイル: eval_functions.py プロジェクト: lwpyh/Long-Tail-GAN
def Recall_at_k_batch(X_pred, heldout_batch, k=100):
    batch_users = X_pred.shape[0]

    idx = bn.argpartition(-X_pred, k, axis=1)
    X_pred_binary = np.zeros_like(X_pred, dtype=bool)
    X_pred_binary[np.arange(batch_users)[:, np.newaxis], idx[:, :k]] = True

    X_true_binary = (heldout_batch > 0).toarray()

    tmp = (np.logical_and(X_true_binary,
                          X_pred_binary).sum(axis=1)).astype(np.float32)

    denom = np.minimum(k, X_true_binary.sum(axis=1))

    output = []

    misclassified_tags = []

    for idx in range(np.shape(tmp)[0]):
        if denom[idx] != 0:
            output.append(tmp[idx] / denom[idx])

    return output, misclassified_tags
コード例 #35
0
ファイル: training.py プロジェクト: JiaoFusen/csdnSMP
        return a


    pd_train['labels'] = pd_train['labels'].apply(f)

    # change embedding
    embedding = np.array(list(pd_embedding['blog_jieba_vector'].apply(list)))

    print("pd_train:\n", pd_train)
    print("pd_test:\n", pd_test)

    # begin training
    dev_classes = lstm(list(pd_train['embedding_index'])[:100], list(pd_train['labels'])[:100],
                       list(pd_test['embedding_index'])[:100],
                       embedding_dim, embedding, maxlen, labels_len)

    print("dev classes:", dev_classes)

    # bottleneck
    import bottleneck as bl

    result = []
    labels_name = np.array(labels_name)
    for classes in dev_classes:
        result.append(labels_name[bl.argpartition(-classes, 3)[:3]])

    pd_result = pd.DataFrame(result)
    pd_result.to_csv(dp.ResultTxt, sep="\001", header=False, index=False, encoding='utf8')