def MAP_at_k_batch(train_data, heldout_data, Et, Eb, user_idx, mu=None, k=100, vad_data=None): ''' mean average precision@k ''' batch_users = user_idx.stop - user_idx.start X_pred = _make_prediction(train_data, Et, Eb, user_idx, batch_users, mu=mu, vad_data=vad_data) idx_topk_part = bn.argpartition(-X_pred, k, axis=1) topk_part = X_pred[np.arange(batch_users)[:, np.newaxis], idx_topk_part[:, :k]] idx_part = np.argsort(-topk_part, axis=1) # X_pred[np.arange(batch_users)[:, np.newaxis], idx_topk] is the sorted # topk predicted score idx_topk = idx_topk_part[np.arange(batch_users)[:, np.newaxis], idx_part] aps = np.zeros(batch_users) for i, idx in enumerate(xrange(user_idx.start, user_idx.stop)): actual = heldout_data[idx].nonzero()[1] if len(actual) > 0: predicted = idx_topk[i] aps[i] = apk(actual, predicted, k=k) else: aps[i] = np.nan return aps
def NDCG_binary_at_k_batch(train_data, heldout_data, Et, Eb, user_idx, mu=None, k=100, vad_data=None): ''' normalized discounted cumulative gain@k for binary relevance ASSUMPTIONS: all the 0's in heldout_data indicate 0 relevance ''' batch_users = user_idx.stop - user_idx.start X_pred = _make_prediction(train_data, Et, Eb, user_idx, batch_users, mu=mu, vad_data=vad_data) idx_topk_part = bn.argpartition(-X_pred, k, axis=1) topk_part = X_pred[np.arange(batch_users)[:, np.newaxis], idx_topk_part[:, :k]] idx_part = np.argsort(-topk_part, axis=1) # X_pred[np.arange(batch_users)[:, np.newaxis], idx_topk] is the sorted # topk predicted score idx_topk = idx_topk_part[np.arange(batch_users)[:, np.newaxis], idx_part] # build the discount template tp = 1. / np.log2(np.arange(2, k + 2)) heldout_batch = heldout_data[user_idx] DCG = (heldout_batch[np.arange(batch_users)[:, np.newaxis], idx_topk].toarray() * tp).sum(axis=1) IDCG = np.array([(tp[:min(n, k)]).sum() for n in heldout_batch.getnnz(axis=1)]) return DCG / IDCG
def recall_at_k_batch(train_data, heldout_data, Et, Eb, user_idx, k=20, normalize=True, mu=None, vad_data=None): batch_users = user_idx.stop - user_idx.start X_pred = _make_prediction(train_data, Et, Eb, user_idx, batch_users, mu=mu, vad_data=vad_data) idx = bn.argpartition(-X_pred, k, axis=1) X_pred_binary = np.zeros_like(X_pred, dtype=bool) X_pred_binary[np.arange(batch_users)[:, np.newaxis], idx[:, :k]] = True X_true_binary = (heldout_data[user_idx] > 0).toarray() tmp = (np.logical_and(X_true_binary, X_pred_binary).sum(axis=1)).astype( np.float32) recall = tmp / np.minimum(k, X_true_binary.sum(axis=1)) return recall
def NDCG_binary_at_k_batch(X_pred, heldout_batch, k=10): batch_users = X_pred.shape[0] idx_topk_part = bn.argpartition(-X_pred, k, axis=1) topk_part = X_pred[np.arange(batch_users)[:, np.newaxis], idx_topk_part[:, :k]] idx_part = np.argsort(-topk_part, axis=1) # X_pred[np.arange(batch_users)[:, np.newaxis], idx_topk] is the sorted # topk predicted score idx_topk = idx_topk_part[np.arange(batch_users)[:, np.newaxis], idx_part] # build the discount template tp = 1. / np.log2(np.arange(2, k + 2)) DCG = (heldout_batch[np.arange(batch_users)[:, np.newaxis], idx_topk].toarray() * tp).sum(axis=1) IDCG = np.array([(tp[:min(n, k)]).sum() for n in heldout_batch.getnnz(axis=1)]) + 0.0001 return DCG / IDCG
def batch_ndcg_at_k(heldout_batch, X_pred, lo, hi, k): idx_topk_part = bn.argpartition(-X_pred, k, axis = 1) topk_part = X_pred[np.arange(hi - lo)[:, np.newaxis], idx_topk_part[:, :k]] idx_part = np.argsort(-topk_part, axis=1) idx_topk = idx_topk_part[np.arange(hi - lo)[:, np.newaxis], idx_part] tp = 1. / np.log2(np.arange(2, k + 2)) DCG = (heldout_batch[np.arange(hi - lo)[:, np.newaxis], idx_topk].toarray() * tp).sum(axis=1) IDCG = np.array([(tp[:min(n, k)]).sum() for n in heldout_batch.getnnz(axis=1)]) # print topk_part[6] # print X_pred[6, idx_topk[6]] # print 'my DCG: \n',DCG # print '\n' return DCG / IDCG
def batch_map_at_k(heldout_batch, X_pred, lo, hi, k): idx_topk_part = bn.argpartition(-X_pred, k, axis=1) topk_part = X_pred[np.arange(hi - lo)[:, np.newaxis], idx_topk_part[:, :k]] idx_part = np.argsort(-topk_part, axis=1) idx_topk = idx_topk_part[np.arange(hi - lo)[:, np.newaxis], idx_part] aps = np.zeros(hi - lo) for i, idx in enumerate(xrange(lo, hi)): actual = heldout_batch[i].nonzero()[1] if len(actual) > 0: predicted = idx_topk[i] # print 'actual:',actual # print predicted # print '\n' aps[i] = apk(actual, predicted, k=k) else: aps[i] = np.nan return aps
def ndcg_recall_on_batch(preds, holdouts, k=100): N, M = preds.shape total_items = holdouts.getnnz(axis=1) top_inds = bn.argpartition(-preds, k, axis=1)[:, :k] top_items = preds[np.arange(N)[:, np.newaxis], top_inds] ranked_inds = np.argsort(-top_items, axis=1)[:, :k] ranked_items = top_inds[np.arange(N)[:, np.newaxis], ranked_inds] matches = holdouts[np.expand_dims(np.arange(N), 1), ranked_items] dcg = np.sum(matches / np.log2(np.arange(k) + 2), axis=1) idcg = vidcg(total_items, k) ndcg = dcg / idcg recalls = np.sum(matches, axis=1) / np.minimum(k, total_items) return ndcg, recalls
def NDCG_binary_at_k_batch(train_data, heldout_data, Et, Eb, user_idx, mu=None, k=100, vad_data=None): ''' normalized discounted cumulative gain@k for binary relevance ASSUMPTIONS: all the 0's in heldout_data indicate 0 relevance ''' batch_users = user_idx.stop - user_idx.start X_pred = _make_prediction(train_data, Et, Eb, user_idx, batch_users, mu=mu, vad_data=vad_data) X_pred[X_pred <= 0] = 0 idx_topk_part = bn.argpartition(-X_pred, k, axis=1) topk_part = X_pred[np.arange(batch_users)[:, np.newaxis], idx_topk_part[:, :k]] idx_part = np.argsort(-topk_part, axis=1) # X_pred[np.arange(batch_users)[:, np.newaxis], idx_topk] is the sorted # topk predicted score idx_topk = idx_topk_part[np.arange(batch_users)[:, np.newaxis], idx_part] # build the discount template tp = 1. / np.log2(np.arange(2, k + 2)) heldout_batch = heldout_data[user_idx] DCG = (heldout_batch[np.arange(batch_users)[:, np.newaxis], idx_topk].toarray() * tp).sum(axis=1) IDCG = np.array([(tp[:min(n, k)]).sum() for n in heldout_batch.getnnz(axis=1)]) # print idx_topk # print 'DCG : %.4f, IDCG: %.4f' % (DCG, IDCG) # print topk_part[6] # print X_pred[6,idx_topk[6]] # print 'DCG rec eval:\n ', DCG # print '\n' return DCG / IDCG
def Knearest(self, smat): k = args.k # smat=sparse.csr_matrix(smat) print(type(smat)) res = [] # if smatn.find(".npy")!=-1: # print("npy") # smat=np.load(smatn) # smat=sparse.csr_matrix(smat) # else: # smat=sparse.load_npz(smatn) # print(smat.toarray()) print(self.custn) for j in range(2): if j == 0: # outp_sim=sim_outp+"_cc.npy" # outp_index=index_outp+"_cc.npy" sim_mat = smat[:self.custn, :self.custn] else: # outp_sim=sim_outp+"_pp.npy" # outp_index=index_outp+"_pp.npy" sim_mat = smat[self.custn:, self.custn:] print(sim_mat.shape) # print(cust_cust.toarray()) res_sim = [] res_index = [] for i in range(sim_mat.shape[0]): x = sim_mat[i].toarray() size = x.shape[1] # print(x[0]) index = bn.argpartition(x[0], size - k)[-k:] res_index.append(np.array(index)) # print(np.array(index)) data = x[0][index] res_sim.append(np.array(data)) # print(data) print(len(res_sim)) print(len(res_sim[0])) res.append([res_sim, res_index]) # np.save(outp_sim, res_sim) # np.save(outp_index, res_index) return res
def NDCG_binary_at_k_batch(X_pred, heldout_batch, k=100): batch_users = X_pred.shape[0] #print("batch_users: {}".format(batch_users)) idx_topk_part = bn.argpartition(-X_pred, k, axis=1) topk_part = X_pred[np.arange(batch_users)[:, np.newaxis], idx_topk_part[:, :k]] idx_part = np.argsort(-topk_part, axis=1) idx_topk = idx_topk_part[np.arange(batch_users)[:, np.newaxis], idx_part] tp = 1. / np.log2(np.arange(2, k + 2)) DCG = (heldout_batch[np.arange(batch_users)[:, np.newaxis], idx_topk].toarray() * tp).sum(axis=1) IDCG = np.array([(tp[:min(n, k)]).sum() for n in heldout_batch.getnnz(axis=1)]) return DCG / IDCG
def ndcg_binary_at_k_batch(x_pred, heldout_batch, k=100): batch_users = x_pred.shape[0] idx_topk_part = bn.argpartition(-x_pred, k, axis=1) topk_part = x_pred[np.arange(batch_users)[:, np.newaxis], idx_topk_part[:, :k]] idx_part = np.argsort(-topk_part, axis=1) idx_topk = idx_topk_part[np.arange(batch_users)[:, np.newaxis], idx_part] tp = 1. / np.log2(np.arange(2, k + 2)) dcg = (heldout_batch[np.arange(batch_users)[:, np.newaxis], idx_topk].toarray() * tp).sum(axis=1) idcg = np.array([(tp[:min(n, k)]).sum() for n in heldout_batch.getnnz(axis=1)]) ndcg = dcg / idcg ndcg[np.isnan(ndcg)] = 0 return ndcg
def Rpre_at_k_batch(X_pred, heldout_batch, length): batch_users = X_pred.shape[0] X_pred_binary = np.zeros_like(X_pred, dtype=bool) X_true_binary = heldout_batch.astype('int') true_size = X_true_binary.sum(axis=1) for i in range(batch_users): if (true_size[i] == 0): continue idx = bn.argpartition(-X_pred[i, :], length[i][0] - 1) X_pred_binary[i, idx[:length[i][0]]] = True tmp = (np.logical_and(X_true_binary, X_pred_binary).sum(axis=1)).astype(np.float32) Rpre = tmp[:, np.newaxis] / (length + 0.0000000000000000001) return Rpre
def MAP_at_k_batch(X_pred, heldout_batch, k=10): batch_users = X_pred.shape[0] idx = bn.argpartition(-X_pred, k, axis=1) X_true_binary = (heldout_batch > 0).toarray() tmp = np.zeros_like(batch_users, dtype=float) for i in range(1, k + 1): rel = np.zeros_like(batch_users, dtype=int) for user in range(batch_users): if X_true_binary[user, idx[user, k - 1]]: rel[user] = 1 print(rel.shape) r = Precision_at_k_batch(X_pred, heldout_batch, i) * rel tmp = tmp + r Map = tmp / (np.minimum(k, X_true_binary.sum(axis=1)) + 0.0001) return Map
def Rpre_at_k_batch(X_pred, heldout_batch): batch_users = X_pred.shape[0] X_pred_binary = np.zeros_like(X_pred, dtype=bool) X_true_binary = (heldout_batch > 0).toarray() true_size = X_true_binary.sum(axis=1) for i in range(batch_users): if (true_size[i] == 0): continue idx = bn.argpartition(-X_pred[i, :], true_size[i] - 1) X_pred_binary[i, idx[:true_size[i]]] = True tmp = (np.logical_and(X_true_binary, X_pred_binary).sum(axis=1)).astype(np.float32) Rpre = tmp / (X_true_binary.sum(axis=1) + 0.0000001) return Rpre
def MAP_at_k_batch(train_data, heldout_data, Et, Eb, user_idx, mu=None, k=100, vad_data=None, clear_invalid=True): ''' mean average precision@k ''' batch_users = user_idx.stop - user_idx.start X_pred = _make_prediction(train_data, Et, Eb, user_idx, batch_users, mu=mu, vad_data=vad_data) if clear_invalid: X_pred = clear_invalid_project(train_data=train_data, vad_data=vad_data, X_pred=X_pred, lo=user_idx.start, hi=user_idx.stop) idx_topk_part = bn.argpartition(-X_pred, k, axis=1) topk_part = X_pred[np.arange(batch_users)[:, np.newaxis], idx_topk_part[:, :k]] idx_part = np.argsort(-topk_part, axis=1) # X_pred[np.arange(batch_users)[:, np.newaxis], idx_topk] is the sorted # topk predicted score idx_topk = idx_topk_part[np.arange(batch_users)[:, np.newaxis], idx_part] aps = np.zeros(batch_users) for i, idx in enumerate(xrange(user_idx.start, user_idx.stop)): actual = heldout_data[idx].nonzero()[1] if len(actual) > 0: predicted = idx_topk[i] aps[i] = apk(actual, predicted, k=k) else: aps[i] = np.nan return aps
def evaluate_emb(emb, labels): """Evaluate embeddings based on Recall@k.""" d_mat = get_distance_matrix(emb) d_mat = d_mat.asnumpy() labels = labels.asnumpy() names = [] accs = [] for k in [1, 2, 4, 8, 16]: names.append('Recall@%d' % k) correct, cnt = 0.0, 0.0 for i in range(emb.shape[0]): d_mat[i, i] = 1e10 nns = argpartition(d_mat[i], k)[:k] if any(labels[i] == labels[nn] for nn in nns): correct += 1 cnt += 1 accs.append(correct / cnt) return names, accs
def precision_at_k_batch(train_data, heldout_data, Et, Eb, user_idx, k=20, normalize=True, mu=None, vad_data=None): batch_users = user_idx.stop - user_idx.start X_pred = _make_prediction(train_data, Et, Eb, user_idx, batch_users, mu=mu, vad_data=vad_data) idx = bn.argpartition(-X_pred, k, axis=1) X_pred_binary = np.zeros_like(X_pred, dtype=bool) X_pred_binary[np.arange(batch_users)[:, np.newaxis], idx[:, :k]] = True X_true_binary = (heldout_data[user_idx] > 0).toarray() tmp = (np.logical_and(X_true_binary, X_pred_binary).sum(axis=1)).astype( np.float32) if normalize: precision = tmp / np.minimum(k, X_true_binary.sum(axis=1)) else: precision = tmp / k return precision
def evaluate_emb(emb, labels): """Evaluate embeddings based on Recall@k.""" d_mat = get_distance_matrix(emb) d_mat = d_mat.asnumpy() labels = labels.asnumpy() names = [] accs = [] for k in [1, 2, 4, 8, 16]: names.append('Recall@%d' % k) correct, cnt = 0.0, 0.0 for i in range(emb.shape[0]): d_mat[i, i] = 1e10 nns = argpartition(d_mat[i], k)[:k] if any(labels[i] == labels[nn] for nn in nns): correct += 1 cnt += 1 accs.append(correct/cnt) return names, accs
def translation_results(X, y, vocab, M, lg2_vectors, lg2_vocab): """X, y, vocab - The training or test data that you want results for T - The translation matrix lg2_vectors, lg2_vocab - Foreign language used to find the nearest neighbor """ # Data Prep on Inputs X_word, y_word = zip(*vocab) X_norm, X_normed = normalize(X) y_norm, y_normed = normalize(y) lg2_vectors_norm, lg2_vectors_normed = normalize(lg2_vectors) # yhat yhat = X.dot(M) yhat_norm, yhat_normed = normalize(yhat) #X_norm = normalize(X) # Nearest Neighbors neg_cosine = -yhat_normed.dot(lg2_vectors_normed.T) ranked_neighbor_indices = bn.argpartition(neg_cosine, 1, axis = 1 ) # Nearest Neighbor nearest_neighbor_indices = ranked_neighbor_indices[:, 0] yhat_neighbor = lg2_vectors[nearest_neighbor_indices, :] yhat_neighbor_norm, yhat_neighbor_normed = normalize(yhat_neighbor) yhat_neighbor_word = np.asarray(lg2_vocab)[nearest_neighbor_indices] # Results DF cols = ['X_norm', 'y_norm', 'yhat_norm', 'yhat_neighbor_norm', 'X_word', 'y_word', 'yhat_neighbor_word'] results_df = pd.DataFrame({'X_norm': X_norm, 'y_norm': y_norm, 'yhat_norm': yhat_norm, 'yhat_neighbor_norm': yhat_neighbor_norm, 'X_word': X_word, 'y_word': y_word, 'yhat_neighbor_word': yhat_neighbor_word,}) results_df = results_df[cols] results_df['neighbor_correct'] = results_df.y_word == \ results_df.yhat_neighbor_word return results_df
def hit_at_k(pred_scores, ground_truth, k=100): r"""Compute the hit at k. The Hit@k is either 1, if a relevan item is in the top *k* scored items, or 0 otherwise. Parameters ---------- pred_scores : :obj:`numpy.array` The array with the predicted scores. Users are on the rows and items on the columns. ground_truth : :obj:`numpy.array` Binary array with the ground truth. 1 means the item is relevant for the user and 0 not relevant. Users are on the rows and items on the columns. k : :obj:`int` [optional] The number of top items to considers, by default 100 Returns ------- :obj:`numpy.array` An array containing the *hit@k* value for each user. Examples -------- >>> import numpy as np >>> from rectorch.metrics import Metrics >>> scores = np.array([[4., 3., 2., 1.]]) >>> ground_truth = np.array([[0, 0, 1., 1.]]) >>> Metrics.hit_at_k(scores, ground_truth, 3) np.array([1.]) >>> Metrics.hit_at_k(scores, ground_truth, 2) np.array([0.]) """ assert pred_scores.shape == ground_truth.shape,\ "'pred_scores' and 'ground_truth' must have the same shape." k = min(pred_scores.shape[1], k) idx = bn.argpartition(-pred_scores, k - 1, axis=1) pred_scores_binary = np.zeros_like(pred_scores, dtype=bool) pred_scores_binary[np.arange(pred_scores.shape[0])[:, np.newaxis], idx[:, :k]] = True X_true_binary = (ground_truth > 0) num = (np.logical_and( X_true_binary, pred_scores_binary).sum(axis=1)).astype(np.float32) return num > 0
def threaded_multiple_arg_min(vector, s): count = multiprocessing.cpu_count() n = vector.size if (s == 0): return np.array([]) if (s == 1): return np.argmin(vector) if (s > n): return np.argsort(vector) if (n < 1000): return np.argsort(vector)[:s] split_size = max(s * 100, n // 10 // count) # s*s*1000 ## ???? while ((n % split_size) <= s * 10): split_size += s * 10 // count + 1 l = list(range(0, vector.size, split_size)) r_list = [[] for x in l] t_count = min(count, (n - 1) // split_size + 1) #executor = concurrent.futures.ThreadPoolExecutor(count) #def _run_sort_thread(indexes, s): # for i in indexes: # x = split_size * i # r_x = bottleneck.argpartition(vector[x:x+split_size], s)[:s] + x # r_list[i] = list(r_x) #futures = {} for i in range(len(l)): #for i in range(1, t_count): #indexes = range(i,len(l),t_count) #args = (_run_sort_thread, indexes, s) #futures[executor.submit(*args)] = i x = split_size * i r_x = bottleneck.argpartition(vector[x:x + split_size], s)[:s] + x r_list[i] = list(r_x) #_run_sort_thread( range(0,len(l),t_count), s) #concurrent.futures.wait(futures) i_list = [] for x in r_list: i_list += x indexes = np.array(i_list) #indexes = bottleneck.argpartition(vector, s)[:s] values = vector[indexes] new_index = np.argsort(values)[:s] return indexes[new_index]
def NDCG_binary_at_k_batch(X_pred, heldout_batch, k=100): ''' Normalized Discounted Cumulative Gain@k for binary relevance ASSUMPTIONS: all the 0's in heldout_data indicate 0 relevance ''' batch_users = X_pred.shape[0] idx_topk_part = bn.argpartition(-X_pred, k, axis=1) topk_part = X_pred[np.arange(batch_users)[:, np.newaxis], idx_topk_part[:, :k]] idx_part = np.argsort(-topk_part, axis=1) idx_topk = idx_topk_part[np.arange(batch_users)[:, np.newaxis], idx_part] tp = 1. / np.log2(np.arange(2, k + 2)) DCG = (heldout_batch[np.arange(batch_users)[:, np.newaxis], idx_topk].toarray() * tp).sum(axis=1) IDCG = np.array([(tp[:min(n, k)]).sum() for n in heldout_batch.getnnz(axis=1)]) return DCG / IDCG
def NDCG_binary_at_k_batch(train_data, heldout_data, Et, Eb, user_idx, mu=None, k=100, vad_data=None, clear_invalid=False, cache=False): ''' normalized discounted cumulative gain@k for binary relevance ASSUMPTIONS: all the 0's in heldout_data indicate 0 relevance ''' batch_users = user_idx.stop - user_idx.start if cache: file_path = os.path.join(constants.PRED_DIR, 'pred_%d_%d.npz' % (user_idx.start, user_idx.stop)) if os.path.exists(file_path): X_pred = np.load(file_path)['X_pred'] else: X_pred = _make_prediction(train_data, Et, Eb, user_idx, batch_users, mu=mu, vad_data=vad_data) np.savez(file_path, X_pred=X_pred) else: X_pred = _make_prediction(train_data, Et, Eb, user_idx, batch_users, mu=mu, vad_data=vad_data) #clear backed projects in training data --> integrated into learning process. if clear_invalid: X_pred = clear_invalid_project(train_data=train_data, vad_data=vad_data, X_pred=X_pred, lo=user_idx.start, hi=user_idx.stop) # for ui in range(user_idx.start,user_idx.stop): # X_pred[ui, user_invalid_projects_map[ui]] = 0.0 idx_topk_part = bn.argpartition(-X_pred, k, axis=1) topk_part = X_pred[np.arange(batch_users)[:, np.newaxis], idx_topk_part[:, :k]] idx_part = np.argsort(-topk_part, axis=1) # X_pred[np.arange(batch_users)[:, np.newaxis], idx_topk] is the sorted # topk predicted score idx_topk = idx_topk_part[np.arange(batch_users)[:, np.newaxis], idx_part] # build the discount template tp = 1. / np.log2(np.arange(2, k + 2)) heldout_batch = heldout_data[user_idx] DCG = (heldout_batch[np.arange(batch_users)[:, np.newaxis], idx_topk].toarray() * tp).sum(axis=1) IDCG = np.array([(tp[:min(n, k)]).sum() for n in heldout_batch.getnnz(axis=1)]) return DCG / IDCG
def Recall_at_k_batch(X_pred, heldout_batch, k=100, input_batch=None): if input_batch is not None: X_pred[input_batch.nonzero()] = -np.inf batch_users = X_pred.shape[0] idx = bn.argpartition(-X_pred, k, axis=1) X_pred_binary = np.zeros_like(X_pred, dtype=bool) X_pred_binary[np.arange(batch_users)[:, np.newaxis], idx[:, :k]] = True X_true_binary = (heldout_batch > 0) #.toarray() try: X_true_binary = X_true_binary.toarray() except: # print("Wasn't sparse") pass tmp = (np.logical_and(X_true_binary, X_pred_binary).sum(axis=1)).astype(np.float32) recall = tmp / np.maximum(np.minimum(k, X_true_binary.sum(axis=1)), 1) recall = recall.astype(np.float32) return recall
def get_recall(preds, targets, k=10): batch_size = preds.shape[0] voc_size = preds.shape[1] print("batch_size", batch_size) print("voc_size", voc_size) idx = bn.argpartition(-preds, k, axis=1) hit = targets[np.arange(batch_size)[:, np.newaxis], idx[:, :k]] hit = np.count_nonzero(hit, axis=1) hit = np.array(hit) hit = np.squeeze(hit) recall = np.array([min(n, k) for n in np.count_nonzero(targets, axis=1)]) recall = hit / recall return recall
def ndcg(X_pred, heldout_batch, k=100): ''' normalized discounted cumulative gain@k for binary relevance ASSUMPTIONS: all the 0's in heldout_data indicate 0 relevance ''' batch_users = X_pred.shape[0] idx_topk_part = bn.argpartition(-X_pred, k, axis=1) topk_part = X_pred[np.arange(batch_users)[:, np.newaxis], idx_topk_part[:, :k]] idx_part = np.argsort(-topk_part, axis=1) # X_pred[np.arange(batch_users)[:, np.newaxis], idx_topk] is the sorted # topk predicted score idx_topk = idx_topk_part[np.arange(batch_users)[:, np.newaxis], idx_part] # build the discount template tp = 1. / np.log2(np.arange(2, k + 2)) DCG = (heldout_batch[np.arange(batch_users)[:, np.newaxis], idx_topk].toarray() * tp).sum(axis=1) IDCG = np.array([(tp[:min(n, k)]).sum() for n in heldout_batch.getnnz(axis=1)]) return DCG / IDCG
def NDCG_at_k_batch_a(X_pred, heldout_batch, k=100): #print(heldout_batch[1].to_dense().size()) #print(heldout_batch[1].to_dense().nonzero()) #print((heldout_batch[1].to_dense() != 0).sum(dim=1)) X_pred = X_pred.cpu().detach().numpy() nnz = (heldout_batch.to_dense() != 0).sum(dim=1) heldout_batch = heldout_batch.to_dense().cpu().detach().numpy() batch_users = X_pred.shape[0] #print(batch_users) idx_topk_part = bn.argpartition(-X_pred, k, axis=1) topk_part = X_pred[np.arange(batch_users)[:, np.newaxis], idx_topk_part[:, :k]] idx_part = np.argsort(-topk_part, axis=1) idx_topk = idx_topk_part[np.arange(batch_users)[:, np.newaxis], idx_part] tp = 1. / np.log2(np.arange(2, k + 2)) DCG = (heldout_batch[np.arange(batch_users)[:, np.newaxis], idx_topk] * tp).sum(axis=1) #print(nnz.size()) IDCG = np.array([(tp[:min(n, k)]).sum() for n in nnz]) return DCG / IDCG pred = X_pred.cpu().detach().numpy() gt = heldout_batch.to_dense().cpu().detach().numpy() gt_idx_sorted = np.argsort(-pred, axis=1) idx_topk = idx_topk_part[np.arange(len(pred))[:, np.newaxis], gt_idx_sorted] gains = 2**gt - 1 tp = 1. / np.log2(np.arange(2, k + 2)) #print(gt_idx_sorted) print(pred[gt_idx_sorted].shape) DCG = (heldout_batch[np.arange(batch_users)[:, np.newaxis], idx_topk] * tp).sum(axis=1) iDCG = gt[gt_idx_sorted] * tp return DCG / IDCG
def get_dealwifi(wifi_info,ntop=8): str = wifi_info.split(';') every_wifi = np.array([each.split('|')for each in str]) # 转化为矩阵 wifi_id = every_wifi[:,0] wifi_value = every_wifi[:,1] wifi_state = every_wifi[:,2] print(wifi_state) print(wifi_value) #print(np.array(wifi_value)) if 'true' in wifi_state: connection_wifi_name = wifi_id[wifi_state.tolist().index('true')] else: connection_wifi_name = 'null' if len(wifi_id)>= 8: top_5_idx = bottleneck.argpartition(np.array(wifi_value), ntop)[:ntop] # 找到前n大的几个数的索引 return wf_name_2_idx(wifi_id[top_5_idx]),wf_name_2_idx([connection_wifi_name]) else: sort_index = np.argsort(-np.array(wifi_value)) w_name = wifi_id[sort_index].tolist() w_name.extend(['null']*(8-len(wifi_value))) return wf_name_2_idx(w_name), wf_name_2_idx([connection_wifi_name])
def precision_at_k_batch(train_data, heldout_data, Et, Eb, user_idx, k=20, normalize=True, mu=None, vad_data=None): batch_users = user_idx.stop - user_idx.start X_pred = _make_prediction(train_data, Et, Eb, user_idx, batch_users, mu=mu, vad_data=vad_data) # Xavier: first k indexes are corresponding to highest k elements. idx = bn.argpartition(-X_pred, k, axis=1) # Xavier: a matrix whose elements are zero, in this case, are false. X_pred_binary = np.zeros_like(X_pred, dtype=bool) # Xavier: np.arange(batch_users) returns an array like [batch_users.start, ..., batch_users.end] # [:, np.newaxis] -> reshape the array from (batch_users.amount,) to (batch_users.amount, 1) # ref: https://stackoverflow.com/questions/29241056/how-does-numpy-newaxis-work-and-when-to-use-it # Set the value to 1 at (userIdx, itemIdx), where userIdxes denotes all users in batch_users and itemIdxes denotes # all indexes representing the items that are highest k prediction and (userIdx, itemIdx) is all possible # values in cartesian product of userIdxes and itemIdxes X_pred_binary[np.arange(batch_users)[:, np.newaxis], idx[:, :k]] = True X_true_binary = (heldout_data[user_idx] > 0).toarray() tmp = (np.logical_and(X_true_binary, X_pred_binary).sum(axis=1)).astype(np.float32) if normalize: precision = tmp / np.minimum(k, X_true_binary.sum(axis=1)) else: precision = tmp / k return precision
def ans_output(X_pred, k=500, file_ans=0): ''' normalized discounted cumulative gain@k for binary relevance ASSUMPTIONS: all the 0's in heldout_data indicate 0 relevance ''' batch_users = X_pred.shape[0] print(X_pred.shape) idx_topk_part = bn.argpartition(-X_pred, k - 1, axis=1) topk_part = X_pred[np.arange(batch_users)[:, np.newaxis], idx_topk_part[:, :k]] idx_part = np.argsort(-topk_part, axis=1) # X_pred[np.arange(batch_users)[:, np.newaxis], idx_topk] is the sorted # topk predicted score idx_topk = idx_topk_part[np.arange(batch_users)[:, np.newaxis], idx_part] if (file_ans != 0): for i in range(idx_topk.shape[0]): for j in range(idx_topk.shape[1]): file_ans.write('{0} '.format(idx_topk[i][j])) file_ans.write('\n')
def NDCG_binary_at_k_batch(X_pred, heldout_batch, k=100): ''' normalized discounted cumulative gain@k for binary relevance ASSUMPTIONS: all the 0's in heldout_data indicate 0 relevance ''' batch_users = X_pred.shape[0] idx_topk_part = bn.argpartition(-X_pred, k, axis=1) topk_part = X_pred[np.arange(batch_users)[:, np.newaxis], idx_topk_part[:, :k]] idx_part = np.argsort(-topk_part, axis=1) # X_pred[np.arange(batch_users)[:, np.newaxis], idx_topk] is the sorted # topk predicted score idx_topk = idx_topk_part[np.arange(batch_users)[:, np.newaxis], idx_part] # build the discount template tp = 1. / np.log2(np.arange(2, k + 2)) tp = torch.tensor(tp, dtype=torch.float) # ! in order to do operations with torch tensor DCG = (heldout_batch[np.arange(batch_users)[:, np.newaxis], idx_topk].cpu() * tp).sum(dim=1) IDCG = torch.tensor([(tp[:min(n, k)]).sum() for n in (heldout_batch != 0).sum(dim=1)]) return DCG / IDCG
def NDCG_binary_at_k_batch(X_pred, heldout_batch, k=10): ''' normalized discounted cumulative gain@k for binary relevance ASSUMPTIONS: all the 0's in heldout_data indicate 0 relevance ''' batch_users = X_pred.shape[0] # bn.argpartition # kth 번째까지 등장하는 원소들이 리스트 내부에서 가장 작은 kth번째 원소들이도록 partition해주는 인덱스 리스트를 출력해줌. # 여기서는 -를 취해줬으므로, k번째까지 등장하는 원소들이 리스트 내부에서 가장 큰 100번째 원소들이도록 partition해주는 인덱스 리스트를 출력. idx_topk_part = bn.argpartition(-X_pred, k, axis=1) topk_part = X_pred[np.arange(batch_users)[:, np.newaxis], idx_topk_part[:, :k]] # np.argsort -> sorting한 리스트의 arg를 뱉어냄. # -를 붙여줌으로써 내림차순으로 정리(높은 놈이 위에) idx_part = np.argsort(-topk_part, axis=1) # X_pred[np.arange(batch_users)[:, np.newaxis], idx_topk] is the sorted # topk predicted score idx_topk = idx_topk_part[np.arange(batch_users)[:, np.newaxis], idx_part] # build the discount template tp = 1. / np.log2(np.arange(2, k + 2)) DCG_filter = (heldout_batch[np.arange(batch_users)[:, np.newaxis], idx_topk].toarray() > 0) DCG = (DCG_filter * tp).sum(axis=1) # sparse matrix 내에서, 고객의 총 interaction 수와 k 중 더 작은 것을 골라서 IDCG 계산 IDCG = np.array([(tp[:min(n, k)]).sum() for n in heldout_batch.getnnz(axis=1)]) return DCG / IDCG
def Recall_binary_at_k_batch(logits, y_true, k=10): """ Function taken from Variational Autoencoders for Collaborative Filtering :param logits: the un-normalised predictions :param y_true: the true predictions (binary) :param k: cut-off value :return: normalised recall at k """ n = logits.shape[0] dummy_column = np.arange(n).reshape(n, 1) idx_topk_part = bn.argpartition(-logits, k, axis=1)[:, :k] X_pred_binary = np.zeros_like(logits, dtype=bool) X_pred_binary[dummy_column, idx_topk_part] = True X_true_binary = (y_true > 0).toarray() tmp = (np.logical_and(X_true_binary, X_pred_binary).sum(axis=1)).astype( np.float32) recall = tmp / np.minimum(k, X_true_binary.sum(axis=1)) assert (recall >= 0).all() and (recall <= 1).all() return recall
def Recall_at_k_batch(X_pred, heldout_batch, k=100): batch_users = X_pred.shape[0] idx = bn.argpartition(-X_pred, k, axis=1) X_pred_binary = np.zeros_like(X_pred, dtype=bool) X_pred_binary[np.arange(batch_users)[:, np.newaxis], idx[:, :k]] = True X_true_binary = (heldout_batch > 0).toarray() tmp = (np.logical_and(X_true_binary, X_pred_binary).sum(axis=1)).astype(np.float32) denom = np.minimum(k, X_true_binary.sum(axis=1)) output = [] misclassified_tags = [] for idx in range(np.shape(tmp)[0]): if denom[idx] != 0: output.append(tmp[idx] / denom[idx]) return output, misclassified_tags
return a pd_train['labels'] = pd_train['labels'].apply(f) # change embedding embedding = np.array(list(pd_embedding['blog_jieba_vector'].apply(list))) print("pd_train:\n", pd_train) print("pd_test:\n", pd_test) # begin training dev_classes = lstm(list(pd_train['embedding_index'])[:100], list(pd_train['labels'])[:100], list(pd_test['embedding_index'])[:100], embedding_dim, embedding, maxlen, labels_len) print("dev classes:", dev_classes) # bottleneck import bottleneck as bl result = [] labels_name = np.array(labels_name) for classes in dev_classes: result.append(labels_name[bl.argpartition(-classes, 3)[:3]]) pd_result = pd.DataFrame(result) pd_result.to_csv(dp.ResultTxt, sep="\001", header=False, index=False, encoding='utf8')