def test_deterministic(): seed = np.random.RandomState(42) x1 = seed.normal(0, 100, (1000, 50)) x2 = seed.normal(0, 100, (1000, 50)) index1 = NNDescent(x1, random_state=np.random.RandomState(42)) neighbors1, distances1 = index1.query(x2) index2 = NNDescent(x1, random_state=np.random.RandomState(42)) neighbors2, distances2 = index2.query(x2) np.testing.assert_equal(neighbors1, neighbors2) np.testing.assert_equal(distances1, distances2)
class NNDescent(KNNIndex): # TODO: Make mapping from sklearn metrics to lib metrics def build(self, data): self.index = LibNNDescent(data, metric=self.metric, n_neighbors=5) def query_train(self, data, k): search_neighbors = min(data.shape[0] - 1, k + 1) neighbors, distances = self.index.query(data, k=search_neighbors, queue_size=1) return neighbors[:, 1:], distances[:, 1:] def query(self, query, k): return self.index.query(query, k=k, queue_size=1)
def compute_tau(X, V, k=100, nbr_idx=None): if nbr_idx is None: if X.shape[0] > 200000 and X.shape[1] > 2: from pynndescent import NNDescent nbrs = NNDescent( X, metric="euclidean", n_neighbors=k, n_jobs=-1, random_state=19491001, ) _, dist = nbrs.query(X, k=k) else: alg = "ball_tree" if X.shape[1] > 10 else "kd_tree" nbrs = NearestNeighbors(n_neighbors=k, algorithm=alg, n_jobs=-1).fit(X) dists, _ = nbrs.kneighbors(X) else: dists = np.zeros(nbr_idx.shape) for i in range(nbr_idx.shape[0]): for j in range(nbr_idx.shape[1]): x = X[i] y = X[nbr_idx[i, j]] dists[i, j] = np.sqrt((x - y).dot(x - y)) d = np.mean(dists[:, 1:], 1) v = np.linalg.norm(V, axis=1) tau = d / v return tau, v
def graphize_vecfld(func, X, nbrs_idx=None, dist=None, k=30, distance_free=True, n_int_steps=20, cores=1): n, d = X.shape nbrs = None if nbrs_idx is None: if X.shape[0] > 200000 and X.shape[1] > 2: from pynndescent import NNDescent nbrs = NNDescent(X, metric='euclidean', n_neighbors=k+1, n_jobs=-1, random_state=19491001) nbrs_idx, dist = nbrs.query(X, k=k+1) else: alg = 'ball_tree' if X.shape[1] > 10 else 'kd_tree' nbrs = NearestNeighbors(n_neighbors=k+1, algorithm=alg, n_jobs=-1).fit(X) dist, nbrs_idx = nbrs.kneighbors(X) if dist is None and not distance_free: D = pdist(X) else: D = None V = sp.csr_matrix((n, n)) if cores == 1: for i, idx in tqdm(enumerate(nbrs_idx), desc='Constructing diffusion graph from reconstructed vector field'): V += construct_v(X, i, idx, n_int_steps, func, distance_free, dist, D, n) else: pool = ThreadPool(cores) res = pool.starmap(construct_v, zip(itertools.repeat(X), np.arange(len(nbrs_idx)), nbrs_idx, itertools.repeat(n_int_steps), itertools.repeat(func), itertools.repeat(distance_free), itertools.repeat(dist), itertools.repeat(D), itertools.repeat(n))) pool.close() pool.join() V = functools.reduce((lambda a, b: a + b), res) return V, nbrs
def get_Xss_confidence(self): X = self.X_data X = X.A if sp.issparse(X) else X Xss = self.Xss.get_X() alg = 'ball_tree' if Xss.shape[1] > 10 else 'kd_tree' if X.shape[0] > 200000 and X.shape[1] > 2: from pynndescent import NNDescent nbrs = NNDescent(X, metric='euclidean', n_neighbors=min(self.k, X.shape[0] - 1), n_jobs=-1, random_state=19491001) _, dist = nbrs.query(Xss, k=min(self.k, X.shape[0] - 1)) else: alg = 'ball_tree' if X.shape[1] > 10 else 'kd_tree' nbrs = NearestNeighbors(n_neighbors=min(self.k, X.shape[0] - 1), algorithm=alg, n_jobs=-1).fit(X) dist, _ = nbrs.kneighbors(Xss) dist_m = dist.mean(1) confidence = 1 - dist_m / dist_m.max() return confidence
def test_tree_no_split(small_data, sparse_small_data, metric): k = 10 for data, data_type in zip([small_data, sparse_small_data], ["dense", "sparse"]): n_instances = data.shape[0] leaf_size = n_instances + 1 # just to be safe data_train = data[n_instances // 2:] data_test = data[:n_instances // 2] nnd = NNDescent( data_train, metric=metric, n_neighbors=data_train.shape[0] - 1, random_state=None, tree_init=True, leaf_size=leaf_size, ) nnd.prepare() knn_indices, _ = nnd.query(data_test, k=k, epsilon=0.2) true_nnd = NearestNeighbors(metric=metric).fit(data_train) true_indices = true_nnd.kneighbors(data_test, k, return_distance=False) num_correct = 0.0 for i in range(true_indices.shape[0]): num_correct += np.sum(np.in1d(true_indices[i], knn_indices[i])) percent_correct = num_correct / (true_indices.shape[0] * k) assert ( percent_correct >= 0.95 ), "NN-descent query did not get 95% for accuracy on nearest neighbors on {} data".format( data_type)
def get_Xss_confidence(self, k=50): X = self.X_data X = X.A if sp.issparse(X) else X Xss = self.Xss.get_X() Xref = np.median(X, 0) Xss = np.vstack((Xss, Xref)) if X.shape[0] > 200000 and X.shape[1] > 2: from pynndescent import NNDescent nbrs = NNDescent(X, metric="euclidean", n_neighbors=min(k, X.shape[0] - 1), n_jobs=-1, random_state=19491001) _, dist = nbrs.query(Xss, k=min(k, X.shape[0] - 1)) else: alg = "ball_tree" if X.shape[1] > 10 else "kd_tree" nbrs = NearestNeighbors(n_neighbors=min(k, X.shape[0] - 1), algorithm=alg, n_jobs=-1).fit(X) dist, _ = nbrs.kneighbors(Xss) dist_m = dist.mean(1) # confidence = 1 - dist_m / dist_m.max() sigma = 0.1 * 0.5 * (np.max(X[:, 0]) - np.min(X[:, 0]) + np.max(X[:, 1]) - np.min(X[:, 1])) confidence = gaussian_1d(dist_m, sigma=sigma) confidence /= np.max(confidence) return confidence[:-1]
def test_update_w_prepare_query_accuracy(nn_data, metric): nnd = NNDescent( nn_data[200:800], metric=metric, n_neighbors=10, random_state=None, compressed=False, ) nnd.prepare() nnd.update(xs_fresh=nn_data[800:]) nnd.prepare() knn_indices, _ = nnd.query(nn_data[:200], k=10, epsilon=0.2) true_nnd = NearestNeighbors(metric=metric).fit(nn_data[200:]) true_indices = true_nnd.kneighbors(nn_data[:200], 10, return_distance=False) num_correct = 0.0 for i in range(true_indices.shape[0]): num_correct += np.sum(np.in1d(true_indices[i], knn_indices[i])) percent_correct = num_correct / (true_indices.shape[0] * 10) assert percent_correct >= 0.95, ("NN-descent query did not get 95% " "accuracy on nearest neighbors")
def bandwidth_selector(X): """ This function computes an empirical bandwidth for a Gaussian kernel. """ n, m = X.shape if n > 200000 and m > 2: from pynndescent import NNDescent nbrs = NNDescent( X, metric="euclidean", n_neighbors=max(2, int(0.2 * n)), n_jobs=-1, random_state=19491001, ) _, distances = nbrs.query(X, k=max(2, int(0.2 * n))) else: alg = "ball_tree" if X.shape[1] > 10 else "kd_tree" nbrs = NearestNeighbors(n_neighbors=max(2, int(0.2 * n)), algorithm=alg, n_jobs=-1).fit(X) distances, _ = nbrs.kneighbors(X) d = np.mean(distances[:, 1:]) / 1.5 return np.sqrt(2) * d
def fit(self, X, V, k, s=None, tol=1e-4): self.__reset__() # knn clustering if self.nbrs_idx is None: if X.shape[0] > 200000 and X.shape[1] > 2: from pynndescent import NNDescent nbrs = NNDescent(X, metric='euclidean', n_neighbors=k + 1, n_jobs=-1, random_state=19491001) Idx, _ = nbrs.query(X, k=k+1) else: alg = 'ball_tree' if X.shape[1] > 10 else 'kd_tree' nbrs = NearestNeighbors(n_neighbors=k + 1, algorithm=alg, n_jobs=-1).fit(X) _, Idx = nbrs.kneighbors(X) self.nbrs_idx = Idx[:, 1:] else: Idx = self.nbrs_idx # compute transition prob. n = X.shape[0] self.P = np.zeros((n, n)) for i in range(n): y = X[i] v = V[i] Y = X[Idx[i, 1:]] p = compute_markov_trans_prob(y, v, Y, s, cont_time=True) p[p <= tol] = 0 # tolerance check self.P[Idx[i, 1:], i] = p self.P[i, i] = -np.sum(p)
def test_transformer_equivalence(): N_NEIGHBORS = 15 EPSILON = 0.15 train = nn_data[:400] test = nn_data[:200] # Note we shift N_NEIGHBORS to conform to sklearn's KNeighborTransformer defn nnd = NNDescent(data=train, n_neighbors=N_NEIGHBORS + 1, random_state=42, compressed=False) indices, dists = nnd.query(test, k=N_NEIGHBORS, epsilon=EPSILON) sort_idx = np.argsort(indices, axis=1) indices_sorted = np.vstack( [indices[i, sort_idx[i]] for i in range(sort_idx.shape[0])]) dists_sorted = np.vstack( [dists[i, sort_idx[i]] for i in range(sort_idx.shape[0])]) # Note we shift N_NEIGHBORS to conform to sklearn' KNeighborTransformer defn transformer = PyNNDescentTransformer(n_neighbors=N_NEIGHBORS, search_epsilon=EPSILON, random_state=42).fit( train, compress_index=False) Xt = transformer.transform(test).sorted_indices() assert np.all(Xt.indices == indices_sorted.flatten()) assert np.allclose(Xt.data, dists_sorted.flat)
def get_knn_graph(self, data): nn = NNDescent(data, metric="euclidean", n_jobs=self.n_jobs, random_state=self.random_state) indices, distances = nn.query(data, k=self.n_neighbors + 1) knn = indices[:, 1:] return knn
def fit(self, X, V, k, s=None, method="qp", eps=None, tol=1e-4): # pass index # the parameter k will be replaced by a connectivity matrix in the future. self.__reset__() # knn clustering if X.shape[0] > 200000 and X.shape[1] > 2: from pynndescent import NNDescent nbrs = NNDescent(X, metric="euclidean", n_neighbors=k, n_jobs=-1, random_state=19491001) Idx, _ = nbrs.query(X, k=k) else: alg = "ball_tree" if X.shape[1] > 10 else "kd_tree" nbrs = NearestNeighbors(n_neighbors=k, algorithm=alg, n_jobs=-1).fit(X) _, Idx = nbrs.kneighbors(X) # compute transition prob. n = X.shape[0] self.P = np.zeros((n, n)) if method == "kernel": inv_s = np.linalg.inv(s) # compute density kernel if eps is not None: self.Kd = np.zeros((n, n)) inv_eps = 1 / eps for i in range(n): self.Kd[i, Idx[i]] = compute_density_kernel( X[i], X[Idx[i]], inv_eps) D = np.sum(self.Kd, 0) for i in range(n): y = X[i] v = V[i] if method == "qp": Y = X[Idx[i, 1:]] p = compute_markov_trans_prob(y, v, Y, s) p[p <= tol] = 0 # tolerance check self.P[Idx[i, 1:], i] = p self.P[i, i] = 1 - np.sum(p) else: Y = X[Idx[i]] # p = compute_kernel_trans_prob(y, v, Y, inv_s) k = compute_drift_kernel(y, v, Y, inv_s) if eps is not None: k /= D[Idx[i]] p = k / np.sum(k) p[p <= tol] = 0 # tolerance check p = p / np.sum(p) self.P[Idx[i], i] = p
def calculate_neighbours(genes, n_neighbours: int, inverse: bool, scale: str, log: bool, description: str = '', return_neigh_sim: bool = False, genes_query_data: pd.DataFrame = None, remove_self: bool = False): """ Calculate neighbours of genes based on cosine distance. :param genes: Data frame as in class init, gene names (rows) should match the one in init. :param n_neighbours: Number of neighbours to obtain for each gene. This will include self for non-inverse. :param inverse: Calculate most similar neighbours (False) or neighbours with inverse profile (True). :param scale: Scale expression by gene with 'minmax' (min=0, max=1) or 'mean0std1' (mean=0, std=1) or 'none'. :param log: Should expression data be log2(data+pseudocount) transformed before scaling. :param description: If an error occurs while making KNN index report this description with the error. :param return_neigh_sim: Return tuple with nearest neighbour matrix and similarity matrix data frames, as returned by pynndescent, but with distance matrix converted to similarities and with added gene names for the index. :param genes_query_data: Use this as query. If None use genes. :param remove_self: Used only if return_neigh_dist is true. Whether to remove sample from its closest neighbours or not. If return_neigh_dist is False this is done automatically. This also removes the last column of neighbours if self is not present - thus it should not be used with inverse, as self will not be present. :return: Dict with keys being gene pair names tuple (smaller name by alphabet is the first tuple value) and values representing cosine similarity. Or see return_neigh_dist. """ genes_index, genes_query = NeighbourCalculator.get_index_query(genes=genes, inverse=inverse, scale=scale, log=log, genes_query_data=genes_query_data) # Random state was not set during the analysis in the paper so the obtained results might differ slightly try: index = NNDescent(genes_index, n_jobs=THREADS, metric='cosine', random_state=0) except ValueError: try: index = NNDescent(genes_index, tree_init=False, n_jobs=THREADS, random_state=0) warnings.warn( 'Dataset ' + description + ' index computed without tree initialisation', Warning) except ValueError: raise ValueError('Dataset ' + description + ' can not be processed by pydescent') neighbours, distances = index.query(genes_query.tolist(), k=n_neighbours) if genes_query_data is None: genes_query_data = genes if return_neigh_sim: neighbours = NeighbourCalculator.parse_neighbours_matrix(neighbours=neighbours, genes_query=genes_query_data, genes_idx=genes) similarities = pd.DataFrame(NeighbourCalculator.parse_distances_matrix(distances), index=genes_query_data.index) if remove_self: neighbours, similarities = NeighbourCalculator.remove_self_pynn_matrix(neighbours=neighbours, similarities=similarities) return neighbours, similarities else: return NeighbourCalculator.parse_neighbours(neighbours=neighbours, distances=distances, genes_query=genes_query_data, genes_idx=genes)
def prepare_velocity_grid_data( X_emb, xy_grid_nums, density=None, smooth=None, n_neighbors=None, ): n_obs, n_dim = X_emb.shape density = 1 if density is None else density smooth = 0.5 if smooth is None else smooth grs, scale = [], 0 for dim_i in range(n_dim): m, M = np.min(X_emb[:, dim_i]), np.max(X_emb[:, dim_i]) m = m - 0.01 * np.abs(M - m) M = M + 0.01 * np.abs(M - m) gr = np.linspace(m, M, xy_grid_nums[dim_i] * density) scale += gr[1] - gr[0] grs.append(gr) scale = scale / n_dim * smooth meshes_tuple = np.meshgrid(*grs) X_grid = np.vstack([i.flat for i in meshes_tuple]).T # estimate grid velocities if n_neighbors is None: n_neighbors = np.max([10, int(n_obs / 50)]) if X_emb.shape[0] > 200000 and X_emb.shape[1] > 2: from pynndescent import NNDescent nn = NNDescent(X_emb, metric='euclidean', n_neighbors=n_neighbors, n_jobs=-1, random_state=19491001) neighs, dists = nn.query(X_grid, k=n_neighbors) else: alg = "ball_tree" if X_emb.shape[1] > 10 else 'kd_tree' nn = NearestNeighbors(n_neighbors=n_neighbors, n_jobs=-1, algorithm=alg) nn.fit(X_emb) dists, neighs = nn.kneighbors(X_grid) weight = norm.pdf(x=dists, scale=scale) p_mass = weight.sum(1) return X_grid, p_mass, neighs, weight
def test_nn_descent_query_accuracy(nn_data): nnd = NNDescent(nn_data[200:], "euclidean", n_neighbors=10, random_state=None) knn_indices, _ = nnd.query(nn_data[:200], k=10, epsilon=0.2) tree = KDTree(nn_data[200:]) true_indices = tree.query(nn_data[:200], 10, return_distance=False) num_correct = 0.0 for i in range(true_indices.shape[0]): num_correct += np.sum(np.in1d(true_indices[i], knn_indices[i])) percent_correct = num_correct / (true_indices.shape[0] * 10) assert percent_correct >= 0.95, ( "NN-descent query did not get 95% " "accuracy on nearest neighbors" )
def test_nn_descent_query_accuracy_angular(nn_data): nnd = NNDescent(nn_data[200:], "cosine", n_neighbors=30, random_state=None) knn_indices, _ = nnd.query(nn_data[:200], k=10, epsilon=0.32) nn = NearestNeighbors(metric="cosine").fit(nn_data[200:]) true_indices = nn.kneighbors(nn_data[:200], n_neighbors=10, return_distance=False) num_correct = 0.0 for i in range(true_indices.shape[0]): num_correct += np.sum(np.in1d(true_indices[i], knn_indices[i])) percent_correct = num_correct / (true_indices.shape[0] * 10) assert percent_correct >= 0.95, ( "NN-descent query did not get 95% " "accuracy on nearest neighbors" )
def trn(X, n, return_index=True, seed=19491001, **kwargs): trnet = TRNET(n, X, seed) trnet.run(**kwargs) if not return_index: return trnet.W else: if X.shape[0] > 200000 and X.shape[1] > 2: from pynndescent import NNDescent nbrs = NNDescent(X, metric="euclidean", n_neighbors=1, n_jobs=-1, random_state=seed) idx, _ = nbrs.query(trnet.W, k=1) else: alg = "ball_tree" if X.shape[1] > 10 else "kd_tree" nbrs = NearestNeighbors(n_neighbors=1, algorithm=alg, n_jobs=-1).fit(X) _, idx = nbrs.kneighbors(trnet.W) return idx[:, 0]
def test_joblib_dump(): seed = np.random.RandomState(42) x1 = seed.normal(0, 100, (1000, 50)) x2 = seed.normal(0, 100, (1000, 50)) index1 = NNDescent(x1, "euclidean", {}, 10, random_state=None) neighbors1, distances1 = index1.query(x2) mem_temp = io.BytesIO() joblib.dump(index1, mem_temp) mem_temp.seek(0) index2 = joblib.load(mem_temp) neighbors2, distances2 = index2.query(x2) np.testing.assert_equal(neighbors1, neighbors2) np.testing.assert_equal(distances1, distances2)
def test_sparse_nn_descent_query_accuracy(): nnd = NNDescent( sparse_nn_data[200:], "euclidean", n_neighbors=10, random_state=None ) knn_indices, _ = nnd.query(sparse_nn_data[:200], k=10) tree = KDTree(sparse_nn_data[200:].toarray()) true_indices = tree.query(sparse_nn_data[:200].toarray(), 10, return_distance=False) num_correct = 0.0 for i in range(true_indices.shape[0]): num_correct += np.sum(np.in1d(true_indices[i], knn_indices[i])) percent_correct = num_correct / (true_indices.shape[0] * 10) assert_greater_equal( percent_correct, 0.95, "Sparse NN-descent query did not get 95% " "accuracy on nearest neighbors", )
def p_ij_sym(x, perp, verbose=False): num_pts = x.shape[0] k = min(num_pts - 1, int(3 * perp)) if verbose: print('Indexing') index = NNDescent(x) neighbors = np.empty((num_pts, k-1), dtype=np.int) p_ij = np.empty((num_pts, k-1)) for i, xi in enumerate(x): if verbose: print('Calculating probabilities: {cur}/{tot}'.format( cur=i+1, tot=num_pts), end='\r') nn, dists = index.query([xi], k) beta = find_beta(dists[0, 1:], perp) neighbors[i] = nn[0, 1:] p_ij[i] = p_i(dists[0, 1:], beta) row_indices = np.repeat(np.arange(num_pts), k-1) p = csr_matrix((p_ij.ravel(), (row_indices, neighbors.ravel()))) return 0.5*(p + p.transpose())
def test_transformer_equivalence(): N_NEIGHBORS = 15 QUEUE_SIZE = 5.0 train = nn_data[:400] test = nn_data[:200] nnd = NNDescent(data=train, n_neighbors=N_NEIGHBORS, random_state=42) indices, dists = nnd.query(test, k=N_NEIGHBORS, queue_size=QUEUE_SIZE) sort_idx = np.argsort(indices, axis=1) indices_sorted = np.vstack( [indices[i, sort_idx[i]] for i in range(sort_idx.shape[0])] ) dists_sorted = np.vstack([dists[i, sort_idx[i]] for i in range(sort_idx.shape[0])]) transformer = PyNNDescentTransformer( n_neighbors=N_NEIGHBORS, search_queue_size=QUEUE_SIZE, random_state=42 ).fit(train) Xt = transformer.transform(test).sorted_indices() assert np.all(Xt.indices == indices_sorted.flat) assert np.allclose(Xt.data, dists_sorted.flat)
def test_generate_triplets(self): key = random.PRNGKey(42) n_points = 1000 n_inliers = 10 n_outliers = 5 n_random = 3 n_extra = min(n_inliers + 50, n_points) # Currently testing it only for 'euclidean' distance. The test for other # cases breaks due to issues with the knn search NNDescent package, but # it works fine when tested in a colab. for distance in ['euclidean']: inputs = np.random.normal(size=(n_points, 100)) index = NNDescent(inputs, metric=distance) index.prepare() neighbors = index.query(inputs, n_extra)[0] neighbors = np.concatenate( (np.arange(n_points).reshape([-1, 1]), neighbors), 1) distance_fn = trimap.get_distance_fn(distance) _, _, sig = trimap.find_scaled_neighbors(inputs, neighbors, distance_fn) triplets, _ = trimap.generate_triplets(key, inputs, n_inliers=n_inliers, n_outliers=n_outliers, n_random=n_random, distance=distance) similar_pairs_distances = distance_fn(inputs[triplets[:, 0]], inputs[triplets[:, 1]])**2 similar_pairs_distances /= (sig[triplets[:, 0]] * sig[triplets[:, 1]]) outlier_pairs_distances = distance_fn(inputs[triplets[:, 0]], inputs[triplets[:, 2]])**2 outlier_pairs_distances /= (sig[triplets[:, 0]] * sig[triplets[:, 2]]) npt.assert_array_less(similar_pairs_distances, outlier_pairs_distances) n_knn_triplets = inputs.shape[0] * n_inliers * n_outliers n_random_triplets = inputs.shape[0] * n_random npt.assert_equal(triplets.shape, [n_knn_triplets + n_random_triplets, 3])
def test_sparse_nn_descent_query_accuracy_angular(): nnd = NNDescent(sparse_nn_data[200:], "cosine", n_neighbors=50, random_state=None) knn_indices, _ = nnd.query(sparse_nn_data[:200], k=10, epsilon=0.36) nn = NearestNeighbors(metric="cosine").fit(sparse_nn_data[200:].toarray()) true_indices = nn.kneighbors(sparse_nn_data[:200].toarray(), n_neighbors=10, return_distance=False) num_correct = 0.0 for i in range(true_indices.shape[0]): num_correct += np.sum(np.in1d(true_indices[i], knn_indices[i])) percent_correct = num_correct / (true_indices.shape[0] * 10) assert_greater_equal( percent_correct, 0.95, "Sparse NN-descent query did not get 95% " "accuracy on nearest neighbors", )
def module_computing(): json_data = json.loads(request.form.get('data')) selected_nodes = json_data['nodes'] data, cols = get_selected_data(selected_nodes) module_info = json_data['module_info'] # data_new = call_module_function(data, cols, module_info) # data_new['kmeans_cluster'] = KMeans(n_clusters=4, random_state=0).fit(data_new).labels_ # data_new = data_new.to_json(orient='records') # return jsonify(module_result=data_new) # return data_new # kNN graph from pynndescent import NNDescent df = pd.read_csv(APP_STATIC+"/uploads/processed_data.csv") activations_shape = df.shape[1]-1 activations = df.iloc[:, 0:activations_shape] k=5 index = NNDescent(activations, n_neighbors=5, metric='euclidean') out = index.query(activations, k=k) dist = out[1] s_dist=np.sort(dist, axis=0) s_dist = list(s_dist[:,k-1].astype("str")) print(s_dist) return jsonify(s_dist=s_dist)
def test_pickle_unpickle(): seed = np.random.RandomState(42) x1 = seed.normal(0, 100, (1000, 50)) x2 = seed.normal(0, 100, (1000, 50)) index1 = NNDescent( x1, "euclidean", {}, 10, random_state=None, ) neighbors1, distances1 = index1.query(x2) pickle.dump(index1, open("test_tmp.pkl", "wb")) index2 = pickle.load(open("test_tmp.pkl", "rb")) os.remove("test_tmp.pkl") neighbors2, distances2 = index2.query(x2) np.testing.assert_equal(neighbors1, neighbors2) np.testing.assert_equal(distances1, distances2)
def test_joblib_dump(): seed = np.random.RandomState(42) x1 = seed.normal(0, 100, (1000, 50)) x2 = seed.normal(0, 100, (1000, 50)) index1 = NNDescent( x1, "euclidean", {}, 10, random_state=None, ) neighbors1, distances1 = index1.query(x2) joblib.dump(index1, "test_tmp.dump") index2 = joblib.load("test_tmp.dump") os.remove("test_tmp.dump") neighbors2, distances2 = index2.query(x2) np.testing.assert_equal(neighbors1, neighbors2) np.testing.assert_equal(distances1, distances2)
def test_one_dimensional_data(nn_data, metric): nnd = NNDescent( nn_data[200:, :1], metric=metric, n_neighbors=20, random_state=None, tree_init=False, ) nnd.prepare() knn_indices, _ = nnd.query(nn_data[:200, :1], k=10, epsilon=0.2) true_nnd = NearestNeighbors(metric=metric).fit(nn_data[200:, :1]) true_indices = true_nnd.kneighbors(nn_data[:200, :1], 10, return_distance=False) num_correct = 0.0 for i in range(true_indices.shape[0]): num_correct += np.sum(np.in1d(true_indices[i], knn_indices[i])) percent_correct = num_correct / (true_indices.shape[0] * 10) assert percent_correct >= 0.95, ("NN-descent query did not get 95% " "accuracy on nearest neighbors")
def fate_bias( adata, group, basis="umap", inds=None, speed_percentile=5, dist_threshold=None, source_groups=None, metric="euclidean", metric_kwds=None, cores=1, seed=19491001, **kwargs, ): """Calculate the lineage (fate) bias of states whose trajectory are predicted. Fate bias is currently calculated as the percentage of points along the predicted cell fate trajectory whose distance to their 0-th nearest neighbors on the data are close enough (determined by median 1-st nearest neighbors of all observed cells and the dist_threshold) to any cell from each group specified by `group` key. The details is described as following: Cell fate predicted by our vector field method sometimes end up in regions that are not sampled with cells. We thus developed a heuristic method to iteratively walk backward the integration path to assign cell fate. We first identify the regions with small velocity in the tail of the integration path (determined by `speed_percentile`), then we check whether the distance of 0-th nearest points on the observed data to all those points are far away from the observed data (determined by `dist_threshold`). If they are not all close to data, we then walk backwards along the trajectory by one time step until the distance of any currently visited integration path’s data points’ 0-th nearest points to the observed cells is close enough. In order to calculate the cell fate probability, we diffuse one step further of the identified nearest neighbors from the integration to identify more nearest observed cells, especially those from terminal cell types in case nearby cells first identified are all close to some random progenitor cells. Then we use group information of those observed cells to define the fate probability. `fate_bias` calculate a confidence score for the calculated fate probability with a simple metric, defined as :math:`1 - (sum(distances > dist_threshold * median_dist) + walk_back_steps) / (len(indices) + walk_back_steps)` The `distance` is currently visited integration path’s data points’ 0-th nearest points to the observed cells. `median_dist` is median distance of their 1-st nearest cell distance of all observed cells. `walk_back_steps` is the steps walked backward along the integration path until all currently visited integration points's 0-th nearest points to the observed cells satisfy the distance threshold. `indices` are the time indices of integration points that is regarded as the regions with `small velocity` (note when walking backward, those corresponding points are not necessarily have small velocity anymore). Arguments --------- adata: :class:`~anndata.AnnData` AnnData object that contains the predicted fate trajectories in the `uns` attribute. group: `str` The column key that corresponds to the cell type or other group information for quantifying the bias of cell state. basis: `str` or None (default: `None`) The embedding data space where cell fates were predicted and cell fates bias will be quantified. inds `list` or `float` or None (default: `None`): The indices of the time steps that will be used for calculating fate bias. If inds is None, the last a few steps of the fate prediction based on the `sink_speed_percentile` will be use. If inds is the float (between 0 and 1), it will be regarded as a percentage, and the last percentage of steps will be used for fate bias calculation. Otherwise inds need to be a list of integers of the time steps. speed_percentile: `float` (default: `5`) The percentile of speed that will be used to determine the terminal cells (or sink region on the prediction path where speed is smaller than this speed percentile). dist_threshold: `float` or `None` (default: `None`) A multiplier of the median nearest cell distance on the embedding to determine cells that are outside the sampled domain of cells. If the mean distance of identified "terminal cells" is above this number, we will look backward along the trajectory (by minimize all indices by 1) until it finds cells satisfy this threshold. By default it is set to be 1 to ensure only considering points that are very close to observed data points. source_groups: `list` or `None` (default: `None`) The groups that corresponds to progenitor groups. They has to have at least one intersection with the groups from the `group` column. If group is not `None`, any identified "source_groups" cells that happen to be in those groups will be ignored and the probability of cell fate of those cells will be reassigned to the group that has the highest fate probability among other non source_groups group cells. metric: `str` or callable, default='euclidean' The distance metric to use for the tree. The default metric is , and with p=2 is equivalent to the standard Euclidean metric. See the documentation of :class:`DistanceMetric` for a list of available metrics. If metric is "precomputed", X is assumed to be a distance matrix and must be square during fit. X may be a :term:`sparse graph`, in which case only "nonzero" elements may be considered neighbors. metric_kwds : dict, default=None Additional keyword arguments for the metric function. cores: `int` (default: 1) The number of parallel jobs to run for neighbors search. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. seed: `int` (default `19491001`) Random seed to ensure the reproducibility of each run. kwargs: Additional arguments that will be passed to each nearest neighbor search algorithm. Returns ------- fate_bias: `pandas.DataFrame` A DataFrame that stores the fate bias for each cell state (row) to each cell group (column). """ if dist_threshold is None: dist_threshold = 1 if group not in adata.obs.keys(): raise ValueError( f"The group {group} you provided is not a key of .obs attribute.") else: clusters = adata.obs[group] basis_key = "X_" + basis if basis is not None else "X" fate_key = "fate_" + basis if basis is not None else "fate" if basis_key not in adata.obsm.keys(): raise ValueError( f"The basis {basis_key} you provided is not a key of .obsm attribute." ) if fate_key not in adata.uns.keys(): raise ValueError( f"The {fate_key} key is not existed in the .uns attribute of the adata object. You need to run" f"dyn.pd.fate(adata, basis='{basis}') before calculate fate bias.") if source_groups is not None: if type(source_groups) is str: source_groups = [source_groups] source_groups = list(set(source_groups).intersection(clusters)) if len(source_groups) == 0: raise ValueError( f"the {source_groups} you provided doesn't intersect with any groups in the {group} column." ) X = adata.obsm[basis_key] if basis_key != "X" else adata.X if X.shape[0] > 5000 and X.shape[1] > 2: from pynndescent import NNDescent nbrs = NNDescent(X, metric=metric, metric_kwds=metric_kwds, n_neighbors=30, n_jobs=cores, random_state=seed, **kwargs) knn, distances = nbrs.query(X, k=30) else: alg = "ball_tree" if X.shape[1] > 10 else "kd_tree" nbrs = NearestNeighbors(n_neighbors=30, algorithm=alg, n_jobs=cores).fit(X) distances, knn = nbrs.kneighbors(X) median_dist = np.median(distances[:, 1]) pred_dict = {} cell_predictions, cell_indx = adata.uns[fate_key]["prediction"], adata.uns[ fate_key]["init_cells"] t = adata.uns[fate_key]["t"] confidence = np.zeros(len(t)) for i, prediction in tqdm(enumerate(cell_predictions), desc="calculating fate distributions"): cur_t, n_steps = t[i], len(t[i]) # ensure to identify sink where the speed is very slow if inds is not provided. # if inds is the percentage, use the last percentage of steps to check for cell fate bias. # otherwise inds need to be a list. if inds is None: avg_speed = np.array( [np.linalg.norm(i) for i in np.diff(prediction, 1).T]) / np.diff(cur_t) sink_checker = np.where( avg_speed[::-1] > np.percentile(avg_speed, speed_percentile) )[0] indices = np.arange(n_steps - max(min(sink_checker), 10), n_steps) elif inds is float: indices = np.arange(int(n_steps - inds * n_steps), n_steps) else: indices = inds if hasattr(nbrs, "query"): knn, distances = nbrs.query(prediction[:, indices].T, k=30) else: distances, knn = nbrs.kneighbors(prediction[:, indices].T) # if final steps too far away from observed cells, ignore them walk_back_steps = 0 while True: is_dist_larger_than_threshold = distances.flatten( ) < dist_threshold * median_dist if any(is_dist_larger_than_threshold): # let us diffuse one step further to identify cells from terminal cell types in case # cells with indices are all close to some random progenitor cells. if hasattr(nbrs, "query"): knn, _ = nbrs.query(X[knn.flatten(), :], k=30) else: _, knn = nbrs.kneighbors(X[knn.flatten(), :]) fate_prob = clusters[knn.flatten()].value_counts() / len( knn.flatten()) if source_groups is not None: source_p = fate_prob[source_groups].sum() if 1 > source_p > 0: fate_prob[source_groups] = 0 fate_prob[fate_prob.idxmax()] += source_p pred_dict[i] = fate_prob confidence[i] = 1 - ( sum(~is_dist_larger_than_threshold) + walk_back_steps) / ( len(is_dist_larger_than_threshold) + walk_back_steps) break else: walk_back_steps += 1 if any(indices - 1 < 0): pred_dict[i] = clusters[ knn.flatten()].value_counts() * np.nan break if hasattr(nbrs, "query"): knn, distances = nbrs.query(prediction[:, indices - 1].T, k=30) else: distances, knn = nbrs.kneighbors(prediction[:, indices - 1].T) knn, distances = knn[:, 0], distances[:, 0] indices = indices - 1 bias = pd.DataFrame(pred_dict).T conf = pd.DataFrame({"confidence": confidence}, index=bias.index) bias = pd.merge(conf, bias, left_index=True, right_index=True) if cell_indx is not None: bias.index = cell_indx return bias
def cell_wise_confidence( adata, X_data=None, V_data=None, ekey="M_s", vkey="velocity_S", neighbors_from_basis=False, method="jaccard", ): """Calculate the cell-wise velocity confidence metric. Parameters ---------- adata: :class:`~anndata.AnnData` an Annodata object. X_data: 'np.ndarray' or `sp.csr_matrix` or None (optional, default `None`) The expression states of single cells (or expression states in reduced dimension, like pca, of single cells) V_data: 'np.ndarray' or `sp.csr_matrix` or None (optional, default `None`) The RNA velocity of single cells (or velocity estimates projected to reduced dimension, like pca, of single cells). Note that X, V_mat need to have the exact dimensionalities. ekey: `str` (optional, default `M_s`) The dictionary key that corresponds to the gene expression in the layer attribute. By default, it is the smoothed expression `M_s`. vkey: 'str' (optional, default `velocity_S`) The dictionary key that corresponds to the estimated velocity values in layers attribute. neighbors_from_basis: `bool` (optional, default `False`) Whether to construct nearest neighbors from low dimensional space as defined by the `basis`, instead of using that calculated during UMAP process. method: `str` (optional, default `jaccard`) Which method will be used for calculating the cell wise velocity confidence metric. By default it uses `jaccard` index, which measures how well each velocity vector meets the geometric constraints defined by the local neighborhood structure. Jaccard index is calculated as the fraction of the number of the intersected set of nearest neighbors from each cell at current expression state (X) and that from the future expression state (X + V) over the number of the union of these two sets. The `cosine` or `correlation` method is similar to that used by scVelo (https://github.com/theislab/scvelo). Returns ------- adata: :class:`~anndata.AnnData` Returns an updated `~anndata.AnnData` with `.obs.confidence` as the cell-wise velocity confidence. """ if method in ["cosine", "consensus", "correlation"]: if "indices" not in adata.uns["neighbors"].keys(): adata.uns["neighbors"]["indices"], _ = adj_to_knn( adata.obsp["connectivities"], n_neighbors=adata.uns["neighbors"]["params"]["n_neighbors"]) if ekey == "X": X, V = ( adata.X if X_data is None else X_data, adata.layers[vkey] if V_data is None else V_data, ) norm_method = adata.uns["pp"]["norm_method"].copy() adata.uns["pp"]["norm_method"] = "log1p" X = inverse_norm(adata, X) if X_data is None else X_data adata.uns["pp"]["norm_method"] = norm_method else: X, V = ( adata.layers[ekey] if X_data is None else X_data, adata.layers[vkey] if V_data is None else V_data, ) X = inverse_norm(adata, X) if X_data is None else X_data if not neighbors_from_basis: check_and_recompute_neighbors(adata, result_prefix="") n_neigh, X_neighbors = ( adata.uns["neighbors"]["params"]["n_neighbors"], adata.obsp["connectivities"], ) else: n_neigh = 30 if X.shape[0] > 200000 and X.shape[1] > 2: from pynndescent import NNDescent nbrs = NNDescent( X, metric="euclidean", n_neighbors=n_neigh + 1, n_jobs=-1, random_state=19491001, ) nbrs_idx, dist = nbrs.query(X, k=n_neigh + 1) else: alg = "ball_tree" if X.shape[1] > 10 else "kd_tree" nbrs = NearestNeighbors(n_neighbors=n_neigh + 1, algorithm=alg, n_jobs=-1).fit(X) dist, nbrs_idx = nbrs.kneighbors(X) row = np.repeat(nbrs_idx[:, 0], n_neigh) col = nbrs_idx[:, 1:].flatten() X_neighbors = csr_matrix( (np.repeat(1, len(col)), (row, col)), shape=(adata.n_obs, adata.n_obs), ) n_neigh = n_neigh[0] if type(n_neigh) == np.ndarray else n_neigh n_pca_components = adata.obsm["X"].shape[1] finite_inds = get_finite_inds(V, 0) X, V = X[:, finite_inds], V[:, finite_inds] if method == "jaccard": jac, _, _ = jaccard(X, V, n_pca_components, n_neigh, X_neighbors) confidence = jac elif method == "hybrid": # this is inspired from the locality preservation paper jac, intersect_, _ = jaccard(X, V, n_pca_components, n_neigh, X_neighbors) confidence = np.zeros(adata.n_obs) for i in tqdm( range(adata.n_obs), desc= "calculating hybrid method (jaccard + consensus) based cell wise confidence", ): neigh_ids = np.where( intersect_[i].A)[0] if issparse(intersect_) else np.where( intersect_[i])[0] confidence[i] = (jac[i] * np.mean([ consensus(V[i].A.flatten(), V[j].A.flatten()) for j in neigh_ids ]) if issparse(V) else jac[i] * np.mean( [consensus(V[i].flatten(), V[j].flatten()) for j in neigh_ids])) elif method == "cosine": check_and_recompute_neighbors(adata, result_prefix="") indices = adata.uns["neighbors"]["indices"] confidence = np.zeros(adata.n_obs) for i in tqdm( range(adata.n_obs), desc="calculating cosine based cell wise confidence", ): neigh_ids = indices[i] confidence[i] = (np.mean([ einsum_correlation(V[i].A, V[j].A.flatten(), type="cosine")[0, 0] for j in neigh_ids ]) if issparse(V) else np.mean([ einsum_correlation( V[i][None, :], V[j].flatten(), type="cosine")[0, 0] for j in neigh_ids ])) elif method == "consensus": check_and_recompute_neighbors(adata, result_prefix="") indices = adata.uns["neighbors"]["indices"] confidence = np.zeros(adata.n_obs) for i in tqdm( range(adata.n_obs), desc="calculating consensus based cell wise confidence", ): neigh_ids = indices[i] confidence[i] = (np.mean([ consensus(V[i].A.flatten(), V[j].A.flatten()) for j in neigh_ids ]) if issparse(V) else np.mean( [consensus(V[i], V[j].flatten()) for j in neigh_ids])) elif method == "correlation": # this is equivalent to scVelo check_and_recompute_neighbors(adata, result_prefix="") indices = adata.uns["neighbors"]["indices"] confidence = np.zeros(adata.n_obs) for i in tqdm( range(adata.n_obs), desc="calculating correlation based cell wise confidence", ): neigh_ids = indices[i] confidence[i] = (np.mean([ einsum_correlation(V[i].A, V[j].A.flatten(), type="pearson")[0, 0] for j in neigh_ids ]) if issparse(V) else np.mean([ einsum_correlation( V[i][None, :], V[j].flatten(), type="pearson")[0, 0] for j in neigh_ids ])) elif method == "divergence": pass else: raise Exception( "The input {} method for cell-wise velocity confidence calculation is not implemented" " yet".format(method)) adata.obs[method + "_velocity_confidence"] = confidence return adata