def _construct_mnn(t1_cells, t2_cells, data_df, n_neighbors,device,n_jobs=-2): # FUnction to construct mutually nearest neighbors bewteen two points if device == "gpu": from cuml import NearestNeighbors nbrs = NearestNeighbors(n_neighbors=n_neighbors, metric='euclidean') elif device == "cpu": from sklearn.neighbors import NearestNeighbors nbrs = NearestNeighbors(n_neighbors=n_neighbors, metric='euclidean', n_jobs=n_jobs) print(f't+1 neighbors of t...') nbrs.fit(data_df.loc[t1_cells, :].values) t1_nbrs = nbrs.kneighbors_graph( data_df.loc[t2_cells, :].values, mode='distance') print(f't neighbors of t+1...') nbrs.fit(data_df.loc[t2_cells, :].values) t2_nbrs = nbrs.kneighbors_graph( data_df.loc[t1_cells, :].values, mode='distance') # Mututally nearest neighbors mnn = t2_nbrs.multiply(t1_nbrs.T) mnn = mnn.sqrt() return mnn
def reduce_dimensionality(self, embeddings): """Reduce dimensionality of embeddings using UMAP and train a UMAP model Args: embeddings (cupy.ndarray): The extracted embeddings using the sentence transformer module. Returns: umap_embeddings: The reduced embeddings """ m_cos = NearestNeighbors(n_neighbors=15, metric="cosine") m_cos.fit(embeddings) knn_graph_cos = m_cos.kneighbors_graph(embeddings, mode="distance") u1 = UMAP(n_neighbors=15, n_components=5, min_dist=0.0) umap_embeddings = u1.fit_transform(embeddings, knn_graph=knn_graph_cos) return umap_embeddings
def fast_knn( X, *, n_clusters: int = 5, n_neighbors: Optional[int] = None, graph_mode='distance', cluster_mode='spectral', algorithm='brute', n_jobs: Optional[int] = None, random_state: int = 1, framework: Literal['auto', 'cuml', 'sklearn'] = 'auto', ) -> NearestNeighbors: """ Parameters ---------- X : `ndarray` or tuple of (X, y) n_neighbors: int (default = 5) The top K closest datapoints you want the algorithm to return. Currently, this value must be < 1024. graph_mode : {'distance', 'connectivity'}, default='distance' This mode decides which values `kneighbors_graph` will return: - 'connectivity' : will return the connectivity matrix with ones and zeros (for 'SpectralClustering'). - 'distance' : will return the distances between neighbors according to the given metric (for 'DBSCAN'). cluster_mode: {'vote', 'spectral', 'isomap'}, default='vote' This mode decides how to generate cluster prediction from the neighbors graph: - 'dbscan' : - 'spectral' : - 'isomap' : - 'kmeans' : algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, optional Algorithm used to compute the nearest neighbors: - 'ball_tree' will use :class:`BallTree` - 'kd_tree' will use :class:`KDTree` - 'brute' will use a brute-force search. - 'auto' will attempt to decide the most appropriate algorithm based on the values passed to :meth:`fit` method. Note: fitting on sparse input will override the setting of this parameter, using brute force. """ kwargs = dict(locals()) X = kwargs.pop('X') framework = kwargs.pop('framework') random_state = kwargs.pop('random_state') n_clusters = int(kwargs.pop('n_clusters')) if n_neighbors is None: kwargs['n_neighbors'] = n_clusters n_neighbors = n_clusters ## graph mode graph_mode = str(kwargs.pop('graph_mode')).strip().lower() assert graph_mode in ('distance', 'connectivity') ## cluster mode cluster_mode = str(kwargs.pop('cluster_mode')).strip().lower() ## fine-tuning the kwargs use_cuml = _check_cuml(framework) if use_cuml: from cuml.neighbors import NearestNeighbors as KNN kwargs.pop('n_jobs') kwargs.pop('algorithm') else: KNN = NearestNeighbors ## fitting knn = KNN(**kwargs) knn.fit(X) knn._fitid = id(X) ## Transform mode knn._random_state = random_state knn._n_clusters = n_clusters knn._graph_mode = graph_mode knn._cluster_mode = cluster_mode if use_cuml: knn.n_samples_fit_ = X.shape[0] knn.kneighbors_graph = types.MethodType(nn_kneighbors_graph, knn) knn.transform = types.MethodType(nn_transform, knn) knn.fit_transform = types.MethodType(nn_fit_transform, knn) knn.predict = types.MethodType(nn_predict, knn) return knn
def diffusion( adata: AnnData, n_components=10, knn=30, alpha=0, multiscale: bool = True, n_eigs: int = None, device="cpu", n_pcs=50, copy=False, ): """\ Wrapper to generate diffusion maps using Palantir. Parameters ---------- adata Annotated data matrix. use_highly_variable Use only variable genes for calculating PC components. n_components Number of diffusion components. knn Number of nearest neighbors for graph construction. alpha Normalization parameter for the diffusion operator. multiscale Whether to get mutliscale diffusion space (calls palantir.utils.determine_multiscale_space). n_eigs if multiscale is True, how much components to retain. device Run method on either `cpu` or on `gpu`. do_PCA Whether to perform PCA or not. n_pcs Number of PC components. seed Get reproducible results for the GPU implementation. copy Return a copy instead of writing to adata. Returns ------- adata : anndata.AnnData if `copy=True` it returns AnnData, else it update field to `adata`: `.obsm['X_diffusion']` if `multiscale = False`, diffusion space. `.obsm['X_multiscale_diffusion']` if `multiscale = True`, multiscale diffusion space. `.uns['diffusion']` dict containing results from Palantir. """ logg.info("Running Diffusion maps ", reset=True) data_df = pd.DataFrame(adata.obsm["X_pca"], index=adata.obs_names) if device == "cpu": from palantir.utils import run_diffusion_maps res = run_diffusion_maps(data_df, n_components=n_components, knn=knn, alpha=alpha) # code converted in GPU, not reproducible! elif device == "gpu": logg.warn( "GPU implementation uses eigsh from cupy.sparse, which is not currently reproducible and can give unstable results!" ) import cupy as cp from cupyx.scipy.sparse import csr_matrix as csr_matrix_gpu from cupyx.scipy.sparse.linalg import eigsh # Determine the kernel N = data_df.shape[0] if not issparse(data_df): from cuml.neighbors import NearestNeighbors nn = NearestNeighbors(n_neighbors=knn, metric="euclidean") X_contiguous = np.ascontiguousarray(data_df.values) nn.fit(X_contiguous) kNN = nn.kneighbors_graph(X_contiguous, mode="distance") kNN.setdiag(0) kNN.eliminate_zeros() # Adaptive k adaptive_k = int(np.floor(knn / 3)) adaptive_std = np.zeros(N) for i in np.arange(len(adaptive_std)): adaptive_std[i] = np.sort( kNN.data[kNN.indptr[i]:kNN.indptr[i + 1]])[adaptive_k - 1] # Kernel x, y, dists = find(kNN) # X, y specific stds dists = dists / adaptive_std[x] W = csr_matrix((np.exp(-dists), (x, y)), shape=[N, N]) # Diffusion components kernel = W + W.T else: kernel = data_df # Markov D = np.ravel(kernel.sum(axis=1)) if alpha > 0: # L_alpha D[D != 0] = D[D != 0]**(-alpha) mat = csr_matrix((D, (range(N), range(N))), shape=[N, N]) kernel = mat.dot(kernel).dot(mat) D = np.ravel(kernel.sum(axis=1)) D[D != 0] = 1 / D[D != 0] kernel = csr_matrix_gpu(kernel) D = csr_matrix_gpu((cp.array(D), (cp.arange(N), cp.arange(N))), shape=(N, N)) T = D.dot(kernel) # Eigen value dcomposition D, V = eigsh(T, n_components, tol=1e-4, maxiter=1000) D, V = D.get(), V.get() inds = np.argsort(D)[::-1] D = D[inds] V = V[:, inds] # Normalize for i in range(V.shape[1]): V[:, i] = V[:, i] / np.linalg.norm(V[:, i]) # Create are results dictionary res = {"T": T.get(), "EigenVectors": V, "EigenValues": D} res["EigenVectors"] = pd.DataFrame(res["EigenVectors"]) if not issparse(data_df): res["EigenVectors"].index = data_df.index res["EigenValues"] = pd.Series(res["EigenValues"]) res["kernel"] = kernel.get() if multiscale: logg.info(" determining multiscale diffusion space") from palantir.utils import determine_multiscale_space adata.obsm["X_diffusion_multiscale"] = determine_multiscale_space( res, n_eigs=n_eigs).values logstr = " .obsm['X_diffusion_multiscale'], multiscale diffusion space.\n" else: adata.obsm["X_diffusion"] = res["EigenVectors"].iloc[:, 1:].values logstr = " .obsm['X_diffusion'], diffusion space.\n" adata.uns["diffusion"] = res logg.info(" finished", time=True, end=" " if settings.verbosity > 2 else "\n") logg.hint("added \n" + logstr + " .uns['diffusion'] dict containing diffusion maps results.")
def augmented_affinity_matrix( data_df, timepoints, timepoint_connections, n_neighbors=30, n_jobs=-2, pc_components=1000, device="cpu", ): """Function for max min sampling of waypoints :param data_df: Normalized data frame. Data frame should be sorted according to the timepoints :param timepoints: Panadas series indicating timepoints for each cell in data_df :param timepoint_connections: Links between timepoints :param n_neighbors: Number of nearest neighbors for graph construction :param n_jobs: Nearest Neighbors will be computed in parallel using n_jobs. :param pc_components: Minimum number of principal components to use. Specify `None` to use pre-computed components :return: Affinity matrix augmented to mutually nearest neighbors """ # Timepoints nad data_df should in same order timepoints = timepoints[data_df.index] cell_order = data_df.index # Time point cells and indices tp_cells = pd.Series() tp_offset = pd.Series() offset = 0 for i in timepoints.unique(): tp_offset[i] = offset tp_cells[i] = list(timepoints.index[timepoints == i]) offset += len(tp_cells[i]) # Run PCA to denoise the dropouts if pc_components is None: pca_projections = data_df else: pca_projections, _ = utils.run_pca(data_df,device, n_components=pc_components) # Nearest neighbor graph construction and affinity matrix print('Nearest neighbor computation...') # -------------------------------------------------------------------------- # nbrs = NearestNeighbors(n_neighbors=n_neighbors, # metric='euclidean', n_jobs=-2) # nbrs.fit(pca_projections.values) # dists, _ = nbrs.kneighbors(pca_projections.values) # adj = nbrs.kneighbors_graph(pca_projections.values, mode='distance') # # Scaling factors for affinity matrix construction # ka = np.int(n_neighbors / 3) # scaling_factors = pd.Series(dists[:, ka], index=cell_order) # # Affinity matrix # nn_aff = _convert_to_affinity(adj, scaling_factors, True) # -------------------------------------------------------------------------- if device == "cpu": temp = sc.AnnData(data_df.values) sc.pp.neighbors(temp, n_pcs=0, n_neighbors=n_neighbors) # maintaining backwards compatibility to Scanpy `sc.pp.neighbors` try: kNN = temp.uns['neighbors']['distances'] except KeyError: kNN = temp.obsp['distances'] elif device == "gpu": from cuml.neighbors import NearestNeighbors nn = NearestNeighbors(n_neighbors=n_neighbors,metric="euclidean") X_contiguous = np.ascontiguousarray(data_df.values) nn.fit(X_contiguous) kNN = nn.kneighbors_graph(X_contiguous,mode="distance") kNN.setdiag(0) kNN.eliminate_zeros() # Adaptive k adaptive_k = int(np.floor(n_neighbors / 3)) scaling_factors = np.zeros(data_df.shape[0]) for i in np.arange(len(scaling_factors)): scaling_factors[i] = np.sort(kNN.data[kNN.indptr[i]:kNN.indptr[i + 1]])[adaptive_k - 1] scaling_factors = pd.Series(scaling_factors, index=cell_order) # Affinity matrix nn_aff = _convert_to_affinity(kNN, scaling_factors, device, True) # Mututally nearest neighbor affinity matrix # Initilze mnn affinity matrix N = len(cell_order) full_mnn_aff = csr_matrix(([0], ([0], [0])), [N, N]) for i in timepoint_connections.index: t1, t2 = timepoint_connections.loc[i, :].values print(f'Constucting affinities between {t1} and {t2}...') # MNN matrix and distance to ka the distance t1_cells = tp_cells[t1] t2_cells = tp_cells[t2] mnn = _construct_mnn(t1_cells, t2_cells, pca_projections, n_neighbors, device, n_jobs) # MNN Scaling factors # Distance to the adaptive neighbor ka_dists = pd.Series(0.0, index=t1_cells + t2_cells) # T1 scaling factors ka_dists[t1_cells] = _mnn_ka_distances(mnn, n_neighbors) # T2 scaling factors ka_dists[t2_cells] = _mnn_ka_distances(mnn.T, n_neighbors) # Scaling factors mnn_scaling_factors = pd.Series(0.0, index=cell_order) mnn_scaling_factors[t1_cells] = _mnn_scaling_factors( ka_dists[t1_cells], scaling_factors, device) mnn_scaling_factors[t2_cells] = _mnn_scaling_factors( ka_dists[t2_cells], scaling_factors, device) # MNN affinity matrix full_mnn_aff = full_mnn_aff + \ _mnn_affinity(mnn, mnn_scaling_factors, tp_offset[t1], tp_offset[t2], device) # Symmetrize the affinity matrix and return aff = nn_aff + nn_aff.T + full_mnn_aff + full_mnn_aff.T return aff, nn_aff + nn_aff.T