Ejemplo n.º 1
0
def _construct_mnn(t1_cells, t2_cells, data_df, n_neighbors,device,n_jobs=-2):
    # FUnction to construct mutually nearest neighbors bewteen two points
    
    if device == "gpu":
        from cuml import NearestNeighbors
        nbrs = NearestNeighbors(n_neighbors=n_neighbors,
                                metric='euclidean')
    elif device == "cpu":
        from sklearn.neighbors import NearestNeighbors
        nbrs = NearestNeighbors(n_neighbors=n_neighbors,
                                metric='euclidean', n_jobs=n_jobs)
    
    print(f't+1 neighbors of t...')
    nbrs.fit(data_df.loc[t1_cells, :].values)
    t1_nbrs = nbrs.kneighbors_graph(
        data_df.loc[t2_cells, :].values, mode='distance')

    print(f't neighbors of t+1...')
    nbrs.fit(data_df.loc[t2_cells, :].values)
    t2_nbrs = nbrs.kneighbors_graph(
        data_df.loc[t1_cells, :].values, mode='distance')

    # Mututally nearest neighbors
    mnn = t2_nbrs.multiply(t1_nbrs.T)
    mnn = mnn.sqrt()
    return mnn
Ejemplo n.º 2
0
    def reduce_dimensionality(self, embeddings):
        """Reduce dimensionality of embeddings using UMAP and train a UMAP model

        Args:
            embeddings (cupy.ndarray): The extracted embeddings using the
            sentence transformer module.

        Returns:
            umap_embeddings: The reduced embeddings
        """
        m_cos = NearestNeighbors(n_neighbors=15, metric="cosine")
        m_cos.fit(embeddings)
        knn_graph_cos = m_cos.kneighbors_graph(embeddings, mode="distance")
        u1 = UMAP(n_neighbors=15, n_components=5, min_dist=0.0)
        umap_embeddings = u1.fit_transform(embeddings, knn_graph=knn_graph_cos)

        return umap_embeddings
Ejemplo n.º 3
0
def fast_knn(
    X,
    *,
    n_clusters: int = 5,
    n_neighbors: Optional[int] = None,
    graph_mode='distance',
    cluster_mode='spectral',
    algorithm='brute',
    n_jobs: Optional[int] = None,
    random_state: int = 1,
    framework: Literal['auto', 'cuml', 'sklearn'] = 'auto',
) -> NearestNeighbors:
    """
  Parameters
  ----------
  X : `ndarray` or tuple of (X, y)
  n_neighbors: int (default = 5)
    The top K closest datapoints you want the algorithm to return.
    Currently, this value must be < 1024.
  graph_mode : {'distance', 'connectivity'}, default='distance'
    This mode decides which values `kneighbors_graph` will return:
      - 'connectivity' : will return the connectivity matrix with ones and
        zeros (for 'SpectralClustering').
      - 'distance' : will return the distances between neighbors according
        to the given metric (for 'DBSCAN').
  cluster_mode: {'vote', 'spectral', 'isomap'}, default='vote'
      This mode decides how to generate cluster prediction from the
      neighbors graph:
      - 'dbscan' :
      - 'spectral' :
      - 'isomap' :
      - 'kmeans' :
  algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, optional
      Algorithm used to compute the nearest neighbors:
      - 'ball_tree' will use :class:`BallTree`
      - 'kd_tree' will use :class:`KDTree`
      - 'brute' will use a brute-force search.
      - 'auto' will attempt to decide the most appropriate algorithm
        based on the values passed to :meth:`fit` method.
      Note: fitting on sparse input will override the setting of
      this parameter, using brute force.
  """
    kwargs = dict(locals())
    X = kwargs.pop('X')
    framework = kwargs.pop('framework')
    random_state = kwargs.pop('random_state')
    n_clusters = int(kwargs.pop('n_clusters'))
    if n_neighbors is None:
        kwargs['n_neighbors'] = n_clusters
        n_neighbors = n_clusters
    ## graph mode
    graph_mode = str(kwargs.pop('graph_mode')).strip().lower()
    assert graph_mode in ('distance', 'connectivity')
    ## cluster mode
    cluster_mode = str(kwargs.pop('cluster_mode')).strip().lower()
    ## fine-tuning the kwargs
    use_cuml = _check_cuml(framework)
    if use_cuml:
        from cuml.neighbors import NearestNeighbors as KNN
        kwargs.pop('n_jobs')
        kwargs.pop('algorithm')
    else:
        KNN = NearestNeighbors
    ## fitting
    knn = KNN(**kwargs)
    knn.fit(X)
    knn._fitid = id(X)
    ## Transform mode
    knn._random_state = random_state
    knn._n_clusters = n_clusters
    knn._graph_mode = graph_mode
    knn._cluster_mode = cluster_mode
    if use_cuml:
        knn.n_samples_fit_ = X.shape[0]
    knn.kneighbors_graph = types.MethodType(nn_kneighbors_graph, knn)
    knn.transform = types.MethodType(nn_transform, knn)
    knn.fit_transform = types.MethodType(nn_fit_transform, knn)
    knn.predict = types.MethodType(nn_predict, knn)
    return knn
Ejemplo n.º 4
0
def diffusion(
    adata: AnnData,
    n_components=10,
    knn=30,
    alpha=0,
    multiscale: bool = True,
    n_eigs: int = None,
    device="cpu",
    n_pcs=50,
    copy=False,
):
    """\
    Wrapper to generate diffusion maps using Palantir.

    Parameters
    ----------
    adata
        Annotated data matrix.
    use_highly_variable
        Use only variable genes for calculating PC components.
    n_components
        Number of diffusion components.
    knn
        Number of nearest neighbors for graph construction.
    alpha
        Normalization parameter for the diffusion operator.
    multiscale
        Whether to get mutliscale diffusion space
        (calls palantir.utils.determine_multiscale_space).
    n_eigs
        if multiscale is True, how much components to retain.
    device
        Run method on either `cpu` or on `gpu`.
    do_PCA
        Whether to perform PCA or not.
    n_pcs
        Number of PC components.
    seed
        Get reproducible results for the GPU implementation.
    copy
        Return a copy instead of writing to adata.
    Returns
    -------
    adata : anndata.AnnData
        if `copy=True` it returns AnnData, else it update field to `adata`:

        `.obsm['X_diffusion']`
            if `multiscale = False`, diffusion space.
        `.obsm['X_multiscale_diffusion']`
            if `multiscale = True`, multiscale diffusion space.
        `.uns['diffusion']`
            dict containing results from Palantir.
    """

    logg.info("Running Diffusion maps ", reset=True)

    data_df = pd.DataFrame(adata.obsm["X_pca"], index=adata.obs_names)

    if device == "cpu":
        from palantir.utils import run_diffusion_maps

        res = run_diffusion_maps(data_df,
                                 n_components=n_components,
                                 knn=knn,
                                 alpha=alpha)
    # code converted in GPU, not reproducible!
    elif device == "gpu":
        logg.warn(
            "GPU implementation uses eigsh from cupy.sparse, which is not currently reproducible and can give unstable results!"
        )
        import cupy as cp
        from cupyx.scipy.sparse import csr_matrix as csr_matrix_gpu
        from cupyx.scipy.sparse.linalg import eigsh

        # Determine the kernel
        N = data_df.shape[0]
        if not issparse(data_df):
            from cuml.neighbors import NearestNeighbors

            nn = NearestNeighbors(n_neighbors=knn, metric="euclidean")
            X_contiguous = np.ascontiguousarray(data_df.values)
            nn.fit(X_contiguous)

            kNN = nn.kneighbors_graph(X_contiguous, mode="distance")
            kNN.setdiag(0)
            kNN.eliminate_zeros()

            # Adaptive k
            adaptive_k = int(np.floor(knn / 3))
            adaptive_std = np.zeros(N)

            for i in np.arange(len(adaptive_std)):
                adaptive_std[i] = np.sort(
                    kNN.data[kNN.indptr[i]:kNN.indptr[i + 1]])[adaptive_k - 1]

            # Kernel
            x, y, dists = find(kNN)

            # X, y specific stds
            dists = dists / adaptive_std[x]
            W = csr_matrix((np.exp(-dists), (x, y)), shape=[N, N])

            # Diffusion components
            kernel = W + W.T
        else:
            kernel = data_df

        # Markov
        D = np.ravel(kernel.sum(axis=1))

        if alpha > 0:
            # L_alpha
            D[D != 0] = D[D != 0]**(-alpha)
            mat = csr_matrix((D, (range(N), range(N))), shape=[N, N])
            kernel = mat.dot(kernel).dot(mat)
            D = np.ravel(kernel.sum(axis=1))

        D[D != 0] = 1 / D[D != 0]
        kernel = csr_matrix_gpu(kernel)
        D = csr_matrix_gpu((cp.array(D), (cp.arange(N), cp.arange(N))),
                           shape=(N, N))
        T = D.dot(kernel)
        # Eigen value dcomposition
        D, V = eigsh(T, n_components, tol=1e-4, maxiter=1000)
        D, V = D.get(), V.get()

        inds = np.argsort(D)[::-1]
        D = D[inds]
        V = V[:, inds]

        # Normalize
        for i in range(V.shape[1]):
            V[:, i] = V[:, i] / np.linalg.norm(V[:, i])

        # Create are results dictionary
        res = {"T": T.get(), "EigenVectors": V, "EigenValues": D}
        res["EigenVectors"] = pd.DataFrame(res["EigenVectors"])
        if not issparse(data_df):
            res["EigenVectors"].index = data_df.index
        res["EigenValues"] = pd.Series(res["EigenValues"])
        res["kernel"] = kernel.get()

    if multiscale:
        logg.info("    determining multiscale diffusion space")
        from palantir.utils import determine_multiscale_space

        adata.obsm["X_diffusion_multiscale"] = determine_multiscale_space(
            res, n_eigs=n_eigs).values
        logstr = "    .obsm['X_diffusion_multiscale'], multiscale diffusion space.\n"
    else:
        adata.obsm["X_diffusion"] = res["EigenVectors"].iloc[:, 1:].values
        logstr = "    .obsm['X_diffusion'], diffusion space.\n"

    adata.uns["diffusion"] = res

    logg.info("    finished",
              time=True,
              end=" " if settings.verbosity > 2 else "\n")
    logg.hint("added \n" + logstr +
              "    .uns['diffusion'] dict containing diffusion maps results.")
Ejemplo n.º 5
0
def augmented_affinity_matrix(
    data_df,
    timepoints,
    timepoint_connections,
    n_neighbors=30,
    n_jobs=-2,
    pc_components=1000,
    device="cpu",
):
    """Function for max min sampling of waypoints

    :param data_df: Normalized data frame. Data frame should be sorted according to the timepoints
    :param timepoints: Panadas series indicating timepoints for each cell in data_df
    :param timepoint_connections: Links between timepoints
    :param n_neighbors: Number of nearest neighbors for graph construction
    :param n_jobs: Nearest Neighbors will be computed in parallel using n_jobs.
    :param pc_components: Minimum number of principal components to use. Specify `None` to use pre-computed components
    :return: Affinity matrix  augmented to mutually nearest neighbors
    """

    # Timepoints nad data_df should in same order
    timepoints = timepoints[data_df.index]
    cell_order = data_df.index

    # Time point cells and indices
    tp_cells = pd.Series()
    tp_offset = pd.Series()
    offset = 0
    for i in timepoints.unique():
        tp_offset[i] = offset
        tp_cells[i] = list(timepoints.index[timepoints == i])
        offset += len(tp_cells[i])

    # Run PCA to denoise the dropouts
    if pc_components is None:
        pca_projections = data_df
    else:
        pca_projections, _ = utils.run_pca(data_df,device, n_components=pc_components)

    # Nearest neighbor graph construction and affinity matrix
    print('Nearest neighbor computation...')

    # --------------------------------------------------------------------------
    # nbrs = NearestNeighbors(n_neighbors=n_neighbors,
    #                         metric='euclidean', n_jobs=-2)
    # nbrs.fit(pca_projections.values)
    # dists, _ = nbrs.kneighbors(pca_projections.values)
    # adj = nbrs.kneighbors_graph(pca_projections.values, mode='distance')
    # # Scaling factors for affinity matrix construction
    # ka = np.int(n_neighbors / 3)
    # scaling_factors = pd.Series(dists[:, ka], index=cell_order)
    # # Affinity matrix
    # nn_aff = _convert_to_affinity(adj, scaling_factors, True)
    # --------------------------------------------------------------------------
    
    if device == "cpu":
        temp = sc.AnnData(data_df.values)
        sc.pp.neighbors(temp, n_pcs=0, n_neighbors=n_neighbors)
        # maintaining backwards compatibility to Scanpy `sc.pp.neighbors`
        try:
            kNN = temp.uns['neighbors']['distances']
        except KeyError:
            kNN = temp.obsp['distances']
    elif device == "gpu":
        from cuml.neighbors import NearestNeighbors
        nn = NearestNeighbors(n_neighbors=n_neighbors,metric="euclidean")
        X_contiguous = np.ascontiguousarray(data_df.values)
        nn.fit(X_contiguous)

        kNN = nn.kneighbors_graph(X_contiguous,mode="distance")
        kNN.setdiag(0)
        kNN.eliminate_zeros()

    # Adaptive k
    adaptive_k = int(np.floor(n_neighbors / 3))
    scaling_factors = np.zeros(data_df.shape[0])

    for i in np.arange(len(scaling_factors)):
        scaling_factors[i] = np.sort(kNN.data[kNN.indptr[i]:kNN.indptr[i + 1]])[adaptive_k - 1]

    scaling_factors = pd.Series(scaling_factors, index=cell_order)

    # Affinity matrix
    nn_aff = _convert_to_affinity(kNN, scaling_factors, device, True)

    # Mututally nearest neighbor affinity matrix
    # Initilze mnn affinity matrix
    N = len(cell_order)
    full_mnn_aff = csr_matrix(([0], ([0], [0])), [N, N])
    for i in timepoint_connections.index:
        t1, t2 = timepoint_connections.loc[i, :].values
        print(f'Constucting affinities between {t1} and {t2}...')

        # MNN matrix  and distance to ka the distance
        t1_cells = tp_cells[t1]
        t2_cells = tp_cells[t2]
        mnn = _construct_mnn(t1_cells, t2_cells, pca_projections,
                             n_neighbors, device, n_jobs)

        # MNN Scaling factors
        # Distance to the adaptive neighbor
        ka_dists = pd.Series(0.0, index=t1_cells + t2_cells)
        # T1 scaling factors
        ka_dists[t1_cells] = _mnn_ka_distances(mnn, n_neighbors)
        # T2 scaling factors
        ka_dists[t2_cells] = _mnn_ka_distances(mnn.T, n_neighbors)

        # Scaling factors
        mnn_scaling_factors = pd.Series(0.0, index=cell_order)
        mnn_scaling_factors[t1_cells] = _mnn_scaling_factors(
            ka_dists[t1_cells], scaling_factors, device)
        mnn_scaling_factors[t2_cells] = _mnn_scaling_factors(
            ka_dists[t2_cells], scaling_factors, device)

        # MNN affinity matrix
        full_mnn_aff = full_mnn_aff + \
            _mnn_affinity(mnn, mnn_scaling_factors,
                          tp_offset[t1], tp_offset[t2], device)

    # Symmetrize the affinity matrix and return
    aff = nn_aff + nn_aff.T + full_mnn_aff + full_mnn_aff.T
    return aff, nn_aff + nn_aff.T