Example #1
0
    def fit(self, X, V, k, s=None, tol=1e-4):
        self.__reset__()
        # knn clustering
        if self.nbrs_idx is None:
            if X.shape[0] > 200000 and X.shape[1] > 2: 
                from pynndescent import NNDescent

                nbrs = NNDescent(X, metric='euclidean', n_neighbors=k + 1, n_jobs=-1,
                                  random_state=19491001)
                Idx, _ = nbrs.query(X, k=k+1)
            else:
                alg = 'ball_tree' if X.shape[1] > 10 else 'kd_tree'
                nbrs = NearestNeighbors(n_neighbors=k + 1, algorithm=alg, n_jobs=-1).fit(X)
                _, Idx = nbrs.kneighbors(X)

            self.nbrs_idx = Idx[:, 1:]
        else:
            Idx = self.nbrs_idx
        # compute transition prob.
        n = X.shape[0]
        self.P = np.zeros((n, n))
        for i in range(n):
            y = X[i]
            v = V[i]
            Y = X[Idx[i, 1:]]
            p = compute_markov_trans_prob(y, v, Y, s, cont_time=True)
            p[p <= tol] = 0  # tolerance check
            self.P[Idx[i, 1:], i] = p
            self.P[i, i] = -np.sum(p)
Example #2
0
    def get_Xss_confidence(self):
        X = self.X_data
        X = X.A if sp.issparse(X) else X
        Xss = self.Xss.get_X()
        alg = 'ball_tree' if Xss.shape[1] > 10 else 'kd_tree'

        if X.shape[0] > 200000 and X.shape[1] > 2:
            from pynndescent import NNDescent

            nbrs = NNDescent(X,
                             metric='euclidean',
                             n_neighbors=min(self.k, X.shape[0] - 1),
                             n_jobs=-1,
                             random_state=19491001)
            _, dist = nbrs.query(Xss, k=min(self.k, X.shape[0] - 1))
        else:
            alg = 'ball_tree' if X.shape[1] > 10 else 'kd_tree'
            nbrs = NearestNeighbors(n_neighbors=min(self.k, X.shape[0] - 1),
                                    algorithm=alg,
                                    n_jobs=-1).fit(X)
            dist, _ = nbrs.kneighbors(Xss)

        dist_m = dist.mean(1)
        confidence = 1 - dist_m / dist_m.max()

        return confidence
Example #3
0
def graphize_vecfld(func, X, nbrs_idx=None, dist=None, k=30, distance_free=True, n_int_steps=20, cores=1):
    n, d = X.shape

    nbrs = None
    if nbrs_idx is None:
        if X.shape[0] > 200000 and X.shape[1] > 2: 
            from pynndescent import NNDescent

            nbrs = NNDescent(X, metric='euclidean', n_neighbors=k+1, n_jobs=-1, random_state=19491001)
            nbrs_idx, dist = nbrs.query(X, k=k+1)
        else:
            alg = 'ball_tree' if X.shape[1] > 10 else 'kd_tree'
            nbrs = NearestNeighbors(n_neighbors=k+1, algorithm=alg, n_jobs=-1).fit(X)
            dist, nbrs_idx = nbrs.kneighbors(X)

    if dist is None and not distance_free:
        D = pdist(X)
    else:
        D = None

    V = sp.csr_matrix((n, n))
    if cores == 1:
        for i, idx in tqdm(enumerate(nbrs_idx), desc='Constructing diffusion graph from reconstructed vector field'):
            V += construct_v(X, i, idx, n_int_steps, func, distance_free, dist, D, n)

    else:
        pool = ThreadPool(cores)
        res = pool.starmap(construct_v, zip(itertools.repeat(X), np.arange(len(nbrs_idx)), nbrs_idx, itertools.repeat(n_int_steps),
                                            itertools.repeat(func), itertools.repeat(distance_free),
                                            itertools.repeat(dist), itertools.repeat(D), itertools.repeat(n)))
        pool.close()
        pool.join()
        V = functools.reduce((lambda a, b: a + b), res)

    return V, nbrs
Example #4
0
def compute_tau(X, V, k=100, nbr_idx=None):
    if nbr_idx is None:
        if X.shape[0] > 200000 and X.shape[1] > 2:
            from pynndescent import NNDescent

            nbrs = NNDescent(
                X,
                metric="euclidean",
                n_neighbors=k,
                n_jobs=-1,
                random_state=19491001,
            )
            _, dist = nbrs.query(X, k=k)
        else:
            alg = "ball_tree" if X.shape[1] > 10 else "kd_tree"
            nbrs = NearestNeighbors(n_neighbors=k, algorithm=alg, n_jobs=-1).fit(X)
            dists, _ = nbrs.kneighbors(X)

    else:
        dists = np.zeros(nbr_idx.shape)
        for i in range(nbr_idx.shape[0]):
            for j in range(nbr_idx.shape[1]):
                x = X[i]
                y = X[nbr_idx[i, j]]
                dists[i, j] = np.sqrt((x - y).dot(x - y))
    d = np.mean(dists[:, 1:], 1)
    v = np.linalg.norm(V, axis=1)
    tau = d / v
    return tau, v
Example #5
0
    def get_Xss_confidence(self, k=50):
        X = self.X_data
        X = X.A if sp.issparse(X) else X
        Xss = self.Xss.get_X()
        Xref = np.median(X, 0)
        Xss = np.vstack((Xss, Xref))

        if X.shape[0] > 200000 and X.shape[1] > 2:
            from pynndescent import NNDescent

            nbrs = NNDescent(X,
                             metric="euclidean",
                             n_neighbors=min(k, X.shape[0] - 1),
                             n_jobs=-1,
                             random_state=19491001)
            _, dist = nbrs.query(Xss, k=min(k, X.shape[0] - 1))
        else:
            alg = "ball_tree" if X.shape[1] > 10 else "kd_tree"
            nbrs = NearestNeighbors(n_neighbors=min(k, X.shape[0] - 1),
                                    algorithm=alg,
                                    n_jobs=-1).fit(X)
            dist, _ = nbrs.kneighbors(Xss)

        dist_m = dist.mean(1)
        # confidence = 1 - dist_m / dist_m.max()
        sigma = 0.1 * 0.5 * (np.max(X[:, 0]) - np.min(X[:, 0]) +
                             np.max(X[:, 1]) - np.min(X[:, 1]))
        confidence = gaussian_1d(dist_m, sigma=sigma)
        confidence /= np.max(confidence)
        return confidence[:-1]
Example #6
0
def bandwidth_selector(X):
    """
    This function computes an empirical bandwidth for a Gaussian kernel.
    """
    n, m = X.shape
    if n > 200000 and m > 2:
        from pynndescent import NNDescent

        nbrs = NNDescent(
            X,
            metric="euclidean",
            n_neighbors=max(2, int(0.2 * n)),
            n_jobs=-1,
            random_state=19491001,
        )
        _, distances = nbrs.query(X, k=max(2, int(0.2 * n)))
    else:
        alg = "ball_tree" if X.shape[1] > 10 else "kd_tree"
        nbrs = NearestNeighbors(n_neighbors=max(2, int(0.2 * n)),
                                algorithm=alg,
                                n_jobs=-1).fit(X)
        distances, _ = nbrs.kneighbors(X)

    d = np.mean(distances[:, 1:]) / 1.5
    return np.sqrt(2) * d
Example #7
0
    def fit(self,
            X,
            V,
            k,
            s=None,
            method="qp",
            eps=None,
            tol=1e-4):  # pass index
        # the parameter k will be replaced by a connectivity matrix in the future.
        self.__reset__()
        # knn clustering
        if X.shape[0] > 200000 and X.shape[1] > 2:
            from pynndescent import NNDescent

            nbrs = NNDescent(X,
                             metric="euclidean",
                             n_neighbors=k,
                             n_jobs=-1,
                             random_state=19491001)
            Idx, _ = nbrs.query(X, k=k)
        else:
            alg = "ball_tree" if X.shape[1] > 10 else "kd_tree"
            nbrs = NearestNeighbors(n_neighbors=k, algorithm=alg,
                                    n_jobs=-1).fit(X)
            _, Idx = nbrs.kneighbors(X)
        # compute transition prob.
        n = X.shape[0]
        self.P = np.zeros((n, n))
        if method == "kernel":
            inv_s = np.linalg.inv(s)
            # compute density kernel
            if eps is not None:
                self.Kd = np.zeros((n, n))
                inv_eps = 1 / eps
                for i in range(n):
                    self.Kd[i, Idx[i]] = compute_density_kernel(
                        X[i], X[Idx[i]], inv_eps)
                D = np.sum(self.Kd, 0)
        for i in range(n):
            y = X[i]
            v = V[i]
            if method == "qp":
                Y = X[Idx[i, 1:]]
                p = compute_markov_trans_prob(y, v, Y, s)
                p[p <= tol] = 0  # tolerance check
                self.P[Idx[i, 1:], i] = p
                self.P[i, i] = 1 - np.sum(p)
            else:
                Y = X[Idx[i]]
                # p = compute_kernel_trans_prob(y, v, Y, inv_s)
                k = compute_drift_kernel(y, v, Y, inv_s)
                if eps is not None:
                    k /= D[Idx[i]]
                p = k / np.sum(k)
                p[p <= tol] = 0  # tolerance check
                p = p / np.sum(p)
                self.P[Idx[i], i] = p
Example #8
0
def prepare_velocity_grid_data(
    X_emb,
    xy_grid_nums,
    density=None,
    smooth=None,
    n_neighbors=None,
):

    n_obs, n_dim = X_emb.shape
    density = 1 if density is None else density
    smooth = 0.5 if smooth is None else smooth

    grs, scale = [], 0
    for dim_i in range(n_dim):
        m, M = np.min(X_emb[:, dim_i]), np.max(X_emb[:, dim_i])
        m = m - 0.01 * np.abs(M - m)
        M = M + 0.01 * np.abs(M - m)
        gr = np.linspace(m, M, xy_grid_nums[dim_i] * density)
        scale += gr[1] - gr[0]
        grs.append(gr)

    scale = scale / n_dim * smooth

    meshes_tuple = np.meshgrid(*grs)
    X_grid = np.vstack([i.flat for i in meshes_tuple]).T

    # estimate grid velocities
    if n_neighbors is None:
        n_neighbors = np.max([10, int(n_obs / 50)])

    if X_emb.shape[0] > 200000 and X_emb.shape[1] > 2:
        from pynndescent import NNDescent

        nn = NNDescent(X_emb,
                       metric='euclidean',
                       n_neighbors=n_neighbors,
                       n_jobs=-1,
                       random_state=19491001)
        neighs, dists = nn.query(X_grid, k=n_neighbors)
    else:
        alg = "ball_tree" if X_emb.shape[1] > 10 else 'kd_tree'
        nn = NearestNeighbors(n_neighbors=n_neighbors,
                              n_jobs=-1,
                              algorithm=alg)
        nn.fit(X_emb)
        dists, neighs = nn.kneighbors(X_grid)

    weight = norm.pdf(x=dists, scale=scale)
    p_mass = weight.sum(1)

    return X_grid, p_mass, neighs, weight
Example #9
0
def trn(X, n, return_index=True, seed=19491001, **kwargs):
    trnet = TRNET(n, X, seed)
    trnet.run(**kwargs)
    if not return_index:
        return trnet.W
    else:
        if X.shape[0] > 200000 and X.shape[1] > 2:
            from pynndescent import NNDescent

            nbrs = NNDescent(X, metric="euclidean", n_neighbors=1, n_jobs=-1, random_state=seed)
            idx, _ = nbrs.query(trnet.W, k=1)
        else:
            alg = "ball_tree" if X.shape[1] > 10 else "kd_tree"
            nbrs = NearestNeighbors(n_neighbors=1, algorithm=alg, n_jobs=-1).fit(X)
            _, idx = nbrs.kneighbors(trnet.W)

        return idx[:, 0]
Example #10
0
def cluster_field(adata,
                  basis="pca",
                  embedding_basis=None,
                  normalize=True,
                  method="leiden",
                  cores=1,
                  copy=False,
                  **kwargs):
    """Cluster cells based on vector field features.

    We would like to see whether the vector field can be used to better define cell state/types. This can be accessed
    via characterizing critical points (attractor/saddle/repressor, etc.) and characteristic curves (nullcline,
    separatrix). However, the calculation of those is not easy, for example, a strict definition of an attractor is
    states where velocity is 0 and the eigenvalue of the jacobian matrix at that point is all negative. Under this
    strict definition, we may sometimes find the attractors are very far away from our sampled cell states which makes
    them less meaningful although this can be largely avoided when we decide to remove the density correction during the
    velocity projection. This is not unexpected as the vector field we learned is defined via a set of basis functions
    based on gaussian kernels and thus it is hard to satisfy that strict definition.

    Fortunately, we can handle this better with the help of a different set of ideas. Instead of using critical points
    by the classical dynamic system methods, we can use some machine learning approaches that are based on extracting
    geometric features of streamline to "cluster vector field space" for define cell states/type. This requires
    calculating, potential (ordered pseudotime), speed, curliness, divergence, acceleration, curvature, etc. Thanks to
    the fact that we can analytically calculate Jacobian matrix matrix, those quantities of the vector field function
    can be conveniently and efficiently calculated.

    Parameters
    ----------
    adata: :class:`~anndata.AnnData`.
        adata object that includes both newly synthesized and total gene expression of cells. Alternatively,
        the object should include both unspliced and spliced gene expression of cells.
    basis: `str` or None (default: `None`)
        The space that will be used for calculating vector field features. Valid names includes, for example, `pca`,
        `umap`, etc.
    embedding_basis: `str` or None (default: `None`)
        The embedding basis that will be combined with the vector field feature space for clustering.
    normalize: `bool` (default: `True`)
        Whether to mean center and scale the feature across all cells so that the mean
    method: `str` (default: `leiden`)
        The method that will be used for clustering, one of `{'kmeans'', 'hdbscan', 'louvain', 'leiden'}`. If `louvain`
        or `leiden` used, you need to have `cdlib` installed.
    cores: `int` (default: 1)
        The number of parallel jobs to run for neighbors search. ``None`` means 1 unless in a
        :obj:`joblib.parallel_backend` context.
        ``-1`` means using all processors.
    copy:
        Whether to return a new deep copy of `adata` instead of updating `adata` object passed in arguments.
    kwargs:
        Any additional arguments that will be passed to either kmeans, hdbscan, louvain or leiden clustering algorithms.

    Returns
    -------

    """

    logger = LoggerManager.gen_logger("dynamo-cluster_field")
    logger.log_time()
    adata = copy_adata(adata) if copy else adata

    if method in ["louvain", "leiden"]:
        try:
            from cdlib import algorithms

            "leiden" in dir(algorithms)

        except ImportError:
            raise ImportError(
                "You need to install the excellent package `cdlib` if you want to use louvain or leiden "
                "for clustering.")

    feature_key = [
        "speed_" + basis,
        basis + "_ddhodge_potential",
        "divergence_" + basis,
        "acceleration_" + basis,
        "curvature_" + basis,
    ]

    if feature_key[0] not in adata.obs.keys():
        from ..vectorfield import speed

        speed(adata, basis=basis)
    if feature_key[1] not in adata.obs.keys():
        from ..ext import ddhodge

        ddhodge(adata, basis=basis)
    if feature_key[2] not in adata.obs.keys():
        from ..vectorfield import divergence

        divergence(adata, basis=basis)
    if feature_key[3] not in adata.obs.keys():
        from ..vectorfield import acceleration

        acceleration(adata, basis=basis)
    if feature_key[4] not in adata.obs.keys():
        from ..vectorfield import curvature

        curvature(adata, basis=basis)

    feature_data = adata.obs.loc[:, feature_key].values
    if embedding_basis is None:
        embedding_basis = basis
    X = np.hstack((feature_data, adata.obsm["X_" + embedding_basis]))

    if normalize:
        # X = (X - X.min(0)) / X.ptp(0)
        X = (X - X.mean(0)) / X.std(0)

    if method in ["hdbscan", "kmeans"]:
        if method == "hdbscan":
            key = "field_hdbscan"
            hdbscan(adata, X_data=X, result_key=key, **kwargs)
        elif method == "kmeans":
            from sklearn.cluster import KMeans

            key = "field_kmeans"

            kmeans = KMeans(random_state=0, **kwargs).fit(X)
            adata.obs[key] = kmeans.labels_.astype("str")

        # clusters need to be categorical variables
        adata.obs.obs[key] = adata.obs.obs[key].astype("category")

    elif method in ["louvain", "leiden"]:
        if X.shape[0] > 200000 and X.shape[1] > 2:
            from pynndescent import NNDescent

            nbrs = NNDescent(
                X,
                metric="euclidean",
                n_neighbors=31,
                n_jobs=cores,
                random_state=19491001,
            )
            nbrs_idx, dist = nbrs.query(X, k=31)
        else:
            nbrs = NearestNeighbors(n_neighbors=31, n_jobs=cores).fit(X)
            dist, nbrs_idx = nbrs.kneighbors(X)

        row = np.repeat(nbrs_idx[:, 0], 30)
        col = nbrs_idx[:, 1:].flatten()
        graph = csr_matrix(
            (np.repeat(1, len(col)), (row, col)),
            shape=(adata.n_obs, adata.n_obs),
        )
        adata.obsp["vf_feature_knn"] = graph

        if method == "leiden":
            leiden(
                adata,
                adj_matrix_key="vf_feature_knn",
                result_key="field_leiden",
            )
        elif method == "louvain":
            louvain(
                adata,
                adj_matrix_key="vf_feature_knn",
                result_key="field_louvain",
            )
        elif method == "infomap":
            infomap(
                adata,
                adj_matrix_key="vf_feature_knn",
                result_key="field_infomap",
            )

    logger.finish_progress(progress_name="clustering_field")

    if copy:
        return adata
    return None
Example #11
0
def score_cells(
    adata,
    genes=None,
    layer=None,
    basis=None,
    n_neighbors=30,
    beta=0.1,
    iteration=5,
    metric="euclidean",
    metric_kwds=None,
    cores=1,
    seed=19491001,
    return_score=True,
    **kwargs,
):
    """Score cells based on a set of genes.

    Parameters
    ----------
        adata: :class:`~anndata.AnnData`
            AnnData object that contains the reconstructed vector field function in the `uns` attribute.
        genes: `list` or None (default: None)
            The gene names whose gene expression will be used for predicting cell fate. By default (when genes is set to
            None), the genes used for velocity embedding (var.use_for_transition) will be used for vector field
            reconstruction. Note that the genes to be used need to have velocity calculated and corresponds to those used
            in the `dyn.tl.VectorField` function.
        layer: `str` or None (default: 'X')
            Which layer of the data will be used for predicting cell fate with the reconstructed vector field function.
            The layer once provided, will override the `basis` argument and then predicting cell fate in high dimensional
            space.
        basis: `str` or None (default: `None`)
            The embedding data to use for predicting cell fate. If `basis` is either `umap` or `pca`, the reconstructed
            trajectory will be projected back to high dimensional space via the `inverse_transform` function.
        n_neighbors: `int` (default: `30`)
            Number of nearest neighbors.
        beta: `float` (default: `0.1`)
            The weight that will apply to the current query cell.
        iteration: `int` (default: `0.5`)
            Number of smooth iterations.
        metric: `str` or callable, default='euclidean'
            The distance metric to use for the tree.  The default metric is , and with p=2 is equivalent to the standard
            Euclidean metric. See the documentation of :class:`DistanceMetric` for a list of available metrics. If metric
            is "precomputed", X is assumed to be a distance matrix and must be square during fit. X may be a
            :term:`sparse graph`, in which case only "nonzero" elements may be considered neighbors.
        metric_kwds : dict, default=None
            Additional keyword arguments for the metric function.
        cores: `int` (default: 1)
            The number of parallel jobs to run for neighbors search. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
            ``-1`` means using all processors.
        seed: `int` (default `19491001`)
            Random seed to ensure the reproducibility of each run.
        return_score: `bool` (default: `False`)
            Whether to return the score. If False, save the smoothed score to `cell_scores` column in the `.obs`
            attribute and also to the dictionary corresponding to the `score_cells` key in the .uns attribute.
        kwargs:
            Additional arguments that will be passed to each nearest neighbor search algorithm.

    Returns
    -------
        Depending on return_score, it either return the cell scores or an updated adata object that contains the cell
        score information.
    """

    if basis is None and "X_pca" not in adata.obsm.keys():
        raise ValueError(f"Your adata doesn't have 'X_pca' basis in .obsm.")
    elif basis is not None and "X_" + basis not in adata.obsm.keys():
        raise ValueError(
            f"Your adata doesn't have the {basis} you inputted in .obsm attribute of your adata."
        )

    if genes is None and "use_for_pca" not in adata.obs.keys():
        raise ValueError(
            f"Your adata doesn't have 'use_for_pca' column in .obs.")

    if genes is None:
        genes = adata.var_names[adata.use_for_pca]
    else:
        genes = (list(adata.var_names.intersection(genes))
                 if adata.var_names[0].isupper() else list(
                     adata.var_names.intersection(
                         [i.capitalize()
                          for i in genes])) if adata.var_names[0][0].isupper()
                 and adata.var_names[0][1:].islower() else list(
                     adata.var_names.intersection([i.lower() for i in genes])))

    if len(genes) < 1:
        raise ValueError(
            f"Your inputted gene list doesn't overlap any gene in your adata object."
        )

    X_basis = adata.obsm["X_pca"] if basis is None else adata.obsm["X_" +
                                                                   basis]

    if X_basis.shape[0] > 5000 and X_basis.shape[1] > 2:
        from pynndescent import NNDescent

        nbrs = NNDescent(X_basis,
                         metric=metric,
                         metric_kwds=metric_kwds,
                         n_neighbors=30,
                         n_jobs=cores,
                         random_state=seed,
                         **kwargs)
        knn, distances = nbrs.query(X_basis, k=n_neighbors)
    else:
        alg = "ball_tree" if X_basis.shape[1] > 10 else "kd_tree"
        nbrs = NearestNeighbors(n_neighbors=n_neighbors,
                                algorithm=alg,
                                n_jobs=cores).fit(X_basis)
        distances, knn = nbrs.kneighbors(X_basis)

    X_data = adata[:, genes].X if layer in [None, "X"
                                            ] else adata[:,
                                                         genes].layers[layer]

    prev_score = X_data.mean(1).A1 if issparse(X_data) else X_data.mean(1)
    cur_score = np.zeros(prev_score.shape)

    for _ in range(iteration):
        for i in range(len(prev_score)):
            xn = prev_score[knn[i]]
            cur_score[i] = (beta * xn[0]) + ((1 - beta) * xn[1:].mean(axis=0))
        prev_score = cur_score

    smoothed_score = cur_score

    if return_score:
        return smoothed_score
    else:
        adata.uns["score_cells"] = {
            "smoothed_score": smoothed_score,
            "genes": genes,
            "layer": layer,
            "basis": basis
        }
        adata.obs["cell_score"] = smoothed_score
Example #12
0
    def fit(
        self,
        X,
        V,
        M_diff,
        neighbor_idx=None,
        n_recurse_neighbors=None,
        k=30,
        epsilon=None,
        adaptive_local_kernel=False,
        tol=1e-4,
        sparse_construct=True,
        sample_fraction=None,
    ):
        # compute connectivity
        if neighbor_idx is None:
            if X.shape[0] > 200000 and X.shape[1] > 2:
                from pynndescent import NNDescent

                nbrs = NNDescent(
                    X,
                    metric="euclidean",
                    n_neighbors=k,
                    n_jobs=-1,
                    random_state=19491001,
                )
                neighbor_idx, _ = nbrs.query(X, k=k)
            else:
                alg = "ball_tree" if X.shape[1] > 10 else "kd_tree"
                nbrs = NearestNeighbors(n_neighbors=k, algorithm=alg, n_jobs=-1).fit(X)
                _, neighbor_idx = nbrs.kneighbors(X)

        if n_recurse_neighbors is not None:
            self.Idx = append_iterative_neighbor_indices(neighbor_idx, n_recurse_neighbors)
        else:
            self.Idx = neighbor_idx

        # apply kNN downsampling to accelerate calculation (adapted from velocyto)
        if sample_fraction is not None:
            neighbor_idx = self.Idx
            p = np.linspace(0.5, 1, neighbor_idx.shape[1])
            p = p / p.sum()

            sampling_ixs = np.stack(
                (
                    np.random.choice(
                        np.arange(1, neighbor_idx.shape[1] - 1),
                        size=int(sample_fraction * (neighbor_idx.shape[1] + 1)),
                        replace=False,
                        p=p,
                    )
                    for i in range(neighbor_idx.shape[0])
                ),
                0,
            )
            self.Idx = self.Idx[np.arange(neighbor_idx.shape[0])[:, None], sampling_ixs]

        n = X.shape[0]
        if sparse_construct:
            self.P = sp.lil_matrix((n, n))
        else:
            self.P = np.zeros((n, n))

        # compute density kernel
        if epsilon is not None:
            if sparse_construct:
                self.Kd = sp.lil_matrix((n, n))
            else:
                self.Kd = np.zeros((n, n))
            inv_eps = 1 / epsilon
            for i in range(n):
                self.Kd[i, self.Idx[i]] = compute_density_kernel(X[i], X[self.Idx[i]], inv_eps)
            self.Kd = sp.csc_matrix(self.Kd)
            D = np.sum(self.Kd, 0)

        # compute transition prob.
        if np.isscalar(M_diff):
            inv_s = 1 / M_diff
        else:
            inv_s = np.linalg.inv(M_diff)
        for i in tqdm(range(n), desc="compute transiton matrix"):
            y = X[i]
            v = V[i]
            Y = X[self.Idx[i]]
            if adaptive_local_kernel:
                k = compute_drift_local_kernel(y, v, Y, inv_s)
            else:
                k = compute_drift_kernel(y, v, Y, inv_s)
            if epsilon is not None:
                k = k / D[0, self.Idx[i]]
            else:
                k = np.matrix(k)
            p = k / np.sum(k) if np.sum(k) > 0 else np.ones_like(k) / n
            p[p <= tol] = 0  # tolerance check
            p = p / np.sum(p)
            self.P[self.Idx[i], i] = p.A[0]

        self.P = sp.csc_matrix(self.P)
Example #13
0
def graphize_velocity(V, X, nbrs_idx=None, k=30, normalize_v=False, E_func=None):
    """
        The function generates a graph based on the velocity data. The flow from i- to j-th
        node is returned as the edge matrix E[i, j], and E[i, j] = -E[j, i].

    Arguments
    ---------
        V: :class:`~numpy.ndarray`
            The velocities for all cells.
        X: :class:`~numpy.ndarray`
            The coordinates for all cells.
        nbrs_idx: list (optional, default None)
            a list of neighbor indices for each cell. If None a KNN will be performed instead.
        k: int (optional, default 30)
            The number of neighbors for the KNN search.
        normalize_v: bool (optional, default False)
            Whether or not normalizing the velocity vectors.
        E_func: str, function, or None (optional, default None)
            A variance stabilizing function for reducing the variance of the flows.
            If a string is passed, there are two options:
                'sqrt': the numpy.sqrt square root function;
                'exp': the numpy.exp exponential function.

    Returns
    -------
        E: :class:`~numpy.ndarray`
            The edge matrix.
        nbrs_idx: list
            Neighbor indices.
    """
    n, d = X.shape

    nbrs = None
    if nbrs_idx is None:
        if n > 200000 and d > 2:
            from pynndescent import NNDescent

            nbrs = NNDescent(
                X,
                metric="euclidean",
                n_neighbors=k + 1,
                n_jobs=-1,
                random_state=19491001,
            )
            nbrs_idx, _ = nbrs.query(X, k=k + 1)
        else:
            alg = "ball_tree" if d > 10 else "kd_tree"
            nbrs = NearestNeighbors(n_neighbors=k + 1, algorithm=alg, n_jobs=-1).fit(X)
            _, nbrs_idx = nbrs.kneighbors(X)

    if type(E_func) is str:
        if E_func == "sqrt":
            E_func = np.sqrt
        elif E_func == "exp":
            E_func = np.exp
        else:
            raise NotImplementedError("The specified edge function is not implemented.")

    # E = sp.csr_matrix((n, n))      # Making E a csr_matrix will slow down this process. Try lil_matrix maybe?
    E = np.zeros((n, n))
    for i in range(n):
        x = flatten(X[i])
        idx = nbrs_idx[i]
        if len(idx) > 0 and idx[0] == i:  # excluding the node itself from the neighbors
            idx = idx[1:]
        vi = flatten(V[i])
        if normalize_v:
            vi_norm = np.linalg.norm(vi)
            if vi_norm > 0:
                vi /= vi_norm

        # normalized differences
        U = X[idx] - x
        U_norm = np.linalg.norm(U, axis=1)
        U_norm[U_norm == 0] = 1
        U /= U_norm[:, None]

        for jj, j in enumerate(idx):
            vj = flatten(V[j])
            if normalize_v:
                vj_norm = np.linalg.norm(vj)
                if vj_norm > 0:
                    vj /= vj_norm
            u = flatten(U[jj])
            v = np.mean((vi.dot(u), vj.dot(u)))

            if E_func is not None:
                v = np.sign(v) * E_func(np.abs(v))
            E[i, j] = v
            E[j, i] = -v

    return E, nbrs_idx
Example #14
0
def extract_structural_backbone(t, data, s, max_angle=90, relaxation=0):
    """
    Construct simplified graphs connecting data points there have been projected to density ridges.

    Two graphs are constructed for different purposes. The first graph (g_simple) is constructed with two major steps:
     1. Construct nearest neighbor graph on both ridge positions
    and raw data positions and combine. 2. Simplify graph so that each point is only connected to up
    to 2^ridge_dimensionality points with a set of filtering criteria.

    The second graph (g_mst) has two extra steps: 3. If the graph is not fully connected, connect the components by
    nearest neigbors between all pairs of components. 4. Construct a minimum spanning tree of the graph.

    Parameters
    -----------
    t : 2D array
        Density ridge positions. Typically projected to density ridges with quasildr.dridge.Scms.
    data : 2D array
        Original data points.
    s : object
        `quasildr.dridge.Scms` object which were used to produce t0.
    max_angle : float, optional
        Maximum angle in degree for filtering graph edges. Default is 90.
    relaxation : float, optional
        The relaxation parameter used to produce `t0`. See `quasildr.dridge.Scms.scms` documention.

    Returns
    -----------
    g_simple : sparse matrix
        Simplified graph constructed without explicit shape or connectivity constraints (g_simple)
    and the other one is  from step 2
    g_mst : sparse matrix
        A tree-shaped graph connecting all points (g_mst). from step 4.
    """
    h, _, _, _, _ = s._nlocal_inv_cov(t)
    eigvals, eigvecs = np.linalg.eigh(-h)
    eigvals = -eigvals
    ridge_dims = ((eigvals[:, 0] - eigvals[:, 1]) / (eigvals[:, 0] - eigvals[:, -1]) < relaxation) + 1


    gknn, _ = neighbors.neighbors(t, n_neighbors=50, smoothknn=False)
    gknnZ, _ = neighbors.neighbors(data, n_neighbors=50, smoothknn=False)
    gknn = gknn + gknn.T + gknnZ + gknnZ.T

    gknn.setdiag(0)
    gknn.eliminate_zeros()

    # remove edge connecting structures of different dimensionality
    gknn.data[ridge_dims[gknn.nonzero()[0]] != ridge_dims[gknn.nonzero()[1]]] = 0
    gknn.eliminate_zeros()


    # filter edges
    edges = t[gknn.nonzero()[1], :] - t[gknn.nonzero()[0], :]
    edges_norm = edges / (np.linalg.norm(edges, axis=1)[:, np.newaxis])

    angles = np.zeros(len(gknn.nonzero()[0]))
    angles_edge = np.zeros(len(gknn.nonzero()[0]))
    for d in np.unique(ridge_dims):
        if d == 1:
            ind = ridge_dims[gknn.nonzero()[0]] == 1
            angles[ind] = np.arccos(
                np.clip(np.sum(eigvecs[gknn.nonzero()[0][ind], :, 0] * eigvecs[gknn.nonzero()[1][ind], :, 0], axis=1),
                        -1, 1)) / np.pi * 180
            angles_edge[ind] = np.arccos(
                np.clip(np.sum(eigvecs[gknn.nonzero()[0][ind], :, 0] * edges_norm[ind, :], axis=1), -1,
                        1)) / np.pi * 180
            angles[ind] = np.minimum(angles[ind], 180 - angles[ind])
            angles_edge[ind] = np.minimum(angles_edge[ind], 180 - angles_edge[ind])
            gknn.data[(angles > max_angle) * (angles_edge > max_angle)] = 0
        else:
            # calculate mean principal angles
            ind = ridge_dims[gknn.nonzero()[0]] == d
            angles[ind] = np.mean(
                subspace_angles(eigvecs[gknn.nonzero()[0][ind], :, :d], eigvecs[gknn.nonzero()[1][ind], :, :d]), axis=1)
            gknn.data[(angles > max_angle)] = 0


    gknn.eliminate_zeros()
    # gknn = gknn + gknn.T
    n_components, labels = scipy.sparse.csgraph.connected_components(gknn)

    # simplify graph by connecting only closest nodes in the subspace
    edges_vecs = t[gknn.nonzero()[1], :] - t[gknn.nonzero()[0], :]
    edges_dist = np.linalg.norm(edges_vecs, axis=1)

    rowinds = []
    colinds = []
    # orient eigen vectors in the same directions
    for d in range(eigvecs.shape[2]):
        eigvecs[eigvecs[:, 0, d] < 0, :, d] *= -1

    for d in np.unique(ridge_dims):
        if d == 1:
            ind = ridge_dims[gknn.nonzero()[0]] == 1
            proj_vecs = np.sum(edges_vecs[ind, :] * eigvecs[gknn.nonzero()[0][ind], :, 0], axis=1)[:,
                        np.newaxis] * eigvecs[gknn.nonzero()[0][ind], :, 0]
        else:
            ind = ridge_dims[gknn.nonzero()[0]] == d
            proj_vecs = matrix_multiply(eigvecs[gknn.nonzero()[0][ind], :, :d], \
                                        matrix_multiply(eigvecs[gknn.nonzero()[0][ind], :, :d].transpose((0, 2, 1)),
                                                        edges_vecs[ind, :, np.newaxis]))

        proj_dists = edges_dist[ind]
        gknn.data[ind] = proj_dists
        gknn_directions = []
        for k in range(d):
            gknn_directions.append(gknn.copy())
            gknn_directions[k].data[ind] = proj_vecs[:, k].squeeze()

        from itertools import product
        for directions in list(product([-1, 1], repeat=len(gknn_directions))):
            for i in np.where(ridge_dims == d)[0]:
                dist_data = gknn[i, :].data
                direction_datas = [gd[i, :].data for gd in gknn_directions]
                conditions = [(directions[i] * direction_datas[i]) > 0 for i in range(len(directions))]
                conditions = np.all(np.vstack(conditions).T, axis=1)
                if np.any(conditions):
                    min_dist = np.min(dist_data[conditions])
                    if directions[0] < 0:
                        rowinds.append(gknn[i, :].nonzero()[1][dist_data == min_dist][0])
                        colinds.append(i)
                    else:
                        rowinds.append(i)
                        colinds.append(gknn[i, :].nonzero()[1][dist_data == min_dist][0])

    gedges = np.vstack([rowinds, colinds]).T
    gedges = np.unique(gedges, axis=0)
    g_simple = csr_matrix((np.repeat(1, gedges.shape[0]), (gedges[:, 0], gedges[:, 1])), shape=gknn.shape)

    n_components, labels = scipy.sparse.csgraph.connected_components(g_simple)

    # meta graph connecting each component.
    components_dimensionality = []
    for i in range(n_components):
        components_dimensionality.append(ridge_dims[labels == i][0])

    # To connect or not to connect

    fc_metaedges = []
    fc_edge_indices = []
    for i in range(n_components - 1):
        i_inds = np.where(labels == i)[0]
        if len(i_inds) > 1000:
            index_group_i = NNDescent(t[i_inds, :])
            index_group_data_i = NNDescent(data[i_inds, :])
        else:
            index_group_i = NearestNeighbors(n_neighbors=1).fit(t[i_inds, :])
            index_group_data_i = NearestNeighbors(n_neighbors=1).fit(data[i_inds, :])

        # for g_mst
        for j in range(i + 1, n_components):
            j_inds = np.where(labels == j)[0]
            if len(i_inds) > 1000:
                nn, _ = index_group_i.query(t[j_inds, :], k=1)
                _, dist = index_group_data_i.query(data[j_inds, :], k=1)
            else:
                _, nn = index_group_i.kneighbors(t[j_inds, :])
                dist, _ = index_group_data_i.kneighbors(data[j_inds, :])

            mindist = np.min(dist)
            fc_metaedges.append([i, j])
            fc_edge_indices.append([i_inds[nn[dist == mindist]][0], j_inds[np.where(dist == mindist)[0]][0]])


    if len(fc_edge_indices) > 0:
        fc_edge_indices = np.vstack(fc_edge_indices)
        g_fc_connections = csr_matrix(
            (np.repeat(2, fc_edge_indices.shape[0]), (fc_edge_indices[:, 0], fc_edge_indices[:, 1])), shape=gknn.shape)
        g_fc = g_simple + g_fc_connections
    else:
        g_fc = g_simple
    g_fc.data = np.linalg.norm(t[g_fc.nonzero()[0], :] - t[g_fc.nonzero()[1], :], axis=1)
    g_mst = minimum_spanning_tree(g_fc)
    g_simple.data = np.linalg.norm(t[g_simple.nonzero()[0], :] - t[g_simple.nonzero()[1], :], axis=1)

    return g_simple, g_mst, ridge_dims
Example #15
0
class KNNSearch:
    def __init__(self, features, kwargs):

        self.org_features = features
        if kwargs["normalize"]:
            self.features = preprocessing.normalize(features, norm='l2')
        else:
            self.features = features

        self.kwargs = kwargs
        self.predictor = None

    def fit(self):
        if self.kwargs['algorithm'] == 'datasketch':
            self.__datasketch_fit()
        elif self.kwargs['algorithm'] == 'annoy':
            self.__annoy_fit()
        elif self.kwargs['algorithm'] == 'exact':
            self.__exhaustive_fit()
        elif self.kwargs['algorithm'] == 'falconn':
            self.__falconn_fit()
        elif self.kwargs['algorithm'] == 'descent':
            self.__descent_fit()
        elif self.kwargs['algorithm'] == 'random':
            self.__random_fit()
        else:
            raise Exception("Algorithm=[{}] not yet implemented".format(
                self.kwargs['algorithm']))

    def predict(self, input, k):
        if self.kwargs['algorithm'] == 'datasketch':
            return self.__datasketch_predict(input, k)
        elif self.kwargs['algorithm'] == 'annoy':
            return self.__annoy_predict(input, k)
        elif self.kwargs['algorithm'] == 'exact':
            return self.__exhaustive_predict(input, k)
        elif self.kwargs['algorithm'] == 'falconn':
            return self.__falconn_predict(input, k)
        elif self.kwargs['algorithm'] == 'descent':
            return self.__descent_predict(input, k)
        elif self.kwargs['algorithm'] == 'random':
            return self.__random_predict(input, k)
        else:
            raise Exception("Algorithm=[{}] not yet implemented".format(
                self.kwargs['algorithm']))

    def __datasketch_fit(self):
        if self.kwargs['create']:
            # Create a list of MinHash objects
            min_hash_obj_list = []
            forest = MinHashLSHForest(num_perm=self.kwargs['num_perm'])
            for i in range(len(self.features)):
                min_hash_obj_list.append(
                    MinHash(num_perm=self.kwargs['num_perm']))
                for d in self.features[i]:
                    min_hash_obj_list[i].update(d)
                forest.add(i, min_hash_obj_list[i])
            # IMPORTANT: must call index() otherwise the keys won't be searchable
            forest.index()
            with open(self.kwargs['file_path'], "wb") as f:
                pickle.dump(forest, f)
                pickle.dump(min_hash_obj_list, f)
            self.predictor = [forest, min_hash_obj_list]
        else:
            with open(self.kwargs['file_path'], "rb") as f:
                forest = pickle.load(f)
                min_hash_obj_list = pickle.load(f)
                self.predictor = [forest, min_hash_obj_list]

    def __datasketch_predict(self, input, k):
        forest, min_hash_obj_list = self.predictor
        if type(input) == int:
            return forest.query(min_hash_obj_list[input], k)
        else:
            min_hash_obj = MinHash(num_perm=self.kwargs['num_perm'])
            for d in input:
                min_hash_obj.update(d)
            return forest.query(min_hash_obj, k)

    def __annoy_fit(self):
        if self.kwargs['create']:
            indexer = AnnoyIndex(self.features.shape[1], self.kwargs['metric'])
            for i, f in enumerate(self.features):
                indexer.add_item(i, f)
            indexer.build(self.kwargs['num_trees'])
            indexer.save(self.kwargs['file_path'])
            self.predictor = indexer
        else:
            forest = AnnoyIndex(self.features.shape[1], self.kwargs['metric'])
            forest.load(self.kwargs['file_path'])
            self.predictor = forest

    def __annoy_predict(self, input, k):
        annoy_forest = self.predictor
        if type(input) == int:
            return annoy_forest.get_nns_by_item(input,
                                                k,
                                                search_k=-1,
                                                include_distances=False)
        else:
            return annoy_forest.get_nns_by_vector(input,
                                                  k,
                                                  search_k=-1,
                                                  include_distances=False)

    def __exhaustive_fit(self):
        self.predictor = NearestNeighbors(algorithm='ball_tree')
        self.predictor.fit(self.features)

    def __exhaustive_predict(self, input, k):
        if type(input) == int:
            return self.predictor.kneighbors(self.features[input].reshape(
                1, -1),
                                             n_neighbors=k,
                                             return_distance=False)[0]
        else:
            return self.predictor.kneighbors(input.reshape(1, -1),
                                             n_neighbors=k,
                                             return_distance=False)[0]

    def __falconn_fit(self):
        """
        Initializes locality-sensitive hashing with FALCONN to find nearest neighbors in training data.
        """

        import falconn

        dimension = self.features.shape[1]
        nb_tables = self.kwargs['nb_tables']
        number_bits = self.kwargs['number_bits']

        # LSH parameters
        params_cp = falconn.LSHConstructionParameters()
        params_cp.dimension = dimension
        params_cp.lsh_family = falconn.LSHFamily.CrossPolytope
        params_cp.distance_function = falconn.DistanceFunction.EuclideanSquared
        params_cp.l = nb_tables
        params_cp.num_rotations = 2  # for dense set it to 1; for sparse data set it to 2
        params_cp.seed = 5721840
        # we want to use all the available threads to set up
        params_cp.num_setup_threads = 0
        params_cp.storage_hash_table = falconn.StorageHashTable.BitPackedFlatHashTable

        # we build number_bits-bit hashes so that each table has
        # 2^number_bits bins; a rule of thumb is to have the number
        # of bins be the same order of magnitude as the number of data points
        falconn.compute_number_of_hash_functions(number_bits, params_cp)
        self._falconn_table = falconn.LSHIndex(params_cp)
        self._falconn_query_object = None
        self._FALCONN_NB_TABLES = nb_tables

        # Center the dataset and the queries: this improves the performance of LSH quite a bit.
        self.center = np.mean(self.features, axis=0)
        self.features -= self.center

        # add features to falconn table
        self._falconn_table.setup(self.features)

    def __falconn_predict(self, input, k):

        # Normalize input if you care about the cosine similarity
        if type(input) == int:
            input = self.features[input]
        else:
            if self.kwargs['normalize']:
                input /= np.linalg.norm(input)
                # Center the input and the queries: this improves the performance of LSH quite a bit.
                input -= self.center

        # Late falconn query_object construction
        # Since I suppose there might be an error
        # if table.setup() will be called after
        if self._falconn_query_object is None:
            self._falconn_query_object = self._falconn_table.construct_query_object(
            )
            self._falconn_query_object.set_num_probes(self._FALCONN_NB_TABLES)

        query_res = self._falconn_query_object.find_k_nearest_neighbors(
            input, k)
        return query_res

    def __descent_fit(self):
        self.predictor = NNDescent(data=self.features,
                                   metric=self.kwargs['metric'])

    def __descent_predict(self, input, k):
        input = np.expand_dims(
            input, axis=0)  # input should be an array of search points
        index = self.predictor
        return index.query(input, k)[0][
            0]  # returns indices of NN, distances of the NN from the input

    def __random_fit(self):
        pass

    def __random_predict(self, input, k):
        rand_index_list = []
        for i in range(k):
            rand_index_list.append(random.randint(0, len(self.features) - 1))

        return rand_index_list
Example #16
0
def cluster_field(adata,
                  basis='pca',
                  embedding_basis=None,
                  normalize=True,
                  method='louvain',
                  cores=1,
                  **kwargs):
    """Cluster cells based on vector field features.

    We would like to see whether the vector field can be used to better define cell state/types. This can be accessed via
    characterizing critical points (attractor/saddle/repressor, etc.) and characteristic curves (nullcline, separatrix).
    However, the calculation of those is not easy, for example, a strict definition of an attractor is states where
    velocity is 0 and the eigenvalue of the jacobian matrix at that point is all negative. Under this strict definition,
    we may sometimes find the attractors are very far away from our sampled cell states which makes them less meaningful.
    This is not unexpected as the vector field we learned is defined via a set of basis functions based on gaussian
    kernels and thus it is hard to satisfy that strict definition.

    Fortunately, we can handle this better with the help of a different set of ideas. Instead of using critical points
    by the classical dynamic system methods, we can use some machine learning approaches that are based on extracting
    geometric features of streamline to "cluster vector field space" for define cell states/type. This requires calculating,
    potential (ordered pseudotime), speed, curliness, divergence, acceleration, curvature, etc. Thanks to the fact that we
    can analytically calculate Jacobian matrix matrix, those quantities of the vector field function can be conveniently
    and efficiently calculated.

    Parameters
    ----------
    adata: :class:`~anndata.AnnData`.
        adata object that includes both newly synthesized and total gene expression of cells. Alternatively,
        the object should include both unspliced and spliced gene expression of cells.
    basis: `str` or None (default: `None`)
        The space that will be used for calculating vector field features. Valid names includes, for example, `pca`, `umap`, etc.
    embedding_basis: `str` or None (default: `None`)
        The embedding basis that will be combined with the vector field feature space for clustering.
    normalize: `bool` (default: `True`)
        Whether to mean center and scale the feature across all cells so that the mean
    method: `str` (default: `louvain`)
        The method that will be used for clustering, one of `{'kmeans'', 'hdbscan', 'louvain', 'leiden'}`. If `louvain`
        or `leiden` used, you need to have `scanpy` installed.
    cores: `int` (default: 1)
        The number of parallel jobs to run for neighbors search. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
        ``-1`` means using all processors.
    kwargs:
        Any additional arguments that will be passed to either kmeans, hdbscan, louvain or leiden clustering algorithms.

    Returns
    -------

    """

    if method in ['louvain', 'leiden']:
        try:
            import scanpy as sc
        except ImportError:
            raise ImportError(
                "You need to install the excellent package `scanpy` if you want to use louvain or leiden "
                "for clustering.")

    feature_key = [
        'speed_' + basis, basis + '_ddhodge_potential', 'divergence_' + basis,
        'acceleration_' + basis, 'curvature_' + basis
    ]

    if feature_key[0] not in adata.obs.keys():
        from .vector_calculus import speed
        speed(adata, basis=basis)
    if feature_key[1] not in adata.obs.keys():
        from ..ext import ddhodge
        ddhodge(adata, basis=basis)
    if feature_key[2] not in adata.obs.keys():
        from .vector_calculus import divergence
        divergence(adata, basis=basis)
    if feature_key[3] not in adata.obs.keys():
        from .vector_calculus import acceleration
        acceleration(adata, basis=basis)
    if feature_key[4] not in adata.obs.keys():
        from .vector_calculus import curvature
        curvature(adata, basis=basis)

    feature_data = adata.obs.loc[:, feature_key].values
    if embedding_basis is None: embedding_basis = basis
    X = np.hstack((feature_data, adata.obsm['X_' + embedding_basis]))

    if normalize:
        # X = (X - X.min(0)) / X.ptp(0)
        X = (X - X.mean(0)) / X.std(0)

    if method in ['hdbscan', 'kmeans']:
        if method == 'hdbscan':
            hdbscan(adata, X_data=X, **kwargs)
        elif method == 'kmeans':
            from sklearn.cluster import KMeans
            kmeans = KMeans(random_state=0, **kwargs).fit(X)
            adata.obs['kmeans'] = kmeans.labels_.astype('str')

    elif method in ['louvain', 'leiden']:
        if X.shape[0] > 200000 and X.shape[1] > 2:
            from pynndescent import NNDescent

            nbrs = NNDescent(X,
                             metric='euclidean',
                             n_neighbors=31,
                             n_jobs=cores,
                             random_state=19491001)
            nbrs_idx, dist = nbrs.query(X, k=31)
        else:
            nbrs = NearestNeighbors(n_neighbors=31, n_jobs=cores).fit(X)
            dist, nbrs_idx = nbrs.kneighbors(X)

        row = np.repeat(nbrs_idx[:, 0], 30)
        col = nbrs_idx[:, 1:].flatten()
        g = csr_matrix((np.repeat(1, len(col)), (row, col)),
                       shape=(adata.n_obs, adata.n_obs))
        adata.obsp['feature_knn'] = g

        if method == 'louvain':
            sc.tl.louvain(adata, obsp='feature_knn', **kwargs)
        elif method == 'leiden':
            sc.tl.leiden(adata, obsp='feature_knn', **kwargs)
Example #17
0
def diffusionMatrix(
    adata,
    X_data=None,
    V_data=None,
    genes=None,
    layer=None,
    basis="umap",
    dims=None,
    n=30,
    VecFld=None,
    residual="vector_field",
):
    """ "Calculate the diffusion matrix from the estimated velocity vector and the reconstructed vector field.

    Parameters
    ----------
        adata: :class:`~anndata.AnnData`
            an Annodata object.
        X_data: `np.ndarray` (default: `None`)
            The user supplied expression (embedding) data that will be used for calculating diffusion matrix directly.
        V_data: `np.ndarray` (default: `None`)
            The user supplied velocity data that will be used for calculating diffusion matrix directly.
        genes: `list` or None (default: `None`)
            The list of genes that will be used to subset the data. If `None`, all genes will be used.
        layer: `str` or None (default: None)
            Which layer of the data will be used for diffusion matrix calculation.
        basis: `str` (default: `umap`)
            Which basis of the data will be used for diffusion matrix calculation.
        dims: `list` or None (default: `None`)
            The list of dimensions that will be selected for diffusion matrix calculation. If `None`, all dimensions will be used.
        n: `int` (default: `10`)
            Number of nearest neighbors when the nearest neighbor graph is not included.
        VecFld: `dictionary` or None (default: None)
            The reconstructed vector field function.
        residual: `str` or None (default: `vector_field`)
            Method to calculate residual velocity vectors for diffusion matrix calculation. If `average`, all velocity
            of the nearest neighbor cells will be minused by its average velocity; if `vector_field`, all velocity will be
            minused by the predicted velocity from the reconstructed deterministic velocity vector field.

    Returns
    -------
        adata: :class:`~anndata.AnnData`
            `AnnData` object that is updated with the `diffusion_matrix` key in the `uns` attribute which is a list of
            the diffusion matrix for each cell. A column `diffusion` corresponds to the square root of the sum of all
            elements for each cell's diffusion matrix will also be added.
    """

    if X_data is None or V_data is not None:
        if genes is not None:
            genes = adata.var_name.intersection(genes).to_list()
            if len(genes) == 0:
                raise ValueError(f"no genes from your genes list appear in your adata object.")
        if layer is not None:
            if layer not in adata.layers.keys():
                raise ValueError(f"the layer {layer} you provided is not included in the adata object!")

            if basis is None:
                vkey = "velocity_" + layer[0].upper()
                if vkey not in adata.obsm.keys():
                    raise ValueError(
                        f"the data corresponds to the velocity key {vkey} is not included in the adata object!"
                    )

        if VecFld is None:
            VecFld, func = vecfld_from_adata(adata, basis)
        else:
            func = lambda x: vector_field_function(x, VecFld)

        prefix = "X_" if layer is None else layer + "_"

        if basis is not None:
            if basis.split(prefix)[-1] not in [
                "pca",
                "umap",
                "trimap",
                "tsne",
                "diffmap",
            ]:
                raise ValueError(
                    f"basis (or the suffix of basis) can only be one of "
                    f"['pca', 'umap', 'trimap', 'tsne', 'diffmap']."
                )
            if basis.startswith(prefix):
                basis = basis
                vkey = "velocity_" + basis.split(prefix)[-1]
            else:
                vkey = "velocity_" + basis
                basis = prefix + basis

            if vkey not in adata.obsm_keys():
                raise ValueError(
                    f"the data corresponds to the velocity key {vkey} is not included in the adata object!"
                )

        if basis is None:
            if layer is None:
                vkey = "velocity_S"
                if vkey not in adata.uns_keys():
                    raise ValueError(
                        f"the data corresponds to the velocity key {vkey} is not included in the adata object!"
                    )

                if genes is not None:
                    X_data, V_data = (
                        adata[:, genes].X,
                        adata[:, genes].uns[vkey],
                    )
                else:
                    if "use_for_dynamics" not in adata.var.keys():
                        X_data, V_data = adata.X, adata.uns[vkey]
                    else:
                        X_data, V_data = (
                            adata[:, adata.var.use_for_dynamics].X,
                            adata[:, adata.var.use_for_dynamics].uns[vkey],
                        )
            else:
                vkey = "velocity_" + layer[0].upper()
                if vkey not in adata.uns_keys():
                    raise ValueError(
                        f"the data corresponds to the velocity key {vkey} is not included in the adata object!"
                    )

                if genes is not None:
                    X_data, V_data = (
                        adata[:, genes].layers[layer],
                        adata[:, genes].uns[vkey],
                    )
                else:
                    if "use_for_dynamics" not in adata.var.keys():
                        X_data, V_data = adata.layers[layer], adata.uns[vkey]
                    else:
                        X_data, V_data = (
                            adata[:, adata.var.use_for_dynamics].layers[layer],
                            adata[:, adata.var.use_for_dynamics].uns[vkey],
                        )
                X_data = log1p_(adata, X_data)
        else:
            X_data, V_data = adata.obsm[basis], adata.obsm[vkey]

    if dims is not None:
        X_data, V_data = X_data[:, dims], V_data[:, dims]

    neighbor_result_prefix = "" if layer is None else layer
    conn_key, dist_key, neighbor_key = _gen_neighbor_keys(neighbor_result_prefix)
    if neighbor_key not in adata.uns_keys() or (X_data is not None and V_data is not None):
        if X_data.shape[0] > 200000 and X_data.shape[1] > 2:
            from pynndescent import NNDescent

            nbrs = NNDescent(
                X_data,
                metric="euclidean",
                n_neighbors=n,
                n_jobs=-1,
                random_state=19491001,
            )
            Idx, _ = nbrs.query(X_data, k=n)
        else:
            alg = "ball_tree" if X_data.shape[1] > 10 else "kd_tree"
            nbrs = NearestNeighbors(n_neighbors=n, algorithm=alg, n_jobs=-1).fit(X_data)
            _, Idx = nbrs.kneighbors(X_data)
    else:
        check_and_recompute_neighbors(adata, result_prefix=layer)
        conn_key = "connectivities" if layer is None else layer + "_connectivities"
        neighbors = adata.obsp[conn_key]
        Idx = neighbors.tolil().rows

    if residual == "average":
        V_ave = np.zeros_like(V_data)
        for i in range(X_data.shape[0]):
            vv = V_data[Idx[i]]
            V_ave[i] = vv.mean(0)
    elif residual == "vector_field":
        V_ave = func(X_data)
    else:
        raise ValueError(
            f"The method for calculate residual {residual} is not supported. "
            f'Currently only {"average", "vector_field"} supported.'
        )

    V_diff = V_data - V_ave
    val = np.zeros((V_data.shape[0], 1))
    dmatrix = [None] * V_data.shape[0]

    for i in tqdm(range(X_data.shape[0]), "calculating diffusion matrix for each cell."):
        vv = V_diff[Idx[i]]
        d = np.cov(vv.T)
        val[i] = np.sqrt(sum(sum(d)))
        dmatrix[i] = d

    adata.obs["diffusion"] = val
    adata.uns["diffusion_matrix"] = dmatrix
Example #18
0
def cell_velocities(adata,
                    ekey=None,
                    vkey=None,
                    X=None,
                    V_mat=None,
                    X_embedding=None,
                    use_mnn=False,
                    neighbors_from_basis=False,
                    n_pca_components=None,
                    min_r2=0.01,
                    min_alpha=0.01,
                    min_gamma=0.01,
                    min_delta=0.01,
                    basis="umap",
                    method="pearson",
                    neg_cells_trick=True,
                    calc_rnd_vel=False,
                    xy_grid_nums=(50, 50),
                    correct_density=True,
                    scale=True,
                    sample_fraction=None,
                    random_seed=19491001,
                    other_kernels_dict={},
                    enforce=False,
                    key=None,
                    preserve_len=False,
                    **kmc_kwargs):
    """Compute transition probability and project high dimension velocity vector to existing low dimension embedding.

    It is powered by the Itô kernel that not only considers the correlation between the vector from any cell to its
    nearest neighbors and its velocity vector but also the corresponding distances. We expect this new kernel will enable
    us to visualize more intricate vector flow or steady states in low dimension. We also expect it will improve the
    calculation of the stationary distribution or source states of sampled cells. The original "correlation/cosine"
    velocity projection method is also supported. Kernels based on the reconstructed velocity field is also possible.

    With the `key` argument, `cell_velocities` can be called by `cell_accelerations` to calculate RNA acceleration vector
    for each cell.

    Arguments
    ---------
        adata: :class:`~anndata.AnnData`
            an Annodata object.
        ekey: `str` or None (optional, default `None`)
            The dictionary key that corresponds to the gene expression in the layer attribute. By default, ekey and vkey
            will be automatically detected from the adata object.
        vkey: 'str' or None (optional, default `None`)
            The dictionary key that corresponds to the estimated velocity values in the layers attribute.
        X: 'np.ndarray' or `sp.csr_matrix` or None (optional, default `None`)
            The expression states of single cells (or expression states in reduced dimension, like pca, of single cells)
        V_mat: 'np.ndarray' or `sp.csr_matrix` or None (optional, default `None`)
            The RNA velocity of single cells (or velocity estimates projected to reduced dimension, like pca, of single
            cells). Note that X, V_mat need to have the exact dimensionalities.
        X_embedding: 'str' or None (optional, default `None`)
            The low expression reduced space (pca, umap, tsne, etc.) of single cells that RNA velocity will be projected
            onto. Note X_embedding, X and V_mat has to have the same cell/sample dimension and X_embedding should have
            less feature dimension comparing that of X or V_mat.
        use_mnn: `bool` (optional, default `False`)
            Whether to use mutual nearest neighbors for projecting the high dimensional velocity vectors. By default, we
            don't use the mutual nearest neighbors. Mutual nearest neighbors are calculated from nearest neighbors across
            different layers, which which accounts for cases where, for example, the cells from spliced expression may be
            nearest neighbors but far from nearest neighbors on unspliced data. Using mnn assumes your data from different
            layers are reliable (otherwise it will destroy real signals).
        neighbors_from_basis: `bool` (optional, default `False`)
            Whether to construct nearest neighbors from low dimensional space as defined by the `basis`, instead of using
            that calculated during UMAP process.
        n_pca_components: `int` (optional, default `None`)
            The number of pca components to project the high dimensional X, V before calculating transition matrix for
            velocity visualization. By default it is None and if method is `kmc`, n_pca_components will be reset to 30;
            otherwise use all high dimensional data for velocity projection.
        min_r2: `float` (optional, default `0.01`)
            The minimal value of r-squared of the parameter fits for selecting velocity genes.
        min_alpha: `float` (optional, default `0.01`)
            The minimal value of alpha kinetic parameter for selecting velocity genes.
        min_gamma: `float` (optional, default `0.01`)
            The minimal value of gamma kinetic parameter for selecting velocity genes.
        min_delta: `float` (optional, default `0.01`)
            The minimal value of delta kinetic parameter for selecting velocity genes.
        basis: 'int' (optional, default `umap`)
            The dictionary key that corresponds to the reduced dimension in `.obsm` attribute.
        method: `string` (optional, default `pearson`)
            The method to calculate the transition matrix and project high dimensional vector to low dimension, either `kmc`,
            `cosine`, `pearson`, or `transform`. "kmc" is our new approach to learn the transition matrix via diffusion
            approximation or an Itô kernel. "cosine" or "pearson" are the methods used in the original RNA velocity paper
            or the scvelo paper (Note that scVelo implementation actually centers both dX and V, so its cosine kernel is
            equivalent to pearson correlation kernel but we also provide the raw cosine kernel). "kmc" option is arguable
            better than "correlation" or "cosine" as it not only considers the correlation but also the distance of the
            nearest neighbors to the high dimensional velocity vector. Finally, the "transform" method uses umap's transform
            method to transform new data points to the UMAP space. "transform" method is NOT recommended. Kernels that
            are based on the reconstructed vector field in high dimension is also possible.
        neg_cells_trick: 'bool' (optional, default `True`)
            Whether we should handle cells having negative correlations in gene expression difference with high dimensional
            velocity vector separately. This option was borrowed from scVelo package (https://github.com/theislab/scvelo)
            and use in conjunction with "pearson" and "cosine" kernel. Not required if method is set to be "kmc".
        calc_rnd_vel: `bool` (default: `False`)
            A logic flag to determine whether we will calculate the random velocity vectors which can be plotted
            downstream as a negative control and used to adjust the quiver scale of the velocity field.
        xy_grid_nums: `tuple` (default: `(50, 50)`).
            A tuple of number of grids on each dimension.
        correct_density: `bool` (default: `False`)
            Whether to correct density when calculating the markov transition matrix, applicable to the `kmc` kernel.
        correct_density: `bool` (default: `False`)
            Whether to scale velocity when calculating the markov transition matrix, applicable to the `kmc` kernel.
        sample_fraction: `None` or `float` (default: `None`)
            The downsampled fraction of kNN for the purpose of acceleration, applicable to the `kmc` kernel.
        random_seed: `int` (default: `19491001`)
            The random seed for numba to ensure consistency of the random velocity vectors. Default value 19491001 is a
            special day for those who care.
        key: `str` or None (default: `None`)
            The prefix key that will be prefixed to the keys for storing calculated transition matrix, projection vectors, etc.
        preserve_len: `bool` (default: `False`)
            Whether to preserve the length of high dimension vector length. When set to be True, the length  of low
            dimension projected vector will be proportionally scaled to that of the high dimensional vector.
        other_kernels_dict: `dict` (default: `{}`)
            A dictionary of paramters that will be passed to the cosine/correlation kernel.
        enforce: `bool` (default: `False`)
            Whether to enforce 1) redefining use_for_velocity column in obs attribute;
                               2) recalculation of transition matrix.

    Returns
    -------
        Adata: :class:`~anndata.AnnData`
            Returns an updated `~anndata.AnnData` with transition_matrix and projected embedding of high dimension velocity
            vectors in the existing embeddings of current cell state, calculated using either the Itô kernel method
            (default) or the diffusion approximation or the method from (La Manno et al. 2018).
    """

    mapper_r = get_mapper_inverse()
    layer = mapper_r[ekey] if (ekey is not None
                               and ekey in mapper_r.keys()) else ekey
    ekey, vkey, layer = (get_ekey_vkey_from_adata(adata) if
                         (ekey is None or vkey is None) else
                         (ekey, vkey, layer))

    if calc_rnd_vel:
        numba_random_seed(random_seed)

    if (not neighbors_from_basis) and ("neighbors" in adata.uns.keys()):
        if use_mnn:
            neighbors = adata.uns["mnn"]
            indices, dist = extract_indices_dist_from_graph(
                neighbors, adata.uns["neighbors"]["indices"].shape[1])
            indices, dist = indices[:, 1:], dist[:, 1:]
        else:
            if adata.obsp["distances"].shape[0] == adata.obsp[
                    "distances"].shape[1]:
                knn_indices, knn_dists = extract_indices_dist_from_graph(
                    adata.obsp["distances"], 30
                    # np.min((adata.uns["neighbors"]["connectivities"] > 0).sum(1).A)
                )
                knn_dists = build_distance_graph(knn_indices, knn_dists)

                adata.uns["neighbors"]["indices"], adata.obsp[
                    "distances"] = knn_indices, knn_dists
            neighbors, dist, indices = (
                adata.obsp["connectivities"],
                adata.obsp["distances"],
                adata.uns["neighbors"]["indices"],
            )
            indices, dist = indices[:, 1:], dist[:, 1:]

    if 'use_for_velocity' not in adata.var.keys() or enforce:
        use_for_dynamics = True if "use_for_dynamics" in adata.var.keys(
        ) else False
        adata = set_velocity_genes(
            adata,
            vkey="velocity_S",
            min_r2=min_r2,
            use_for_dynamics=use_for_dynamics,
            min_alpha=min_alpha,
            min_gamma=min_gamma,
            min_delta=min_delta,
        )

    X = adata[:, adata.var.use_for_velocity.
              values].layers[ekey] if X is None else X
    V_mat = (adata[:, adata.var.use_for_velocity.values].layers[vkey] if vkey
             in adata.layers.keys() else None) if V_mat is None else V_mat

    if X.shape != V_mat.shape and X.shape[0] != adata.n_obs:
        raise Exception(
            f"X and V_mat doesn't have the same dimensionalities or X/V_mat doesn't {adata.n_obs} rows!"
        )

    if X_embedding is None:
        if vkey == "velocity_S":
            X_embedding = adata.obsm["X_" + basis]
        else:
            adata = reduceDimension(adata, layer=layer, reduction_method=basis)
            X_embedding = adata.obsm[layer + "_" + basis]

    if X.shape[0] != X_embedding.shape[0] and X.shape[1] > X_embedding.shape[1]:
        raise Exception(
            f"X and X_embedding doesn't have the same sample dimension or "
            f"X doesn't have the higher feature dimension!")

    V_mat = V_mat.A if issparse(V_mat) else V_mat
    X = X.A if issparse(X) else X
    finite_inds = get_finite_inds(V_mat)
    X, V_mat = X[:, finite_inds], V_mat[:, finite_inds]

    if method == 'kmc' and n_pca_components is None: n_pca_components = 30
    if n_pca_components is not None:
        X = log1p_(adata, X)
        X_plus_V = log1p_(adata, X + V_mat)
        if ("velocity_pca_fit" not in adata.uns_keys()
                or type(adata.uns["velocity_pca_fit"]) == str):
            pca = PCA(
                n_components=min(n_pca_components, X.shape[1] - 1),
                svd_solver="arpack",
                random_state=0,
            )
            pca_fit = pca.fit(X)
            X_pca = pca_fit.transform(X)

            adata.uns["velocity_pca_fit"] = pca_fit
            adata.uns["velocity_PCs"] = pca_fit.components_.T
            adata.obsm["X_velocity_pca"] = X_pca

        X_pca, PCs, pca_fit = (
            adata.obsm["X_velocity_pca"],
            adata.uns["velocity_PCs"],
            adata.uns["velocity_pca_fit"],
        )

        Y_pca = pca_fit.transform(X_plus_V)
        V_pca = Y_pca - X_pca
        # V_pca = (V_mat - V_mat.mean(0)).dot(PCs)

        adata.obsm["velocity_pca_raw"] = V_pca
        X, V_mat = X_pca[:, :n_pca_components], V_pca[:, :n_pca_components]

    if neighbors_from_basis:
        if X.shape[0] > 200000 and X.shape[1] > 2:
            from pynndescent import NNDescent

            nbrs = NNDescent(X,
                             metric='eulcidean',
                             n_neighbors=30,
                             n_jobs=-1,
                             random_state=19490110,
                             **kwargs)
            indices, _ = nbrs.query(X, k=30)
        else:
            alg = "ball_tree" if X.shape[1] > 10 else 'kd_tree'
            nbrs = NearestNeighbors(n_neighbors=30, algorithm=alg,
                                    n_jobs=-1).fit(X)
            _, indices = nbrs.kneighbors(X)

    # add both source and sink distribution
    if method == "kmc":
        if method + '_transition_matrix' in adata.uns_keys() and not enforce:
            T = adata.uns[method + '_transition_matrix']
            kmc = KernelMarkovChain(P=T)
        else:
            kmc = KernelMarkovChain()
        kmc_args = {
            "n_recurse_neighbors": 2,
            "M_diff": 2,
            "epsilon": None,
            "adaptive_local_kernel": True,
            "tol": 1e-7,
        }
        kmc_args = update_dict(kmc_args, kmc_kwargs)

        if method + '_transition_matrix' not in adata.uns_keys(
        ) or not enforce:
            kmc.fit(X,
                    V_mat,
                    neighbor_idx=indices,
                    sample_fraction=sample_fraction,
                    **kmc_args)  #

        T = kmc.P
        if correct_density:
            delta_X = kmc.compute_density_corrected_drift(
                X_embedding, kmc.Idx, normalize_vector=True,
                scale=scale)  # indices, k = 500
        else:
            delta_X = kmc.compute_drift(X_embedding, num_prop=1,
                                        scale=scale)  # indices, k = 500

        # P = kmc.compute_stationary_distribution()
        # adata.obs['stationary_distribution'] = P
        X_grid, V_grid, D = velocity_on_grid(X_embedding,
                                             delta_X,
                                             xy_grid_nums=xy_grid_nums)

        if calc_rnd_vel:
            kmc = KernelMarkovChain()
            permute_rows_nsign(V_mat)
            kmc.fit(X, V_mat, **kmc_args)  # neighbor_idx=indices,
            T_rnd = kmc.P
            if correct_density:
                delta_X_rnd = kmc.compute_density_corrected_drift(
                    X_embedding, kmc.Idx,
                    normalize_vector=True)  # indices, k = 500
            else:
                delta_X_rnd = kmc.compute_drift(X_embedding)
            # P_rnd = kmc.compute_stationary_distribution()
            # adata.obs['stationary_distribution_rnd'] = P_rnd
            X_grid_rnd, V_grid_rnd, D_rnd = velocity_on_grid(
                X_embedding, delta_X_rnd, xy_grid_nums=xy_grid_nums)

        adata.uns["kmc"] = kmc
    elif method in ["pearson", "cosine"]:
        vs_kwargs = {
            "n_recurse_neighbors": 2,
            "max_neighs": None,
            "transform": 'sqrt',
            "use_neg_vals": True,
        }
        vs_kwargs = update_dict(vs_kwargs, other_kernels_dict)

        if method + '_transition_matrix' in adata.uns_keys() and not enforce:
            T = adata.uns[method + '_transition_matrix']
            delta_X = projection_with_transition_matrix(
                X.shape[0], T, X_embedding)
            X_grid, V_grid, D = velocity_on_grid(
                X_embedding[:, :2], (X_embedding + delta_X)[:, :2],
                xy_grid_nums=xy_grid_nums)
        else:
            T, delta_X, X_grid, V_grid, D = kernels_from_velocyto_scvelo(
                X, X_embedding, V_mat, indices, neg_cells_trick, xy_grid_nums,
                neighbors, method, **vs_kwargs)

        if calc_rnd_vel:
            permute_rows_nsign(V_mat)
            T_rnd, delta_X_rnd, X_grid_rnd, V_grid_rnd, D_rnd = kernels_from_velocyto_scvelo(
                X, X_embedding, V_mat, indices, neg_cells_trick, xy_grid_nums,
                neighbors, method, **vs_kwargs)
    elif method == "transform":
        umap_trans, n_pca_components = (
            adata.uns["umap_fit"]["fit"],
            adata.uns["umap_fit"]["n_pca_components"],
        )

        if "pca_fit" not in adata.uns_keys() or type(
                adata.uns["pca_fit"]) == str:
            CM = adata.X[:, adata.var.use_for_dynamics.values]
            from ..preprocessing.utils import pca

            adata, pca_fit, X_pca = pca(adata, CM, n_pca_components, "X")
            adata.uns["pca_fit"] = pca_fit

        X_pca, pca_fit = adata.obsm["X"], adata.uns["pca_fit"]
        V = (adata[:, adata.var.use_for_dynamics.values].layers[vkey]
             if vkey in adata.layers.keys() else None)
        CM, V = CM.A if issparse(CM) else CM, V.A if issparse(V) else V
        V[np.isnan(V)] = 0
        Y_pca = pca_fit.transform(CM + V)

        Y = umap_trans.transform(Y_pca)

        delta_X = Y - X_embedding

        X_grid, V_grid, D = velocity_on_grid(X_embedding,
                                             delta_X,
                                             xy_grid_nums=xy_grid_nums),

    if preserve_len:
        basis_len, high_len = np.linalg.norm(delta_X,
                                             axis=1), np.linalg.norm(V_mat,
                                                                     axis=1)
        scaler = np.nanmedian(basis_len) / np.nanmedian(high_len)
        for i in tqdm(range(adata.n_obs), desc=f"rescaling velocity norm..."):
            idx = T[i].indices
            high_len_ = high_len[idx]
            T_i = T[i].data
            delta_X[i] *= T_i.dot(high_len_) / basis_len[i] * scaler

    if key is None:
        adata.uns[method + "_transition_matrix"] = T
        adata.obsm["velocity_" + basis] = delta_X
        adata.uns["grid_velocity_" + basis] = {
            "X_grid": X_grid,
            "V_grid": V_grid,
            "D": D
        }
    else:
        adata.uns[key + '_' + method + "_transition_matrix"] = T
        adata.obsm[key + '_' + basis] = delta_X
        adata.uns["grid_" + key + '_' + basis] = {
            "X_grid": X_grid,
            "V_grid": V_grid,
            "D": D
        }

    if calc_rnd_vel:
        if key is None:
            adata.uns[method + "_transition_matrix_rnd"] = T_rnd
            adata.obsm["X_" + basis + "_rnd"] = X_embedding
            adata.obsm["velocity_" + basis + "_rnd"] = delta_X_rnd
            adata.uns["grid_velocity_" + basis + "_rnd"] = {
                "X_grid": X_grid_rnd,
                "V_grid": V_grid_rnd,
                "D": D_rnd,
            }
        else:
            adata.uns[key + '_' + method + "_transition_matrix_rnd"] = T_rnd
            adata.obsm["X_" + key + "_" + basis + "_rnd"] = X_embedding
            adata.obsm[key + "_" + basis + "_rnd"] = delta_X_rnd
            adata.uns["grid_" + key + '_' + basis + "_rnd"] = {
                "X_grid": X_grid_rnd,
                "V_grid": V_grid_rnd,
                "D": D_rnd,
            }

    return adata
Example #19
0
def cell_wise_confidence(
    adata,
    X_data=None,
    V_data=None,
    ekey="M_s",
    vkey="velocity_S",
    neighbors_from_basis=False,
    method="jaccard",
):
    """Calculate the cell-wise velocity confidence metric.

    Parameters
    ----------
        adata: :class:`~anndata.AnnData`
            an Annodata object.
        X_data: 'np.ndarray' or `sp.csr_matrix` or None (optional, default `None`)
            The expression states of single cells (or expression states in reduced dimension, like pca, of single cells)
        V_data: 'np.ndarray' or `sp.csr_matrix` or None (optional, default `None`)
            The RNA velocity of single cells (or velocity estimates projected to reduced dimension, like pca, of single
            cells). Note that X, V_mat need to have the exact dimensionalities.
        ekey: `str` (optional, default `M_s`)
            The dictionary key that corresponds to the gene expression in the layer attribute. By default, it is the
            smoothed expression `M_s`.
        vkey: 'str' (optional, default `velocity_S`)
            The dictionary key that corresponds to the estimated velocity values in layers attribute.
        neighbors_from_basis: `bool` (optional, default `False`)
            Whether to construct nearest neighbors from low dimensional space as defined by the `basis`, instead of using
            that calculated during UMAP process.
        method: `str` (optional, default `jaccard`)
            Which method will be used for calculating the cell wise velocity confidence metric.
            By default it uses
            `jaccard` index, which measures how well each velocity vector meets the geometric constraints defined by the
            local neighborhood structure. Jaccard index is calculated as the fraction of the number of the intersected
            set of nearest neighbors from each cell at current expression state (X) and that from the future expression
            state (X + V) over the number of the union of these two sets. The `cosine` or `correlation` method is similar
            to that used by scVelo (https://github.com/theislab/scvelo).

    Returns
    -------
        adata: :class:`~anndata.AnnData`
            Returns an updated `~anndata.AnnData` with `.obs.confidence` as the cell-wise velocity confidence.
    """

    if method in ["cosine", "consensus", "correlation"]:
        if "indices" not in adata.uns["neighbors"].keys():
            adata.uns["neighbors"]["indices"], _ = adj_to_knn(
                adata.obsp["connectivities"],
                n_neighbors=adata.uns["neighbors"]["params"]["n_neighbors"])

    if ekey == "X":
        X, V = (
            adata.X if X_data is None else X_data,
            adata.layers[vkey] if V_data is None else V_data,
        )
        norm_method = adata.uns["pp"]["norm_method"].copy()
        adata.uns["pp"]["norm_method"] = "log1p"
        X = inverse_norm(adata, X) if X_data is None else X_data
        adata.uns["pp"]["norm_method"] = norm_method
    else:
        X, V = (
            adata.layers[ekey] if X_data is None else X_data,
            adata.layers[vkey] if V_data is None else V_data,
        )
        X = inverse_norm(adata, X) if X_data is None else X_data

    if not neighbors_from_basis:
        check_and_recompute_neighbors(adata, result_prefix="")
        n_neigh, X_neighbors = (
            adata.uns["neighbors"]["params"]["n_neighbors"],
            adata.obsp["connectivities"],
        )
    else:
        n_neigh = 30

        if X.shape[0] > 200000 and X.shape[1] > 2:
            from pynndescent import NNDescent

            nbrs = NNDescent(
                X,
                metric="euclidean",
                n_neighbors=n_neigh + 1,
                n_jobs=-1,
                random_state=19491001,
            )
            nbrs_idx, dist = nbrs.query(X, k=n_neigh + 1)
        else:
            alg = "ball_tree" if X.shape[1] > 10 else "kd_tree"
            nbrs = NearestNeighbors(n_neighbors=n_neigh + 1,
                                    algorithm=alg,
                                    n_jobs=-1).fit(X)
            dist, nbrs_idx = nbrs.kneighbors(X)

        row = np.repeat(nbrs_idx[:, 0], n_neigh)
        col = nbrs_idx[:, 1:].flatten()
        X_neighbors = csr_matrix(
            (np.repeat(1, len(col)), (row, col)),
            shape=(adata.n_obs, adata.n_obs),
        )

    n_neigh = n_neigh[0] if type(n_neigh) == np.ndarray else n_neigh
    n_pca_components = adata.obsm["X"].shape[1]

    finite_inds = get_finite_inds(V, 0)
    X, V = X[:, finite_inds], V[:, finite_inds]
    if method == "jaccard":
        jac, _, _ = jaccard(X, V, n_pca_components, n_neigh, X_neighbors)
        confidence = jac

    elif method == "hybrid":
        # this is inspired from the locality preservation paper
        jac, intersect_, _ = jaccard(X, V, n_pca_components, n_neigh,
                                     X_neighbors)

        confidence = np.zeros(adata.n_obs)
        for i in tqdm(
                range(adata.n_obs),
                desc=
                "calculating hybrid method (jaccard + consensus) based cell wise confidence",
        ):
            neigh_ids = np.where(
                intersect_[i].A)[0] if issparse(intersect_) else np.where(
                    intersect_[i])[0]
            confidence[i] = (jac[i] * np.mean([
                consensus(V[i].A.flatten(), V[j].A.flatten())
                for j in neigh_ids
            ]) if issparse(V) else jac[i] * np.mean(
                [consensus(V[i].flatten(), V[j].flatten())
                 for j in neigh_ids]))

    elif method == "cosine":
        check_and_recompute_neighbors(adata, result_prefix="")
        indices = adata.uns["neighbors"]["indices"]
        confidence = np.zeros(adata.n_obs)
        for i in tqdm(
                range(adata.n_obs),
                desc="calculating cosine based cell wise confidence",
        ):
            neigh_ids = indices[i]
            confidence[i] = (np.mean([
                einsum_correlation(V[i].A, V[j].A.flatten(), type="cosine")[0,
                                                                            0]
                for j in neigh_ids
            ]) if issparse(V) else np.mean([
                einsum_correlation(
                    V[i][None, :], V[j].flatten(), type="cosine")[0, 0]
                for j in neigh_ids
            ]))

    elif method == "consensus":
        check_and_recompute_neighbors(adata, result_prefix="")
        indices = adata.uns["neighbors"]["indices"]
        confidence = np.zeros(adata.n_obs)
        for i in tqdm(
                range(adata.n_obs),
                desc="calculating consensus based cell wise confidence",
        ):
            neigh_ids = indices[i]
            confidence[i] = (np.mean([
                consensus(V[i].A.flatten(), V[j].A.flatten())
                for j in neigh_ids
            ]) if issparse(V) else np.mean(
                [consensus(V[i], V[j].flatten()) for j in neigh_ids]))

    elif method == "correlation":
        # this is equivalent to scVelo
        check_and_recompute_neighbors(adata, result_prefix="")
        indices = adata.uns["neighbors"]["indices"]
        confidence = np.zeros(adata.n_obs)
        for i in tqdm(
                range(adata.n_obs),
                desc="calculating correlation based cell wise confidence",
        ):
            neigh_ids = indices[i]
            confidence[i] = (np.mean([
                einsum_correlation(V[i].A, V[j].A.flatten(), type="pearson")[0,
                                                                             0]
                for j in neigh_ids
            ]) if issparse(V) else np.mean([
                einsum_correlation(
                    V[i][None, :], V[j].flatten(), type="pearson")[0, 0]
                for j in neigh_ids
            ]))

    elif method == "divergence":
        pass

    else:
        raise Exception(
            "The input {} method for cell-wise velocity confidence calculation is not implemented"
            " yet".format(method))

    adata.obs[method + "_velocity_confidence"] = confidence

    return adata
Example #20
0
class NearestNeighbors:
    """Greedy algorithm to balance a K-nearest neighbour graph
    It has an API similar to scikit-learn
    Parameters
    ----------
    k : int  (default=50)
        the number of neighbours in the final graph
    sight_k : int  (default=100)
        the number of neighbours in the initialization graph
        It correspondent to the farthest neighbour that a sample is allowed to connect to
        when no closest neighbours are allowed. If sight_k is reached then the matrix is filled
        with the sample itself
    maxl : int  (default=200)
        max degree of connectivity allowed. Avoids the presence of hubs in the graph, it is the
        maximum number of neighbours that are allowed to contact a node before the node is blocked
    mode : str (default="connectivity")
        decide wich kind of utput "distance" or "connectivity"
    n_jobs : int  (default=4)
        parallelization of the standard KNN search preformed at initialization
    """
    def __init__(self,
                 k: int = 50,
                 sight_k: int = 100,
                 maxl: int = 200,
                 mode: str = "distance",
                 metric: str = "euclidean",
                 minkowski_p: int = 20,
                 n_jobs: int = -1) -> None:
        # input parameters
        self.k = k
        self.sight_k = sight_k
        self.maxl = maxl
        self.mode = mode
        self.metric = metric
        self.minkowski_p = minkowski_p
        self.n_jobs = n_jobs

        # NN graphs
        self.data = None
        self._nn = None  # raw KNN
        self.bknn = None  # balanced KNN
        self.dist = None  # balanced KNN distances
        self.dsi = None  # balanced KNN neighbor index
        self.l = None  # balanced KNN degree of connectivity
        self.mknn = None  # mutual KNN based on bknn
        self.rnn = None  # radius NN based on mknn

    @property
    def n_samples(self) -> int:
        return self.data.shape[0]

    def fit(self, data: np.ndarray, sight_k: int = None) -> Any:
        """Fits the model
        data: np.ndarray (samples, features)
            np
        sight_k: int
            the farthest point that a node is allowed to connect to when its closest neighbours are not allowed
        """
        self.data = data
        if sight_k is not None:
            self.sight_k = sight_k
        logging.debug(
            f"First search the {self.sight_k} nearest neighbours for {self.n_samples}"
        )
        np.random.seed(13)
        if self.metric == "correlation":
            self._nn = _NearestNeighbors(n_neighbors=self.sight_k + 1,
                                         metric=self.metric,
                                         p=self.minkowski_p,
                                         n_jobs=self.n_jobs,
                                         algorithm="brute")
            self._nn.fit(self.data)
        elif self.metric == "js":
            self._nn = NNDescent(data=self.data,
                                 metric=jensen_shannon_distance)
        else:
            self._nn = _NearestNeighbors(n_neighbors=self.sight_k + 1,
                                         metric=self.metric,
                                         p=self.minkowski_p,
                                         n_jobs=self.n_jobs,
                                         leaf_size=30)
            self._nn.fit(self.data)

        # call this to calculate bknn
        self.kneighbors_graph(mode='distance')
        return self

    def kneighbors(self,
                   X: np.ndarray = None,
                   maxl: int = None,
                   mode: str = "distance"
                   ) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
        if self._nn is None:
            raise ValueError('must fit() before generating kneighbors graphs')
        """Finds the K-neighbors of a point.
            Returns indices of and distances to the neighbors of each point.
            Parameters
            ----------
            X : array-like, shape (n_query, n_features),
                The query point or points.
                If not provided, neighbors of each indexed point are returned.
                In this case, the query point is not considered its own neighbor.
            maxl: int
                max degree of connectivity allowed
            mode : "distance" or "connectivity"
                Decides the kind of output
            Returns
            -------
            dist_new : np.ndarray (samples, k+1)
                distances to the NN
            dsi_new : np.ndarray (samples, k+1)
                indexes of the NN, first column is the sample itself
            l: np.ndarray (samples)
                l[i] is the number of connections from other samples to the sample i
            NOTE:
            First column (0) correspond to the sample itself, the nearest neighbour is at the second column (1)
        """
        if X is not None:
            self.data = X
        if maxl is not None:
            self.maxl = maxl
        if mode == "distance":
            if self.metric == "js":
                self.dsi, self.dist = self._nn.query(self.data,
                                                     k=self.sight_k + 1)
            else:
                self.dist, self.dsi = self._nn.kneighbors(self.data,
                                                          return_distance=True)
        else:
            if self.metric == "js":
                self.dsi, _ = self._nn.query(self.data, k=self.sight_k + 1)
            else:
                self.dsi = self._nn.kneighbors(self.data,
                                               return_distance=False)
            self.dist = np.ones_like(self.dsi, dtype='float64')
            self.dist[:, 0] = 0
        logging.debug(
            f"Using the initialization network to find a {self.k}-NN "
            f"graph with maximum connectivity of {self.maxl}")
        self.dist, self.dsi, self.l = knn_balance(self.dsi,
                                                  self.dist,
                                                  maxl=self.maxl,
                                                  k=self.k)
        return self.dist, self.dsi, self.l

    def kneighbors_graph(self,
                         X: np.ndarray = None,
                         maxl: int = None,
                         mode: str = "distance") -> sparse.csr_matrix:
        """Retrun the K-neighbors graph as a sparse csr matrix
            Parameters
            ----------
            X : array-like, shape (n_query, n_features),
                The query point or points.
                If not provided, neighbors of each indexed point are returned.
                In this case, the query point is not considered its own neighbor.
            maxl: int
                max degree of connectivity allowed
            mode : "distance" or "connectivity"
                Decides the kind of output
            Returns
            -------
            neighbor_graph : scipy.sparse.csr_matrix
                The values are either distances or connectivity dependig of the mode parameter
            NOTE: The diagonal will be zero even though the value 0 is actually stored
        """
        dist_new, dsi_new, _ = self.kneighbors(X=X, maxl=maxl, mode=mode)
        logging.debug("Returning sparse matrix")
        self.bknn = sparse.csr_matrix(
            (np.ravel(dist_new), np.ravel(dsi_new),
             np.arange(0, dist_new.shape[0] * dist_new.shape[1] + 1,
                       dist_new.shape[1])), (self.n_samples, self.n_samples))
        self.bknn.eliminate_zeros()
        return self.bknn

    def mnn_graph(self):
        """get mutual nearest neighbor graph from bknn"""
        if self.mknn is None:
            if self.bknn is None:
                raise ValueError(
                    'must fit() before generating kneighbors graphs')
            # element-wise minimum between bknn and bknn.T, so non-mutual value will be 0
            self.mknn = self.bknn.minimum(self.bknn.transpose())
        return self.mknn

    def rnn_graph(self):
        """get rnn from mknn, return a sparse binary matrix"""
        # Convert distances to similarities
        if self.mknn is None:
            self.mnn_graph()
        mknn_sim = self.mknn.copy()
        bknn_sim = self.bknn.copy()
        max_d = self.bknn.data.max()
        bknn_sim.data = (max_d - bknn_sim.data) / max_d
        mknn_sim.data = (max_d - mknn_sim.data) / max_d
        mknn_sim = mknn_sim.tocoo()
        mknn_sim.setdiag(0)

        # Compute the effective resolution
        d = 1 - bknn_sim.data
        radius = np.percentile(d, 90)
        logging.info(f"  90th percentile radius: {radius:.02}")
        inside = mknn_sim.data > 1 - radius
        self.rnn = sparse.coo_matrix(
            (mknn_sim.data[inside],
             (mknn_sim.row[inside], mknn_sim.col[inside])),
            shape=mknn_sim.shape)
        return self.rnn
Example #21
0
def fate_bias(
    adata,
    group,
    basis="umap",
    inds=None,
    speed_percentile=5,
    dist_threshold=None,
    source_groups=None,
    metric="euclidean",
    metric_kwds=None,
    cores=1,
    seed=19491001,
    **kwargs,
):
    """Calculate the lineage (fate) bias of states whose trajectory are predicted.

    Fate bias is currently calculated as the percentage of points along the predicted cell fate trajectory whose distance
    to their 0-th nearest neighbors on the data are close enough (determined by median 1-st nearest neighbors of all
    observed cells and the dist_threshold) to any cell from each group specified by `group` key. The details is described
    as following:

    Cell fate predicted by our vector field method sometimes end up in regions that are not sampled with cells. We thus
    developed a heuristic method to iteratively walk backward the integration path to assign cell fate. We first identify
    the regions with small velocity in the tail of the integration path (determined by `speed_percentile`), then we check
    whether the distance of 0-th nearest points on the observed data to all those points are far away from the observed
    data (determined by `dist_threshold`). If they are not all close to data, we then walk backwards along the trajectory
    by one time step until the distance of any currently visited integration path’s data points’ 0-th nearest points to
    the observed cells is close enough. In order to calculate the cell fate probability, we diffuse one step further of
    the identified nearest neighbors from the integration to identify more nearest observed cells, especially those from
    terminal cell types in case nearby cells first identified are all close to some random progenitor cells. Then we use
    group information of those observed cells to define the fate probability.

    `fate_bias` calculate a confidence score for the calculated fate probability with a simple metric, defined as
        :math:`1 - (sum(distances > dist_threshold * median_dist) + walk_back_steps) / (len(indices) + walk_back_steps)`

    The `distance` is currently visited integration path’s data points’ 0-th nearest points to the observed cells.
    `median_dist` is median distance of their 1-st nearest cell distance of all observed cells. `walk_back_steps` is the
    steps walked backward along the integration path until all currently visited integration points's 0-th nearest points
    to the observed cells satisfy the distance threshold. `indices` are the time indices of integration points that is
    regarded as the regions with `small velocity` (note when walking backward, those corresponding points are not
    necessarily have small velocity anymore).

    Arguments
    ---------
        adata: :class:`~anndata.AnnData`
            AnnData object that contains the predicted fate trajectories in the `uns` attribute.
        group: `str`
            The column key that corresponds to the cell type or other group information for quantifying the bias of cell
            state.
        basis: `str` or None (default: `None`)
            The embedding data space where cell fates were predicted and cell fates bias will be quantified.
        inds `list` or `float` or None (default: `None`):
            The indices of the time steps that will be used for calculating fate bias. If inds is None, the last a few
            steps of the fate prediction based on the `sink_speed_percentile` will be use. If inds is the float (between
            0 and 1), it will be regarded as a percentage, and the last percentage of steps will be used for fate bias
            calculation. Otherwise inds need to be a list of integers of the time steps.
        speed_percentile: `float` (default: `5`)
            The percentile of speed that will be used to determine the terminal cells (or sink region on the prediction
            path where speed is smaller than this speed percentile).
        dist_threshold: `float` or `None` (default: `None`)
            A multiplier of the median nearest cell distance on the embedding to determine cells that are outside the
            sampled domain of cells. If the mean distance of identified "terminal cells" is above this number, we will
            look backward along the trajectory (by minimize all indices by 1) until it finds cells satisfy this threshold.
            By default it is set to be 1 to ensure only considering points that are very close to observed data points.
        source_groups: `list` or `None` (default: `None`)
            The groups that corresponds to progenitor groups. They has to have at least one intersection with the groups
            from the `group` column. If group is not `None`, any identified "source_groups" cells that happen to be in
            those groups will be ignored and the probability of cell fate of those cells will be reassigned to the group
            that has the highest fate probability among other non source_groups group cells.
        metric: `str` or callable, default='euclidean'
            The distance metric to use for the tree.  The default metric is , and with p=2 is equivalent to the standard
            Euclidean metric. See the documentation of :class:`DistanceMetric` for a list of available metrics. If metric
            is "precomputed", X is assumed to be a distance matrix and must be square during fit. X may be a
            :term:`sparse graph`, in which case only "nonzero" elements may be considered neighbors.
        metric_kwds : dict, default=None
            Additional keyword arguments for the metric function.
        cores: `int` (default: 1)
            The number of parallel jobs to run for neighbors search. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
            ``-1`` means using all processors.
        seed: `int` (default `19491001`)
            Random seed to ensure the reproducibility of each run.
        kwargs:
            Additional arguments that will be passed to each nearest neighbor search algorithm.

    Returns
    -------
        fate_bias: `pandas.DataFrame`
            A DataFrame that stores the fate bias for each cell state (row) to each cell group (column).
    """

    if dist_threshold is None:
        dist_threshold = 1

    if group not in adata.obs.keys():
        raise ValueError(
            f"The group {group} you provided is not a key of .obs attribute.")
    else:
        clusters = adata.obs[group]

    basis_key = "X_" + basis if basis is not None else "X"
    fate_key = "fate_" + basis if basis is not None else "fate"

    if basis_key not in adata.obsm.keys():
        raise ValueError(
            f"The basis {basis_key} you provided is not a key of .obsm attribute."
        )
    if fate_key not in adata.uns.keys():
        raise ValueError(
            f"The {fate_key} key is not existed in the .uns attribute of the adata object. You need to run"
            f"dyn.pd.fate(adata, basis='{basis}') before calculate fate bias.")

    if source_groups is not None:
        if type(source_groups) is str:
            source_groups = [source_groups]
        source_groups = list(set(source_groups).intersection(clusters))
        if len(source_groups) == 0:
            raise ValueError(
                f"the {source_groups} you provided doesn't intersect with any groups in the {group} column."
            )

    X = adata.obsm[basis_key] if basis_key != "X" else adata.X

    if X.shape[0] > 5000 and X.shape[1] > 2:
        from pynndescent import NNDescent

        nbrs = NNDescent(X,
                         metric=metric,
                         metric_kwds=metric_kwds,
                         n_neighbors=30,
                         n_jobs=cores,
                         random_state=seed,
                         **kwargs)
        knn, distances = nbrs.query(X, k=30)
    else:
        alg = "ball_tree" if X.shape[1] > 10 else "kd_tree"
        nbrs = NearestNeighbors(n_neighbors=30, algorithm=alg,
                                n_jobs=cores).fit(X)
        distances, knn = nbrs.kneighbors(X)

    median_dist = np.median(distances[:, 1])

    pred_dict = {}
    cell_predictions, cell_indx = adata.uns[fate_key]["prediction"], adata.uns[
        fate_key]["init_cells"]
    t = adata.uns[fate_key]["t"]
    confidence = np.zeros(len(t))

    for i, prediction in tqdm(enumerate(cell_predictions),
                              desc="calculating fate distributions"):
        cur_t, n_steps = t[i], len(t[i])

        # ensure to identify sink where the speed is very slow if inds is not provided.
        # if inds is the percentage, use the last percentage of steps to check for cell fate bias.
        # otherwise inds need to be a list.
        if inds is None:
            avg_speed = np.array(
                [np.linalg.norm(i)
                 for i in np.diff(prediction, 1).T]) / np.diff(cur_t)
            sink_checker = np.where(
                avg_speed[::-1] > np.percentile(avg_speed, speed_percentile)
            )[0]
            indices = np.arange(n_steps - max(min(sink_checker), 10), n_steps)
        elif inds is float:
            indices = np.arange(int(n_steps - inds * n_steps), n_steps)
        else:
            indices = inds

        if hasattr(nbrs, "query"):
            knn, distances = nbrs.query(prediction[:, indices].T, k=30)
        else:
            distances, knn = nbrs.kneighbors(prediction[:, indices].T)

            # if final steps too far away from observed cells, ignore them
        walk_back_steps = 0
        while True:
            is_dist_larger_than_threshold = distances.flatten(
            ) < dist_threshold * median_dist
            if any(is_dist_larger_than_threshold):

                # let us diffuse one step further to identify cells from terminal cell types in case
                # cells with indices are all close to some random progenitor cells.
                if hasattr(nbrs, "query"):
                    knn, _ = nbrs.query(X[knn.flatten(), :], k=30)
                else:
                    _, knn = nbrs.kneighbors(X[knn.flatten(), :])

                fate_prob = clusters[knn.flatten()].value_counts() / len(
                    knn.flatten())
                if source_groups is not None:
                    source_p = fate_prob[source_groups].sum()
                    if 1 > source_p > 0:
                        fate_prob[source_groups] = 0
                        fate_prob[fate_prob.idxmax()] += source_p

                pred_dict[i] = fate_prob

                confidence[i] = 1 - (
                    sum(~is_dist_larger_than_threshold) + walk_back_steps) / (
                        len(is_dist_larger_than_threshold) + walk_back_steps)

                break
            else:
                walk_back_steps += 1

                if any(indices - 1 < 0):
                    pred_dict[i] = clusters[
                        knn.flatten()].value_counts() * np.nan
                    break

                if hasattr(nbrs, "query"):
                    knn, distances = nbrs.query(prediction[:, indices - 1].T,
                                                k=30)
                else:
                    distances, knn = nbrs.kneighbors(prediction[:,
                                                                indices - 1].T)

                knn, distances = knn[:, 0], distances[:, 0]
                indices = indices - 1

    bias = pd.DataFrame(pred_dict).T
    conf = pd.DataFrame({"confidence": confidence}, index=bias.index)
    bias = pd.merge(conf, bias, left_index=True, right_index=True)

    if cell_indx is not None:
        bias.index = cell_indx

    return bias
Example #22
0
def graphize_vecfld(
    func,
    X,
    nbrs_idx=None,
    dist=None,
    k=30,
    distance_free=True,
    n_int_steps=20,
    cores=1,
):
    n, d = X.shape

    nbrs = None
    if nbrs_idx is None:
        if X.shape[0] > 200000 and X.shape[1] > 2:
            from pynndescent import NNDescent

            nbrs = NNDescent(
                X,
                metric="euclidean",
                n_neighbors=k + 1,
                n_jobs=-1,
                random_state=19491001,
            )
            nbrs_idx, dist = nbrs.query(X, k=k + 1)
        else:
            alg = "ball_tree" if X.shape[1] > 10 else "kd_tree"
            nbrs = NearestNeighbors(n_neighbors=k + 1,
                                    algorithm=alg,
                                    n_jobs=-1).fit(X)
            dist, nbrs_idx = nbrs.kneighbors(X)

    if dist is None and not distance_free:
        D = pdist(X)
    else:
        D = None

    V = sp.csr_matrix((n, n))
    if cores == 1:
        for i, idx in enumerate(
                LoggerManager.progress_logger(
                    nbrs_idx, progress_name="graphize_vecfld")):
            V += construct_v(X, i, idx, n_int_steps, func, distance_free, dist,
                             D, n)

    else:
        pool = ThreadPool(cores)
        res = pool.starmap(
            construct_v,
            zip(
                itertools.repeat(X),
                np.arange(len(nbrs_idx)),
                nbrs_idx,
                itertools.repeat(n_int_steps),
                itertools.repeat(func),
                itertools.repeat(distance_free),
                itertools.repeat(dist),
                itertools.repeat(D),
                itertools.repeat(n),
            ),
        )
        pool.close()
        pool.join()
        V = functools.reduce((lambda a, b: a + b), res)
    return V, nbrs