Example #1
0
def neighbors(adata,
              n_neighbors=30,
              n_pcs=None,
              use_rep=None,
              knn=True,
              random_state=0,
              method='umap',
              metric='euclidean',
              metric_kwds={},
              num_threads=-1,
              copy=False):
    """
    Compute a neighborhood graph of observations.

    The neighbor graph methods (umap, hnsw, sklearn) only differ in runtime and yield the same result as scanpy [Wolf18]_.
    Connectivities are computed with adaptive kernel width as proposed in Haghverdi et al. 2016 (doi:10.1038/nmeth.3971).

    Parameters
    ----------
    adata
        Annotated data matrix.
    n_neighbors
        The size of local neighborhood (in terms of number of neighboring data
        points) used for manifold approximation. Larger values result in more
        global views of the manifold, while smaller values result in more local
        data being preserved. In general values should be in the range 2 to 100.
        If `knn` is `True`, number of nearest neighbors to be searched. If `knn`
        is `False`, a Gaussian kernel width is set to the distance of the
        `n_neighbors` neighbor.
    n_pcs : `int` or `None` (default: None)
        Number of principal components to use.
        If not specified, the full space is used of a pre-computed PCA,
        or 30 components are used when PCA is computed internally.
    use_rep : `None`, `'X'` or any key for `.obsm` (default: None)
        Use the indicated representation. If `None`, the representation is chosen automatically:
        for .n_vars < 50, .X is used, otherwise ‘X_pca’ is used.
    knn
        If `True`, use a hard threshold to restrict the number of neighbors to
        `n_neighbors`, that is, consider a knn graph. Otherwise, use a Gaussian
        Kernel to assign low weights to neighbors more distant than the
        `n_neighbors` nearest neighbor.
    random_state
        A numpy random seed.
    method : {{'umap', 'hnsw', 'sklearn'}}  (default: `'umap'`)
        Method to compute neighbors, only differs in runtime.
        The 'hnsw' method is most efficient and requires to `pip install hnswlib`.
        Connectivities are computed with adaptive kernel width as proposed in Haghverdi et al. 2016 (https://doi.org/10.1038/nmeth.3971).
    metric
        A known metric’s name or a callable that returns a distance.
    metric_kwds
        Options for the metric.
    copy
        Return a copy instead of writing to adata.
    Returns
    -------
    Depending on `copy`, updates or returns `adata` with the following:
    connectivities : sparse matrix (`.uns['neighbors']`, dtype `float32`)
        Weighted adjacency matrix of the neighborhood graph of data
        points. Weights should be interpreted as connectivities.
    distances : sparse matrix (`.uns['neighbors']`, dtype `float32`)
        Instead of decaying weights, this stores distances for each pair of
        neighbors.
    """
    adata = adata.copy() if copy else adata

    if use_rep is None:
        use_rep = 'X' if adata.n_vars < 50 or n_pcs == 0 else 'X_pca'
        n_pcs = None if use_rep == 'X' else n_pcs
    elif use_rep not in adata.obsm.keys(
    ) and 'X_' + use_rep in adata.obsm.keys():
        use_rep = 'X_' + use_rep

    if use_rep == 'X_pca':
        if 'X_pca' not in adata.obsm.keys(
        ) or n_pcs is not None and n_pcs > adata.obsm['X_pca'].shape[1]:
            pca(adata,
                n_comps=min(30 if n_pcs is None else n_pcs, adata.n_vars - 1),
                svd_solver='arpack')
        elif n_pcs is None and adata.obsm['X_pca'].shape[1] < 10:
            logg.warn('Neighbors are computed on ',
                      adata.obsm['X_pca'].shape[1],
                      ' principal components only.')

        n_duplicate_cells = len(get_duplicate_cells(adata))
        if n_duplicate_cells > 0:
            logg.warn(
                'You seem to have {} duplicate cells in your data.'.format(
                    n_duplicate_cells),
                'Consider removing these via pp.remove_duplicate_cells.')

    logg.info('computing neighbors', r=True)

    if method == 'sklearn':
        from sklearn.neighbors import NearestNeighbors
        X = adata.X if use_rep == 'X' else adata.obsm[use_rep]
        neighbors = NearestNeighbors(n_neighbors=n_neighbors - 1,
                                     metric=metric,
                                     metric_params=metric_kwds,
                                     n_jobs=num_threads)
        neighbors.fit(X if n_pcs is None else X[:, :n_pcs])
        knn_distances, neighbors.knn_indices = neighbors.kneighbors()
        knn_distances, neighbors.knn_indices = set_diagonal(
            knn_distances, neighbors.knn_indices)
        neighbors.distances, neighbors.connectivities = \
            compute_connectivities_umap(neighbors.knn_indices, knn_distances, X.shape[0], n_neighbors=n_neighbors)

    elif method == 'hnsw':
        X = adata.X if use_rep == 'X' else adata.obsm[use_rep]
        neighbors = FastNeighbors(n_neighbors=n_neighbors,
                                  num_threads=num_threads)
        neighbors.fit(X if n_pcs is None else X[:, :n_pcs],
                      metric=metric,
                      random_state=random_state,
                      **metric_kwds)

    else:
        logg.switch_verbosity('off', module='scanpy')
        with warnings.catch_warnings(
        ):  # ignore numba warning (reported in umap/issues/252)
            warnings.simplefilter("ignore")
            neighbors = Neighbors(adata)
            neighbors.compute_neighbors(
                n_neighbors=n_neighbors,
                knn=knn,
                n_pcs=n_pcs,
                method=method,
                use_rep=None if use_rep == 'X_pca' else use_rep,
                random_state=random_state,
                metric=metric,
                metric_kwds=metric_kwds,
                write_knn_indices=True)
        logg.switch_verbosity('on', module='scanpy')

    adata.uns['neighbors'] = {}
    try:
        adata.obsp['distances'] = neighbors.distances
        adata.obsp['connectivities'] = neighbors.connectivities
        adata.uns['neighbors']['connectivities_key'] = 'connectivities'
        adata.uns['neighbors']['distances_key'] = 'distances'
    except:
        adata.uns['neighbors']['distances'] = neighbors.distances
        adata.uns['neighbors']['connectivities'] = neighbors.connectivities

    if hasattr(neighbors, 'knn_indices'):
        adata.uns['neighbors']['indices'] = neighbors.knn_indices
    adata.uns['neighbors']['params'] = {
        'n_neighbors': n_neighbors,
        'method': method,
        'metric': metric,
        'n_pcs': n_pcs
    }

    logg.info('    finished',
              time=True,
              end=' ' if settings.verbosity > 2 else '\n')
    logg.hint(
        'added \n'
        '    \'distances\' and \'connectivities\', weighted adjacency matrices (adata.obsp)'
    )

    return adata if copy else None
Example #2
0
def neighbors(adata,
              n_neighbors=30,
              n_pcs=30,
              use_rep=None,
              knn=True,
              random_state=0,
              method='umap',
              metric='euclidean',
              metric_kwds={},
              copy=False):
    """
    Compute a neighborhood graph of observations [McInnes18]_.
    The neighbor search efficiency of this heavily relies on UMAP [McInnes18]_,
    which also provides a method for estimating connectivities of data points -
    the connectivity of the manifold (`method=='umap'`). If `method=='diffmap'`,
    connectivities are computed according to [Coifman05]_, in the adaption of
    [Haghverdi16]_.
    Parameters
    ----------
    adata
        Annotated data matrix.
    n_neighbors
        The size of local neighborhood (in terms of number of neighboring data
        points) used for manifold approximation. Larger values result in more
        global views of the manifold, while smaller values result in more local
        data being preserved. In general values should be in the range 2 to 100.
        If `knn` is `True`, number of nearest neighbors to be searched. If `knn`
        is `False`, a Gaussian kernel width is set to the distance of the
        `n_neighbors` neighbor.
    n_pcs : `int` or `None` (default: None)
        Use this many PCs. If n_pcs==0 use .X if use_rep is None.

    use_rep : `None`, `'X'` or any key for `.obsm` (default: None)
        Use the indicated representation. If `None`, the representation is chosen automatically:
        for .n_vars < 50, .X is used, otherwise ‘X_pca’ is used.
    knn
        If `True`, use a hard threshold to restrict the number of neighbors to
        `n_neighbors`, that is, consider a knn graph. Otherwise, use a Gaussian
        Kernel to assign low weights to neighbors more distant than the
        `n_neighbors` nearest neighbor.
    random_state
        A numpy random seed.
    method : {{'umap', 'gauss', `sklearn`, `None`}}  (default: `'umap'`)
        Use 'umap' [McInnes18]_ or 'gauss' (Gauss kernel following [Coifman05]_
        with adaptive width [Haghverdi16]_) for computing connectivities.
    metric
        A known metric’s name or a callable that returns a distance.
    metric_kwds
        Options for the metric.
    copy
        Return a copy instead of writing to adata.
    Returns
    -------
    Depending on `copy`, updates or returns `adata` with the following:
    connectivities : sparse matrix (`.uns['neighbors']`, dtype `float32`)
        Weighted adjacency matrix of the neighborhood graph of data
        points. Weights should be interpreted as connectivities.
    distances : sparse matrix (`.uns['neighbors']`, dtype `float32`)
        Instead of decaying weights, this stores distances for each pair of
        neighbors.
    """
    logg.info('computing neighbors', r=True)
    adata = adata.copy() if copy else adata
    if adata.isview: adata._init_as_actual(adata.copy())

    if (use_rep is None or use_rep is 'X_pca') \
            and ('X_pca' not in adata.obsm.keys() or n_pcs > adata.obsm['X_pca'].shape[1]):
        pca(adata, n_comps=n_pcs, svd_solver='arpack')

    adata.uns['neighbors'] = {}
    adata.uns['neighbors']['params'] = {
        'n_neighbors': n_neighbors,
        'method': method
    }

    if method is 'sklearn':
        from sklearn.neighbors import NearestNeighbors
        neighbors = NearestNeighbors(n_neighbors=n_neighbors)
        neighbors.fit(
            adata.obsm['X_pca'] if use_rep is None else adata.obsm[use_rep])
        adata.uns['neighbors']['distances'] = neighbors.kneighbors_graph(
            mode='distance')
        adata.uns['neighbors']['connectivities'] = neighbors.kneighbors_graph(
            mode='connectivity')

    else:
        neighbors = Neighbors(adata)
        neighbors.compute_neighbors(n_neighbors=n_neighbors,
                                    knn=knn,
                                    n_pcs=n_pcs,
                                    use_rep=use_rep,
                                    method=method,
                                    metric=metric,
                                    metric_kwds=metric_kwds,
                                    random_state=random_state,
                                    write_knn_indices=True)
        adata.uns['neighbors']['distances'] = neighbors.distances
        adata.uns['neighbors']['connectivities'] = neighbors.connectivities
        adata.uns['neighbors']['indices'] = neighbors.knn_indices

    logg.info('    finished',
              time=True,
              end=' ' if settings.verbosity > 2 else '\n')
    logg.hint('added to `.uns[\'neighbors\']`\n'
              '    \'distances\', weighted adjacency matrix\n'
              '    \'connectivities\', weighted adjacency matrix')
    return adata if copy else None
Example #3
0
def neighbors(adata,
              n_neighbors=30,
              n_pcs=None,
              use_rep=None,
              knn=True,
              random_state=0,
              method='umap',
              metric='euclidean',
              metric_kwds={},
              num_threads=-1,
              copy=False):
    """
    Compute a neighborhood graph of observations [McInnes18]_.
    The neighbor search efficiency of this heavily relies on UMAP [McInnes18]_,
    which also provides a method for estimating connectivities of data points -
    the connectivity of the manifold (`method=='umap'`). If `method=='diffmap'`,
    connectivities are computed according to [Coifman05]_, in the adaption of
    [Haghverdi16]_.
    Parameters
    ----------
    adata
        Annotated data matrix.
    n_neighbors
        The size of local neighborhood (in terms of number of neighboring data
        points) used for manifold approximation. Larger values result in more
        global views of the manifold, while smaller values result in more local
        data being preserved. In general values should be in the range 2 to 100.
        If `knn` is `True`, number of nearest neighbors to be searched. If `knn`
        is `False`, a Gaussian kernel width is set to the distance of the
        `n_neighbors` neighbor.
    n_pcs : `int` or `None` (default: None)
        Number of principal components to use.
        If not specified, the full space is used of a pre-computed PCA,
        or 30 components are used when PCA is computed internally.
    use_rep : `None`, `'X'` or any key for `.obsm` (default: None)
        Use the indicated representation. If `None`, the representation is chosen automatically:
        for .n_vars < 50, .X is used, otherwise ‘X_pca’ is used.
    knn
        If `True`, use a hard threshold to restrict the number of neighbors to
        `n_neighbors`, that is, consider a knn graph. Otherwise, use a Gaussian
        Kernel to assign low weights to neighbors more distant than the
        `n_neighbors` nearest neighbor.
    random_state
        A numpy random seed.
    method : {{'umap', 'gauss', 'hnsw', 'sklearn', `None`}}  (default: `'umap'`)
        Use 'umap' [McInnes18]_ or 'gauss' (Gauss kernel following [Coifman05]_
        with adaptive width [Haghverdi16]_) for computing connectivities.
    metric
        A known metric’s name or a callable that returns a distance.
    metric_kwds
        Options for the metric.
    copy
        Return a copy instead of writing to adata.
    Returns
    -------
    Depending on `copy`, updates or returns `adata` with the following:
    connectivities : sparse matrix (`.uns['neighbors']`, dtype `float32`)
        Weighted adjacency matrix of the neighborhood graph of data
        points. Weights should be interpreted as connectivities.
    distances : sparse matrix (`.uns['neighbors']`, dtype `float32`)
        Instead of decaying weights, this stores distances for each pair of
        neighbors.
    """
    adata = adata.copy() if copy else adata
    if adata.isview: adata._init_as_actual(adata.copy())

    if use_rep is None:
        use_rep = 'X' if adata.n_vars < 50 or n_pcs is 0 else 'X_pca'
        n_pcs = None if use_rep is 'X' else n_pcs
    elif use_rep not in adata.obsm.keys(
    ) and 'X_' + use_rep in adata.obsm.keys():
        use_rep = 'X_' + use_rep

    if use_rep is 'X_pca':
        if 'X_pca' not in adata.obsm.keys(
        ) or n_pcs is not None and n_pcs > adata.obsm['X_pca'].shape[1]:
            pca(adata,
                n_comps=30 if n_pcs is None else n_pcs,
                svd_solver='arpack')
        elif n_pcs is None and adata.obsm['X_pca'].shape[1] < 10:
            logg.warn('Neighbors are computed on ',
                      adata.obsm['X_pca'].shape[1],
                      ' principal components only.')

    logg.info('computing neighbors', r=True)

    if method is 'sklearn':
        from sklearn.neighbors import NearestNeighbors
        X = adata.X if use_rep is 'X' else adata.obsm[use_rep]
        neighbors = NearestNeighbors(n_neighbors=n_neighbors,
                                     metric=metric,
                                     metric_params=metric_kwds,
                                     n_jobs=num_threads)
        neighbors.fit(X if n_pcs is None else X[:, :n_pcs])
        knn_distances, neighbors.knn_indices = neighbors.kneighbors()
        neighbors.distances, neighbors.connectivities = \
            compute_connectivities_umap(neighbors.knn_indices, knn_distances, X.shape[0], n_neighbors=30)

    elif method is 'hnsw':
        X = adata.X if use_rep is 'X' else adata.obsm[use_rep]
        neighbors = FastNeighbors(n_neighbors=n_neighbors,
                                  num_threads=num_threads)
        neighbors.fit(X if n_pcs is None else X[:, :n_pcs],
                      metric=metric,
                      random_state=random_state,
                      **metric_kwds)

    else:
        logg.switch_verbosity('off', module='scanpy')
        with warnings.catch_warnings(
        ):  # ignore numba warning (reported in umap/issues/252)
            warnings.simplefilter("ignore")
            neighbors = Neighbors(adata)
            neighbors.compute_neighbors(n_neighbors=n_neighbors,
                                        knn=knn,
                                        n_pcs=n_pcs,
                                        use_rep=use_rep,
                                        method=method,
                                        metric=metric,
                                        metric_kwds=metric_kwds,
                                        random_state=random_state,
                                        write_knn_indices=True)
        logg.switch_verbosity('on', module='scanpy')

    adata.uns['neighbors'] = {}
    adata.uns['neighbors']['params'] = {
        'n_neighbors': n_neighbors,
        'method': method
    }

    adata.uns['neighbors']['distances'] = neighbors.distances
    adata.uns['neighbors']['connectivities'] = neighbors.connectivities
    if hasattr(neighbors, 'knn_indices'):
        adata.uns['neighbors']['indices'] = neighbors.knn_indices

    logg.info('    finished',
              time=True,
              end=' ' if settings.verbosity > 2 else '\n')
    logg.hint('added to `.uns[\'neighbors\']`\n'
              '    \'distances\', weighted adjacency matrix\n'
              '    \'connectivities\', weighted adjacency matrix')
    return adata if copy else None
Example #4
0
    def __init__(
        self,
        adata,
        vkey="velocity",
        xkey="Ms",
        tkey=None,
        basis=None,
        n_neighbors=None,
        sqrt_transform=None,
        n_recurse_neighbors=None,
        random_neighbors_at_max=None,
        gene_subset=None,
        approx=None,
        report=False,
        compute_uncertainties=None,
        mode_neighbors="distances",
    ):

        subset = np.ones(adata.n_vars, bool)
        if gene_subset is not None:
            var_names_subset = adata.var_names.isin(gene_subset)
            subset &= var_names_subset if len(
                var_names_subset) > 0 else gene_subset
        elif f"{vkey}_genes" in adata.var.keys():
            subset &= np.array(adata.var[f"{vkey}_genes"].values, dtype=bool)

        xkey = xkey if xkey in adata.layers.keys() else "spliced"

        X = np.array(adata.layers[xkey].A[:, subset] if issparse(
            adata.layers[xkey]) else adata.layers[xkey][:, subset])
        V = np.array(adata.layers[vkey].A[:, subset] if issparse(
            adata.layers[vkey]) else adata.layers[vkey][:, subset])

        nans = np.isnan(np.sum(V, axis=0))
        if np.any(nans):
            X = X[:, ~nans]
            V = V[:, ~nans]

        if approx is True and X.shape[1] > 100:
            X_pca, PCs, _, _ = pca(X,
                                   n_comps=30,
                                   svd_solver="arpack",
                                   return_info=True)
            self.X = np.array(X_pca, dtype=np.float32)
            self.V = (V - V.mean(0)).dot(PCs.T)
            self.V[V.sum(1) == 0] = 0
        else:
            self.X = np.array(X, dtype=np.float32)
            self.V = np.array(V, dtype=np.float32)
        self.V_raw = np.array(self.V)

        self.sqrt_transform = sqrt_transform
        uns_key = f"{vkey}_params"
        if self.sqrt_transform is None:
            if uns_key in adata.uns.keys() and "mode" in adata.uns[uns_key]:
                self.sqrt_transform = adata.uns[uns_key][
                    "mode"] == "stochastic"
        if self.sqrt_transform:
            self.V = np.sqrt(np.abs(self.V)) * np.sign(self.V)
        self.V -= np.nanmean(self.V, axis=1)[:, None]

        self.n_recurse_neighbors = n_recurse_neighbors
        if self.n_recurse_neighbors is None:
            if n_neighbors is not None or mode_neighbors == "connectivities":
                self.n_recurse_neighbors = 1
            else:
                self.n_recurse_neighbors = 2

        if "neighbors" not in adata.uns.keys():
            neighbors(adata)
        if np.min((get_neighs(adata, "distances") > 0).sum(1).A1) == 0:
            raise ValueError("Your neighbor graph seems to be corrupted. "
                             "Consider recomputing via pp.neighbors.")
        if n_neighbors is None or n_neighbors <= get_n_neighs(adata):
            self.indices = get_indices(
                dist=get_neighs(adata, "distances"),
                n_neighbors=n_neighbors,
                mode_neighbors=mode_neighbors,
            )[0]
        else:
            if basis is None:
                basis_keys = ["X_pca", "X_tsne", "X_umap"]
                basis = [
                    key for key in basis_keys if key in adata.obsm.keys()
                ][-1]
            elif f"X_{basis}" in adata.obsm.keys():
                basis = f"X_{basis}"

            if isinstance(approx, str) and approx in adata.obsm.keys():
                from sklearn.neighbors import NearestNeighbors

                neighs = NearestNeighbors(n_neighbors=n_neighbors + 1)
                neighs.fit(adata.obsm[approx])
                self.indices = neighs.kneighbors_graph(
                    mode="connectivity").indices.reshape((-1, n_neighbors + 1))
            else:
                from scvelo import Neighbors

                neighs = Neighbors(adata)
                neighs.compute_neighbors(n_neighbors=n_neighbors,
                                         use_rep=basis,
                                         n_pcs=10)
                self.indices = get_indices(dist=neighs.distances,
                                           mode_neighbors=mode_neighbors)[0]

        self.max_neighs = random_neighbors_at_max

        gkey, gkey_ = f"{vkey}_graph", f"{vkey}_graph_neg"
        self.graph = adata.uns[gkey] if gkey in adata.uns.keys() else []
        self.graph_neg = adata.uns[gkey_] if gkey_ in adata.uns.keys() else []

        if tkey in adata.obs.keys():
            self.t0 = adata.obs[tkey].copy()
            init = min(self.t0) if isinstance(min(self.t0), int) else 0
            self.t0.cat.categories = np.arange(init,
                                               len(self.t0.cat.categories))
            self.t1 = self.t0.copy()
            self.t1.cat.categories = self.t0.cat.categories + 1
        else:
            self.t0 = None

        self.compute_uncertainties = compute_uncertainties
        self.uncertainties = None
        self.self_prob = None
        self.report = report
        self.adata = adata
Example #5
0
    def __init__(self, adata, vkey='velocity', xkey='Ms', tkey=None, basis=None, n_neighbors=None, sqrt_transform=None,
                 n_recurse_neighbors=None, random_neighbors_at_max=None, gene_subset=None, approx=None, report=False,
                 mode_neighbors='distances'):

        subset = np.ones(adata.n_vars, bool)
        if gene_subset is not None:
            subset &= adata.var_names.isin(gene_subset) if len(adata.var_names.isin(gene_subset)) > 0 else gene_subset
        elif vkey + '_genes' in adata.var.keys():
            subset &= np.array(adata.var[vkey + '_genes'].values, dtype=bool)

        xkey = xkey if xkey in adata.layers.keys() else 'spliced'

        X = np.array(adata.layers[xkey].A[:, subset] if issparse(adata.layers[xkey]) else adata.layers[xkey][:, subset])
        V = np.array(adata.layers[vkey].A[:, subset] if issparse(adata.layers[vkey]) else adata.layers[vkey][:, subset])

        nans = np.isnan(np.sum(V, axis=0))
        if np.any(nans):
            X = X[:, ~nans]
            V = V[:, ~nans]

        if approx is True and X.shape[1] > 100:
            X_pca, PCs, _, _ = pca(X,  n_comps=30, svd_solver='arpack', return_info=True)
            self.X = np.array(X_pca, dtype=np.float32)
            self.V = (V - V.mean(0)).dot(PCs.T)
            self.V[V.sum(1) == 0] = 0
        else:
            self.X = np.array(X, dtype=np.float32)
            self.V = np.array(V, dtype=np.float32)

        self.sqrt_transform = sqrt_transform
        if self.sqrt_transform is None and vkey + '_settings' in adata.uns.keys():
            self.sqrt_transform = adata.uns[vkey + '_settings']['mode'] == 'stochastic'
        if self.sqrt_transform: self.V = np.sqrt(np.abs(self.V)) * np.sign(self.V)
        self.V -= np.nanmean(self.V, axis=1)[:, None]

        self.n_recurse_neighbors = 1 if n_neighbors is not None \
            else 2 if n_recurse_neighbors is None else n_recurse_neighbors

        if 'neighbors' not in adata.uns.keys(): neighbors(adata)
        if np.min((get_neighs(adata, 'distances') > 0).sum(1).A1) == 0:
            raise ValueError('Your neighbor graph seems to be corrupted. Consider recomputing via pp.neighbors.')
        if n_neighbors is None or n_neighbors <= adata.uns['neighbors']['params']['n_neighbors']:
            self.indices = get_indices(dist=get_neighs(adata, 'distances'), n_neighbors=n_neighbors,
                                       mode_neighbors=mode_neighbors)[0]
        else:
            if basis is None: basis = [key for key in ['X_pca', 'X_tsne', 'X_umap'] if key in adata.obsm.keys()][-1]
            elif 'X_' + basis in adata.obsm.keys(): basis = 'X_' + basis

            if isinstance(approx, str) and approx in adata.obsm.keys():
                from sklearn.neighbors import NearestNeighbors
                neighs = NearestNeighbors(n_neighbors=n_neighbors + 1)
                neighs.fit(adata.obsm[approx])
                self.indices = neighs.kneighbors_graph(mode='connectivity').indices.reshape((-1, n_neighbors + 1))
            else:
                from .. import Neighbors
                neighs = Neighbors(adata)
                neighs.compute_neighbors(n_neighbors=n_neighbors, use_rep=basis, n_pcs=10)
                self.indices = get_indices(dist=neighs.distances, mode_neighbors=mode_neighbors)[0]

        self.max_neighs = random_neighbors_at_max

        self.graph = adata.uns[vkey + '_graph'] if vkey + '_graph' in adata.uns.keys() else []
        self.graph_neg = adata.uns[vkey + '_graph_neg'] if vkey + '_graph_neg' in adata.uns.keys() else []

        if tkey in adata.obs.keys():
            self.t0 = adata.obs[tkey].copy()
            init = min(self.t0) if isinstance(min(self.t0), int) else 0
            self.t0.cat.categories = np.arange(init, len(self.t0.cat.categories))
            self.t1 = self.t0.copy()
            self.t1.cat.categories = self.t0.cat.categories + 1
        else: self.t0 = None

        self.report = report
        self.self_prob = None
Example #6
0
def neighbors(
    adata,
    n_neighbors=30,
    n_pcs=None,
    use_rep=None,
    use_highly_variable=True,
    knn=True,
    random_state=0,
    method="umap",
    metric="euclidean",
    metric_kwds=None,
    num_threads=-1,
    copy=False,
):
    """
    Compute a neighborhood graph of observations.

    The neighbor graph methods (umap, hnsw, sklearn) only differ in runtime and
    yield the same result as scanpy [Wolf18]_. Connectivities are computed with
    adaptive kernel width as proposed in Haghverdi et al. 2016 (doi:10.1038/nmeth.3971).

    Parameters
    ----------
    adata
        Annotated data matrix.
    n_neighbors
        The size of local neighborhood (in terms of number of neighboring data
        points) used for manifold approximation. Larger values result in more
        global views of the manifold, while smaller values result in more local
        data being preserved. In general values should be in the range 2 to 100.
        If `knn` is `True`, number of nearest neighbors to be searched. If `knn`
        is `False`, a Gaussian kernel width is set to the distance of the
        `n_neighbors` neighbor.
    n_pcs : `int` or `None` (default: None)
        Number of principal components to use.
        If not specified, the full space is used of a pre-computed PCA,
        or 30 components are used when PCA is computed internally.
    use_rep : `None`, `'X'` or any key for `.obsm` (default: None)
        Use the indicated representation. If `None`, the representation is chosen
        automatically: for .n_vars < 50, .X is used, otherwise ‘X_pca’ is used.
    use_highly_variable: `bool` (default: True)
        Whether to use highly variable genes only, stored in .var['highly_variable'].
    knn
        If `True`, use a hard threshold to restrict the number of neighbors to
        `n_neighbors`, that is, consider a knn graph. Otherwise, use a Gaussian
        Kernel to assign low weights to neighbors more distant than the
        `n_neighbors` nearest neighbor.
    random_state
        A numpy random seed.
    method : {{'umap', 'hnsw', 'sklearn'}}  (default: `'umap'`)
        Method to compute neighbors, only differs in runtime.
        The 'hnsw' method is most efficient and requires to `pip install hnswlib`.
        Connectivities are computed with adaptive kernel.
    metric
        A known metric’s name or a callable that returns a distance.
    metric_kwds
        Options for the metric.
    num_threads
        Number of threads to be used (for runtime).
    copy
        Return a copy instead of writing to adata.

    Returns
    -------
    connectivities : `.obsp`
        Sparse weighted adjacency matrix of the neighborhood graph of data
        points. Weights should be interpreted as connectivities.
    distances : `.obsp`
        Sparse matrix of distances for each pair of neighbors.
    """

    adata = adata.copy() if copy else adata

    if use_rep is None:
        use_rep = "X" if adata.n_vars < 50 or n_pcs == 0 else "X_pca"
        n_pcs = None if use_rep == "X" else n_pcs
    elif use_rep not in adata.obsm.keys() and f"X_{use_rep}" in adata.obsm.keys():
        use_rep = f"X_{use_rep}"

    if use_rep == "X_pca":
        if (
            "X_pca" not in adata.obsm.keys()
            or n_pcs is not None
            and n_pcs > adata.obsm["X_pca"].shape[1]
        ):
            n_vars = (
                np.sum(adata.var["highly_variable"])
                if use_highly_variable and "highly_variable" in adata.var.keys()
                else adata.n_vars
            )
            n_comps = min(30 if n_pcs is None else n_pcs, n_vars - 1, adata.n_obs - 1)
            use_highly_variable &= "highly_variable" in adata.var.keys()
            pca(
                adata,
                n_comps=n_comps,
                use_highly_variable=use_highly_variable,
                svd_solver="arpack",
            )
        elif n_pcs is None and adata.obsm["X_pca"].shape[1] < 10:
            logg.warn(
                f"Neighbors are computed on {adata.obsm['X_pca'].shape[1]} "
                f"principal components only."
            )

        n_duplicate_cells = len(get_duplicate_cells(adata))
        if n_duplicate_cells > 0:
            logg.warn(
                f"You seem to have {n_duplicate_cells} duplicate cells in your data.",
                "Consider removing these via pp.remove_duplicate_cells.",
            )

    if metric_kwds is None:
        metric_kwds = {}

    logg.info("computing neighbors", r=True)

    if method == "sklearn":
        from sklearn.neighbors import NearestNeighbors

        X = adata.X if use_rep == "X" else adata.obsm[use_rep]
        neighbors = NearestNeighbors(
            n_neighbors=n_neighbors - 1,
            metric=metric,
            metric_params=metric_kwds,
            n_jobs=num_threads,
        )
        neighbors.fit(X if n_pcs is None else X[:, :n_pcs])
        knn_distances, neighbors.knn_indices = neighbors.kneighbors()
        knn_distances, neighbors.knn_indices = set_diagonal(
            knn_distances, neighbors.knn_indices
        )
        neighbors.distances, neighbors.connectivities = compute_connectivities_umap(
            neighbors.knn_indices, knn_distances, X.shape[0], n_neighbors=n_neighbors
        )

    elif method == "hnsw":
        X = adata.X if use_rep == "X" else adata.obsm[use_rep]
        neighbors = FastNeighbors(n_neighbors=n_neighbors, num_threads=num_threads)
        neighbors.fit(
            X if n_pcs is None else X[:, :n_pcs],
            metric=metric,
            random_state=random_state,
            **metric_kwds,
        )

    else:
        logg.switch_verbosity("off", module="scanpy")
        with warnings.catch_warnings():  # ignore numba warning (umap/issues/252)
            warnings.simplefilter("ignore")
            neighbors = Neighbors(adata)
            neighbors.compute_neighbors(
                n_neighbors=n_neighbors,
                knn=knn,
                n_pcs=n_pcs,
                method=method,
                use_rep=use_rep,
                random_state=random_state,
                metric=metric,
                metric_kwds=metric_kwds,
                write_knn_indices=True,
            )
        logg.switch_verbosity("on", module="scanpy")

    adata.uns["neighbors"] = {}
    try:
        adata.obsp["distances"] = neighbors.distances
        adata.obsp["connectivities"] = neighbors.connectivities
        adata.uns["neighbors"]["connectivities_key"] = "connectivities"
        adata.uns["neighbors"]["distances_key"] = "distances"
    except Exception:
        adata.uns["neighbors"]["distances"] = neighbors.distances
        adata.uns["neighbors"]["connectivities"] = neighbors.connectivities

    if hasattr(neighbors, "knn_indices"):
        adata.uns["neighbors"]["indices"] = neighbors.knn_indices
    adata.uns["neighbors"]["params"] = {
        "n_neighbors": n_neighbors,
        "method": method,
        "metric": metric,
        "n_pcs": n_pcs,
        "use_rep": use_rep,
    }

    logg.info("    finished", time=True, end=" " if settings.verbosity > 2 else "\n")
    logg.hint(
        "added \n"
        "    'distances' and 'connectivities', weighted adjacency matrices (adata.obsp)"
    )

    return adata if copy else None
Example #7
0
    def __init__(self,
                 adata,
                 vkey='velocity',
                 xkey='Ms',
                 tkey=None,
                 basis=None,
                 n_neighbors=None,
                 sqrt_transform=False,
                 n_recurse_neighbors=None,
                 random_neighbors_at_max=None,
                 approx=False,
                 report=False):

        subset = np.array(adata.var[vkey + '_genes'].values, dtype=bool) \
            if vkey + '_genes' in adata.var.keys() else np.ones(adata.n_vars, bool)
        xkey = xkey if xkey in adata.layers.keys() else 'spliced'

        X = adata.layers[xkey].A[:, subset] if issparse(
            adata.layers[xkey]) else adata.layers[xkey][:, subset]
        V = adata.layers[vkey].A[:, subset] if issparse(
            adata.layers[vkey]) else adata.layers[vkey][:, subset]

        if approx is True and X.shape[1] > 100:
            X_pca, PCs, _, _ = pca(X,
                                   n_comps=30,
                                   svd_solver='arpack',
                                   return_info=True)
            self.X = np.array(X_pca, dtype=np.float32)
            self.V = (V - V.mean(0)).dot(PCs.T)
            self.V[V.sum(1) == 0] = 0
        else:
            self.X = np.array(X, dtype=np.float32)
            self.V = np.array(V, dtype=np.float32)

        self.sqrt_transform = sqrt_transform
        if sqrt_transform: self.V = np.sqrt(np.abs(self.V)) * np.sign(self.V)
        self.V -= self.V.mean(1)[:, None]

        self.n_recurse_neighbors = 1 if n_neighbors is not None \
            else 2 if n_recurse_neighbors is None else n_recurse_neighbors

        if 'neighbors' not in adata.uns.keys(): neighbors(adata)
        if n_neighbors is None or n_neighbors < adata.uns['neighbors'][
                'params']['n_neighbors']:
            self.indices = get_indices(
                dist=adata.uns['neighbors']['distances'],
                n_neighbors=n_neighbors)[0]
        else:
            if basis is None:
                basis = [
                    key for key in ['X_pca', 'X_tsne', 'X_umap']
                    if key in adata.obsm.keys()
                ][-1]
            elif 'X_' + basis in adata.obsm.keys():
                basis = 'X_' + basis

            if isinstance(approx, str) and approx in adata.obsm.keys():
                from sklearn.neighbors import NearestNeighbors
                neighs = NearestNeighbors(n_neighbors=n_neighbors + 1)
                neighs.fit(adata.obsm[approx][:, :2])
                self.indices = neighs.kneighbors_graph(
                    mode='connectivity').indices.reshape((-1, n_neighbors + 1))
            else:
                from .. import Neighbors
                neighs = Neighbors(adata)
                neighs.compute_neighbors(n_neighbors=n_neighbors,
                                         use_rep=basis,
                                         n_pcs=10)
                self.indices = get_indices(dist=neighs.distances)[0]

        self.max_neighs = random_neighbors_at_max

        self.graph = adata.uns[
            vkey + '_graph'] if vkey + '_graph' in adata.uns.keys() else []
        self.graph_neg = adata.uns[
            vkey +
            '_graph_neg'] if vkey + '_graph_neg' in adata.uns.keys() else []

        if tkey in adata.obs.keys():
            self.t0 = adata.obs[tkey].copy()
            init = min(self.t0) if isinstance(min(self.t0), int) else 0
            self.t0.cat.categories = np.arange(init,
                                               len(self.t0.cat.categories))
            self.t1 = self.t0.copy()
            self.t1.cat.categories = self.t0.cat.categories + 1
        else:
            self.t0 = None

        self.report = report