def regress_out( adata: AnnData, keys: Union[str, Sequence[str]], n_jobs: Optional[int] = None, copy: bool = False, ) -> Optional[AnnData]: """\ Regress out (mostly) unwanted sources of variation. Uses simple linear regression. This is inspired by Seurat's `regressOut` function in R [Satija15]. Note that this function tends to overcorrect in certain circumstances as described in :issue:`526`. Parameters ---------- adata The annotated data matrix. keys Keys for observation annotation on which to regress on. n_jobs Number of jobs for parallel computation. `None` means using :attr:`scanpy._settings.ScanpyConfig.n_jobs`. copy Determines whether a copy of `adata` is returned. Returns ------- Depending on `copy` returns or updates `adata` with the corrected data matrix. """ start = logg.info(f'regressing out {keys}') if issparse(adata.X): logg.info(' sparse input is densified and may ' 'lead to high memory use') adata = adata.copy() if copy else adata sanitize_anndata(adata) # TODO: This should throw an implicit modification warning if adata.is_view: adata._init_as_actual(adata.copy()) if isinstance(keys, str): keys = [keys] if issparse(adata.X): adata.X = adata.X.toarray() n_jobs = sett.n_jobs if n_jobs is None else n_jobs # regress on a single categorical variable variable_is_categorical = False if keys[0] in adata.obs_keys() and is_categorical_dtype( adata.obs[keys[0]]): if len(keys) > 1: raise ValueError('If providing categorical variable, ' 'only a single one is allowed. For this one ' 'we regress on the mean for each category.') logg.debug('... regressing on per-gene means within categories') regressors = np.zeros(adata.X.shape, dtype='float32') for category in adata.obs[keys[0]].cat.categories: mask = (category == adata.obs[keys[0]]).values for ix, x in enumerate(adata.X.T): regressors[mask, ix] = x[mask].mean() variable_is_categorical = True # regress on one or several ordinal variables else: # create data frame with selected keys (if given) if keys: regressors = adata.obs[keys] else: regressors = adata.obs.copy() # add column of ones at index 0 (first column) regressors.insert(0, 'ones', 1.0) len_chunk = np.ceil(min(1000, adata.X.shape[1]) / n_jobs).astype(int) n_chunks = np.ceil(adata.X.shape[1] / len_chunk).astype(int) tasks = [] # split the adata.X matrix by columns in chunks of size n_chunk # (the last chunk could be of smaller size than the others) chunk_list = np.array_split(adata.X, n_chunks, axis=1) if variable_is_categorical: regressors_chunk = np.array_split(regressors, n_chunks, axis=1) for idx, data_chunk in enumerate(chunk_list): # each task is a tuple of a data_chunk eg. (adata.X[:,0:100]) and # the regressors. This data will be passed to each of the jobs. if variable_is_categorical: regres = regressors_chunk[idx] else: regres = regressors tasks.append(tuple((data_chunk, regres, variable_is_categorical))) if n_jobs > 1 and n_chunks > 1: import multiprocessing pool = multiprocessing.Pool(n_jobs) res = pool.map_async(_regress_out_chunk, tasks).get(9999999) pool.close() else: res = list(map(_regress_out_chunk, tasks)) # res is a list of vectors (each corresponding to a regressed gene column). # The transpose is needed to get the matrix in the shape needed adata.X = np.vstack(res).T.astype(adata.X.dtype) logg.info(' finished', time=start) return adata if copy else None
def neighbors( adata: AnnData, n_neighbors: int = 15, n_pcs: Optional[int] = None, use_rep: Optional[str] = None, knn: bool = True, random_state: AnyRandom = 0, method: Optional[_Method] = 'umap', metric: Union[_Metric, _MetricFn] = 'euclidean', metric_kwds: Mapping[str, Any] = MappingProxyType({}), key_added: Optional[str] = None, copy: bool = False, ) -> Optional[AnnData]: """\ Compute a neighborhood graph of observations [McInnes18]_. The neighbor search efficiency of this heavily relies on UMAP [McInnes18]_, which also provides a method for estimating connectivities of data points - the connectivity of the manifold (`method=='umap'`). If `method=='gauss'`, connectivities are computed according to [Coifman05]_, in the adaption of [Haghverdi16]_. Parameters ---------- adata Annotated data matrix. n_neighbors The size of local neighborhood (in terms of number of neighboring data points) used for manifold approximation. Larger values result in more global views of the manifold, while smaller values result in more local data being preserved. In general values should be in the range 2 to 100. If `knn` is `True`, number of nearest neighbors to be searched. If `knn` is `False`, a Gaussian kernel width is set to the distance of the `n_neighbors` neighbor. {n_pcs} {use_rep} knn If `True`, use a hard threshold to restrict the number of neighbors to `n_neighbors`, that is, consider a knn graph. Otherwise, use a Gaussian Kernel to assign low weights to neighbors more distant than the `n_neighbors` nearest neighbor. random_state A numpy random seed. method Use 'umap' [McInnes18]_ or 'gauss' (Gauss kernel following [Coifman05]_ with adaptive width [Haghverdi16]_) for computing connectivities. Use 'rapids' for the RAPIDS implementation of UMAP (experimental, GPU only). metric A known metric’s name or a callable that returns a distance. metric_kwds Options for the metric. key_added If not specified, the neighbors data is stored in .uns['neighbors'], distances and connectivities are stored in .obsp['distances'] and .obsp['connectivities'] respectively. If specified, the neighbors data is added to .uns[key_added], distances are stored in .obsp[key_added+'_distances'] and connectivities in .obsp[key_added+'_connectivities']. copy Return a copy instead of writing to adata. Returns ------- Depending on `copy`, updates or returns `adata` with the following: See `key_added` parameter description for the storage path of connectivities and distances. **connectivities** : sparse matrix of dtype `float32`. Weighted adjacency matrix of the neighborhood graph of data points. Weights should be interpreted as connectivities. **distances** : sparse matrix of dtype `float32`. Instead of decaying weights, this stores distances for each pair of neighbors. Notes ----- If `method='umap'`, it's highly recommended to install pynndescent ``pip install pynndescent``. Installing `pynndescent` can significantly increase performance, and in later versions it will become a hard dependency. """ start = logg.info('computing neighbors') adata = adata.copy() if copy else adata if adata.is_view: # we shouldn't need this here... adata._init_as_actual(adata.copy()) neighbors = Neighbors(adata) neighbors.compute_neighbors( n_neighbors=n_neighbors, knn=knn, n_pcs=n_pcs, use_rep=use_rep, method=method, metric=metric, metric_kwds=metric_kwds, random_state=random_state, ) if key_added is None: key_added = 'neighbors' conns_key = 'connectivities' dists_key = 'distances' else: conns_key = key_added + '_connectivities' dists_key = key_added + '_distances' adata.uns[key_added] = {} neighbors_dict = adata.uns[key_added] neighbors_dict['connectivities_key'] = conns_key neighbors_dict['distances_key'] = dists_key neighbors_dict['params'] = {'n_neighbors': neighbors.n_neighbors, 'method': method} neighbors_dict['params']['random_state'] = random_state neighbors_dict['params']['metric'] = metric if metric_kwds: neighbors_dict['params']['metric_kwds'] = metric_kwds if use_rep is not None: neighbors_dict['params']['use_rep'] = use_rep if n_pcs is not None: neighbors_dict['params']['n_pcs'] = n_pcs adata.obsp[dists_key] = neighbors.distances adata.obsp[conns_key] = neighbors.connectivities if neighbors.rp_forest is not None: neighbors_dict['rp_forest'] = neighbors.rp_forest logg.info( ' finished', time=start, deep=( f'added to `.uns[{key_added!r}]`\n' f' `.obsp[{dists_key!r}]`, distances for each pair of neighbors\n' f' `.obsp[{conns_key!r}]`, weighted adjacency matrix' ), ) return adata if copy else None
def neighbors( adata: AnnData, n_neighbors: int = 15, n_pcs: Optional[int] = None, use_rep: Optional[str] = None, knn: bool = True, random_state: Optional[Union[int, RandomState]] = 0, method: Optional[_Method] = 'umap', metric: Union[_Metric, _MetricFn] = 'euclidean', metric_kwds: Mapping[str, Any] = {}, copy: bool = False, ) -> Optional[AnnData]: """\ Compute a neighborhood graph of observations [McInnes18]_. The neighbor search efficiency of this heavily relies on UMAP [McInnes18]_, which also provides a method for estimating connectivities of data points - the connectivity of the manifold (`method=='umap'`). If `method=='gauss'`, connectivities are computed according to [Coifman05]_, in the adaption of [Haghverdi16]_. Parameters ---------- adata Annotated data matrix. n_neighbors The size of local neighborhood (in terms of number of neighboring data points) used for manifold approximation. Larger values result in more global views of the manifold, while smaller values result in more local data being preserved. In general values should be in the range 2 to 100. If `knn` is `True`, number of nearest neighbors to be searched. If `knn` is `False`, a Gaussian kernel width is set to the distance of the `n_neighbors` neighbor. {n_pcs} {use_rep} knn If `True`, use a hard threshold to restrict the number of neighbors to `n_neighbors`, that is, consider a knn graph. Otherwise, use a Gaussian Kernel to assign low weights to neighbors more distant than the `n_neighbors` nearest neighbor. random_state A numpy random seed. method Use 'umap' [McInnes18]_ or 'gauss' (Gauss kernel following [Coifman05]_ with adaptive width [Haghverdi16]_) for computing connectivities. Use 'rapids' for the RAPIDS implementation of UMAP (experimental, GPU only). metric A known metric’s name or a callable that returns a distance. metric_kwds Options for the metric. copy Return a copy instead of writing to adata. Returns ------- Depending on `copy`, updates or returns `adata` with the following: **connectivities** : sparse matrix (`.uns['neighbors']`, dtype `float32`) Weighted adjacency matrix of the neighborhood graph of data points. Weights should be interpreted as connectivities. **distances** : sparse matrix (`.uns['neighbors']`, dtype `float32`) Instead of decaying weights, this stores distances for each pair of neighbors. """ start = logg.info('computing neighbors') adata = adata.copy() if copy else adata if adata.isview: # we shouldn't need this here... adata._init_as_actual(adata.copy()) neighbors = Neighbors(adata) neighbors.compute_neighbors( n_neighbors=n_neighbors, knn=knn, n_pcs=n_pcs, use_rep=use_rep, method=method, metric=metric, metric_kwds=metric_kwds, random_state=random_state, ) adata.uns['neighbors'] = {} adata.uns['neighbors']['params'] = {'n_neighbors': n_neighbors, 'method': method} adata.uns['neighbors']['params']['metric'] = metric if metric_kwds: adata.uns['neighbors']['params']['metric_kwds'] = metric_kwds if use_rep is not None: adata.uns['neighbors']['params']['use_rep'] = use_rep if n_pcs is not None: adata.uns['neighbors']['params']['n_pcs'] = n_pcs adata.uns['neighbors']['distances'] = neighbors.distances adata.uns['neighbors']['connectivities'] = neighbors.connectivities if neighbors.rp_forest is not None: adata.uns['neighbors']['rp_forest'] = neighbors.rp_forest logg.info( ' finished', time=start, deep=( 'added to `.uns[\'neighbors\']`\n' ' \'distances\', distances for each pair of neighbors\n' ' \'connectivities\', weighted adjacency matrix' ), ) return adata if copy else None
def neighbors( adata: AnnData, n_neighbors: int = 15, n_pcs: Optional[int] = None, use_rep: Optional[str] = None, knn: bool = True, random_state: Optional[Union[int, RandomState]] = 0, method: str = 'umap', metric: Union[str, Callable[[np.ndarray, np.ndarray], float]] = 'euclidean', metric_kwds: Mapping[str, Any] = {}, copy: bool = False, directed_groups: Optional[Iterable[int]] = None ) -> Optional[AnnData]: """\ Compute a neighborhood graph of observations [McInnes18]_. The neighbor search efficiency of this heavily relies on UMAP [McInnes18]_, which also provides a method for estimating connectivities of data points - the connectivity of the manifold (`method=='umap'`). If `method=='diffmap'`, connectivities are computed according to [Coifman05]_, in the adaption of [Haghverdi16]_. Parameters ---------- adata Annotated data matrix. n_neighbors The size of local neighborhood (in terms of number of neighboring data points) used for manifold approximation. Larger values result in more global views of the manifold, while smaller values result in more local data being preserved. In general values should be in the range 2 to 100. If `knn` is `True`, number of nearest neighbors to be searched. If `knn` is `False`, a Gaussian kernel width is set to the distance of the `n_neighbors` neighbor. {n_pcs} {use_rep} knn If `True`, use a hard threshold to restrict the number of neighbors to `n_neighbors`, that is, consider a knn graph. Otherwise, use a Gaussian Kernel to assign low weights to neighbors more distant than the `n_neighbors` nearest neighbor. random_state A numpy random seed. method : {{'umap', 'gauss', `None`}} (default: `'umap'`) Use 'umap' [McInnes18]_ or 'gauss' (Gauss kernel following [Coifman05]_ with adaptive width [Haghverdi16]_) for computing connectivities. metric A known metric’s name or a callable that returns a distance. metric_kwds Options for the metric. copy Return a copy instead of writing to adata. Returns ------- Depending on `copy`, updates or returns `adata` with the following: connectivities : sparse matrix (`.uns['neighbors']`, dtype `float32`) Weighted adjacency matrix of the neighborhood graph of data points. Weights should be interpreted as connectivities. distances : sparse matrix (`.uns['neighbors']`, dtype `float32`) Instead of decaying weights, this stores distances for each pair of neighbors. """ logg.info('computing neighbors', r=True) adata = adata.copy() if copy else adata if adata.isview: # we shouldn't need this here... adata._init_as_actual(adata.copy()) if directed_groups is not None: method = 'directed' print('Directed groups were defined, setting method to \'directed\'') neighbors = Neighbors(adata) neighbors.compute_neighbors( n_neighbors=n_neighbors, knn=knn, n_pcs=n_pcs, use_rep=use_rep, method=method, metric=metric, metric_kwds=metric_kwds, random_state=random_state, directed_groups=directed_groups) adata.uns['neighbors'] = {} adata.uns['neighbors']['params'] = {'n_neighbors': n_neighbors, 'method': method} adata.uns['neighbors']['distances'] = neighbors.distances adata.uns['neighbors']['connectivities'] = neighbors.connectivities logg.info(' finished', time=True, end=' ' if settings.verbosity > 2 else '\n') logg.hint( 'added to `.uns[\'neighbors\']`\n' ' \'distances\', weighted adjacency matrix\n' ' \'connectivities\', weighted adjacency matrix') return adata if copy else None