def test_indices_dtypes(): adata = AnnData( np.array([[1, 2, 3], [4, 5, 6]]), dict(obs_names=['A', 'B']), dict(var_names=['a', 'b', 'c'])) adata.obs_names = ['ö', 'a'] assert adata.obs_names.tolist() == ['ö', 'a']
def test_set_obs(): adata = AnnData(np.array([[1, 2, 3], [4, 5, 6]])) adata.obs = pd.DataFrame({'a': [3, 4]}) assert adata.obs_names.tolist() == [0, 1] from pytest import raises with raises(ValueError): adata.obs = pd.DataFrame({'a': [3, 4, 5]}) adata.obs = {'a': [1, 2]}
def test_append_col(): adata = AnnData(np.array([[1, 2, 3], [4, 5, 6]])) adata.obs['new'] = [1, 2] # this worked in the initial AnnData, but not with a dataframe # adata.obs[['new2', 'new3']] = [['A', 'B'], ['c', 'd']] from pytest import raises with raises(ValueError): adata.obs['new4'] = 'far too long'.split()
def test_transpose(): adata = AnnData( np.array([[1, 2, 3], [4, 5, 6]]), dict(obs_names=['A', 'B']), dict(var_names=['a', 'b', 'c'])) adata1 = adata.T # make sure to not modify the original! assert adata.obs_names.tolist() == ['A', 'B'] assert adata.var_names.tolist() == ['a', 'b', 'c'] assert adata1.obs_names.tolist() == ['a', 'b', 'c'] assert adata1.var_names.tolist() == ['A', 'B'] assert adata1.X.shape == adata.X.T.shape adata2 = adata.transpose() assert np.array_equal(adata1.X, adata2.X) assert np.array_equal(adata1.obs, adata2.obs) assert np.array_equal(adata1.var, adata2.var)
def test_rename_categories(): X = np.ones((6, 3)) obs = pd.DataFrame( {'cat_anno': pd.Categorical(['a', 'a', 'a', 'a', 'b', 'a'])}) adata = AnnData(X=X, obs=obs) adata.uns['tool'] = {} adata.uns['tool']['cat_array'] = np.rec.fromarrays( [np.ones(2) for cat in adata.obs['cat_anno'].cat.categories], dtype=[(cat, 'float32') for cat in adata.obs['cat_anno'].cat.categories]) adata.uns['tool']['params'] = {'groupby': 'cat_anno'} new_categories = ['c', 'd'] adata.rename_categories('cat_anno', new_categories) assert list(adata.obs['cat_anno'].cat.categories) == new_categories assert list(adata.uns['tool']['cat_array'].dtype.names) == new_categories
def annotate_cells( adata: AnnData, dataset: str, cluster: Optional[str] = "louvain", n_top_genes: Optional[int] = 1000, max_cell_types: Optional[int] = 50, min_annotated: Optional[int] = 50, select: Optional[bool] = True, ) -> None: """ Assign cells with cell type based on H3K27ac reference profiles. """ # Determine relevant reference cell types. # All other cell types will not be used for motif activity and # cell type annotation. data = ScepiaDataset(dataset) gene_df = data.load_reference_data(reftype="gene") if select: cell_types = relevant_cell_types( adata, gene_df, cluster=cluster, n_top_genes=n_top_genes, max_cell_types=max_cell_types, ) else: logger.info("Selecting all reference cell types.") cell_types = gene_df.columns if "scepia" not in adata.uns: adata.uns["scepia"] = {"version": __version__} adata.uns["scepia"]["cell_types"] = list(cell_types) logger.info("Annotating cells.") annotation_result, df_coef = annotate_with_k27( adata, gene_df[cell_types], cluster=cluster, use_neighbors=True, model="BayesianRidge", subsample=False, use_raw=False, ) adata.obsm["X_cell_types"] = df_coef.T[adata.uns["scepia"] ["cell_types"]].values # Annotate by highest mean coefficient coefs = pd.DataFrame(adata.obsm["X_cell_types"], index=adata.obs_names, columns=cell_types) coefs["cluster"] = adata.obs[cluster] cluster_anno = (coefs.groupby("cluster").mean().idxmax( axis=1).to_frame("cluster_annotation")) if "cluster_annotation" in adata.obs: adata.obs = adata.obs.drop(columns=["cluster_annotation"]) adata.obs = adata.obs.join(cluster_anno, on=cluster) # Second round of annotation, including "other" assign_cell_types(adata, min_annotated=min_annotated)
def _generate_cost_matrices( self, adata: AnnData, cost_matrices: Optional[ Union[str, Mapping[Tuple[float, float], np.ndarray]] ] = None, ) -> Tuple[Mapping[Tuple[float, float], Optional[np.ndarray]], str]: timepoints = self.experimental_time.cat.categories timepoints = list(zip(timepoints[:-1], timepoints[1:])) if cost_matrices is None: logg.info("Using default cost matrices") return {tpair: None for tpair in timepoints}, "default" if isinstance(cost_matrices, dict): logg.info("Using precomputed cost matrices") cmats = {} for tpair in timepoints: if tpair not in cost_matrices: logg.warning( f"Unable to find cost matrix for pair `{tpair}`. Using default" ) cmats[tpair] = cmat = cost_matrices.get(tpair, None) if cmat is not None: n_start = len(np.where(self.experimental_time == tpair[0])[0]) n_end = len(np.where(self.experimental_time == tpair[1])[0]) try: if cmat.shape != (n_start, n_end): raise ValueError( f"Expected cost matrix for time pair `{tpair}` to be " f"of shape `{(n_start, n_end)}`, found `{cmat.shape}`." ) except AttributeError: logg.warning( f"Unable to verify whether supplied cost matrix for time pair `{tpair}` " f"has the correct shape `{(n_start, n_end)}`" ) # prevent equality comparison when comparing with cache return cmats, nstr("precomputed") if isinstance(cost_matrices, str): logg.info(f"Computing cost matrices using `{cost_matrices!r}` key") if cost_matrices == "X": cost_matrices = None try: features = adata._get_X(layer=cost_matrices) modifier = "layer" except KeyError: try: features = adata.obsm[cost_matrices] modifier = "obsm" except KeyError: raise KeyError( f"Unable to find key `{cost_matrices!r}` in `adata.layers` or `adata.obsm`." ) from None cmats = {} for tpair in tqdm(timepoints, unit="cost matrix"): start_ixs = np.where(self.experimental_time == tpair[0])[0] end_ixs = np.where(self.experimental_time == tpair[1])[0] # being sparse is handled in WOT's function below cmats[tpair] = wot.ot.OTModel.compute_default_cost_matrix( features[start_ixs], features[end_ixs] ) return cmats, f"{modifier}:{cost_matrices}" raise NotImplementedError( f"Specifying cost matrices as " f"`{type(cost_matrices).__name__}` is not yet implemented." )
def trimap( adata: AnnData, n_components: int = 2, n_inliers: int = 10, n_outliers: int = 5, n_random: int = 5, metric: Literal['angular', 'euclidean', 'hamming', 'manhattan'] = 'euclidean', weight_adj: float = 500.0, lr: float = 1000.0, n_iters: int = 400, verbose: Union[bool, int, None] = None, copy: bool = False, ) -> Optional[AnnData]: """\ TriMap: Large-scale Dimensionality Reduction Using Triplets [Amid19]_. TriMap is a dimensionality reduction method that uses triplet constraints to form a low-dimensional embedding of a set of points. The triplet constraints are of the form "point i is closer to point j than point k". The triplets are sampled from the high-dimensional representation of the points and a weighting scheme is used to reflect the importance of each triplet. TriMap provides a significantly better global view of the data than the other dimensionality reduction methods such t-SNE, LargeVis, and UMAP. The global structure includes relative distances of the clusters, multiple scales in the data, and the existence of possible outliers. We define a global score to quantify the quality of an embedding in reflecting the global structure of the data. Parameters ---------- adata Annotated data matrix. n_components Number of dimensions of the embedding. n_inliers Number of inlier points for triplet constraints. n_outliers Number of outlier points for triplet constraints. n_random Number of random triplet constraints per point. metric Distance measure: 'angular', 'euclidean', 'hamming', 'manhattan'. weight_adj Adjusting the weights using a non-linear transformation. lr Learning rate. n_iters Number of iterations. verbose If `True`, print the progress report. If `None`, `sc.settings.verbosity` is used. copy Return a copy instead of writing to `adata`. Returns ------- Depending on `copy`, returns or updates `adata` with the following fields. **X_trimap** : :class:`~numpy.ndarray`, (:attr:`~anndata.AnnData.obsm`, shape=(n_samples, n_components), dtype `float`) TriMap coordinates of data. Example ------- >>> import scanpy as sc >>> import scanpy.external as sce >>> pbmc = sc.datasets.pbmc68k_reduced() >>> pbmc = sce.tl.trimap(pbmc, copy=True) >>> sce.pl.trimap(pbmc, color=['bulk_labels'], s=10) """ try: from trimap import TRIMAP except ImportError: raise ImportError( '\nplease install trimap: \n\n\tsudo pip install trimap') adata = adata.copy() if copy else adata start = logg.info('computing TriMap') adata = adata.copy() if copy else adata verbosity = settings.verbosity if verbose is None else verbose verbose = verbosity if isinstance(verbosity, bool) else verbosity > 0 if 'X_pca' in adata.obsm: n_dim_pca = adata.obsm['X_pca'].shape[1] X = adata.obsm['X_pca'][:, :min(n_dim_pca, 100)] else: X = adata.X if scp.issparse(X): raise ValueError( 'trimap currently does not support sparse matrices. Please' 'use a dense matrix or apply pca first.') logg.warning('`X_pca` not found. Run `sc.pp.pca` first for speedup.') X_trimap = TRIMAP( n_dims=n_components, n_inliers=n_inliers, n_outliers=n_outliers, n_random=n_random, lr=lr, distance=metric, weight_adj=weight_adj, n_iters=n_iters, verbose=verbose, ).fit_transform(X) adata.obsm['X_trimap'] = X_trimap logg.info( ' finished', time=start, deep="added\n 'X_trimap', TriMap coordinates (adata.obsm)", ) return adata if copy else None
def test_slicing_remove_unused_categories(): adata = AnnData( np.array([[1, 2], [3, 4], [5, 6], [7, 8]]), dict(k=['a', 'a', 'b', 'b'])) adata._sanitize() assert adata[3:5].obs['k'].cat.categories.tolist() == ['b']
def filter_cells( data: AnnData, min_counts: Optional[int] = None, min_genes: Optional[int] = None, max_counts: Optional[int] = None, max_genes: Optional[int] = None, inplace: bool = True, copy: bool = False, ): """Filter cell outliers based on counts and numbers of genes expressed. For instance, only keep cells with at least `min_counts` counts or `min_genes` genes expressed. This is to filter measurement outliers, i.e. “unreliable” observations. Only provide one of the optional parameters ``min_counts``, ``min_genes``, ``max_counts``, ``max_genes`` per call. Parameters ---------- data The (annotated) data matrix of shape ``n_obs`` × ``n_vars``. Rows correspond to cells and columns to genes. min_counts Minimum number of counts required for a cell to pass filtering. min_genes Minimum number of genes expressed required for a cell to pass filtering. max_counts Maximum number of counts required for a cell to pass filtering. max_genes Maximum number of genes expressed required for a cell to pass filtering. inplace Perform computation inplace or return result. Returns ------- `tuple`, `None` Depending on `inplace`, returns the following arrays or directly subsets and annotates the data matrix cells_subset : :class:`~numpy.ndarray` Boolean index mask that does filtering. `True` means that the cell is kept. `False` means the cell is removed. number_per_cell : :class:`~numpy.ndarray` Depending on what was tresholded (`counts` or `genes`), the array stores `n_counts` or `n_cells` per gene. Examples -------- >>> adata = sc.datasets.krumsiek11() >>> adata.n_obs 640 >>> adata.var_names ['Gata2' 'Gata1' 'Fog1' 'EKLF' 'Fli1' 'SCL' 'Cebpa' 'Pu.1' 'cJun' 'EgrNab' 'Gfi1'] >>> # add some true zeros >>> adata.X[adata.X < 0.3] = 0 >>> # simply compute the number of genes per cell >>> sc.pp.filter_cells(adata, min_genes=0) >>> adata.n_obs 640 >>> adata.obs['n_genes'].min() 1 >>> # filter manually >>> adata_copy = adata[adata.obs['n_genes'] >= 3] >>> adata_copy.obs['n_genes'].min() >>> adata.n_obs 554 >>> adata.obs['n_genes'].min() 3 >>> # actually do some filtering >>> sc.pp.filter_cells(adata, min_genes=3) >>> adata.n_obs 554 >>> adata.obs['n_genes'].min() 3 """ if copy: logg.warn('`copy` is deprecated, use `inplace` instead.') n_given_options = sum( option is not None for option in [min_genes, min_counts, max_genes, max_counts]) if n_given_options != 1: raise ValueError( 'Only provide one of the optional parameters `min_counts`,' '`min_genes`, `max_counts`, `max_genes` per call.') if isinstance(data, AnnData): adata = data.copy() if copy else data cell_subset, number = materialize_as_ndarray(filter_cells(adata.X, min_counts, min_genes, max_counts, max_genes)) if not inplace: return gene_subset, number if min_genes is None and max_genes is None: adata.obs['n_counts'] = number else: adata.obs['n_genes'] = number adata._inplace_subset_obs(cell_subset) return adata if copy else None X = data # proceed with processing the data matrix min_number = min_counts if min_genes is None else min_genes max_number = max_counts if max_genes is None else max_genes number_per_cell = np.sum(X if min_genes is None and max_genes is None else X > 0, axis=1) if issparse(X): number_per_cell = number_per_cell.A1 if min_number is not None: cell_subset = number_per_cell >= min_number if max_number is not None: cell_subset = number_per_cell <= max_number s = np.sum(~cell_subset) if s > 0: logg.info('filtered out {} cells that have'.format(s), end=' ') if min_genes is not None or min_counts is not None: logg.info('less than', str(min_genes) + ' genes expressed' if min_counts is None else str(min_counts) + ' counts', no_indent=True) if max_genes is not None or max_counts is not None: logg.info('more than ', str(max_genes) + ' genes expressed' if max_counts is None else str(max_counts) + ' counts', no_indent=True) return cell_subset, number_per_cell
def combat(adata: AnnData, key: str = 'batch', inplace: bool = True): """ ComBat function for batch effect correction [Johnson07]_ [Leek12]_ [Pedersen12]_. Corrects for batch effects by fitting linear models, gains statistical power via an EB framework where information is borrowed across genes. This uses the implementation of `ComBat <https://github.com/brentp/combat.py>`__ [Pedersen12]_. Parameters ---------- adata : :class:`~anndata.AnnData` Annotated data matrix key: `str`, optional (default: `"batch"`) Key to a categorical annotation from adata.obs that will be used for batch effect removal inplace: bool, optional (default: `True`) Wether to replace adata.X or to return the corrected data Returns ------- Depending on the value of inplace, either returns an updated AnnData object or modifies the passed one. """ # check the input if key not in adata.obs.keys(): raise ValueError( 'Could not find the key {!r} in adata.obs'.format(key)) # only works on dense matrices so far if issparse(adata.X): X = adata.X.A.T else: X = adata.X.T data = pd.DataFrame( data=X, index=adata.var_names, columns=adata.obs_names, ) # construct a pandas series of the batch annotation batch = pd.Series(adata.obs[key]) model = pd.DataFrame({'batch': batch}) batch_items = model.groupby("batch").groups.items() batch_info = [v for k, v in batch_items] n_batch = len(batch_info) n_batches = np.array([len(v) for v in batch_info]) n_array = float(sum(n_batches)) # standardize across genes using a pooled variance estimator sys.stderr.write("Standardizing Data across genes.\n") s_data, design, var_pooled, stand_mean = stand_data(model, data) # fitting the parameters on the standardized data sys.stderr.write("Fitting L/S model and finding priors\n") batch_design = design[design.columns[:n_batch]] # first estimate of the additive batch effect gamma_hat = np.dot( np.dot(la.inv(np.dot(batch_design.T, batch_design)), batch_design.T), s_data.T) delta_hat = [] # first estimate for the multiplicative batch effect for i, batch_idxs in enumerate(batch_info): delta_hat.append(s_data[batch_idxs].var(axis=1)) # empirically fix the prior hyperparameters gamma_bar = gamma_hat.mean(axis=1) t2 = gamma_hat.var(axis=1) # a_prior and b_prior are the priors on lambda and theta from Johnson and Li (2006) a_prior = list(map(aprior, delta_hat)) b_prior = list(map(bprior, delta_hat)) sys.stderr.write("Finding parametric adjustments\n") # gamma star and delta star will be our empirical bayes (EB) estimators # for the additive and multiplicative batch effect per batch and cell gamma_star, delta_star = [], [] for i, batch_idxs in enumerate(batch_info): # temp stores our estimates for the batch effect parameters. # temp[0] is the additive batch effect # temp[1] is the multiplicative batch effect gamma, delta = _it_sol( s_data[batch_idxs].values, gamma_hat[i], delta_hat[i].values, gamma_bar[i], t2[i], a_prior[i], b_prior[i], ) gamma_star.append(gamma) delta_star.append(delta) sys.stdout.write("Adjusting data\n") bayesdata = s_data gamma_star = np.array(gamma_star) delta_star = np.array(delta_star) # we now apply the parametric adjustment to the standardized data from above # loop over all batches in the data for j, batch_idxs in enumerate(batch_info): # we basically substract the additive batch effect, rescale by the ratio # of multiplicative batch effect to pooled variance and add the overall gene # wise mean dsq = np.sqrt(delta_star[j, :]) dsq = dsq.reshape((len(dsq), 1)) denom = np.dot(dsq, np.ones((1, n_batches[j]))) numer = np.array(bayesdata[batch_idxs] - np.dot(batch_design.loc[batch_idxs], gamma_star).T) bayesdata[batch_idxs] = numer / denom vpsq = np.sqrt(var_pooled).reshape((len(var_pooled), 1)) bayesdata = bayesdata * np.dot(vpsq, np.ones( (1, int(n_array)))) + stand_mean # put back into the adata object or return if inplace: adata.X = bayesdata.values.transpose() else: return bayesdata.values.transpose()
from itertools import product import numpy as np from numpy import ma import pandas as pd import pytest from scipy import sparse as sp from scipy.sparse import csr_matrix, isspmatrix_csr, issparse from anndata import AnnData, Raw from helpers import assert_equal, gen_adata # some test objects that we use below adata_dense = AnnData(np.array([[1, 2], [3, 4]])) adata_sparse = AnnData( csr_matrix([[0, 2, 3], [0, 5, 6]]), dict(obs_names=["s1", "s2"], anno1=["c1", "c2"]), dict(var_names=["a", "b", "c"]), ) def test_creation(): AnnData(np.array([[1, 2], [3, 4]])) AnnData(np.array([[1, 2], [3, 4]]), {}, {}) AnnData(ma.array([[1, 2], [3, 4]]), uns=dict(mask=[0, 1, 1, 0])) AnnData(sp.eye(2)) X = np.array([[1, 2, 3], [4, 5, 6]]) adata = AnnData( X=X, obs=dict(Obs=["A", "B"]), var=dict(Feat=["a", "b", "c"]),
def circular_projection( adata: AnnData, keys: Union[str, Sequence[str]], backward: bool = False, lineages: Optional[Union[str, Sequence[str]]] = None, early_cells: Optional[Union[Mapping[str, Sequence[str]], Sequence[str]]] = None, lineage_order: Optional[Literal["default", "optimal"]] = None, metric: Union[str, Callable, np.ndarray, pd.DataFrame] = "correlation", normalize_by_mean: bool = True, ncols: int = 4, space: float = 0.25, use_raw: bool = False, text_kwargs: Mapping[str, Any] = MappingProxyType({}), labeldistance: float = 1.25, labelrot: Union[Literal["default", "best"], float] = "best", show_edges: bool = True, key_added: Optional[str] = None, figsize: Optional[Tuple[float, float]] = None, dpi: Optional[int] = None, save: Optional[Union[str, Path]] = None, **kwargs: Any, ): r""" Plot absorption probabilities on a circular embedding as in :cite:`velten:17`. Parameters ---------- %(adata)s keys Keys in :attr:`anndata.AnnData.obs` or :attr:`anndata.AnnData.var_names`. Additional keys are: - `'kl_divergence'` - as in :cite:`velten:17`, computes KL-divergence between the fate probabilities of a cell and the average fate probabilities. See ``early_cells`` for more information. - `'entropy'` - as in :cite:`setty:19`, computes entropy over a cells fate probabilities. %(backward)s lineages Lineages to plot. If `None`, plot all lineages. early_cells Cell ids or a mask marking early cells used to define the average fate probabilities. If `None`, use all cells. Only used when `'kl_divergence'` is in ``keys``. If a :class:`dict`, key specifies a cluster key in :attr:`anndata.AnnData.obs` and the values specify cluster labels containing early cells. lineage_order Can be one of the following: - `None` - it will determined automatically, based on the number of lineages. - `'optimal'` - order lineages optimally by solving the Travelling salesman problem (TSP). Recommended for <= `20` lineages. - `'default'` - use the order as specified in ``lineages``. metric Metric to use when constructing pairwise distance matrix when ``lineage_order = 'optimal'``. For available options, see :func:`sklearn.metrics.pairwise_distances`. normalize_by_mean If `True`, normalize each lineage by its mean probability, as done in :cite:`velten:17`. ncols Number of columns when plotting multiple ``keys``. space Horizontal and vertical space between for :func:`matplotlib.pyplot.subplots_adjust`. use_raw Whether to access :attr:`anndata.AnnData.raw` when there are ``keys`` in :attr:`anndata.AnnData.var_names`. text_kwargs Keyword arguments for :func:`matplotlib.pyplot.text`. labeldistance Distance at which the lineage labels will be drawn. labelrot How to rotate the labels. Valid options are: - `'best'` - rotate labels so that they are easily readable. - `'default'` - use :mod:`matplotlib`'s default. - `None` - same as `'default'`. If a :class:`float`, all labels will be rotated by this many degrees. show_edges Whether to show the edges surrounding the simplex. key_added Key in :attr:`anndata.AnnData.obsm` where to add the circular embedding. If `None`, it will be set to `'X_fate_simplex_{fwd,bwd}'`, based on ``backward``. %(plotting)s kwargs Keyword arguments for :func:`scvelo.pl.scatter`. Returns ------- %(just_plots)s Also updates ``adata`` with the following fields: - :attr:`anndata.AnnData.obsm` ``['{key_added}']`` - the circular projection. - :attr:`anndata.AnnData.obs` ``['to_{initial,terminal}_states_{method}']`` - the priming degree, if a method is present in ``keys``. """ if labeldistance is not None and labeldistance < 0: raise ValueError( f"Expected `delta` to be positive, found `{labeldistance}`.") if labelrot is None: labelrot = LabelRot.DEFAULT if isinstance(labelrot, str): labelrot = LabelRot(labelrot) suffix = "bwd" if backward else "fwd" if key_added is None: key_added = "X_fate_simplex_" + suffix if isinstance(keys, str): keys = (keys, ) keys = _unique_order_preserving(keys) keys_ = _check_collection( adata, keys, "obs", key_name="Observation", raise_exc=False) + _check_collection(adata, keys, "var_names", key_name="Gene", raise_exc=False, use_raw=use_raw) haystack = {s.s for s in PrimingDegree} keys = keys_ + [k for k in keys if k in haystack] keys = _unique_order_preserving(keys) if not len(keys): raise ValueError("No valid keys have been selected.") lineage_key = str(AbsProbKey.BACKWARD if backward else AbsProbKey.FORWARD) if lineage_key not in adata.obsm: raise KeyError( f"Lineages key `{lineage_key!r}` not found in `adata.obsm`.") probs = adata.obsm[lineage_key] if isinstance(lineages, str): lineages = (lineages, ) elif lineages is None: lineages = probs.names probs: Lineage = adata.obsm[lineage_key][lineages] n_lin = probs.shape[1] if n_lin < 3: raise ValueError(f"Expected at least `3` lineages, found `{n_lin}`.") X = probs.X.copy() if normalize_by_mean: X /= np.mean(X, axis=0)[None, :] X /= X.sum(1)[:, None] # this happens when cells for sel. lineages sum to 1 (or when the lineage average is 0, which is unlikely) X = np.nan_to_num(X, nan=1.0 / n_lin, copy=False) if lineage_order is None: lineage_order = (LineageOrder.OPTIMAL if 3 < n_lin <= 20 else LineageOrder.DEFAULT) logg.debug(f"Set ordering to `{lineage_order}`") lineage_order = LineageOrder(lineage_order) if lineage_order == LineageOrder.OPTIMAL: logg.info(f"Solving TSP for `{n_lin}` states") _, order = _get_optimal_order(X, metric=metric) else: order = np.arange(n_lin) probs = probs[:, order] X = X[:, order] angle_vec = np.linspace(0, 2 * np.pi, n_lin, endpoint=False) angle_vec_sin = np.cos(angle_vec) angle_vec_cos = np.sin(angle_vec) x = np.sum(X * angle_vec_sin, axis=1) y = np.sum(X * angle_vec_cos, axis=1) adata.obsm[key_added] = np.c_[x, y] nrows = int(np.ceil(len(keys) / ncols)) fig, ax = plt.subplots( nrows=nrows, ncols=ncols, figsize=(ncols * 5, nrows * 5) if figsize is None else figsize, dpi=dpi, ) fig.subplots_adjust(wspace=space, hspace=space) axes = np.ravel([ax]) text_kwargs = dict(text_kwargs) text_kwargs["ha"] = "center" text_kwargs["va"] = "center" _i = 0 for _i, (k, ax) in enumerate(zip(keys, axes)): set_lognorm, colorbar = False, kwargs.pop("colorbar", True) try: _ = PrimingDegree(k) logg.debug(f"Calculating priming degree using `method={k}`") val = probs.priming_degree(method=k, early_cells=early_cells) k = f"{lineage_key}_{k}" adata.obs[k] = val except ValueError: pass scv.pl.scatter( adata, basis=key_added, color=k, show=False, ax=ax, use_raw=use_raw, norm=LogNorm() if set_lognorm else None, colorbar=colorbar, **kwargs, ) if colorbar and set_lognorm: cbar = ax.collections[0].colorbar cax = cbar.locator.axis ticks = cax.minor.locator.tick_values(cbar.vmin, cbar.vmax) ticks = [ticks[0], ticks[len(ticks) // 2 + 1], ticks[-1]] cbar.set_ticks(ticks) cbar.set_ticklabels([f"{t:.2f}" for t in ticks]) cbar.update_ticks() patches, texts = ax.pie( np.ones_like(angle_vec), labeldistance=labeldistance, rotatelabels=True, labels=probs.names[::-1], startangle=-360 / len(angle_vec) / 2, counterclock=False, textprops=text_kwargs, ) for patch in patches: patch.set_visible(False) # clockwise for color, text in zip(probs.colors[::-1], texts): if isinstance(labelrot, (int, float)): text.set_rotation(labelrot) elif labelrot == LabelRot.BEST: rot = text.get_rotation() text.set_rotation(rot + 90 + (1 - rot // 180) * 180) elif labelrot != LabelRot.DEFAULT: raise NotImplementedError( f"Label rotation `{labelrot}` is not yet implemented.") text.set_color(color) if not show_edges: continue for i, color in enumerate(probs.colors): next = (i + 1) % n_lin x = 1.04 * np.linspace(angle_vec_sin[i], angle_vec_sin[next], _N) y = 1.04 * np.linspace(angle_vec_cos[i], angle_vec_cos[next], _N) points = np.array([x, y]).T.reshape(-1, 1, 2) segments = np.concatenate([points[:-1], points[1:]], axis=1) cmap = LinearSegmentedColormap.from_list( "abs_prob_cmap", [color, probs.colors[next]], N=_N) lc = LineCollection(segments, cmap=cmap, zorder=-1) lc.set_array(np.linspace(0, 1, _N)) lc.set_linewidth(2) ax.add_collection(lc) for j in range(_i + 1, len(axes)): axes[j].remove() if save is not None: save_fig(fig, save)
def SCALEX(data_list, batch_categories=None, profile='RNA', join='inner', batch_key='batch', batch_name='batch', min_features=600, min_cells=3, n_top_features=2000, batch_size=64, lr=2e-4, max_iteration=30000, seed=124, gpu=0, outdir='output/', projection=None, repeat=False, impute=None, chunk_size=20000, ignore_umap=False, verbose=False, assess=False, show=False, processed=False): """ Single-Cell integrative Analysis via Latent feature Extraction Parameters ---------- data_list A path list of AnnData matrices to concatenate with. Each matrix is referred to as a 'batch'. batch_categories Categories for the batch annotation. By default, use increasing numbers. profile Specify the single-cell profile, RNA or ATAC. Default: RNA. join Use intersection ('inner') or union ('outer') of variables of different batches. batch_key Add the batch annotation to obs using this key. By default, batch_key='batch'. batch_name Use this annotation in obs as batches for training model. Default: 'batch'. min_features Filtered out cells that are detected in less than min_features. Default: 600. min_cells Filtered out genes that are detected in less than min_cells. Default: 3. n_top_features Number of highly-variable genes to keep. Default: 2000. batch_size Number of samples per batch to load. Default: 64. lr Learning rate. Default: 2e-4. max_iteration Max iterations for training. Training one batch_size samples is one iteration. Default: 30000. seed Random seed for torch and numpy. Default: 124. gpu Index of GPU to use if GPU is available. Default: 0. outdir Output directory. Default: 'output/'. projection Use for new dataset projection. Input the folder containing the pre-trained model. If None, don't do projection. Default: None. repeat Use with projection. If False, concatenate the reference and projection datasets for downstream analysis. If True, only use projection datasets. Default: False. impute If True, calculate the imputed gene expression and store it at adata.layers['impute']. Default: False. chunk_size Number of samples from the same batch to transform. Default: 20000. ignore_umap If True, do not perform UMAP for visualization and leiden for clustering. Default: False. verbose Verbosity, True or False. Default: False. assess If True, calculate the entropy_batch_mixing score and silhouette score to evaluate integration results. Default: False. Returns ------- The output folder contains: adata.h5ad The AnnData matrice after batch effects removal. The low-dimensional representation of the data is stored at adata.obsm['latent']. checkpoint model.pt contains the variables of the model and config.pt contains the parameters of the model. log.txt Records raw data information, filter conditions, model parameters etc. umap.pdf UMAP plot for visualization. """ np.random.seed(seed) # seed torch.manual_seed(seed) if torch.cuda.is_available(): # cuda device device = 'cuda' torch.cuda.set_device(gpu) else: device = 'cpu' outdir = outdir + '/' os.makedirs(outdir + '/checkpoint', exist_ok=True) log = create_logger('', fh=outdir + 'log.txt') if not projection: adata, trainloader, testloader = load_data( data_list, batch_categories, join=join, profile=profile, n_top_features=n_top_features, batch_size=batch_size, chunk_size=chunk_size, min_features=min_features, min_cells=min_cells, batch_name=batch_name, batch_key=batch_key, log=log, processed=processed) early_stopping = EarlyStopping(patience=10, checkpoint_file=outdir + '/checkpoint/model.pt') x_dim, n_domain = adata.shape[1], len( adata.obs['batch'].cat.categories) # model config enc = [['fc', 1024, 1, 'relu'], ['fc', 10, '', '']] # TO DO # enc = [['fc', 32, 1, 'relu'],['fc', 10, '', '']] dec = [['fc', x_dim, n_domain, 'sigmoid']] model = VAE(enc, dec, n_domain=n_domain) log.info('model\n' + model.__repr__()) model.fit( trainloader, lr=lr, max_iteration=max_iteration, device=device, early_stopping=early_stopping, verbose=verbose, ) torch.save( { 'n_top_features': adata.var.index, 'enc': enc, 'dec': dec, 'n_domain': n_domain }, outdir + '/checkpoint/config.pt') else: state = torch.load(projection + '/checkpoint/config.pt') n_top_features, enc, dec, n_domain = state['n_top_features'], state[ 'enc'], state['dec'], state['n_domain'] model = VAE(enc, dec, n_domain=n_domain) model.load_model(projection + '/checkpoint/model.pt') model.to(device) adata, trainloader, testloader = load_data( data_list, batch_categories, join='outer', profile=profile, chunk_size=chunk_size, n_top_features=n_top_features, min_cells=0, min_features=min_features, batch_name=batch_name, batch_key=batch_key, log=log) # log.info('Processed dataset shape: {}'.format(adata.shape)) adata.obsm['latent'] = model.encodeBatch(testloader, device=device) # save latent rep if impute: adata.layers['impute'] = model.encodeBatch(testloader, out='impute', batch_id=impute, device=device) log.info('Output dir: {}'.format(outdir)) if projection and (not repeat): ref = sc.read_h5ad(projection + '/adata.h5ad') adata = AnnData.concatenate(ref, adata, batch_categories=['reference', 'query'], batch_key='projection', index_unique=None) adata.write(outdir + 'adata.h5ad', compression='gzip') if not ignore_umap: #and adata.shape[0]<1e6: log.info('Plot umap') sc.pp.neighbors(adata, n_neighbors=30, use_rep='latent') sc.tl.umap(adata, min_dist=0.1) sc.tl.leiden(adata) # UMAP visualization sc.settings.figdir = outdir sc.set_figure_params(dpi=80, figsize=(10, 10), fontsize=20) cols = ['batch', 'celltype', 'leiden'] color = [c for c in cols if c in adata.obs] if len(color) > 0: if projection and (not repeat): embedding(adata, groupby='projection', save='.pdf', show=show) else: sc.pl.umap(adata, color=color, save='.pdf', wspace=0.4, ncols=4, show=show) if assess: if len(adata.obs['batch'].cat.categories) > 1: entropy_score = batch_entropy_mixing_score( adata.obsm['X_umap'], adata.obs['batch']) log.info( 'batch_entropy_mixing_score: {:.3f}'.format(entropy_score)) if 'celltype' in adata.obs: sil_score = silhouette_score(adata.obsm['X_umap'], adata.obs['celltype'].cat.codes) log.info("silhouette_score: {:.3f}".format(sil_score)) adata.write(outdir + 'adata.h5ad', compression='gzip') return adata
def test_normalize_total_layers(typ, dtype): adata = AnnData(typ(X_total), dtype=dtype) adata.layers["layer"] = adata.X.copy() sc.pp.normalize_total(adata, layers=["layer"]) assert np.allclose(adata.layers["layer"].sum(axis=1), [3.0, 3.0, 3.0])
def visualize_dictionary(ct, X_dimred, genes, cell_types, namespace, dag_method, verbose=True): from anndata import AnnData from scanorama import visualize import scanpy as sc import seaborn as sns # KNN and UMAP. if verbose: tprint('Constructing KNN graph...') adata = AnnData(X=X_dimred) sc.pp.neighbors(adata, use_rep='X') if verbose: tprint('Visualizing with UMAP...') sc.tl.umap(adata, min_dist=0.5) embedding = np.array(adata.obsm['X_umap']) embedding[embedding < -20] = -20 embedding[embedding > 20] = 20 # Visualize cell types. le = LabelEncoder().fit(cell_types) cell_types_int = le.transform(cell_types) visualize(None, cell_types_int, '{}_pan_umap_{}_type'.format(namespace, dag_method), np.array(sorted(set(cell_types))), embedding=embedding, image_suffix='.png') #max_intensity = ct.labels_.max() for c_idx in range(ct.labels_.shape[1]): intensity = ct.labels_[:, c_idx] intensity /= intensity.max() print('\nCluster {}'.format(c_idx)) print_cell_types(cell_types, intensity) # Visualize cluster in UMAP coordinates. plt.figure() plt.title('Cluster {}'.format(c_idx)) plt.scatter(embedding[:, 0], embedding[:, 1], c=intensity, cmap=cm.get_cmap('Blues'), s=1) plt.savefig('{}_pan_umap_{}_cluster{}.png'.format( namespace, dag_method, c_idx), dpi=500) plt.figure() plt.title('Cluster {}'.format(c_idx)) plt.hist(intensity.flatten(), bins=100) plt.savefig('{}_pan_umap_{}_intensehist{}.png'.format( namespace, dag_method, c_idx), dpi=500) intensity = (intensity > 0.8) * 1 plt.figure() plt.title('Cluster {}'.format(c_idx)) plt.scatter(embedding[:, 0], embedding[:, 1], c=intensity, cmap=cm.get_cmap('Blues'), s=1) plt.savefig('{}_pan_umap_{}_member{}.png'.format( namespace, dag_method, c_idx), dpi=500) for c_idx in range(ct.labels_.shape[1]): # Visualize covariance matrix. corr = ct.dictionary_[:, :, c_idx] corr[np.isnan(corr)] = 0 #print('\nCluster {}'.format(c_idx)) #print_gene_modules(corr, genes) gene_idx = np.sum(np.abs(corr), axis=1) > 0 if np.sum(gene_idx) == 0: continue corr = corr[gene_idx] corr = corr[:, gene_idx] plt.figure() plt.title('Cluster {}'.format(c_idx)) plt.rcParams.update({'font.size': 5}) cmap = sns.diverging_palette(220, 10, as_cmap=True) corr_max = max(corr.max(), abs(corr.min())) sns.clustermap(corr, xticklabels=genes[gene_idx], yticklabels=genes[gene_idx], cmap=cmap, vmin=-corr_max, vmax=corr_max) plt.xticks(rotation=90) plt.yticks(rotation=90) plt.savefig('{}_pan_cov_{}_cluster{}.png'.format( namespace, dag_method, c_idx), dpi=500)
def correlate_tf_motifs( adata: AnnData, n_sketch: Optional[int] = 2500, n_permutations: Optional[int] = 100000, indirect: Optional[bool] = True, ) -> None: """Correlate inferred motif activity with TF expression. Parameters ---------- adata : :class:`~anndata.AnnData` Annotated data matrix. n_sketch : `int`, optional (default: 2500) If the number of cells is higher than `n_sketch`, use geometric sketching (Hie et al. 2019) to select a subset of `n_sketch` cells. This subset will be used to calculate the correlation beteen motif activity and transcription factor expression. n_permutations : `int`, optional (default: 100000) Number of permutations that is used to calculate the p-value. Can be decreased for quicker run-time, but should probably not be below 10000. indirect : `bool`, optional (default: True) Include indirect TF to motif assignments. """ logger.info("correlating motif activity with factors") if indirect: logger.info("including indirect and/or predicted factors") # Get all TFs from motif database m2f = motif_mapping(indirect=True) batch_size = m2f.shape[0] f2m2 = pd.DataFrame(m2f["factors"].str.split(",").tolist(), index=m2f.index).stack() f2m2 = f2m2.to_frame().reset_index().iloc[:, [0, 2]] f2m2.columns = ["motif", "factor"] unique_factors = f2m2["factor"].unique() if n_sketch is None or n_sketch > adata.shape[0]: logger.info(f"using all cells") my_adata = adata else: logger.info(f"creating sketch of {n_sketch} cells") idx = geosketch.gs(adata.obsm["X_pca"], n_sketch) my_adata = adata.copy() my_adata = my_adata[idx] detected = (my_adata.raw.var_names.str.upper().isin(unique_factors)) & ( (my_adata.raw.X > 0).sum(0) > 3) detected = np.squeeze(np.asarray(detected)) unique_factors = my_adata.raw.var_names[detected].str.upper() # Get the expression for all TFs expression = (np.squeeze(np.asarray(my_adata.raw.X.todense())) if issparse(my_adata.raw.X) else my_adata.raw.X) expression = expression.T[detected] logger.info( f"calculating correlation of motif activity with {len(unique_factors)} factors" ) real = fast_corr( expression, (my_adata.obsm["X_cell_types"] @ my_adata.uns["scepia"]["motif_activity"].T).T.values, ) real = pd.DataFrame( real, index=unique_factors, columns=my_adata.uns["scepia"]["motif_activity"].index, ) tmp = (real.reset_index().melt( id_vars="index", var_name="motif", value_name="correlation").rename(columns={ "index": "factor" }).set_index(["motif", "factor"])) f2m2 = f2m2.set_index(["motif", "factor"]).join(tmp).dropna() f2m2["abs_correlation"] = f2m2["correlation"].abs() logger.info(f"calculating {n_permutations} permutations") permute_result = pd.DataFrame(index=unique_factors) shape = my_adata.uns["scepia"]["motif_activity"].shape for i in tqdm(range(0, n_permutations, batch_size)): random_activities = None while random_activities is None or random_activities.shape[ 0] < batch_size: x = my_adata.uns["scepia"]["motif_activity"].values.flatten() motif_activity = shuffle(x).reshape(shape[1], shape[0]) cell_motif_activity = ( my_adata.obsm["X_cell_types"] @ motif_activity).T if random_activities is None: random_activities = cell_motif_activity else: random_activities = np.vstack( (random_activities, cell_motif_activity)) random_activities = random_activities[:batch_size] batch_result = fast_corr(expression, random_activities) batch_result = pd.DataFrame(batch_result, index=unique_factors, columns=range(i, i + batch_size)) permute_result = permute_result.join(batch_result) logger.info("calculating permutation-based p-values (all)") # Calculate p-value of correlation relative to all permuted correlations permuted_corrs = permute_result.values.flatten() pvals = [(100 - percentileofscore(permuted_corrs, corr)) / 100 for corr in f2m2["correlation"]] f2m2["pval"] = pvals f2m2.loc[f2m2["correlation"] < 0, "pval"] = (1 - f2m2.loc[f2m2["correlation"] < 0, "pval"]) logger.info("calculating permutation-based p-values (factor-specific)") # Calculate p-value of correlation relative to permutated value of this factor for motif, factor in tqdm(f2m2.index): pval = (100 - percentileofscore(permute_result.loc[factor], real.loc[factor, motif])) / 100 pval = 1 - pval if real.loc[factor, motif] < 0 else pval pval = 1 / permute_result.shape[1] if pval == 0 else pval f2m2.loc[(motif, factor), "permutation_pval"] = pval f2m2.loc[(motif, factor), "combined"] = combine_pvalues( f2m2.loc[(motif, factor), ["pval", "permutation_pval"]])[1] f2m2["p_adj"] = multipletests(f2m2["combined"], method="fdr_bh")[1] f2m2["-log10(p-value)"] = -np.log10(f2m2["p_adj"]) cluster_cell_types = adata.obs["cluster_annotation"].unique() f2m2 = f2m2.join( (adata.uns["scepia"]["motif_activity"][cluster_cell_types].max(1) - adata.uns["scepia"]["motif_activity"][cluster_cell_types].min(1) ).to_frame("motif_stddev").rename_axis("motif")) f2m2 = f2m2.reset_index().set_index("factor") adata.uns["scepia"]["correlation"] = f2m2
def infer_motifs( adata: AnnData, dataset: str, cluster: Optional[str] = "louvain", n_top_genes: Optional[int] = 1000, max_cell_types: Optional[int] = 50, pfm: Optional[str] = None, min_annotated: Optional[int] = 50, num_enhancers: Optional[int] = 10000, maelstrom: Optional[bool] = False, indirect: Optional[bool] = True, n_sketch: Optional[int] = 2500, n_permutations: Optional[int] = 100000, ) -> None: """Infer motif ativity for single cell RNA-seq data. The adata object is modified with the following fields. **X_cell_types** : `adata.obsm` field Cell type coefficients. Parameters ---------- adata : :class:`~anndata.AnnData` Annotated data matrix. dataset : `str` Name of reference data set or directory with reference data. cluster : `str`, optional (default: "louvain") Name of the clustering, can be either louvain or leiden. n_top_genes : `int`, optional (default: 1000) Number of variable genes that is used. If `n_top_genes` is greater than the number of hypervariable genes in `adata` then all variable genes are used. max_cell_types : `int`, optional (default: 50) Maximum number of cell types to select. pfm : `str`, optional (default: None) Name of motif file in PFM format. The GimmeMotifs default is used if this parameter is not specified. This can be a filename, or a pfm name support by GimmeMotifs such as `JASPAR2018_vertebrates`. If a custom PFM file is specified, there should also be an associated `.motif2factors.txt` file. min_annotated : `int`, optional (default: 50) Cells that are annotated with cell types less than this number will be annotated as "other". num_enhancers : `int`, optional (default: 10000) Number of enhancers to use for motif activity analysis. maelstrom : `boolean`, optional (default: False) Use maelstrom instead of ridge regression for motif activity analysis. """ use_name = True validate_adata(adata) data = ScepiaDataset(dataset) if "scepia" not in adata.uns: adata.uns["scepia"] = {"version": __version__} # Annotate each cell with H3K27ac reference if "cell_annotation" not in adata.obs or "cluster_annotation" not in adata.obs: annotate_cells( adata, dataset=dataset, cluster=cluster, n_top_genes=n_top_genes, min_annotated=min_annotated, max_cell_types=max_cell_types, ) logger.info("Linking variable genes to differential enhancers.") gene_map_file = data.gene_mapping link_file = data.link_file link = pd.read_feather(link_file) if use_name: ens2name = pd.read_csv(gene_map_file, sep="\t", index_col=0, names=["identifier", "name"]) link = link.join(ens2name, on="gene").dropna() link = link.set_index("name") link.index = link.index.str.upper() enh_genes = adata.var_names[adata.var_names.str.upper().isin( link.index)].str.upper() var_enhancers = change_region_size(link.loc[enh_genes, "loc"]).unique() enhancer_df = data.load_reference_data(reftype="enhancer") enhancer_df.index = change_region_size(enhancer_df.index) enhancer_df = enhancer_df.loc[var_enhancers, adata.uns["scepia"]["cell_types"]] enhancer_df = enhancer_df.groupby(enhancer_df.columns, axis=1).mean() enhancer_df.loc[:, :] = scale(enhancer_df) main_cell_types = pd.concat(( adata.obs["cluster_annotation"].astype(str), adata.obs["cell_annotation"].astype(str), )) main_cell_types = [x for x in main_cell_types.unique() if x != "other"] # Select top most variable enhancers of the most important annotated cell types enhancer_df = enhancer_df.loc[enhancer_df[main_cell_types].var( 1).sort_values().tail(num_enhancers).index] # Center by mean of the most import cell types # Here we chose the majority cell type per cluster mean_value = enhancer_df[main_cell_types].mean(1) enhancer_df = enhancer_df.sub(mean_value, axis=0) fname = NamedTemporaryFile(delete=False).name enhancer_df.to_csv(fname, sep="\t") logger.info("inferring motif activity") pfm = pfmfile_location(pfm) if maelstrom: with TemporaryDirectory() as tmpdir: run_maelstrom( fname, data.genome, tmpdir, center=False, filter_redundant=True, ) motif_act = pd.read_csv( os.path.join(tmpdir, "final.out.txt"), sep="\t", comment="#", index_col=0, ) motif_act.columns = motif_act.columns.str.replace( r"z-score\s+", "") pfm = pfmfile_location( os.path.join(tmpdir, "nonredundant.motifs.pfm")) else: logger.info(f"Activity based on genome {data.genome}") motif_act = moap( fname, scoring="score", genome=data.genome, method="bayesianridge", pfmfile=pfm, ncpus=12, ) adata.uns["scepia"]["pfm"] = pfm adata.uns["scepia"]["motif_activity"] = motif_act[adata.uns["scepia"] ["cell_types"]] logger.info("calculating cell-specific motif activity") cell_motif_activity = ( adata.uns["scepia"]["motif_activity"] @ adata.obsm["X_cell_types"].T).T cell_motif_activity.index = adata.obs_names adata.obs = adata.obs.drop( columns=cell_motif_activity.columns.intersection(adata.obs.columns)) adata.obs = adata.obs.join(cell_motif_activity) correlate_tf_motifs(adata, indirect=indirect, n_sketch=n_sketch, n_permutations=n_permutations) add_activity(adata) logger.info("Done with motif inference.")
def test_from_df_and_dict(): df = pd.DataFrame(dict(a=[0.1, 0.2, 0.3], b=[1.1, 1.2, 1.3])) adata = AnnData(df, dict(species=pd.Categorical(["a", "b", "a"]))) assert adata.obs["species"].values.tolist() == ["a", "b", "a"]
def test_strings_to_categoricals(): adata = AnnData(np.array([[1, 2], [3, 4], [5, 6], [7, 8]]), dict(k=["a", "a", "b", "b"])) adata.strings_to_categoricals() assert adata.obs["k"].cat.categories.tolist() == ["a", "b"]
def flat_model( adata: AnnData, max_iterations: int = 1000000, epsilon: float = 0, equilibrate: bool = False, wait: int = 1000, nbreaks: int = 2, collect_marginals: bool = False, niter_collect: int = 10000, deg_corr: bool = True, multiflip: bool = True, fast_model: bool = False, n_init: int = 1, beta_range: Tuple[float] = (1., 100.), steps_anneal: int = 5, resume: bool = False, *, restrict_to: Optional[Tuple[str, Sequence[str]]] = None, random_seed: Optional[int] = None, key_added: str = 'sbm', adjacency: Optional[sparse.spmatrix] = None, neighbors_key: Optional[str] = 'neighbors', directed: bool = False, use_weights: bool = False, copy: bool = False, minimize_args: Optional[Dict] = {}, equilibrate_args: Optional[Dict] = {}, ) -> Optional[AnnData]: """\ Cluster cells into subgroups [Peixoto14]_. Cluster cells using the Stochastic Block Model [Peixoto14]_, performing Bayesian inference on node groups. This requires having ran :func:`~scanpy.pp.neighbors` or :func:`~scanpy.external.pp.bbknn` first. Parameters ---------- adata The annotated data matrix. max_iterations Maximal number of iterations to be performed by the equilibrate step. epsilon Relative changes in entropy smaller than epsilon will not be considered as record-breaking. equilibrate Whether or not perform the mcmc_equilibrate step. Equilibration should always be performed. Note, also, that without equilibration it won't be possible to collect marginals. collect_marginals Whether or not collect node probability of belonging to a specific partition. niter_collect Number of iterations to force when collecting marginals. This will increase the precision when calculating probabilites wait Number of iterations to wait for a record-breaking event. Higher values result in longer computations. Set it to small values when performing quick tests. nbreaks Number of iteration intervals (of size `wait`) without record-breaking events necessary to stop the algorithm. deg_corr Whether to use degree correction in the minimization step. In many real world networks this is the case, although this doesn't seem the case for KNN graphs used in scanpy. multiflip Whether to perform MCMC sweep with multiple simultaneous moves to sample network partitions. It may result in slightly longer runtimes, but under the hood it allows for a more efficient space exploration. fast_model Whether to skip initial minization step and let the MCMC find a solution. This approach tend to be faster and consume less memory, but less accurate. n_init Number of initial minimizations to be performed. The one with smaller entropy is chosen beta_range Inverse temperature at the beginning and the end of the equilibration steps_anneal Number of steps in which the simulated annealing is performed resume Start from a previously created model, if any, without initializing a novel model key_added `adata.obs` key under which to add the cluster labels. adjacency Sparse adjacency matrix of the graph, defaults to `adata.uns['neighbors']['connectivities']` in case of scanpy<=1.4.6 or `adata.obsp[neighbors_key][connectivity_key]` for scanpy>1.4.6 neighbors_key The key passed to `sc.pp.neighbors` directed Whether to treat the graph as directed or undirected. use_weights If `True`, edge weights from the graph are used in the computation (placing more emphasis on stronger edges). Note that this increases computation times copy Whether to copy `adata` or modify it inplace. random_seed Random number to be used as seed for graph-tool Returns ------- `adata.obs[key_added]` Array of dim (number of samples) that stores the subgroup id (`'0'`, `'1'`, ...) for each cell. `adata.uns['sbm']['params']` A dict with the values for the parameters `resolution`, `random_state`, and `n_iterations`. `adata.uns['sbm']['stats']` A dict with the values returned by mcmc_sweep `adata.uns['sbm']['cell_affinity']` A `np.ndarray` with cell probability of belonging to a specific group `adata.uns['sbm']['state']` The BlockModel state object """ raise DeprecationWarning("""This function has been deprecated since version 0.5.0, please consider usage of planted_model instead. """) if fast_model or resume: # if the fast_model is chosen perform equilibration anyway equilibrate=True if resume and ('sbm' not in adata.uns or 'state' not in adata.uns['sbm']): # let the model proceed as default logg.warning('Resuming has been specified but a state was not found\n' 'Will continue with default minimization step') resume=False fast_model=False if random_seed: np.random.seed(random_seed) gt.seed_rng(random_seed) if collect_marginals: logg.warning('Collecting marginals has a large impact on running time') if not equilibrate: raise ValueError( "You can't collect marginals without MCMC equilibrate " "step. Either set `equlibrate` to `True` or " "`collect_marginals` to `False`" ) start = logg.info('minimizing the Stochastic Block Model') adata = adata.copy() if copy else adata # are we clustering a user-provided graph or the default AnnData one? if adjacency is None: if neighbors_key not in adata.uns: raise ValueError( 'You need to run `pp.neighbors` first ' 'to compute a neighborhood graph.' ) elif 'connectivities_key' in adata.uns[neighbors_key]: # scanpy>1.4.6 has matrix in another slot conn_key = adata.uns[neighbors_key]['connectivities_key'] adjacency = adata.obsp[conn_key] else: # scanpy<=1.4.6 has sparse matrix here adjacency = adata.uns[neighbors_key]['connectivities'] if restrict_to is not None: restrict_key, restrict_categories = restrict_to adjacency, restrict_indices = restrict_adjacency( adata, restrict_key, restrict_categories, adjacency, ) # convert it to igraph g = get_graph_tool_from_adjacency(adjacency, directed=directed) recs=[] rec_types=[] if use_weights: # this is not ideal to me, possibly we may need to transform # weights. More tests needed. recs=[g.ep.weight] rec_types=['real-normal'] if fast_model: # do not minimize, start with a dummy state and perform only equilibrate state = gt.BlockState(g=g, B=1, sampling=True, state_args=dict(deg_corr=deg_corr, recs=recs, rec_types=rec_types )) elif resume: # create the state and make sure sampling is performed state = adata.uns['sbm']['state'].copy(sampling=True) g = state.g else: if n_init < 1: n_init = 1 states = [gt.minimize_nested_blockmodel_dl(g, deg_corr=deg_corr, state_args=dict(recs=recs, rec_types=rec_types), **minimize_args) for n in range(n_init)] state = states[np.argmin([s.entropy() for s in states])] logg.info(' done', time=start) state = state.copy(B=g.num_vertices()) # equilibrate the Markov chain if equilibrate: logg.info('running MCMC equilibration step') equilibrate_args['wait'] = wait equilibrate_args['nbreaks'] = nbreaks equilibrate_args['max_niter'] = max_iterations equilibrate_args['multiflip'] = multiflip equilibrate_args['mcmc_args'] = {'niter':10} dS, nattempts, nmoves = gt.mcmc_anneal(state, mcmc_equilibrate_args=equilibrate_args, niter=steps_anneal, beta_range=beta_range) if collect_marginals and equilibrate: # we here only retain level_0 counts, until I can't figure out # how to propagate correctly counts to higher levels # I wonder if this should be placed after group definition or not logg.info(' collecting marginals') group_marginals = np.zeros(g.num_vertices() + 1) def _collect_marginals(s): group_marginals[s.get_nonempty_B()] += 1 gt.mcmc_equilibrate(state, wait=wait, nbreaks=nbreaks, epsilon=epsilon, max_niter=max_iterations, multiflip=False, force_niter=niter_collect, mcmc_args=dict(niter=10), callback=_collect_marginals) logg.info(' done', time=start) # everything is in place, we need to fill all slots # first build an array with groups = pd.Series(state.get_blocks().get_array()).astype('category') new_cat_names = dict([(cx, u'%s' % cn) for cn, cx in enumerate(groups.cat.categories)]) groups.cat.rename_categories(new_cat_names, inplace=True) if restrict_to is not None: groups.index = adata.obs[restrict_key].index else: groups.index = adata.obs_names # add column names adata.obs.loc[:, key_added] = groups # add some unstructured info adata.uns['sbm'] = {} adata.uns['sbm']['stats'] = dict( dS=dS, nattempts=nattempts, nmoves=nmoves, modularity=gt.modularity(g, state.get_blocks()) ) adata.uns['sbm']['state'] = state # now add marginal probabilities. if collect_marginals: # cell marginals will be a list of arrays with probabilities # of belonging to a specific group adata.uns['sbm']['group_marginals'] = group_marginals # calculate log-likelihood of cell moves over the remaining levels adata.uns['sbm']['cell_affinity'] = {'1':get_cell_loglikelihood(state, as_prob=True)} # last step is recording some parameters used in this analysis adata.uns['sbm']['params'] = dict( epsilon=epsilon, wait=wait, nbreaks=nbreaks, equilibrate=equilibrate, fast_model=fast_model, collect_marginals=collect_marginals, random_seed=random_seed ) logg.info( ' finished', time=start, deep=( f'found {state.get_nonempty_B()} clusters and added\n' f' {key_added!r}, the cluster labels (adata.obs, categorical)' ), ) return adata if copy else None
def test_slicing_remove_unused_categories(): adata = AnnData(np.array([[1, 2], [3, 4], [5, 6], [7, 8]]), dict(k=["a", "a", "b", "b"])) adata._sanitize() assert adata[2:4].obs["k"].cat.categories.tolist() == ["b"]
def filter_genes( data: AnnData, min_counts: Optional[int] = None, min_cells: Optional[int] = None, max_counts: Optional[int] = None, max_cells: Optional[int] = None, inplace: bool = True, copy: bool = False, ): """Filter genes based on number of cells or counts. Keep genes that have at least ``min_counts`` counts or are expressed in at least ``min_cells`` cells or have at most ``max_counts`` counts or are expressed in at most ``max_cells`` cells. Only provide one of the optional parameters ``min_counts``, ``min_cells``, ``max_counts``, ``max_cells`` per call. Parameters ---------- data An annotated data matrix of shape `n_obs` × `n_vars`. Rows correspond to cells and columns to genes. min_counts Minimum number of counts required for a gene to pass filtering. min_cells Minimum number of cells expressed required for a gene to pass filtering. max_counts Maximum number of counts required for a gene to pass filtering. max_cells Maximum number of cells expressed required for a gene to pass filtering. inplace Perform computation inplace or return result. Returns ------- `tuple`, `None` Depending on `inplace`, returns the following arrays or directly subsets and annotates the data matrix gene_subset : :class:`~numpy.ndarray` Boolean index mask that does filtering. `True` means that the gene is kept. `False` means the gene is removed. number_per_gene : :class:`~numpy.ndarray` Depending on what was tresholded (`counts` or `cells`), the array stores `n_counts` or `n_cells` per gene. """ if copy: logg.warn('`copy` is deprecated, use `inplace` instead.') n_given_options = sum( option is not None for option in [min_cells, min_counts, max_cells, max_counts]) if n_given_options != 1: raise ValueError( 'Only provide one of the optional parameters `min_counts`,' '`min_cells`, `max_counts`, `max_cells` per call.') if isinstance(data, AnnData): adata = data.copy() if copy else data gene_subset, number = materialize_as_ndarray( filter_genes(adata.X, min_cells=min_cells, min_counts=min_counts, max_cells=max_cells, max_counts=max_counts)) if not inplace: return gene_subset, number if min_cells is None and max_cells is None: adata.var['n_counts'] = number else: adata.var['n_cells'] = number adata._inplace_subset_var(gene_subset) return adata if copy else None X = data # proceed with processing the data matrix min_number = min_counts if min_cells is None else min_cells max_number = max_counts if max_cells is None else max_cells number_per_gene = np.sum(X if min_cells is None and max_cells is None else X > 0, axis=0) if issparse(X): number_per_gene = number_per_gene.A1 if min_number is not None: gene_subset = number_per_gene >= min_number if max_number is not None: gene_subset = number_per_gene <= max_number s = np.sum(~gene_subset) if s > 0: logg.info('filtered out {} genes that are detected'.format(s), end=' ') if min_cells is not None or min_counts is not None: logg.info('in less than', str(min_cells) + ' cells' if min_counts is None else str(min_counts) + ' counts', no_indent=True) if max_cells is not None or max_counts is not None: logg.info('in more than ', str(max_cells) + ' cells' if max_counts is None else str(max_counts) + ' counts', no_indent=True) return gene_subset, number_per_gene
def test_multicol(): adata = AnnData(np.array([[1, 2, 3], [4, 5, 6]])) # 'c' keeps the columns as should be adata.obsm["c"] = np.array([[0.0, 1.0], [2, 3]]) assert adata.obsm_keys() == ["c"] assert adata.obsm["c"].tolist() == [[0.0, 1.0], [2, 3]]
def pca( data: Union[AnnData, np.ndarray, spmatrix], n_comps: int = N_PCS, zero_center: Optional[bool] = True, svd_solver: str = 'auto', random_state: int = 0, return_info: bool = False, use_highly_variable: Optional[bool] = None, dtype: str = 'float32', copy: bool = False, chunked: bool = False, chunk_size: Optional[int] = None, ) -> Union[AnnData, np.ndarray, spmatrix]: """Principal component analysis [Pedregosa11]_. Computes PCA coordinates, loadings and variance decomposition. Uses the implementation of *scikit-learn* [Pedregosa11]_. Parameters ---------- data The (annotated) data matrix of shape ``n_obs`` × ``n_vars``. Rows correspond to cells and columns to genes. n_comps Number of principal components to compute. zero_center If `True`, compute standard PCA from covariance matrix. If ``False``, omit zero-centering variables (uses :class:`~sklearn.decomposition.TruncatedSVD`), which allows to handle sparse input efficiently. Passing ``None`` decides automatically based on sparseness of the data. svd_solver SVD solver to use: ``'arpack'`` for the ARPACK wrapper in SciPy (:func:`~scipy.sparse.linalg.svds`) ``'randomized'`` for the randomized algorithm due to Halko (2009). ``'auto'`` (the default) chooses automatically depending on the size of the problem. random_state Change to use different initial states for the optimization. return_info Only relevant when not passing an :class:`~anndata.AnnData`: see “**Returns**”. use_highly_variable Whether to use highly variable genes only, stored in ``.var['highly_variable']``. By default uses them if they have been determined beforehand. dtype Numpy data type string to which to convert the result. copy If an :class:`~anndata.AnnData` is passed, determines whether a copy is returned. Is ignored otherwise. chunked If ``True``, perform an incremental PCA on segments of ``chunk_size``. The incremental PCA automatically zero centers and ignores settings of ``random_seed`` and ``svd_solver``. If ``False``, perform a full PCA. chunk_size Number of observations to include in each chunk. Required if ``chunked=True`` was passed. Returns ------- X_pca : :class:`scipy.sparse.spmatrix` or :class:`numpy.ndarray` If `data` is array-like and ``return_info=False`` was passed, this function only returns `X_pca`… adata : :class:`~anndata.AnnData` …otherwise if ``copy=True`` it returns or else adds fields to ``adata``: ``.obsm['X_pca']`` PCA representation of data. ``.varm['PCs']`` The principal components containing the loadings. ``.uns['pca']['variance_ratio']``) Ratio of explained variance. ``.uns['pca']['variance']`` Explained variance, equivalent to the eigenvalues of the covariance matrix. """ # chunked calculation is not randomized, anyways if svd_solver in {'auto', 'randomized'} and not chunked: logg.info( 'Note that scikit-learn\'s randomized PCA might not be exactly ' 'reproducible across different computational platforms. For exact ' 'reproducibility, choose `svd_solver=\'arpack\'.` This will likely ' 'become the Scanpy default in the future.') data_is_AnnData = isinstance(data, AnnData) if data_is_AnnData: adata = data.copy() if copy else data else: adata = AnnData(data) logg.msg('computing PCA with n_comps =', n_comps, r=True, v=4) if adata.n_vars < n_comps: n_comps = adata.n_vars - 1 logg.msg('reducing number of computed PCs to', n_comps, 'as dim of data is only', adata.n_vars, v=4) if use_highly_variable is True and 'highly_variable' not in adata.var.keys(): raise ValueError('Did not find adata.var[\'highly_variable\']. ' 'Either your data already only consists of highly-variable genes ' 'or consider running `pp.filter_genes_dispersion` first.') if use_highly_variable is None: use_highly_variable = True if 'highly_variable' in adata.var.keys() else False adata_comp = adata[:, adata.var['highly_variable']] if use_highly_variable else adata if chunked: if not zero_center or random_state or svd_solver != 'auto': logg.msg('Ignoring zero_center, random_state, svd_solver', v=4) from sklearn.decomposition import IncrementalPCA X_pca = np.zeros((adata_comp.X.shape[0], n_comps), adata_comp.X.dtype) pca_ = IncrementalPCA(n_components=n_comps) for chunk, _, _ in adata_comp.chunked_X(chunk_size): chunk = chunk.toarray() if issparse(chunk) else chunk pca_.partial_fit(chunk) for chunk, start, end in adata_comp.chunked_X(chunk_size): chunk = chunk.toarray() if issparse(chunk) else chunk X_pca[start:end] = pca_.transform(chunk) else: if zero_center is None: zero_center = not issparse(adata_comp.X) if zero_center: from sklearn.decomposition import PCA if issparse(adata_comp.X): logg.msg(' as `zero_center=True`, ' 'sparse input is densified and may ' 'lead to huge memory consumption', v=4) X = adata_comp.X.toarray() # Copying the whole adata_comp.X here, could cause memory problems else: X = adata_comp.X pca_ = PCA(n_components=n_comps, svd_solver=svd_solver, random_state=random_state) else: from sklearn.decomposition import TruncatedSVD logg.msg(' without zero-centering: \n' ' the explained variance does not correspond to the exact statistical defintion\n' ' the first component, e.g., might be heavily influenced by different means\n' ' the following components often resemble the exact PCA very closely', v=4) pca_ = TruncatedSVD(n_components=n_comps, random_state=random_state) X = adata_comp.X X_pca = pca_.fit_transform(X) if X_pca.dtype.descr != np.dtype(dtype).descr: X_pca = X_pca.astype(dtype) if data_is_AnnData: adata.obsm['X_pca'] = X_pca if use_highly_variable: adata.varm['PCs'] = np.zeros(shape=(adata.n_vars, n_comps)) adata.varm['PCs'][adata.var['highly_variable']] = pca_.components_.T else: adata.varm['PCs'] = pca_.components_.T adata.uns['pca'] = {} adata.uns['pca']['variance'] = pca_.explained_variance_ adata.uns['pca']['variance_ratio'] = pca_.explained_variance_ratio_ logg.msg(' finished', t=True, end=' ', v=4) logg.msg('and added\n' ' \'X_pca\', the PCA coordinates (adata.obs)\n' ' \'PC1\', \'PC2\', ..., the loadings (adata.var)\n' ' \'pca_variance\', the variance / eigenvalues (adata.uns)\n' ' \'pca_variance_ratio\', the variance ratio (adata.uns)', v=4) return adata if copy else None else: if return_info: return X_pca, pca_.components_, pca_.explained_variance_ratio_, pca_.explained_variance_ else: return X_pca
def test_n_obs(): adata = AnnData(np.array([[1, 2], [3, 4], [5, 6]])) assert adata.n_obs == 3 adata1 = adata[:2] assert adata1.n_obs == 2
def leiden( adata: AnnData, resolution: float = 1, *, restrict_to: Optional[Tuple[str, Sequence[str]]] = None, random_state: Optional[Union[int, RandomState]] = 0, key_added: str = 'leiden', adjacency: Optional[sparse.spmatrix] = None, directed: bool = True, use_weights: bool = True, n_iterations: int = -1, partition_type: Optional[Type[MutableVertexPartition]] = None, copy: bool = False, **partition_kwargs, ) -> Optional[AnnData]: """\ Cluster cells into subgroups [Traag18]_. Cluster cells using the Leiden algorithm [Traag18]_, an improved version of the Louvain algorithm [Blondel08]_. It has been proposed for single-cell analysis by [Levine15]_. This requires having ran :func:`~scanpy.pp.neighbors` or :func:`~scanpy.external.pp.bbknn` first. Parameters ---------- adata The annotated data matrix. resolution A parameter value controlling the coarseness of the clustering. Higher values lead to more clusters. Set to `None` if overriding `partition_type` to one that doesn’t accept a `resolution_parameter`. random_state Change the initialization of the optimization. restrict_to Restrict the clustering to the categories within the key for sample annotation, tuple needs to contain `(obs_key, list_of_categories)`. key_added `adata.obs` key under which to add the cluster labels. adjacency Sparse adjacency matrix of the graph, defaults to `adata.uns['neighbors']['connectivities']`. directed Whether to treat the graph as directed or undirected. use_weights If `True`, edge weights from the graph are used in the computation (placing more emphasis on stronger edges). n_iterations How many iterations of the Leiden clustering algorithm to perform. Positive values above 2 define the total number of iterations to perform, -1 has the algorithm run until it reaches its optimal clustering. partition_type Type of partition to use. Defaults to :class:`~leidenalg.RBConfigurationVertexPartition`. For the available options, consult the documentation for :func:`~leidenalg.find_partition`. copy Whether to copy `adata` or modify it inplace. **partition_kwargs Any further arguments to pass to `~leidenalg.find_partition` (which in turn passes arguments to the `partition_type`). Returns ------- `adata.obs[key_added]` Array of dim (number of samples) that stores the subgroup id (`'0'`, `'1'`, ...) for each cell. `adata.uns['leiden']['params']` A dict with the values for the parameters `resolution`, `random_state`, and `n_iterations`. """ try: import leidenalg except ImportError: raise ImportError( 'Please install the leiden algorithm: `conda install -c conda-forge leidenalg` or `pip3 install leidenalg`.' ) partition_kwargs = dict(partition_kwargs) start = logg.info('running Leiden clustering') adata = adata.copy() if copy else adata # are we clustering a user-provided graph or the default AnnData one? if adjacency is None: if 'neighbors' not in adata.uns: raise ValueError( 'You need to run `pp.neighbors` first ' 'to compute a neighborhood graph.' ) adjacency = adata.uns['neighbors']['connectivities'] if restrict_to is not None: restrict_key, restrict_categories = restrict_to adjacency, restrict_indices = restrict_adjacency( adata, restrict_key, restrict_categories, adjacency, ) # convert it to igraph g = _utils.get_igraph_from_adjacency(adjacency, directed=directed) # flip to the default partition type if not overriden by the user if partition_type is None: partition_type = leidenalg.RBConfigurationVertexPartition # Prepare find_partition arguments as a dictionary, # appending to whatever the user provided. It needs to be this way # as this allows for the accounting of a None resolution # (in the case of a partition variant that doesn't take it on input) if use_weights: partition_kwargs['weights'] = np.array(g.es['weight']).astype(np.float64) partition_kwargs['n_iterations'] = n_iterations partition_kwargs['seed'] = random_state if resolution is not None: partition_kwargs['resolution_parameter'] = resolution # clustering proper part = leidenalg.find_partition(g, partition_type, **partition_kwargs) # store output into adata.obs groups = np.array(part.membership) if restrict_to is not None: if key_added == 'leiden': key_added += '_R' groups = rename_groups( adata, key_added, restrict_key, restrict_categories, restrict_indices, groups, ) adata.obs[key_added] = pd.Categorical( values=groups.astype('U'), categories=natsorted(np.unique(groups).astype('U')), ) # store information on the clustering parameters adata.uns['leiden'] = {} adata.uns['leiden']['params'] = dict( resolution=resolution, random_state=random_state, n_iterations=n_iterations, ) logg.info( ' finished', time=start, deep=( f'found {len(np.unique(groups))} clusters and added\n' f' {key_added!r}, the cluster labels (adata.obs, categorical)' ), ) return adata if copy else None
def test_extract_data_raw_None(self, adata: AnnData): adata = AnnData(adata.X, raw=None) with pytest.raises(ValueError): _extract_data(adata, use_raw=True)
def rank_genes_groups(adata: AnnData, groupby: str, use_raw: bool = True, groups: Union[str, Iterable[str]] = 'all', reference: str = 'rest', n_genes: int = 100, rankby_abs: bool = False, key_added: Optional[str] = None, copy: bool = False, method: str = 't-test_overestim_var', corr_method: str = 'benjamini-hochberg', **kwds): """Rank genes for characterizing groups. Parameters ---------- adata Annotated data matrix. groupby The key of the observations grouping to consider. use_raw : `bool`, optional (default: `True`) Use `raw` attribute of `adata` if present. groups Subset of groups, e.g. `['g1', 'g2', 'g3']`, to which comparison shall be restricted, or `'all'` (default), for all groups. reference If `'rest'`, compare each group to the union of the rest of the group. If a group identifier, compare with respect to this group. n_genes The number of genes that appear in the returned tables. method : `{'logreg', 't-test', 'wilcoxon', 't-test_overestim_var'}`, optional (default: 't-test_overestim_var') If 't-test', uses t-test, if 'wilcoxon', uses Wilcoxon-Rank-Sum. If 't-test_overestim_var', overestimates variance of each group. If 'logreg' uses logistic regression, see [Ntranos18]_, `here <https://github.com/theislab/scanpy/issues/95>`__ and `here <http://www.nxn.se/valent/2018/3/5/actionable-scrna-seq-clusters>`__, for why this is meaningful. corr_method : `{'benjamini-hochberg', 'bonferroni'}`, optional (default: 'benjamini-hochberg') p-value correction method. Used only for 't-test', 't-test_overestim_var', and 'wilcoxon' methods. rankby_abs Rank genes by the absolute value of the score, not by the score. The returned scores are never the absolute values. key_added The key in `adata.uns` information is saved to. **kwds : keyword parameters Are passed to test methods. Currently this affects only parameters that are passed to `sklearn.linear_model.LogisticRegression <http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html>`__. For instance, you can pass `penalty='l1'` to try to come up with a minimal set of genes that are good predictors (sparse solution meaning few non-zero fitted coefficients). Returns ------- **names** : structured `np.ndarray` (`.uns['rank_genes_groups']`) Structured array to be indexed by group id storing the gene names. Ordered according to scores. **scores** : structured `np.ndarray` (`.uns['rank_genes_groups']`) Structured array to be indexed by group id storing the z-score underlying the computation of a p-value for each gene for each group. Ordered according to scores. **logfoldchanges** : structured `np.ndarray` (`.uns['rank_genes_groups']`) Structured array to be indexed by group id storing the log2 fold change for each gene for each group. Ordered according to scores. Only provided if method is 't-test' like. Note: this is an approximation calculated from mean-log values. **pvals** : structured `np.ndarray` (`.uns['rank_genes_groups']`) p-values. **pvals_adj** : structured `np.ndarray` (`.uns['rank_genes_groups']`) Corrected p-values. Notes ----- There are slight inconsistencies depending on whether sparse or dense data are passed. See `here <https://github.com/theislab/scanpy/blob/master/scanpy/tests/test_rank_genes_groups.py>`__. Examples -------- >>> adata = sc.datasets.pbmc68k_reduced() >>> sc.tl.rank_genes_groups(adata, 'bulk_labels', method='wilcoxon') # to visualize the results >>> sc.pl.rank_genes_groups(adata) """ if 'only_positive' in kwds: rankby_abs = not kwds.pop('only_positive') # backwards compat start = logg.info('ranking genes') avail_methods = {'t-test', 't-test_overestim_var', 'wilcoxon', 'logreg'} if method not in avail_methods: raise ValueError('Method must be one of {}.'.format(avail_methods)) avail_corr = {'benjamini-hochberg', 'bonferroni'} if corr_method not in avail_corr: raise ValueError( 'Correction method must be one of {}.'.format(avail_corr)) adata = adata.copy() if copy else adata utils.sanitize_anndata(adata) # for clarity, rename variable if groups == 'all': groups_order = 'all' elif isinstance(groups, (str, int)): raise ValueError('Specify a sequence of groups') else: groups_order = list(groups) if isinstance(groups_order[0], int): groups_order = [str(n) for n in groups_order] if reference != 'rest' and reference not in set(groups_order): groups_order += [reference] if (reference != 'rest' and reference not in set(adata.obs[groupby].cat.categories)): cats = adata.obs[groupby].cat.categories.tolist() raise ValueError( f'reference = {reference} needs to be one of groupby = {cats}.') groups_order, groups_masks = utils.select_groups(adata, groups_order, groupby) if key_added is None: key_added = 'rank_genes_groups' adata.uns[key_added] = {} adata.uns[key_added]['params'] = { 'groupby': groupby, 'reference': reference, 'method': method, 'use_raw': use_raw, 'corr_method': corr_method, } # adata_comp mocks an AnnData object if use_raw is True # otherwise it's just the AnnData object adata_comp = adata if adata.raw is not None and use_raw: adata_comp = adata.raw X = adata_comp.X # for clarity, rename variable n_genes_user = n_genes # make sure indices are not OoB in case there are less genes than n_genes if n_genes_user > X.shape[1]: n_genes_user = X.shape[1] # in the following, n_genes is simply another name for the total number of genes n_genes = X.shape[1] n_groups = groups_masks.shape[0] ns = np.zeros(n_groups, dtype=int) for imask, mask in enumerate(groups_masks): ns[imask] = np.where(mask)[0].size logg.debug(f'consider {groupby!r} groups:') logg.debug(f'with sizes: {ns}') if reference != 'rest': ireference = np.where(groups_order == reference)[0][0] reference_indices = np.arange(adata_comp.n_vars, dtype=int) rankings_gene_scores = [] rankings_gene_names = [] rankings_gene_logfoldchanges = [] rankings_gene_pvals = [] rankings_gene_pvals_adj = [] if method in {'t-test', 't-test_overestim_var'}: from scipy import stats from statsmodels.stats.multitest import multipletests # loop over all masks and compute means, variances and sample numbers means = np.zeros((n_groups, n_genes)) vars = np.zeros((n_groups, n_genes)) for imask, mask in enumerate(groups_masks): means[imask], vars[imask] = _get_mean_var(X[mask]) # test each either against the union of all other groups or against a # specific group for igroup in range(n_groups): if reference == 'rest': mask_rest = ~groups_masks[igroup] else: if igroup == ireference: continue else: mask_rest = groups_masks[ireference] mean_group, var_group = means[igroup], vars[igroup] mean_rest, var_rest = _get_mean_var(X[mask_rest]) ns_group = ns[igroup] # number of observations in group if method == 't-test': ns_rest = np.where(mask_rest)[0].size elif method == 't-test_overestim_var': ns_rest = ns[ igroup] # hack for overestimating the variance for small groups else: raise ValueError('Method does not exist.') # TODO: Come up with better solution. Mask unexpressed genes? # See https://github.com/scipy/scipy/issues/10269 with np.errstate(invalid="ignore"): scores, pvals = stats.ttest_ind_from_stats( mean1=mean_group, std1=np.sqrt(var_group), nobs1=ns_group, mean2=mean_rest, std2=np.sqrt(var_rest), nobs2=ns_rest, equal_var=False # Welch's ) # Fold change foldchanges = (np.expm1(mean_group) + 1e-9) / ( np.expm1(mean_rest) + 1e-9) # add small value to remove 0's scores[np.isnan( scores )] = 0 # I think it's only nan when means are the same and vars are 0 pvals[np.isnan( pvals)] = 1 # This also has to happen for Benjamini Hochberg if corr_method == 'benjamini-hochberg': _, pvals_adj, _, _ = multipletests(pvals, alpha=0.05, method='fdr_bh') elif corr_method == 'bonferroni': pvals_adj = np.minimum(pvals * n_genes, 1.0) scores_sort = np.abs(scores) if rankby_abs else scores partition = np.argpartition(scores_sort, -n_genes_user)[-n_genes_user:] partial_indices = np.argsort(scores_sort[partition])[::-1] global_indices = reference_indices[partition][partial_indices] rankings_gene_scores.append(scores[global_indices]) rankings_gene_logfoldchanges.append( np.log2(foldchanges[global_indices])) rankings_gene_names.append(adata_comp.var_names[global_indices]) rankings_gene_pvals.append(pvals[global_indices]) rankings_gene_pvals_adj.append(pvals_adj[global_indices]) elif method == 'logreg': # if reference is not set, then the groups listed will be compared to the rest # if reference is set, then the groups listed will be compared only to the other groups listed from sklearn.linear_model import LogisticRegression reference = groups_order[0] if len(groups) == 1: raise Exception( 'Cannot perform logistic regression on a single cluster.') adata_copy = adata[adata.obs[groupby].isin(groups_order)] adata_comp = adata_copy if adata.raw is not None and use_raw: adata_comp = adata_copy.raw X = adata_comp.X clf = LogisticRegression(**kwds) clf.fit(X, adata_copy.obs[groupby].cat.codes) scores_all = clf.coef_ for igroup, group in enumerate(groups_order): if len(groups_order) <= 2: # binary logistic regression scores = scores_all[0] else: scores = scores_all[igroup] partition = np.argpartition(scores, -n_genes_user)[-n_genes_user:] partial_indices = np.argsort(scores[partition])[::-1] global_indices = reference_indices[partition][partial_indices] rankings_gene_scores.append(scores[global_indices]) rankings_gene_names.append(adata_comp.var_names[global_indices]) if len(groups_order) <= 2: break elif method == 'wilcoxon': from scipy import stats from statsmodels.stats.multitest import multipletests CONST_MAX_SIZE = 10000000 means = np.zeros((n_groups, n_genes)) vars = np.zeros((n_groups, n_genes)) # initialize space for z-scores scores = np.zeros(n_genes) # First loop: Loop over all genes if reference != 'rest': for imask, mask in enumerate(groups_masks): means[imask], vars[imask] = _get_mean_var( X[mask]) # for fold-change only if imask == ireference: continue else: mask_rest = groups_masks[ireference] ns_rest = np.where(mask_rest)[0].size mean_rest, var_rest = _get_mean_var( X[mask_rest]) # for fold-change only if ns_rest <= 25 or ns[imask] <= 25: logg.hint( 'Few observations in a group for ' 'normal approximation (<=25). Lower test accuracy.') n_active = ns[imask] m_active = ns_rest # Now calculate gene expression ranking in chunkes: chunk = [] # Calculate chunk frames n_genes_max_chunk = floor(CONST_MAX_SIZE / (n_active + m_active)) if n_genes_max_chunk < n_genes: chunk_index = n_genes_max_chunk while chunk_index < n_genes: chunk.append(chunk_index) chunk_index = chunk_index + n_genes_max_chunk chunk.append(n_genes) else: chunk.append(n_genes) left = 0 # Calculate rank sums for each chunk for the current mask for chunk_index, right in enumerate(chunk): # Check if issparse is true: AnnData objects are currently sparse.csr or ndarray. if issparse(X): df1 = pd.DataFrame(data=X[mask, left:right].todense()) df2 = pd.DataFrame( data=X[mask_rest, left:right].todense(), index=np.arange(start=n_active, stop=n_active + m_active)) else: df1 = pd.DataFrame(data=X[mask, left:right]) df2 = pd.DataFrame(data=X[mask_rest, left:right], index=np.arange(start=n_active, stop=n_active + m_active)) df1 = df1.append(df2) ranks = df1.rank() # sum up adjusted_ranks to calculate W_m,n scores[left:right] = np.sum(ranks.loc[0:n_active, :]) left = right scores = (scores - (n_active * (n_active + m_active + 1) / 2)) / sqrt( (n_active * m_active * (n_active + m_active + 1) / 12)) scores[np.isnan(scores)] = 0 pvals = 2 * stats.distributions.norm.sf(np.abs(scores)) if corr_method == 'benjamini-hochberg': pvals[np.isnan( pvals )] = 1 # set Nan values to 1 to properly convert using Benhjamini Hochberg _, pvals_adj, _, _ = multipletests(pvals, alpha=0.05, method='fdr_bh') elif corr_method == 'bonferroni': pvals_adj = np.minimum(pvals * n_genes, 1.0) # Fold change foldchanges = (np.expm1(means[imask]) + 1e-9) / ( np.expm1(mean_rest) + 1e-9 ) # add small value to remove 0's scores_sort = np.abs(scores) if rankby_abs else scores partition = np.argpartition(scores_sort, -n_genes_user)[-n_genes_user:] partial_indices = np.argsort(scores_sort[partition])[::-1] global_indices = reference_indices[partition][partial_indices] rankings_gene_scores.append(scores[global_indices]) rankings_gene_names.append( adata_comp.var_names[global_indices]) rankings_gene_logfoldchanges.append( np.log2(foldchanges[global_indices])) rankings_gene_pvals.append(pvals[global_indices]) rankings_gene_pvals_adj.append(pvals_adj[global_indices]) # If no reference group exists, ranking needs only to be done once (full mask) else: scores = np.zeros((n_groups, n_genes)) chunk = [] n_cells = X.shape[0] n_genes_max_chunk = floor(CONST_MAX_SIZE / n_cells) if n_genes_max_chunk < n_genes: chunk_index = n_genes_max_chunk while chunk_index < n_genes: chunk.append(chunk_index) chunk_index = chunk_index + n_genes_max_chunk chunk.append(n_genes) else: chunk.append(n_genes) left = 0 for chunk_index, right in enumerate(chunk): # Check if issparse is true if issparse(X): df1 = pd.DataFrame(data=X[:, left:right].todense()) else: df1 = pd.DataFrame(data=X[:, left:right]) ranks = df1.rank() # sum up adjusted_ranks to calculate W_m,n for imask, mask in enumerate(groups_masks): scores[imask, left:right] = np.sum(ranks.loc[mask, :]) left = right for imask, mask in enumerate(groups_masks): mask_rest = ~groups_masks[imask] means[imask], vars[imask] = _get_mean_var( X[mask]) #for fold-change mean_rest, var_rest = _get_mean_var( X[mask_rest]) # for fold-change scores[imask, :] = (scores[imask, :] - (ns[imask] * (n_cells + 1) / 2)) / sqrt( (ns[imask] * (n_cells - ns[imask]) * (n_cells + 1) / 12)) scores[np.isnan(scores)] = 0 pvals = 2 * stats.distributions.norm.sf( np.abs(scores[imask, :])) if corr_method == 'benjamini-hochberg': pvals[np.isnan( pvals )] = 1 # set Nan values to 1 to properly convert using Benhjamini Hochberg _, pvals_adj, _, _ = multipletests(pvals, alpha=0.05, method='fdr_bh') elif corr_method == 'bonferroni': pvals_adj = np.minimum(pvals * n_genes, 1.0) # Fold change foldchanges = (np.expm1(means[imask]) + 1e-9) / ( np.expm1(mean_rest) + 1e-9 ) # add small value to remove 0's scores_sort = np.abs(scores) if rankby_abs else scores partition = np.argpartition(scores_sort[imask, :], -n_genes_user)[-n_genes_user:] partial_indices = np.argsort(scores_sort[imask, partition])[::-1] global_indices = reference_indices[partition][partial_indices] rankings_gene_scores.append(scores[imask, global_indices]) rankings_gene_names.append( adata_comp.var_names[global_indices]) rankings_gene_logfoldchanges.append( np.log2(foldchanges[global_indices])) rankings_gene_pvals.append(pvals[global_indices]) rankings_gene_pvals_adj.append(pvals_adj[global_indices]) groups_order_save = [str(g) for g in groups_order] if (reference != 'rest' and method != 'logreg') or (method == 'logreg' and len(groups) == 2): groups_order_save = [g for g in groups_order if g != reference] adata.uns[key_added]['scores'] = np.rec.fromarrays( [n for n in rankings_gene_scores], dtype=[(rn, 'float32') for rn in groups_order_save]) adata.uns[key_added]['names'] = np.rec.fromarrays( [n for n in rankings_gene_names], dtype=[(rn, 'U50') for rn in groups_order_save]) if method in {'t-test', 't-test_overestim_var', 'wilcoxon'}: adata.uns[key_added]['logfoldchanges'] = np.rec.fromarrays( [n for n in rankings_gene_logfoldchanges], dtype=[(rn, 'float32') for rn in groups_order_save]) adata.uns[key_added]['pvals'] = np.rec.fromarrays( [n for n in rankings_gene_pvals], dtype=[(rn, 'float64') for rn in groups_order_save]) adata.uns[key_added]['pvals_adj'] = np.rec.fromarrays( [n for n in rankings_gene_pvals_adj], dtype=[(rn, 'float64') for rn in groups_order_save]) logg.info( ' finished', time=start, deep= (f'added to `.uns[{key_added!r}]`\n' " 'names', sorted np.recarray to be indexed by group ids\n" " 'scores', sorted np.recarray to be indexed by group ids\n" + (" 'logfoldchanges', sorted np.recarray to be indexed by group ids\n" " 'pvals', sorted np.recarray to be indexed by group ids\n" " 'pvals_adj', sorted np.recarray to be indexed by group ids" if method in {'t-test', 't-test_overestim_var', 'wilcoxon'} else '')), ) return adata if copy else None
def gen_adata( shape: Tuple[int, int], X_type=sparse.csr_matrix, X_dtype=np.float32, # obs_dtypes, # var_dtypes, obsm_types: "Collection[Type]" = (sparse.csr_matrix, np.ndarray, pd.DataFrame), varm_types: "Collection[Type]" = (sparse.csr_matrix, np.ndarray, pd.DataFrame), layers_types: "Collection[Type]" = (sparse.csr_matrix, np.ndarray, pd.DataFrame), ) -> AnnData: """\ Helper function to generate a random AnnData for testing purposes. Note: For `obsm_types`, `varm_types`, and `layers_types` these currently just filter already created objects. In future, these should choose which objects are created. Params ------ shape What shape you want the anndata to be. X_type What kind of container should `X` be? This will be called on a randomly generated 2d array. X_dtype What should the dtype of the `.X` container be? obsm_types What kinds of containers should be in `.obsm`? varm_types What kinds of containers should be in `.varm`? layers_types What kinds of containers should be in `.layers`? """ M, N = shape obs_names = pd.Index(f"cell{i}" for i in range(shape[0])) var_names = pd.Index(f"gene{i}" for i in range(shape[1])) obs = gen_typed_df(M, obs_names) var = gen_typed_df(N, var_names) # For #147 obs.rename(columns=dict(cat="obs_cat"), inplace=True) var.rename(columns=dict(cat="var_cat"), inplace=True) if X_type is None: X = None else: X = X_type(np.random.binomial(100, 0.005, (M, N)).astype(X_dtype)) obsm = dict( array=np.random.random((M, 50)), sparse=sparse.random(M, 100, format="csr"), df=gen_typed_df(M, obs_names), ) obsm = {k: v for k, v in obsm.items() if type(v) in obsm_types} varm = dict( array=np.random.random((N, 50)), sparse=sparse.random(N, 100, format="csr"), df=gen_typed_df(N, var_names), ) varm = {k: v for k, v in varm.items() if type(v) in varm_types} layers = dict(array=np.random.random((M, N)), sparse=sparse.random(M, N, format="csr")) layers = {k: v for k, v in layers.items() if type(v) in layers_types} obsp = dict(array=np.random.random((M, M)), sparse=sparse.random(M, M, format="csr")) varp = dict(array=np.random.random((N, N)), sparse=sparse.random(N, N, format="csr")) uns = dict( O_recarray=gen_vstr_recarray(N, 5), nested=dict( scalar_str="str", scalar_int=42, scalar_float=3.0, nested_further=dict(array=np.arange(5)), ), # U_recarray=gen_vstr_recarray(N, 5, "U4") ) adata = AnnData( X=X, obs=obs, var=var, obsm=obsm, varm=varm, layers=layers, obsp=obsp, varp=varp, dtype=X_dtype, uns=uns, ) return adata
def test_multicol(): adata = AnnData(np.array([[1, 2, 3], [4, 5, 6]])) # 'c' keeps the columns as should be adata.obsm['c'] = np.array([[0., 1.], [2, 3]]) assert adata.obsm_keys() == ['c'] assert adata.obsm['c'].tolist() == [[0., 1.], [2, 3]]
def marker_gene_overlap( adata: AnnData, reference_markers: Union[Dict[str, set], Dict[str, list]], *, key: str = 'rank_genes_groups', method: _Method = 'overlap_count', normalize: Optional[Literal['reference', 'data']] = None, top_n_markers: Optional[int] = None, adj_pval_threshold: Optional[float] = None, key_added: str = 'marker_gene_overlap', inplace: bool = False, ): """\ Calculate an overlap score between data-deriven marker genes and provided markers Marker gene overlap scores can be quoted as overlap counts, overlap coefficients, or jaccard indices. The method returns a pandas dataframe which can be used to annotate clusters based on marker gene overlaps. This function was written by Malte Luecken. Parameters ---------- adata The annotated data matrix. reference_markers A marker gene dictionary object. Keys should be strings with the cell identity name and values are sets or lists of strings which match format of `adata.var_name`. key The key in `adata.uns` where the rank_genes_groups output is stored. By default this is `'rank_genes_groups'`. method (default: `overlap_count`) Method to calculate marker gene overlap. `'overlap_count'` uses the intersection of the gene set, `'overlap_coef'` uses the overlap coefficient, and `'jaccard'` uses the Jaccard index. normalize Normalization option for the marker gene overlap output. This parameter can only be set when `method` is set to `'overlap_count'`. `'reference'` normalizes the data by the total number of marker genes given in the reference annotation per group. `'data'` normalizes the data by the total number of marker genes used for each cluster. top_n_markers The number of top data-derived marker genes to use. By default the top 100 marker genes are used. If `adj_pval_threshold` is set along with `top_n_markers`, then `adj_pval_threshold` is ignored. adj_pval_threshold A significance threshold on the adjusted p-values to select marker genes. This can only be used when adjusted p-values are calculated by `sc.tl.rank_genes_groups()`. If `adj_pval_threshold` is set along with `top_n_markers`, then `adj_pval_threshold` is ignored. key_added Name of the `.uns` field that will contain the marker overlap scores. inplace Return a marker gene dataframe or store it inplace in `adata.uns`. Returns ------- A pandas dataframe with the marker gene overlap scores if `inplace=False`. For `inplace=True` `adata.uns` is updated with an additional field specified by the `key_added` parameter (default = 'marker_gene_overlap'). Examples -------- >>> import scanpy as sc >>> adata = sc.datasets.pbmc68k_reduced() >>> sc.pp.pca(adata, svd_solver='arpack') >>> sc.pp.neighbors(adata) >>> sc.tl.louvain(adata) >>> sc.tl.rank_genes_groups(adata, groupby='louvain') >>> marker_genes = { ... 'CD4 T cells': {'IL7R'}, ... 'CD14+ Monocytes': {'CD14', 'LYZ'}, ... 'B cells': {'MS4A1'}, ... 'CD8 T cells': {'CD8A'}, ... 'NK cells': {'GNLY', 'NKG7'}, ... 'FCGR3A+ Monocytes': {'FCGR3A', 'MS4A7'}, ... 'Dendritic Cells': {'FCER1A', 'CST3'}, ... 'Megakaryocytes': {'PPBP'} ... } >>> marker_matches = sc.tl.marker_gene_overlap(adata, marker_genes) """ # Test user inputs if inplace: raise NotImplementedError( 'Writing Pandas dataframes to h5ad is currently under development.' '\nPlease use `inplace=False`.') if key not in adata.uns: raise ValueError('Could not find marker gene data. ' 'Please run `sc.tl.rank_genes_groups()` first.') avail_methods = {'overlap_count', 'overlap_coef', 'jaccard', 'enrich'} if method not in avail_methods: raise ValueError(f'Method must be one of {avail_methods}.') if normalize == 'None': normalize = None avail_norm = {'reference', 'data', None} if normalize not in avail_norm: raise ValueError(f'Normalize must be one of {avail_norm}.') if normalize is not None and method != 'overlap_count': raise ValueError('Can only normalize with method=`overlap_count`.') if not all( isinstance(val, cabc.Set) for val in reference_markers.values()): try: reference_markers = { key: set(val) for key, val in reference_markers.items() } except Exception: raise ValueError('Please ensure that `reference_markers` contains ' 'sets or lists of markers as values.') if adj_pval_threshold is not None: if 'pvals_adj' not in adata.uns[key]: raise ValueError('Could not find adjusted p-value data. ' 'Please run `sc.tl.rank_genes_groups()` with a ' 'method that outputs adjusted p-values.') if adj_pval_threshold < 0: logg.warning( '`adj_pval_threshold` was set below 0. Threshold will be set to 0.' ) adj_pval_threshold = 0 elif adj_pval_threshold > 1: logg.warning( '`adj_pval_threshold` was set above 1. Threshold will be set to 1.' ) adj_pval_threshold = 1 if top_n_markers is not None: logg.warning( 'Both `adj_pval_threshold` and `top_n_markers` is set. ' '`adj_pval_threshold` will be ignored.') if top_n_markers is not None and top_n_markers < 1: logg.warning( '`top_n_markers` was set below 1. `top_n_markers` will be set to 1.' ) top_n_markers = 1 # Get data-derived marker genes in a dictionary of sets data_markers = dict() cluster_ids = adata.uns[key]['names'].dtype.names for group in cluster_ids: if top_n_markers is not None: n_genes = min(top_n_markers, adata.uns[key]['names'].shape[0]) data_markers[group] = set(adata.uns[key]['names'][group][:n_genes]) elif adj_pval_threshold is not None: n_genes = (adata.uns[key]['pvals_adj'][group] < adj_pval_threshold).sum() data_markers[group] = set(adata.uns[key]['names'][group][:n_genes]) if n_genes == 0: logg.warning( 'No marker genes passed the significance threshold of ' f'{adj_pval_threshold} for cluster {group!r}.') # Use top 100 markers as default if top_n_markers = None else: data_markers[group] = set(adata.uns[key]['names'][group][:100]) # Find overlaps if method == 'overlap_count': marker_match = _calc_overlap_count(reference_markers, data_markers) if normalize == 'reference': # Ensure rows sum to 1 ref_lengths = np.array([ len(reference_markers[m_group]) for m_group in reference_markers ]) marker_match = marker_match / ref_lengths[:, np.newaxis] marker_match = np.nan_to_num(marker_match) elif normalize == 'data': # Ensure columns sum to 1 data_lengths = np.array( [len(data_markers[dat_group]) for dat_group in data_markers]) marker_match = marker_match / data_lengths marker_match = np.nan_to_num(marker_match) elif method == 'overlap_coef': marker_match = _calc_overlap_coef(reference_markers, data_markers) elif method == 'jaccard': marker_match = _calc_jaccard(reference_markers, data_markers) # Note: # Could add an 'enrich' option here # (fisher's exact test or hypergeometric test), # but that would require knowledge of the size of the space from which # the reference marker gene set was taken. # This is at best approximately known. # Create a pandas dataframe with the results marker_groups = list(reference_markers.keys()) clusters = list(cluster_ids) marker_matching_df = pd.DataFrame(marker_match, index=marker_groups, columns=clusters) # Store the results if inplace: adata.uns[key_added] = marker_matching_df logg.hint( f'added\n {key_added!r}, marker overlap scores (adata.uns)') else: return marker_matching_df
def test_concatenate_dense(): # dense data X1 = np.array([[1, 2, 3], [4, 5, 6]]) X2 = np.array([[1, 2, 3], [4, 5, 6]]) X3 = np.array([[1, 2, 3], [4, 5, 6]]) adata1 = AnnData( X1, dict(obs_names=["s1", "s2"], anno1=["c1", "c2"]), dict(var_names=["a", "b", "c"], annoA=[0, 1, 2]), obsm=dict(X_1=X1, X_2=X2, X_3=X3), layers=dict(Xs=X1), ) adata2 = AnnData( X2, dict(obs_names=["s3", "s4"], anno1=["c3", "c4"]), dict(var_names=["d", "c", "b"], annoA=[0, 1, 2]), obsm=dict(X_1=X1, X_2=X2, X_3=X3), layers={"Xs": X2}, ) adata3 = AnnData( X3, dict(obs_names=["s1", "s2"], anno2=["d3", "d4"]), dict(var_names=["d", "c", "b"], annoB=[0, 1, 2]), obsm=dict(X_1=X1, X_2=X2), layers=dict(Xs=X3), ) # inner join adata = adata1.concatenate(adata2, adata3) X_combined = [[2, 3], [5, 6], [3, 2], [6, 5], [3, 2], [6, 5]] assert adata.X.astype(int).tolist() == X_combined assert adata.layers["Xs"].astype(int).tolist() == X_combined assert adata.obs_keys() == ["anno1", "anno2", "batch"] assert adata.var_keys() == ["annoA-0", "annoA-1", "annoB-2"] assert adata.var.values.tolist() == [[1, 2, 2], [2, 1, 1]] assert adata.obsm_keys() == ["X_1", "X_2"] assert adata.obsm["X_1"].tolist() == np.concatenate([X1, X1, X1]).tolist() # with batch_key and batch_categories adata = adata1.concatenate(adata2, adata3, batch_key="batch1") assert adata.obs_keys() == ["anno1", "anno2", "batch1"] adata = adata1.concatenate(adata2, adata3, batch_categories=["a1", "a2", "a3"]) assert adata.obs["batch"].cat.categories.tolist() == ["a1", "a2", "a3"] assert adata.var_names.tolist() == ["b", "c"] # outer join adata = adata1.concatenate(adata2, adata3, join="outer") from numpy import ma Xma = ma.masked_invalid(adata.X) Xma_ref = ma.masked_invalid( np.array([ [1.0, 2.0, 3.0, np.nan], [4.0, 5.0, 6.0, np.nan], [np.nan, 3.0, 2.0, 1.0], [np.nan, 6.0, 5.0, 4.0], [np.nan, 3.0, 2.0, 1.0], [np.nan, 6.0, 5.0, 4.0], ])) assert np.array_equal(Xma.mask, Xma_ref.mask) assert np.allclose(Xma.compressed(), Xma_ref.compressed()) var_ma = ma.masked_invalid(adata.var.values.tolist()) var_ma_ref = ma.masked_invalid( np.array([ [0.0, np.nan, np.nan], [1.0, 2.0, 2.0], [2.0, 1.0, 1.0], [np.nan, 0.0, 0.0], ])) assert np.array_equal(var_ma.mask, var_ma_ref.mask) assert np.allclose(var_ma.compressed(), var_ma_ref.compressed())
def from_scvi_model( cls, scvi_model: SCVI, adata: Optional[AnnData] = None, restrict_to_batch: Optional[str] = None, ): """ Instantiate a SOLO model from an scvi model. Parameters ---------- scvi_model Pre-trained model of :class:`~scvi.model.SCVI`. The adata object used to initialize this model should have only been setup with count data, and optionally a `batch_key`; i.e., no extra covariates or labels, etc. adata Optional anndata to use that is compatible with scvi_model. restrict_to_batch Batch category in `batch_key` used to setup adata for scvi_model to restrict Solo model to. This allows to train a Solo model on one batch of a scvi_model that was trained on multiple batches. Returns ------- SOLO model """ _validate_scvi_model(scvi_model, restrict_to_batch=restrict_to_batch) orig_adata = scvi_model.adata orig_batch_key = scvi_model.scvi_setup_dict_["categorical_mappings"][ "_scvi_batch" ]["original_key"] if restrict_to_batch is not None: batch_mask = orig_adata.obs[orig_batch_key] == restrict_to_batch if np.sum(batch_mask) == 0: raise ValueError( "Batch category given to restrict_to_batch not found.\n" + "Available categories: {}".format( orig_adata.obs[orig_batch_key].astype("category").cat.categories ) ) # indices in adata with restrict_to_batch category batch_indices = np.where(batch_mask)[0] else: # use all indices batch_indices = None # anndata with only generated doublets doublet_adata = cls.create_doublets(orig_adata, indices=batch_indices) # if scvi wasn't trained with batch correction having the # zeros here does nothing. doublet_adata.obs[orig_batch_key] = ( restrict_to_batch if restrict_to_batch is not None else 0 ) # if model is using observed lib size, needs to get lib sample # which is just observed lib size on log scale give_mean_lib = not scvi_model.module.use_observed_lib_size # get latent representations and make input anndata latent_rep = scvi_model.get_latent_representation( orig_adata, indices=batch_indices ) lib_size = scvi_model.get_latent_library_size( orig_adata, indices=batch_indices, give_mean=give_mean_lib ) latent_adata = AnnData(np.concatenate([latent_rep, lib_size], axis=1)) latent_adata.obs[LABELS_KEY] = "singlet" orig_obs_names = orig_adata.obs_names latent_adata.obs_names = ( orig_obs_names[batch_indices] if batch_indices is not None else orig_obs_names ) logger.info("Creating doublets, preparing SOLO model.") f = io.StringIO() with redirect_stdout(f): setup_anndata(doublet_adata, batch_key=orig_batch_key) doublet_latent_rep = scvi_model.get_latent_representation(doublet_adata) doublet_lib_size = scvi_model.get_latent_library_size( doublet_adata, give_mean=give_mean_lib ) doublet_adata = AnnData( np.concatenate([doublet_latent_rep, doublet_lib_size], axis=1) ) doublet_adata.obs[LABELS_KEY] = "doublet" full_adata = latent_adata.concatenate(doublet_adata) setup_anndata(full_adata, labels_key=LABELS_KEY) return cls(full_adata)
def test_concatenate(): # dense data adata1 = AnnData(np.array([[1, 2, 3], [4, 5, 6]]), {'obs_names': ['s1', 's2'], 'anno1': ['c1', 'c2']}, {'var_names': ['a', 'b', 'c'], 'annoA': [0, 1, 2]}) adata2 = AnnData(np.array([[1, 2, 3], [4, 5, 6]]), {'obs_names': ['s3', 's4'], 'anno1': ['c3', 'c4']}, {'var_names': ['d', 'c', 'b'], 'annoA': [0, 1, 2]}) adata3 = AnnData(np.array([[1, 2, 3], [4, 5, 6]]), {'obs_names': ['s1', 's2'], 'anno2': ['d3', 'd4']}, {'var_names': ['d', 'c', 'b'], 'annoB': [0, 1, 2]}) # inner join adata = adata1.concatenate(adata2, adata3) assert adata.X.astype(int).tolist() == [[2, 3], [5, 6], [3, 2], [6, 5], [3, 2], [6, 5]] assert adata.obs_keys() == ['anno1', 'anno2', 'batch'] assert adata.var_keys() == ['annoA-0', 'annoA-1', 'annoB-2'] assert adata.var.values.tolist() == [[1, 2, 2], [2, 1, 1]] adata = adata1.concatenate(adata2, adata3, batch_key='batch1') assert adata.obs_keys() == ['anno1', 'anno2', 'batch1'] adata = adata1.concatenate(adata2, adata3, batch_categories=['a1', 'a2', 'a3']) assert adata.obs['batch'].cat.categories.tolist() == ['a1', 'a2', 'a3'] assert adata.var_names.tolist() == ['b', 'c'] # outer join adata = adata1.concatenate(adata2, adata3, join='outer') from numpy import ma Xma = ma.masked_invalid(adata.X) Xma_ref = ma.masked_invalid(np.array([ [1.0, 2.0, 3.0, np.nan], [4.0, 5.0, 6.0, np.nan], [np.nan, 3.0, 2.0, 1.0], [np.nan, 6.0, 5.0, 4.0], [np.nan, 3.0, 2.0, 1.0], [np.nan, 6.0, 5.0, 4.0]])) assert np.array_equal(Xma.mask, Xma_ref.mask) assert np.allclose(Xma.compressed(), Xma_ref.compressed()) var_ma = ma.masked_invalid(adata.var.values.tolist()) var_ma_ref = ma.masked_invalid(np.array( [[0.0, np.nan, np.nan], [1.0, 2.0, 2.0], [2.0, 1.0, 1.0], [np.nan, 0.0, 0.0]])) assert np.array_equal(var_ma.mask, var_ma_ref.mask) assert np.allclose(var_ma.compressed(), var_ma_ref.compressed()) # sparse data from scipy.sparse import csr_matrix adata1 = AnnData(csr_matrix([[0, 2, 3], [0, 5, 6]]), {'obs_names': ['s1', 's2'], 'anno1': ['c1', 'c2']}, {'var_names': ['a', 'b', 'c']}) adata2 = AnnData(csr_matrix([[0, 2, 3], [0, 5, 6]]), {'obs_names': ['s3', 's4'], 'anno1': ['c3', 'c4']}, {'var_names': ['d', 'c', 'b']}) adata3 = AnnData(csr_matrix([[1, 2, 0], [0, 5, 6]]), {'obs_names': ['s5', 's6'], 'anno2': ['d3', 'd4']}, {'var_names': ['d', 'c', 'b']}) # inner join adata = adata1.concatenate(adata2, adata3) assert adata.X.toarray().astype(int).tolist() == [[2, 3], [5, 6], [3, 2], [6, 5], [0, 2], [6, 5]] # outer join adata = adata1.concatenate(adata2, adata3, join='outer') assert adata.X.toarray().tolist() == [ [0.0, 2.0, 3.0, 0.0], [0.0, 5.0, 6.0, 0.0], [0.0, 3.0, 2.0, 0.0], [0.0, 6.0, 5.0, 0.0], [0.0, 0.0, 2.0, 1.0], [0.0, 6.0, 5.0, 0.0]]
def test_concatenate_with_raw(): # dense data X1 = np.array([[1, 2, 3], [4, 5, 6]]) X2 = np.array([[1, 2, 3], [4, 5, 6]]) X3 = np.array([[1, 2, 3], [4, 5, 6]]) X4 = np.array([[1, 2, 3, 4], [5, 6, 7, 8]]) adata1 = AnnData( X1, dict(obs_names=["s1", "s2"], anno1=["c1", "c2"]), dict(var_names=["a", "b", "c"], annoA=[0, 1, 2]), layers=dict(Xs=X1), ) adata2 = AnnData( X2, dict(obs_names=["s3", "s4"], anno1=["c3", "c4"]), dict(var_names=["d", "c", "b"], annoA=[0, 1, 2]), layers=dict(Xs=X2), ) adata3 = AnnData( X3, dict(obs_names=["s1", "s2"], anno2=["d3", "d4"]), dict(var_names=["d", "c", "b"], annoB=[0, 1, 2]), layers=dict(Xs=X3), ) adata4 = AnnData( X4, dict(obs_names=["s1", "s2"], anno1=["c1", "c2"]), dict(var_names=["a", "b", "c", "z"], annoA=[0, 1, 2, 3]), layers=dict(Xs=X4), ) adata1.raw = adata1 adata2.raw = adata2 adata3.raw = adata3 adata_all = AnnData.concatenate(adata1, adata2, adata3) assert isinstance(adata_all.raw, Raw) assert set(adata_all.raw.var_names) == {"b", "c"} assert_equal(adata_all.raw.to_adata().obs, adata_all.obs) assert np.array_equal(adata_all.raw.X, adata_all.X) adata_all = AnnData.concatenate(adata1, adata2, adata3, join="outer") assert isinstance(adata_all.raw, Raw) assert set(adata_all.raw.var_names) == set("abcd") assert_equal(adata_all.raw.to_adata().obs, adata_all.obs) assert np.array_equal(np.nan_to_num(adata_all.raw.X), np.nan_to_num(adata_all.X)) adata3.raw = adata4 adata_all = AnnData.concatenate(adata1, adata2, adata3, join="outer") assert isinstance(adata_all.raw, Raw) assert set(adata_all.raw.var_names) == set("abcdz") assert set(adata_all.var_names) == set("abcd") assert not np.array_equal(np.nan_to_num(adata_all.raw.X), np.nan_to_num(adata_all.X)) del adata3.raw with pytest.warns( UserWarning, match=("Only some adata objects have `.raw` attribute, " "not concatenating `.raw` attributes."), ): adata_all = AnnData.concatenate(adata1, adata2, adata3) assert adata_all.raw is None del adata1.raw del adata2.raw assert all(_adata.raw is None for _adata in (adata1, adata2, adata3)) adata_all = AnnData.concatenate(adata1, adata2, adata3) assert adata_all.raw is None
def test_pickle(): import pickle adata = AnnData() adata2 = pickle.loads(pickle.dumps(adata)) assert adata2.obsm.parent is adata2
if expr_type == 'harmony': X = correct_harmony(all_dimreds) if expr_type == 'scanorama': X = correct_scanorama(Xs, genes) if expr_type == 'scvi': nonzero_idx = np.array(X.sum(1) > 0).flatten() X = np.zeros((X.shape[0], 30)) X_scvi = correct_scvi(Xs, genes) X[nonzero_idx, :] = X_scvi X[np.isnan(X)] = 0 X[np.isinf(X)] = 0 C = np.vstack([X[sample_idx].mean(0) for sample_idx in sample_idxs]) adata = AnnData(X=C) adata.obs['study'] = studies for knn in [15, 20, 30, 40]: sc.pp.neighbors(adata, n_neighbors=knn, use_rep='X') draw_graph(adata, layout='fa') sc.pl.draw_graph(adata, color='study', edges=True, edges_color='#CCCCCC', save='_{}_expr_gmean_k{}.png'.format( NAMESPACE + '_' + expr_type, knn)) sys.stdout.flush() adata = AnnData(X=X) adata.obs['study'] = ['_'.join(ct.split('_')[:3]) for ct in cell_types]