def rank_genes_groups( adata: AnnData, groupby: str, use_raw: bool = True, groups: Union[str, Iterable[str]] = 'all', reference: str = 'rest', n_genes: int = 100, rankby_abs: bool = False, key_added: Optional[str] = None, copy: bool = False, method: str = 't-test_overestim_var', corr_method: str = 'benjamini-hochberg', layer: Optional[str] = None, **kwds, ): """Rank genes for characterizing groups. Parameters ---------- adata Annotated data matrix. groupby The key of the observations grouping to consider. use_raw Use `raw` attribute of `adata` if present. layer Key from `adata.layers` whose value will be used to perform tests on. groups Subset of groups, e.g. [`'g1'`, `'g2'`, `'g3'`], to which comparison shall be restricted, or `'all'` (default), for all groups. reference If `'rest'`, compare each group to the union of the rest of the group. If a group identifier, compare with respect to this group. n_genes The number of genes that appear in the returned tables. method: {`'logreg'`, `'t-test'`, `'wilcoxon'`, `'t-test_overestim_var'`}` The default 't-test_overestim_var' overestimates variance of each group, `'t-test'` uses t-test, `'wilcoxon'` uses Wilcoxon rank-sum, `'logreg'` uses logistic regression. See [Ntranos18]_, `here <https://github.com/theislab/scanpy/issues/95>`__ and `here <http://www.nxn.se/valent/2018/3/5/actionable-scrna-seq-clusters>`__, for why this is meaningful. corr_method: {`'benjamini-hochberg'`, `'bonferroni'`} p-value correction method. Used only for `'t-test'`, `'t-test_overestim_var'`, and `'wilcoxon'`. rankby_abs Rank genes by the absolute value of the score, not by the score. The returned scores are never the absolute values. key_added The key in `adata.uns` information is saved to. **kwds Are passed to test methods. Currently this affects only parameters that are passed to `sklearn.linear_model.LogisticRegression <http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html>`__. For instance, you can pass `penalty='l1'` to try to come up with a minimal set of genes that are good predictors (sparse solution meaning few non-zero fitted coefficients). Returns ------- **names** : structured `np.ndarray` (`.uns['rank_genes_groups']`) Structured array to be indexed by group id storing the gene names. Ordered according to scores. **scores** : structured `np.ndarray` (`.uns['rank_genes_groups']`) Structured array to be indexed by group id storing the z-score underlying the computation of a p-value for each gene for each group. Ordered according to scores. **logfoldchanges** : structured `np.ndarray` (`.uns['rank_genes_groups']`) Structured array to be indexed by group id storing the log2 fold change for each gene for each group. Ordered according to scores. Only provided if method is 't-test' like. Note: this is an approximation calculated from mean-log values. **pvals** : structured `np.ndarray` (`.uns['rank_genes_groups']`) p-values. **pvals_adj** : structured `np.ndarray` (`.uns['rank_genes_groups']`) Corrected p-values. Notes ----- There are slight inconsistencies depending on whether sparse or dense data are passed. See `here <https://github.com/theislab/scanpy/blob/master/scanpy/tests/test_rank_genes_groups.py>`__. Examples -------- >>> adata = sc.datasets.pbmc68k_reduced() >>> sc.tl.rank_genes_groups(adata, 'bulk_labels', method='wilcoxon') # to visualize the results >>> sc.pl.rank_genes_groups(adata) """ if 'only_positive' in kwds: rankby_abs = not kwds.pop('only_positive') # backwards compat start = logg.info('ranking genes') avail_methods = {'t-test', 't-test_overestim_var', 'wilcoxon', 'logreg'} if method not in avail_methods: raise ValueError('Method must be one of {}.'.format(avail_methods)) avail_corr = {'benjamini-hochberg', 'bonferroni'} if corr_method not in avail_corr: raise ValueError( 'Correction method must be one of {}.'.format(avail_corr)) adata = adata.copy() if copy else adata utils.sanitize_anndata(adata) # for clarity, rename variable if groups == 'all': groups_order = 'all' elif isinstance(groups, (str, int)): raise ValueError('Specify a sequence of groups') else: groups_order = list(groups) if isinstance(groups_order[0], int): groups_order = [str(n) for n in groups_order] if reference != 'rest' and reference not in set(groups_order): groups_order += [reference] if (reference != 'rest' and reference not in set(adata.obs[groupby].cat.categories)): cats = adata.obs[groupby].cat.categories.tolist() raise ValueError( f'reference = {reference} needs to be one of groupby = {cats}.') groups_order, groups_masks = utils.select_groups(adata, groups_order, groupby) if key_added is None: key_added = 'rank_genes_groups' adata.uns[key_added] = {} adata.uns[key_added]['params'] = { 'groupby': groupby, 'reference': reference, 'method': method, 'use_raw': use_raw, 'layer': layer, 'corr_method': corr_method, } # adata_comp mocks an AnnData object if use_raw is True # otherwise it's just the AnnData object adata_comp = adata if layer is not None: if use_raw: raise ValueError("Cannot specify `layer` and have `use_raw=True`.") X = adata_comp.layers[layer] else: if use_raw and adata.raw is not None: adata_comp = adata.raw X = adata_comp.X # for clarity, rename variable n_genes_user = n_genes # make sure indices are not OoB in case there are less genes than n_genes if n_genes_user > X.shape[1]: n_genes_user = X.shape[1] # in the following, n_genes is simply another name for the total number of genes n_genes = X.shape[1] n_groups = groups_masks.shape[0] ns = np.zeros(n_groups, dtype=int) for imask, mask in enumerate(groups_masks): ns[imask] = np.where(mask)[0].size logg.debug(f'consider {groupby!r} groups:') logg.debug(f'with sizes: {ns}') if reference != 'rest': ireference = np.where(groups_order == reference)[0][0] reference_indices = np.arange(adata_comp.n_vars, dtype=int) rankings_gene_scores = [] rankings_gene_names = [] rankings_gene_logfoldchanges = [] rankings_gene_pvals = [] rankings_gene_pvals_adj = [] if method in {'t-test', 't-test_overestim_var'}: from scipy import stats from statsmodels.stats.multitest import multipletests # loop over all masks and compute means, variances and sample numbers means = np.zeros((n_groups, n_genes)) vars = np.zeros((n_groups, n_genes)) for imask, mask in enumerate(groups_masks): means[imask], vars[imask] = _get_mean_var(X[mask]) # test each either against the union of all other groups or against a # specific group for igroup in range(n_groups): if reference == 'rest': mask_rest = ~groups_masks[igroup] else: if igroup == ireference: continue else: mask_rest = groups_masks[ireference] mean_group, var_group = means[igroup], vars[igroup] mean_rest, var_rest = _get_mean_var(X[mask_rest]) ns_group = ns[igroup] # number of observations in group if method == 't-test': ns_rest = np.where(mask_rest)[0].size elif method == 't-test_overestim_var': ns_rest = ns[ igroup] # hack for overestimating the variance for small groups else: raise ValueError('Method does not exist.') # TODO: Come up with better solution. Mask unexpressed genes? # See https://github.com/scipy/scipy/issues/10269 with np.errstate(invalid="ignore"): scores, pvals = stats.ttest_ind_from_stats( mean1=mean_group, std1=np.sqrt(var_group), nobs1=ns_group, mean2=mean_rest, std2=np.sqrt(var_rest), nobs2=ns_rest, equal_var=False # Welch's ) # Fold change foldchanges = (np.expm1(mean_group) + 1e-9) / ( np.expm1(mean_rest) + 1e-9) # add small value to remove 0's scores[np.isnan( scores )] = 0 # I think it's only nan when means are the same and vars are 0 pvals[np.isnan( pvals)] = 1 # This also has to happen for Benjamini Hochberg if corr_method == 'benjamini-hochberg': _, pvals_adj, _, _ = multipletests(pvals, alpha=0.05, method='fdr_bh') elif corr_method == 'bonferroni': pvals_adj = np.minimum(pvals * n_genes, 1.0) scores_sort = np.abs(scores) if rankby_abs else scores partition = np.argpartition(scores_sort, -n_genes_user)[-n_genes_user:] partial_indices = np.argsort(scores_sort[partition])[::-1] global_indices = reference_indices[partition][partial_indices] rankings_gene_scores.append(scores[global_indices]) rankings_gene_logfoldchanges.append( np.log2(foldchanges[global_indices])) rankings_gene_names.append(adata_comp.var_names[global_indices]) rankings_gene_pvals.append(pvals[global_indices]) rankings_gene_pvals_adj.append(pvals_adj[global_indices]) elif method == 'logreg': # if reference is not set, then the groups listed will be compared to the rest # if reference is set, then the groups listed will be compared only to the other groups listed from sklearn.linear_model import LogisticRegression reference = groups_order[0] if len(groups) == 1: raise Exception( 'Cannot perform logistic regression on a single cluster.') grouping_mask = adata.obs[groupby].isin(groups_order) grouping = adata.obs.loc[grouping_mask, groupby] X = X[ grouping_mask. values, :] # Indexing with a series causes issues, possibly segfault clf = LogisticRegression(**kwds) clf.fit(X, grouping.cat.codes) scores_all = clf.coef_ for igroup, group in enumerate(groups_order): if len(groups_order) <= 2: # binary logistic regression scores = scores_all[0] else: scores = scores_all[igroup] partition = np.argpartition(scores, -n_genes_user)[-n_genes_user:] partial_indices = np.argsort(scores[partition])[::-1] global_indices = reference_indices[partition][partial_indices] rankings_gene_scores.append(scores[global_indices]) rankings_gene_names.append(adata_comp.var_names[global_indices]) if len(groups_order) <= 2: break elif method == 'wilcoxon': from scipy import stats from statsmodels.stats.multitest import multipletests CONST_MAX_SIZE = 10000000 means = np.zeros((n_groups, n_genes)) vars = np.zeros((n_groups, n_genes)) # initialize space for z-scores scores = np.zeros(n_genes) # First loop: Loop over all genes if reference != 'rest': for imask, mask in enumerate(groups_masks): means[imask], vars[imask] = _get_mean_var( X[mask]) # for fold-change only if imask == ireference: continue else: mask_rest = groups_masks[ireference] ns_rest = np.where(mask_rest)[0].size mean_rest, var_rest = _get_mean_var( X[mask_rest]) # for fold-change only if ns_rest <= 25 or ns[imask] <= 25: logg.hint( 'Few observations in a group for ' 'normal approximation (<=25). Lower test accuracy.') n_active = ns[imask] m_active = ns_rest # Now calculate gene expression ranking in chunkes: chunk = [] # Calculate chunk frames n_genes_max_chunk = floor(CONST_MAX_SIZE / (n_active + m_active)) if n_genes_max_chunk < n_genes: chunk_index = n_genes_max_chunk while chunk_index < n_genes: chunk.append(chunk_index) chunk_index = chunk_index + n_genes_max_chunk chunk.append(n_genes) else: chunk.append(n_genes) left = 0 # Calculate rank sums for each chunk for the current mask for chunk_index, right in enumerate(chunk): # Check if issparse is true: AnnData objects are currently sparse.csr or ndarray. if issparse(X): df1 = pd.DataFrame(data=X[mask, left:right].todense()) df2 = pd.DataFrame( data=X[mask_rest, left:right].todense(), index=np.arange(start=n_active, stop=n_active + m_active)) else: df1 = pd.DataFrame(data=X[mask, left:right]) df2 = pd.DataFrame(data=X[mask_rest, left:right], index=np.arange(start=n_active, stop=n_active + m_active)) df1 = df1.append(df2) ranks = df1.rank() # sum up adjusted_ranks to calculate W_m,n scores[left:right] = np.sum(ranks.loc[0:n_active, :]) left = right scores = (scores - (n_active * (n_active + m_active + 1) / 2)) / sqrt( (n_active * m_active * (n_active + m_active + 1) / 12)) scores[np.isnan(scores)] = 0 pvals = 2 * stats.distributions.norm.sf(np.abs(scores)) if corr_method == 'benjamini-hochberg': pvals[np.isnan( pvals )] = 1 # set Nan values to 1 to properly convert using Benhjamini Hochberg _, pvals_adj, _, _ = multipletests(pvals, alpha=0.05, method='fdr_bh') elif corr_method == 'bonferroni': pvals_adj = np.minimum(pvals * n_genes, 1.0) # Fold change foldchanges = (np.expm1(means[imask]) + 1e-9) / ( np.expm1(mean_rest) + 1e-9 ) # add small value to remove 0's scores_sort = np.abs(scores) if rankby_abs else scores partition = np.argpartition(scores_sort, -n_genes_user)[-n_genes_user:] partial_indices = np.argsort(scores_sort[partition])[::-1] global_indices = reference_indices[partition][partial_indices] rankings_gene_scores.append(scores[global_indices]) rankings_gene_names.append( adata_comp.var_names[global_indices]) rankings_gene_logfoldchanges.append( np.log2(foldchanges[global_indices])) rankings_gene_pvals.append(pvals[global_indices]) rankings_gene_pvals_adj.append(pvals_adj[global_indices]) # If no reference group exists, ranking needs only to be done once (full mask) else: scores = np.zeros((n_groups, n_genes)) chunk = [] n_cells = X.shape[0] n_genes_max_chunk = floor(CONST_MAX_SIZE / n_cells) if n_genes_max_chunk < n_genes: chunk_index = n_genes_max_chunk while chunk_index < n_genes: chunk.append(chunk_index) chunk_index = chunk_index + n_genes_max_chunk chunk.append(n_genes) else: chunk.append(n_genes) left = 0 for chunk_index, right in enumerate(chunk): # Check if issparse is true if issparse(X): df1 = pd.DataFrame(data=X[:, left:right].todense()) else: df1 = pd.DataFrame(data=X[:, left:right]) ranks = df1.rank() # sum up adjusted_ranks to calculate W_m,n for imask, mask in enumerate(groups_masks): scores[imask, left:right] = np.sum(ranks.loc[mask, :]) left = right for imask, mask in enumerate(groups_masks): mask_rest = ~groups_masks[imask] means[imask], vars[imask] = _get_mean_var( X[mask]) #for fold-change mean_rest, var_rest = _get_mean_var( X[mask_rest]) # for fold-change scores[imask, :] = (scores[imask, :] - (ns[imask] * (n_cells + 1) / 2)) / sqrt( (ns[imask] * (n_cells - ns[imask]) * (n_cells + 1) / 12)) scores[np.isnan(scores)] = 0 pvals = 2 * stats.distributions.norm.sf( np.abs(scores[imask, :])) if corr_method == 'benjamini-hochberg': pvals[np.isnan( pvals )] = 1 # set Nan values to 1 to properly convert using Benhjamini Hochberg _, pvals_adj, _, _ = multipletests(pvals, alpha=0.05, method='fdr_bh') elif corr_method == 'bonferroni': pvals_adj = np.minimum(pvals * n_genes, 1.0) # Fold change foldchanges = (np.expm1(means[imask]) + 1e-9) / ( np.expm1(mean_rest) + 1e-9 ) # add small value to remove 0's scores_sort = np.abs(scores) if rankby_abs else scores partition = np.argpartition(scores_sort[imask, :], -n_genes_user)[-n_genes_user:] partial_indices = np.argsort(scores_sort[imask, partition])[::-1] global_indices = reference_indices[partition][partial_indices] rankings_gene_scores.append(scores[imask, global_indices]) rankings_gene_names.append( adata_comp.var_names[global_indices]) rankings_gene_logfoldchanges.append( np.log2(foldchanges[global_indices])) rankings_gene_pvals.append(pvals[global_indices]) rankings_gene_pvals_adj.append(pvals_adj[global_indices]) groups_order_save = [str(g) for g in groups_order] if (reference != 'rest' and method != 'logreg') or (method == 'logreg' and len(groups) == 2): groups_order_save = [g for g in groups_order if g != reference] adata.uns[key_added]['scores'] = np.rec.fromarrays( [n for n in rankings_gene_scores], dtype=[(rn, 'float32') for rn in groups_order_save]) adata.uns[key_added]['names'] = np.rec.fromarrays( [n for n in rankings_gene_names], dtype=[(rn, 'U50') for rn in groups_order_save]) if method in {'t-test', 't-test_overestim_var', 'wilcoxon'}: adata.uns[key_added]['logfoldchanges'] = np.rec.fromarrays( [n for n in rankings_gene_logfoldchanges], dtype=[(rn, 'float32') for rn in groups_order_save]) adata.uns[key_added]['pvals'] = np.rec.fromarrays( [n for n in rankings_gene_pvals], dtype=[(rn, 'float64') for rn in groups_order_save]) adata.uns[key_added]['pvals_adj'] = np.rec.fromarrays( [n for n in rankings_gene_pvals_adj], dtype=[(rn, 'float64') for rn in groups_order_save]) logg.info( ' finished', time=start, deep= (f'added to `.uns[{key_added!r}]`\n' " 'names', sorted np.recarray to be indexed by group ids\n" " 'scores', sorted np.recarray to be indexed by group ids\n" + (" 'logfoldchanges', sorted np.recarray to be indexed by group ids\n" " 'pvals', sorted np.recarray to be indexed by group ids\n" " 'pvals_adj', sorted np.recarray to be indexed by group ids" if method in {'t-test', 't-test_overestim_var', 'wilcoxon'} else '')), ) return adata if copy else None
def recipe_zheng17( adata: AnnData, n_top_genes: int = 1000, log: bool = True, plot: bool = False, copy: bool = False, ) -> Optional[AnnData]: """\ Normalization and filtering as of [Zheng17]_. Reproduces the preprocessing of [Zheng17]_ – the Cell Ranger R Kit of 10x Genomics. Expects non-logarithmized data. If using logarithmized data, pass `log=False`. The recipe runs the following steps .. code:: python sc.pp.filter_genes(adata, min_counts=1) # only consider genes with more than 1 count sc.pp.normalize_per_cell( # normalize with total UMI count per cell adata, key_n_counts='n_counts_all' ) filter_result = sc.pp.filter_genes_dispersion( # select highly-variable genes adata.X, flavor='cell_ranger', n_top_genes=n_top_genes, log=False ) adata = adata[:, filter_result.gene_subset] # subset the genes sc.pp.normalize_per_cell(adata) # renormalize after filtering if log: sc.pp.log1p(adata) # log transform: adata.X = log(adata.X + 1) sc.pp.scale(adata) # scale to unit variance and shift to zero mean Parameters ---------- adata Annotated data matrix. n_top_genes Number of genes to keep. log Take logarithm. plot Show a plot of the gene dispersion vs. mean relation. copy Return a copy of `adata` instead of updating it. Returns ------- Returns or updates `adata` depending on `copy`. """ start = logg.info('running recipe zheng17') if copy: adata = adata.copy() # only consider genes with more than 1 count pp.filter_genes(adata, min_counts=1) # normalize with total UMI count per cell normalize_total(adata, key_added='n_counts_all') filter_result = filter_genes_dispersion( adata.X, flavor='cell_ranger', n_top_genes=n_top_genes, log=False ) if plot: # should not import at the top of the file from ..plotting import _preprocessing as ppp ppp.filter_genes_dispersion(filter_result, log=True) # actually filter the genes, the following is the inplace version of # adata = adata[:, filter_result.gene_subset] adata._inplace_subset_var(filter_result.gene_subset) # filter genes normalize_total(adata) # renormalize after filtering if log: pp.log1p(adata) # log transform: X = log(X + 1) pp.scale(adata) logg.info(' finished', time=start) return adata if copy else None
def regress_out( adata: AnnData, keys: Union[str, Sequence[str]], n_jobs: Optional[int] = None, copy: bool = False, ) -> Optional[AnnData]: """\ Regress out (mostly) unwanted sources of variation. Uses simple linear regression. This is inspired by Seurat's `regressOut` function in R [Satija15]. Note that this function tends to overcorrect in certain circumstances as described in :issue:`526`. Parameters ---------- adata The annotated data matrix. keys Keys for observation annotation on which to regress on. n_jobs Number of jobs for parallel computation. `None` means using :attr:`scanpy._settings.ScanpyConfig.n_jobs`. copy Determines whether a copy of `adata` is returned. Returns ------- Depending on `copy` returns or updates `adata` with the corrected data matrix. """ start = logg.info(f'regressing out {keys}') if issparse(adata.X): logg.info(' sparse input is densified and may ' 'lead to high memory use') adata = adata.copy() if copy else adata sanitize_anndata(adata) if isinstance(keys, str): keys = [keys] if issparse(adata.X): adata.X = adata.X.toarray() n_jobs = sett.n_jobs if n_jobs is None else n_jobs # regress on a single categorical variable variable_is_categorical = False if keys[0] in adata.obs_keys() and is_categorical_dtype( adata.obs[keys[0]]): if len(keys) > 1: raise ValueError('If providing categorical variable, ' 'only a single one is allowed. For this one ' 'we regress on the mean for each category.') logg.debug('... regressing on per-gene means within categories') regressors = np.zeros(adata.X.shape, dtype='float32') for category in adata.obs[keys[0]].cat.categories: mask = (category == adata.obs[keys[0]]).values for ix, x in enumerate(adata.X.T): regressors[mask, ix] = x[mask].mean() variable_is_categorical = True # regress on one or several ordinal variables else: # create data frame with selected keys (if given) if keys: regressors = adata.obs[keys] else: regressors = adata.obs.copy() # add column of ones at index 0 (first column) regressors.insert(0, 'ones', 1.0) len_chunk = np.ceil(min(1000, adata.X.shape[1]) / n_jobs).astype(int) n_chunks = np.ceil(adata.X.shape[1] / len_chunk).astype(int) tasks = [] # split the adata.X matrix by columns in chunks of size n_chunk # (the last chunk could be of smaller size than the others) chunk_list = np.array_split(adata.X, n_chunks, axis=1) if variable_is_categorical: regressors_chunk = np.array_split(regressors, n_chunks, axis=1) for idx, data_chunk in enumerate(chunk_list): # each task is a tuple of a data_chunk eg. (adata.X[:,0:100]) and # the regressors. This data will be passed to each of the jobs. if variable_is_categorical: regres = regressors_chunk[idx] else: regres = regressors tasks.append(tuple((data_chunk, regres, variable_is_categorical))) if n_jobs > 1 and n_chunks > 1: import multiprocessing pool = multiprocessing.Pool(n_jobs) res = pool.map_async(_regress_out_chunk, tasks).get(9999999) pool.close() else: res = list(map(_regress_out_chunk, tasks)) # res is a list of vectors (each corresponding to a regressed gene column). # The transpose is needed to get the matrix in the shape needed adata.X = np.vstack(res).T.astype(adata.X.dtype) logg.info(' finished', time=start) return adata if copy else None
def filter_genes_dispersion( data: AnnData, flavor: Literal['seurat', 'cell_ranger'] = 'seurat', min_disp: Optional[float] = None, max_disp: Optional[float] = None, min_mean: Optional[float] = None, max_mean: Optional[float] = None, n_bins: int = 20, n_top_genes: Optional[int] = None, log: bool = True, subset: bool = True, copy: bool = False, ): """\ Extract highly variable genes [Satija15]_ [Zheng17]_. .. warning:: .. deprecated:: 1.3.6 Use :func:`~scanpy.pp.highly_variable_genes` instead. The new function is equivalent to the present function, except that * the new function always expects logarithmized data * `subset=False` in the new function, it suffices to merely annotate the genes, tools like `pp.pca` will detect the annotation * you can now call: `sc.pl.highly_variable_genes(adata)` * `copy` is replaced by `inplace` If trying out parameters, pass the data matrix instead of AnnData. Depending on `flavor`, this reproduces the R-implementations of Seurat [Satija15]_ and Cell Ranger [Zheng17]_. The normalized dispersion is obtained by scaling with the mean and standard deviation of the dispersions for genes falling into a given bin for mean expression of genes. This means that for each bin of mean expression, highly variable genes are selected. Use `flavor='cell_ranger'` with care and in the same way as in :func:`~scanpy.pp.recipe_zheng17`. Parameters ---------- data The (annotated) data matrix of shape `n_obs` × `n_vars`. Rows correspond to cells and columns to genes. flavor Choose the flavor for computing normalized dispersion. If choosing 'seurat', this expects non-logarithmized data – the logarithm of mean and dispersion is taken internally when `log` is at its default value `True`. For 'cell_ranger', this is usually called for logarithmized data – in this case you should set `log` to `False`. In their default workflows, Seurat passes the cutoffs whereas Cell Ranger passes `n_top_genes`. min_mean max_mean min_disp max_disp If `n_top_genes` unequals `None`, these cutoffs for the means and the normalized dispersions are ignored. n_bins Number of bins for binning the mean gene expression. Normalization is done with respect to each bin. If just a single gene falls into a bin, the normalized dispersion is artificially set to 1. You'll be informed about this if you set `settings.verbosity = 4`. n_top_genes Number of highly-variable genes to keep. log Use the logarithm of the mean to variance ratio. subset Keep highly-variable genes only (if True) else write a bool array for h ighly-variable genes while keeping all genes copy If an :class:`~anndata.AnnData` is passed, determines whether a copy is returned. Returns ------- If an AnnData `adata` is passed, returns or updates `adata` depending on `copy`. It filters the `adata` and adds the annotations **means** : adata.var Means per gene. Logarithmized when `log` is `True`. **dispersions** : adata.var Dispersions per gene. Logarithmized when `log` is `True`. **dispersions_norm** : adata.var Normalized dispersions per gene. Logarithmized when `log` is `True`. If a data matrix `X` is passed, the annotation is returned as `np.recarray` with the same information stored in fields: `gene_subset`, `means`, `dispersions`, `dispersion_norm`. """ if (n_top_genes is not None and not all(x is None for x in [min_disp, max_disp, min_mean, max_mean])): logg.info('If you pass `n_top_genes`, all cutoffs are ignored.') if min_disp is None: min_disp = 0.5 if min_mean is None: min_mean = 0.0125 if max_mean is None: max_mean = 3 if isinstance(data, AnnData): adata = data.copy() if copy else data result = filter_genes_dispersion( adata.X, log=log, min_disp=min_disp, max_disp=max_disp, min_mean=min_mean, max_mean=max_mean, n_top_genes=n_top_genes, flavor=flavor, ) adata.var['means'] = result['means'] adata.var['dispersions'] = result['dispersions'] adata.var['dispersions_norm'] = result['dispersions_norm'] if subset: adata._inplace_subset_var(result['gene_subset']) else: adata.var['highly_variable'] = result['gene_subset'] return adata if copy else None start = logg.info('extracting highly variable genes') X = data # no copy necessary, X remains unchanged in the following mean, var = materialize_as_ndarray(_get_mean_var(X)) # now actually compute the dispersion mean[mean == 0] = 1e-12 # set entries equal to zero to small value dispersion = var / mean if log: # logarithmized mean as in Seurat dispersion[dispersion == 0] = np.nan dispersion = np.log(dispersion) mean = np.log1p(mean) # all of the following quantities are "per-gene" here df = pd.DataFrame() df['mean'] = mean df['dispersion'] = dispersion if flavor == 'seurat': df['mean_bin'] = pd.cut(df['mean'], bins=n_bins) disp_grouped = df.groupby('mean_bin')['dispersion'] disp_mean_bin = disp_grouped.mean() disp_std_bin = disp_grouped.std(ddof=1) # retrieve those genes that have nan std, these are the ones where # only a single gene fell in the bin and implicitly set them to have # a normalized disperion of 1 one_gene_per_bin = disp_std_bin.isnull() gen_indices = np.where( one_gene_per_bin[df['mean_bin'].values])[0].tolist() if len(gen_indices) > 0: logg.debug( f'Gene indices {gen_indices} fell into a single bin: their ' 'normalized dispersion was set to 1.\n ' 'Decreasing `n_bins` will likely avoid this effect.') # Circumvent pandas 0.23 bug. Both sides of the assignment have dtype==float32, # but there’s still a dtype error without “.value”. disp_std_bin[one_gene_per_bin] = disp_mean_bin[ one_gene_per_bin.values].values disp_mean_bin[one_gene_per_bin] = 0 # actually do the normalization df['dispersion_norm'] = ( df['dispersion'].values # use values here as index differs - disp_mean_bin[df['mean_bin'].values].values ) / disp_std_bin[df['mean_bin'].values].values elif flavor == 'cell_ranger': from statsmodels import robust df['mean_bin'] = pd.cut( df['mean'], np.r_[-np.inf, np.percentile(df['mean'], np.arange(10, 105, 5)), np.inf]) disp_grouped = df.groupby('mean_bin')['dispersion'] disp_median_bin = disp_grouped.median() # the next line raises the warning: "Mean of empty slice" with warnings.catch_warnings(): warnings.simplefilter('ignore') disp_mad_bin = disp_grouped.apply(robust.mad) df['dispersion_norm'] = np.abs( df['dispersion'].values - disp_median_bin[df['mean_bin'].values].values) / disp_mad_bin[ df['mean_bin'].values].values else: raise ValueError('`flavor` needs to be "seurat" or "cell_ranger"') dispersion_norm = df['dispersion_norm'].values.astype('float32') if n_top_genes is not None: dispersion_norm = dispersion_norm[~np.isnan(dispersion_norm)] dispersion_norm[::-1].sort( ) # interestingly, np.argpartition is slightly slower disp_cut_off = dispersion_norm[n_top_genes - 1] gene_subset = df['dispersion_norm'].values >= disp_cut_off logg.debug(f'the {n_top_genes} top genes correspond to a ' f'normalized dispersion cutoff of {disp_cut_off}') else: max_disp = np.inf if max_disp is None else max_disp dispersion_norm[np.isnan(dispersion_norm)] = 0 # similar to Seurat gene_subset = np.logical_and.reduce(( mean > min_mean, mean < max_mean, dispersion_norm > min_disp, dispersion_norm < max_disp, )) logg.info(' finished', time=start) return np.rec.fromarrays(( gene_subset, df['mean'].values, df['dispersion'].values, df['dispersion_norm'].values.astype('float32', copy=False), ), dtype=[ ('gene_subset', bool), ('means', 'float32'), ('dispersions', 'float32'), ('dispersions_norm', 'float32'), ])
def leiden( adata: AnnData, resolution: float = 1, *, restrict_to: Optional[Tuple[str, Sequence[str]]] = None, random_state: Optional[Union[int, RandomState]] = 0, key_added: str = 'leiden', adjacency: Optional[sparse.spmatrix] = None, directed: bool = True, use_weights: bool = True, n_iterations: int = -1, partition_type: Optional[Type[MutableVertexPartition]] = None, copy: bool = False, **partition_kwargs, ) -> Optional[AnnData]: """\ Cluster cells into subgroups [Traag18]_. Cluster cells using the Leiden algorithm [Traag18]_, an improved version of the Louvain algorithm [Blondel08]_. It has been proposed for single-cell analysis by [Levine15]_. This requires having ran :func:`~scanpy.pp.neighbors` or :func:`~scanpy.external.pp.bbknn` first. Parameters ---------- adata The annotated data matrix. resolution A parameter value controlling the coarseness of the clustering. Higher values lead to more clusters. Set to `None` if overriding `partition_type` to one that doesn’t accept a `resolution_parameter`. random_state Change the initialization of the optimization. restrict_to Restrict the clustering to the categories within the key for sample annotation, tuple needs to contain `(obs_key, list_of_categories)`. key_added `adata.obs` key under which to add the cluster labels. adjacency Sparse adjacency matrix of the graph, defaults to `adata.uns['neighbors']['connectivities']`. directed Whether to treat the graph as directed or undirected. use_weights If `True`, edge weights from the graph are used in the computation (placing more emphasis on stronger edges). n_iterations How many iterations of the Leiden clustering algorithm to perform. Positive values above 2 define the total number of iterations to perform, -1 has the algorithm run until it reaches its optimal clustering. partition_type Type of partition to use. Defaults to :class:`~leidenalg.RBConfigurationVertexPartition`. For the available options, consult the documentation for :func:`~leidenalg.find_partition`. copy Whether to copy `adata` or modify it inplace. **partition_kwargs Any further arguments to pass to `~leidenalg.find_partition` (which in turn passes arguments to the `partition_type`). Returns ------- `adata.obs[key_added]` Array of dim (number of samples) that stores the subgroup id (`'0'`, `'1'`, ...) for each cell. `adata.uns['leiden']['params']` A dict with the values for the parameters `resolution`, `random_state`, and `n_iterations`. """ try: import leidenalg except ImportError: raise ImportError( 'Please install the leiden algorithm: `pip3 install leidenalg`.') partition_kwargs = dict(partition_kwargs) start = logg.info('running Leiden clustering') adata = adata.copy() if copy else adata # are we clustering a user-provided graph or the default AnnData one? if adjacency is None: if 'neighbors' not in adata.uns: raise ValueError('You need to run `pp.neighbors` first ' 'to compute a neighborhood graph.') adjacency = adata.uns['neighbors']['connectivities'] if restrict_to is not None: restrict_key, restrict_categories = restrict_to adjacency, restrict_indices = restrict_adjacency( adata, restrict_key, restrict_categories, adjacency, ) # convert it to igraph g = _utils.get_igraph_from_adjacency(adjacency, directed=directed) # flip to the default partition type if not overriden by the user if partition_type is None: partition_type = leidenalg.RBConfigurationVertexPartition # Prepare find_partition arguments as a dictionary, # appending to whatever the user provided. It needs to be this way # as this allows for the accounting of a None resolution # (in the case of a partition variant that doesn't take it on input) if use_weights: partition_kwargs['weights'] = np.array(g.es['weight']).astype( np.float64) partition_kwargs['n_iterations'] = n_iterations partition_kwargs['seed'] = random_state if resolution is not None: partition_kwargs['resolution_parameter'] = resolution # clustering proper part = leidenalg.find_partition(g, partition_type, **partition_kwargs) # store output into adata.obs groups = np.array(part.membership) if restrict_to is not None: if key_added == 'louvain': key_added += '_R' groups = rename_groups( adata, key_added, restrict_key, restrict_categories, restrict_indices, groups, ) adata.obs[key_added] = pd.Categorical( values=groups.astype('U'), categories=natsorted(np.unique(groups).astype('U')), ) # store information on the clustering parameters adata.uns['leiden'] = {} adata.uns['leiden']['params'] = dict( resolution=resolution, random_state=random_state, n_iterations=n_iterations, ) logg.info( ' finished', time=start, deep=( f'found {len(np.unique(groups))} clusters and added\n' f' {key_added!r}, the cluster labels (adata.obs, categorical)'), ) return adata if copy else None
def lineages( adata: AnnData, lineages: Optional[Union[str, Iterable[str]]] = None, final: bool = True, cluster_key: Optional[str] = None, mode: str = "embedding", time_key: str = "latent_time", cmap: Union[str, mpl.colors.ListedColormap] = cm.viridis, **kwargs, ) -> None: """ Plot lineages that were uncovered using :func:`cellrank.tl.lineages`. For each lineage, we show all cells in an embedding (default is UMAP but can be any) and color them by their probability of belonging to this lineage. For cells that are already committed, this probability will be one for their respective lineage and zero otherwise. For naive cells, these probabilities will be more balanced, reflecting the fact that naive cells have the potential to develop towards multiple endpoints. .. image:: https://raw.githubusercontent.com/theislab/cellrank/master/resources/images/lineages.png :width: 400px :align: center Params ------ adata : :class:`adata.AnnData` Annotated data object. lineages Only show these lineages. If `None`, plot all lineages. final Whether to consider cells going to final states or vice versa. cluster_key If given, plot cluster annotations left of the lineage probabilities. mode Can be either `'embedding'` or `'time'`. - If `'embedding'`, plots the embedding while coloring in the absorption probabilities. - If `'time'`, plots the pseudotime on x-axis and the absorption probabilities on y-axis. time_key Key from `adata.obs` to use as a pseudotime ordering of the cells. cmap Colormap to use. kwargs Keyword arguments for :func:`scvelo.pl.scatter`. Returns ------- None Just plots the lineage probabilities. """ adata_dummy = adata.copy() # create a dummy kernel object vk = VelocityKernel(adata_dummy, backward=not final) vk.transition_matrix = csr_matrix((adata_dummy.n_obs, adata_dummy.n_obs)) # use this to initialize an MC object mc = MarkovChain(vk) # plot using the MC object mc.plot_lin_probs( lineages=lineages, cluster_key=cluster_key, mode=mode, time_key=time_key, cmap=cmap, **kwargs, )
def score_genes( adata: AnnData, gene_list: Sequence[str], ctrl_size: int = 50, gene_pool: Optional[Sequence[str]] = None, n_bins: int = 25, score_name: str = 'score', random_state: AnyRandom = 0, copy: bool = False, use_raw: Optional[bool] = None, ) -> Optional[AnnData]: """\ Score a set of genes [Satija15]_. The score is the average expression of a set of genes subtracted with the average expression of a reference set of genes. The reference set is randomly sampled from the `gene_pool` for each binned expression value. This reproduces the approach in Seurat [Satija15]_ and has been implemented for Scanpy by Davide Cittaro. Parameters ---------- adata The annotated data matrix. gene_list The list of gene names used for score calculation. ctrl_size Number of reference genes to be sampled from each bin. If `len(gene_list)` is not too low, you can set `ctrl_size=len(gene_list)`. gene_pool Genes for sampling the reference set. Default is all genes. n_bins Number of expression level bins for sampling. score_name Name of the field to be added in `.obs`. random_state The random seed for sampling. copy Copy `adata` or modify it inplace. use_raw Whether to use `raw` attribute of `adata`. Defaults to `True` if `.raw` is present. .. versionchanged:: 1.4.5 Default value changed from `False` to `None`. Returns ------- Depending on `copy`, returns or updates `adata` with an additional field `score_name`. Examples -------- See this `notebook <https://github.com/scverse/scanpy_usage/tree/master/180209_cell_cycle>`__. """ start = logg.info(f'computing score {score_name!r}') adata = adata.copy() if copy else adata use_raw = _check_use_raw(adata, use_raw) if random_state is not None: np.random.seed(random_state) gene_list_in_var = [] var_names = adata.raw.var_names if use_raw else adata.var_names genes_to_ignore = [] for gene in gene_list: if gene in var_names: gene_list_in_var.append(gene) else: genes_to_ignore.append(gene) if len(genes_to_ignore) > 0: logg.warning(f'genes are not in var_names and ignored: {genes_to_ignore}') gene_list = set(gene_list_in_var[:]) if len(gene_list) == 0: raise ValueError("No valid genes were passed for scoring.") if gene_pool is None: gene_pool = list(var_names) else: gene_pool = [x for x in gene_pool if x in var_names] if not gene_pool: raise ValueError("No valid genes were passed for reference set.") # Trying here to match the Seurat approach in scoring cells. # Basically we need to compare genes against random genes in a matched # interval of expression. _adata = adata.raw if use_raw else adata _adata_subset = ( _adata[:, gene_pool] if len(gene_pool) < len(_adata.var_names) else _adata ) if issparse(_adata_subset.X): obs_avg = pd.Series( np.array(_sparse_nanmean(_adata_subset.X, axis=0)).flatten(), index=gene_pool, ) # average expression of genes else: obs_avg = pd.Series( np.nanmean(_adata_subset.X, axis=0), index=gene_pool ) # average expression of genes obs_avg = obs_avg[ np.isfinite(obs_avg) ] # Sometimes (and I don't know how) missing data may be there, with nansfor n_items = int(np.round(len(obs_avg) / (n_bins - 1))) obs_cut = obs_avg.rank(method='min') // n_items control_genes = set() # now pick `ctrl_size` genes from every cut for cut in np.unique(obs_cut.loc[gene_list]): r_genes = np.array(obs_cut[obs_cut == cut].index) np.random.shuffle(r_genes) # uses full r_genes if ctrl_size > len(r_genes) control_genes.update(set(r_genes[:ctrl_size])) # To index, we need a list – indexing implies an order. control_genes = list(control_genes - gene_list) gene_list = list(gene_list) X_list = _adata[:, gene_list].X if issparse(X_list): X_list = np.array(_sparse_nanmean(X_list, axis=1)).flatten() else: X_list = np.nanmean(X_list, axis=1, dtype='float64') X_control = _adata[:, control_genes].X if issparse(X_control): X_control = np.array(_sparse_nanmean(X_control, axis=1)).flatten() else: X_control = np.nanmean(X_control, axis=1, dtype='float64') score = X_list - X_control adata.obs[score_name] = pd.Series( np.array(score).ravel(), index=adata.obs_names, dtype='float64' ) logg.info( ' finished', time=start, deep=( 'added\n' f' {score_name!r}, score of gene set (adata.obs).\n' f' {len(control_genes)} total control genes are used.' ), ) return adata if copy else None
def critical_transition( adata: AnnData, root_milestone, milestones, n_map=1, n_jobs=None, layer: Optional[str] = None, w=100, step=30, loess_span=0.4, gamma=1.5, n_points=200, copy: bool = False, ): """\ Estimates local critical transition index along the trajectory. Based from the concept of pre-bifurcation struture from [Bargaje17]_. This study proposes the idea that a signature indicating the flattening of the quasi-potential landscape can be detected prior to bifurcation. To detect this signal, this function estimates local critical transition index along the trajectory, by calculating along a moving window of cell the following: .. math:: \\frac{<{\\left | R(g_i,g_j) \\right |>}}{<\\left | R(c_k,c_l) \\right |>} Which is the ratio between the mean of the absolute gene by gene correlations and the mean of the absolute cell by cell correlations. Parameters ---------- adata Annotated data matrix. root_milestone tip defining progenitor branch. milestones tips defining the progenies branches. n_map number of probabilistic cells projection to use for estimates. n_jobs number of cpu processes to perform estimates (per mapping). layer adata layer to use for estimates. w local window, in number of cells, to estimate correlations. step steps, in number of cells, between local windows. loess_span fraction of points to take in account for loess fit copy Return a copy instead of writing to adata. Returns ------- adata : anndata.AnnData if `copy=True` it returns subsetted or else subset (keeping only significant features) and add fields to `adata` for a bifurcation: `.uns['root_milestone->milestoneA<>milestoneB']['critical transition']` containing local critical transition index per window of cells. `.obs['root_milestone->milestoneA<>milestoneB pre-fork CI lowess']` local critical transition index loess fitted onto cells prior to bifurcation. For a linear trajectory: `.uns['root_milestone->milestoneA']['critical transition']` containing local critical transition index per window of cells. `.obs['root_milestone->milestoneA CI lowess']` local critical transition index loess fitted onto cells along the path. """ adata = adata.copy() if copy else adata logg.info("Calculating local critical transition index", reset=True) graph = adata.uns["graph"] edges = graph["pp_seg"][["from", "to"]].astype(str).apply(tuple, axis=1).values img = igraph.Graph() img.add_vertices( np.unique(graph["pp_seg"][["from", "to"]].values.flatten().astype(str))) img.add_edges(edges) uns_temp = adata.uns.copy() if "milestones_colors" in adata.uns: mlsc = adata.uns["milestones_colors"].copy() dct = graph["milestones"] keys = np.array(list(dct.keys())) vals = np.array(list(dct.values())) leaves = list(map(lambda leave: dct[leave], milestones)) root = dct[root_milestone] name = root_milestone + "->" + "<>".join(milestones) def critical_map(m, gamma, loess_span): df = adata.uns["pseudotime_list"][str(m)] edges = graph["pp_seg"][["from", "to"]].astype(str).apply(tuple, axis=1).values img = igraph.Graph() img.add_vertices( np.unique(graph["pp_seg"][["from", "to"]].values.flatten().astype(str))) img.add_edges(edges) def critical_milestone(leave): cells = getpath(img, root, graph["tips"], leave, graph, df).index X = get_X(adata, cells, adata.var_names, layer) mat = pd.DataFrame(X, index=cells, columns=adata.var_names) mat = mat.iloc[adata.obs.t[mat.index].argsort().values, :] def slide_path(i): cls = mat.index[i:(i + w)] cor_gene = mat.loc[cls, :].corr(method="pearson").values cor_cell = mat.loc[cls, :].T.corr(method="pearson").values R_gene = np.nanmean( np.abs(cor_gene[np.triu_indices(cor_gene.shape[0], k=1)])) R_cell = np.nanmean( np.abs(cor_cell[np.triu_indices(cor_cell.shape[0], k=1)])) return [adata.obs.t[cls].mean(), R_gene / R_cell, cls] wins = np.arange(0, mat.shape[0] - w, step) stats = ProgressParallel( n_jobs=n_jobs, total=len(wins), use_tqdm=n_map == 1, file=sys.stdout, desc=" to " + str(keys[vals == leave][0]), )(delayed(slide_path)(i) for i in wins) cells_l = [s[2] for s in stats] stats = pd.DataFrame([[s[0], s[1]] for s in stats], columns=("t", "ci")) l = loess(stats.t, stats.ci, span=loess_span) l.fit() pred = l.predict(stats.t, stderror=True) conf = pred.confidence() stats["lowess"] = pred.values stats["ll"] = conf.lower stats["ul"] = conf.upper cell_stats = [ pd.DataFrame( np.repeat(stats.ci[i].reshape(-1, 1), len(cells_l[i])), index=cells_l[i], columns=["ci"], ) for i in range(stats.shape[0]) ] cell_stats = pd.concat(cell_stats, axis=1) cell_stats = cell_stats.T.groupby(level=0).mean().T cell_stats["t"] = adata.obs.loc[cell_stats.index, "t"] l = loess(cell_stats.t, cell_stats.ci, span=loess_span) pred = l.predict(cell_stats.t, stderror=True) cell_stats["fit"] = pred.values lspaced_stats = pd.DataFrame({ "t": np.linspace(cell_stats["t"].min(), cell_stats["t"].max(), n_points) }) pred = l.predict(lspaced_stats.t, stderror=True) lspaced_stats["fit"] = pred.values del cell_stats["t"] return stats, cell_stats, lspaced_stats res = list(map(critical_milestone, leaves)) cell_stats = pd.concat([r[1] for r in res]).groupby(level=0).mean() res_slide = dict(zip(milestones, [r[0] for r in res])) res_lspaced = dict(zip(milestones, [r[2] for r in res])) return cell_stats, res_slide, res_lspaced if n_map == 1: df, res_slide, res_lspaced = critical_map(0, gamma, loess_span) else: # TODO: adapt multimapping stats = Parallel(n_jobs=n_jobs)(delayed(critical_map)(i) for i in tqdm( range(n_map), file=sys.stdout, desc=" multi mapping ")) res_slides = pd.concat(stats) if name in adata.uns: adata.uns[name]["critical transition"] = { "LOESS": res_slide, "eLOESS": res_lspaced, } else: adata.uns[name] = { "critical transition": { "LOESS": res_slide, "eLOESS": res_lspaced } } adata.obs.loc[df.index, name + " CI"] = df.ci.values adata.obs.loc[df.index, name + " CI fitted"] = df.fit.values logg.info(" finished", time=True, end=" " if settings.verbosity > 2 else "\n") logg.hint( "added \n" " .uns['" + name + "']['critical transition'], df containing local critical transition index per window of cells.\n" " .obs['" + name + " CI'], local critical transition index projected onto cells.\n" " .obs['" + name + " CI fitted'], GAM fit of local critical transition index projected onto cells." ) return adata if copy else None
def criticality_drivers( adata: AnnData, root_milestone, milestones, t_span=None, confidence_level: float = 0.95, layer: Optional[str] = None, device="cpu", copy: bool = False, ): """\ Calculates correlations between genes and local critical transition index along trajectory. Fisher test for the correlations comes from CellRank function `cr.tl.lineages_drivers`. Parameters ---------- adata Annotated data matrix. root_milestone tip defining progenitor branch. milestones tips defining the progenies branches. t_span restrict correlations to a window of pseudotime confidence_level correlation confidence interval. layer adata layer to use for estimates. device whether to run the correlation matrix computation on a cpu or gpu. loess_span fraction of points to take in account for loess fit copy Return a copy instead of writing to adata. Returns ------- adata : anndata.AnnData if `copy=True` it returns subsetted or else subset (keeping only significant features) and add fields to `adata`: `.uns['root_milestone->milestoneA<>milestoneB']['criticality drivers']. a df containing gene correlation with critical index transition. """ adata = adata.copy() if copy else adata logg.info("Calculating gene to critical transition index correlations", reset=True) name = root_milestone + "->" + "<>".join(milestones) obs_name = name + " CI fitted" if t_span is None: cells = adata.obs_names[~np.isnan(adata.obs[obs_name])] else: cells = adata.obs_names[(~np.isnan(adata.obs[obs_name])) & (adata.obs.t > t_span[0]) & (adata.obs.t < t_span[1])] CI = adata[cells].obs[obs_name].values if layer is None: X = adata[cells].X else: X = adata[cells].layers[layer] if device == "cpu": from .utils import cor_mat_cpu X = X.A if sparse.issparse(X) else X corr = cor_mat_cpu(X, CI.reshape(-1, 1)).ravel() else: from .utils import cor_mat_gpu import cupy as cp from cupyx.scipy.sparse import csr_matrix as csr_gpu X = csr_gpu(X) if sparse.issparse(X) else cp.array(X) corr = cor_mat_gpu(X, cp.array(CI).reshape(-1, 1)).ravel().get() ### Fisher testing of correlations, CellRank implementation ### https://github.com/theislab/cellrank/blob/b6345d5e6dd148317782ffc9a9f96793ad98ead9/cellrank/tl/_utils.py#L488 ### Copyright (c) 2019, Theis Lab confidence_level = 0.95 n = adata.shape[0] ql = 1 - confidence_level - (1 - confidence_level) / 2.0 qh = confidence_level + (1 - confidence_level) / 2.0 mean, se = np.arctanh(corr), 1.0 / np.sqrt(n - 3) z_score = (np.arctanh(corr) - np.arctanh(0)) * np.sqrt(n - 3) z = norm.ppf(qh) corr_ci_low = np.tanh(mean - z * se) corr_ci_high = np.tanh(mean + z * se) pvals = 2 * norm.cdf(-np.abs(z_score)) ### res = pd.DataFrame( { "corr": corr, "pval": pvals, "ci_low": corr_ci_low, "ci_high": corr_ci_high }, index=adata.var_names, ) res["q_val"] = np.nan res.loc[~np.isnan(pvals), "q_val"] = multipletests(res[~np.isnan(pvals)].pval.values, alpha=0.05, method="fdr_bh")[1] adata.uns[name]["criticality drivers"] = res.sort_values( "corr", ascending=False).dropna() logg.info(" finished", time=True, end=" " if settings.verbosity > 2 else "\n") logg.hint( "added \n" " .uns['" + name + "']['criticality drivers'], df containing gene correlation with critical index transition." ) return adata if copy else None
def synchro_path( adata: AnnData, root_milestone, milestones, genesetA: Optional[Iterable] = None, genesetB: Optional[Iterable] = None, n_map=1, n_jobs=None, layer: Optional[str] = None, perm=True, w=200, step=30, winp=10, loess_span=0.2, copy: bool = False, ): """\ Estimates pseudotime trends of local intra- and inter-module correlations of fates-specific modules. Parameters ---------- adata Annotated data matrix. root_milestone tip defining progenitor branch. milestones tips defining the progenies branches. n_map number of probabilistic cells projection to use for estimates. n_jobs number of cpu processes to perform estimates (per mapping). layer adata layer to use for estimates. perm estimate control trends for local permutations instead of real expression matrix. w local window, in number of cells, to estimate correlations. step steps, in number of cells, between local windows. winp window of permutation in cells. loess_span fraction of points to take in account for loess fit copy Return a copy instead of writing to adata. Returns ------- adata : anndata.AnnData if `copy=True` it returns subsetted or else subset (keeping only significant features) and add fields to `adata`: `.uns['root_milestone->milestoneA<>milestoneB']['synchro']` Dataframe containing mean local gene-gene correlations of all possible gene pairs inside one module, or between the two modules. `.obs['intercor root_milestone->milestoneA<>milestoneB']` loess fit of inter-module mean local gene-gene correlations prior to bifurcation """ adata = adata.copy() if copy else adata logg.info("computing local correlations", reset=True) graph = adata.uns["graph"] edges = graph["pp_seg"][["from", "to"]].astype(str).apply(tuple, axis=1).values img = igraph.Graph() img.add_vertices( np.unique(graph["pp_seg"][["from", "to"]].values.flatten().astype(str))) img.add_edges(edges) uns_temp = adata.uns.copy() if "milestones_colors" in adata.uns: mlsc = adata.uns["milestones_colors"].copy() dct = graph["milestones"] keys = np.array(list(dct.keys())) vals = np.array(list(dct.values())) leaves = list(map(lambda leave: dct[leave], milestones)) root = dct[root_milestone] name = root_milestone + "->" + "<>".join(milestones) if genesetA is None: bif = adata.uns[name]["fork"] genesetA = bif.index[(bif.module == "early") & (bif.branch == milestones[0])] genesetB = bif.index[(bif.module == "early") & (bif.branch == milestones[1])] genesets = np.concatenate([genesetA, genesetB]) if n_map == 1: logg.info(" single mapping") def synchro_map(m): df = adata.uns["pseudotime_list"][str(m)] edges = graph["pp_seg"][["from", "to"]].astype(str).apply(tuple, axis=1).values img = igraph.Graph() img.add_vertices( np.unique(graph["pp_seg"][["from", "to"]].values.flatten().astype(str))) img.add_edges(edges) def synchro_milestone(leave): cells = getpath(img, root, graph["tips"], leave, graph, df) cells = cells.sort_values("t").index X = get_X(adata, cells, genesets, layer) mat = pd.DataFrame(X, index=cells, columns=genesets) if permut == True: winperm = np.min([winp, mat.shape[0]]) for i in np.arange(0, mat.shape[0] - winperm, winperm): mat.iloc[i:(i + winperm), :] = (mat.iloc[i:( i + winperm), :].apply(np.random.permutation, axis=0).values) def slide_path(i): cls = mat.index[i:(i + w)] cor = mat.loc[cls, :].corr(method="spearman") corA = cor.loc[:, genesetA].mean(axis=1) corB = cor.loc[:, genesetB].mean(axis=1) corA[genesetA] = ((corA[genesetA] - 1 / len(genesetA)) * len(genesetA) / (len(genesetA) - 1)) corB[genesetB] = ((corB[genesetB] - 1 / len(genesetB)) * len(genesetB) / (len(genesetB) - 1)) return pd.Series({ "t": adata.obs.t[cls].mean(), "dist": (corA[genesetA].mean() - corA[genesetB].mean())**2 + (corB[genesetA].mean() - corB[genesetB].mean())**2, "corAA": corA[genesetA].mean(), "corBB": corB[genesetB].mean(), "corAB": corA[genesetB].mean(), "n_map": m, }) ww = np.arange(0, mat.shape[0] - w, step) res = ProgressParallel( n_jobs=n_jobs, total=len(ww), use_tqdm=n_map == 1, file=sys.stdout, desc=" to " + str(keys[vals == leave][0]), )(delayed(slide_path)(i) for i in ww) return pd.concat(res, axis=1).T return pd.concat(list(map(synchro_milestone, leaves)), keys=milestones) if n_map > 1: permut = False stats = ProgressParallel(n_jobs=n_jobs, total=n_map, file=sys.stdout, desc=" multi mapping")( delayed(synchro_map)(i) for i in range(n_map)) allcor_r = pd.concat(stats) if perm: permut = True stats = ProgressParallel( n_jobs=n_jobs, total=n_map, file=sys.stdout, desc=" multi mapping permutations", )(delayed(synchro_map)(i) for i in range(n_map)) allcor_p = pd.concat(stats) allcor = pd.concat([allcor_r, allcor_p], keys=["real", "permuted"]) else: allcor = pd.concat([allcor_r], keys=["real"]) else: permut = False allcor_r = pd.concat(list(map(synchro_map, range(n_map)))) if perm: permut = True allcor_p = pd.concat(list(map(synchro_map, range(n_map)))) allcor = pd.concat([allcor_r, allcor_p], keys=["real", "permuted"]) else: allcor = pd.concat([allcor_r], keys=["real"]) runs = pd.DataFrame(allcor.to_records())["level_0"].unique() dct_cormil = dict( zip( ["corAA", "corBB", "corAB"], [ milestones[0] + "\nintra-module", milestones[1] + "\nintra-module" ] + [milestones[0] + " vs " + milestones[1] + "\ninter-module"], )) logg.info(" done, computing LOESS fit") for cc in ["corAA", "corBB", "corAB"]: allcor[cc + "_lowess"] = 0 allcor[cc + "_ll"] = 0 allcor[cc + "_ul"] = 0 for r in range(len(runs)): for mil in milestones: res = allcor.loc[runs[r]].loc[mil] l = loess(res.t, res[cc], span=loess_span) l.fit() pred = l.predict(res.t, stderror=True) conf = pred.confidence() allcor.loc[(runs[r], mil), cc + "_lowess"] = pred.values allcor.loc[(runs[r], mil), cc + "_ll"] = conf.lower allcor.loc[(runs[r], mil), cc + "_ul"] = conf.upper fork = list( set(img.get_shortest_paths(str(root), str(leaves[0]))[0]).intersection( img.get_shortest_paths(str(root), str(leaves[1]))[0])) fork = np.array(img.vs["name"], dtype=int)[fork] fork_t = adata.uns["graph"]["pp_info"].loc[fork, "time"].max() res = allcor.loc[allcor.t < fork_t, :] res = res[~res.t.duplicated()] l = loess(res.t, res["corAB"], span=loess_span) l.fit() pred = l.predict(res.t, stderror=True) tval = adata.obs.t.copy() tval[tval > fork_t] = np.nan def inter_values(tv): if ~np.isnan(tv): return pred.values[np.argmin(np.abs(res.t.values - tv))] else: return tv adata.obs["inter_cor " + name] = list(map(inter_values, tval)) df = adata.uns["pseudotime_list"][str(0)] cells = np.concatenate([ getpath(img, root, graph["tips"], leaves[0], graph, df).index, getpath(img, root, graph["tips"], leaves[1], graph, df).index, ]) adata.obs.loc[~adata.obs_names.isin(cells), "inter_cor " + name] = np.nan adata.uns = uns_temp allcor = dict( zip( allcor.index.levels[0], [ dict( zip( allcor.loc[l1].index.levels[0], [ allcor.loc[l1].loc[l2] for l2 in allcor.loc[l1].index.levels[0] ], )) for l1 in allcor.index.levels[0] ], )) if name in adata.uns: adata.uns[name]["synchro"] = allcor else: adata.uns[name] = {"synchro": allcor} logg.info(" finished", time=True, end=" " if settings.verbosity > 2 else "\n") logg.hint( "added \n" " .uns['" + name + "']['synchro'], mean local gene-gene correlations of all possible gene pairs inside one module, or between the two modules.\n" " .obs['inter_cor " + name + "'], loess fit of inter-module mean local gene-gene correlations prior to bifurcation." ) return adata if copy else None
def synchro_path_multi(adata: AnnData, root_milestone, milestones, copy=False, **kwargs): """\ Wrappers that call `tl.synchro_path` on the pairwise combination of all selected branches. Parameters ---------- adata Annotated data matrix. root_milestone tip defining progenitor branch. milestones tips defining the progenies branches. kwargs arguments to pass to tl.synchro_path. Returns ------- adata : anndata.AnnData if `copy=True` it returns subsetted or else subset (keeping only significant features) and add fields to `adata`: `.uns['root_milestone->milestoneA<>milestoneB']['synchro']` Dataframe containing mean local gene-gene correlations of all possible gene pairs inside one module, or between the two modules. `.obs['intercor root_milestone->milestoneA<>milestoneB']` loess fit of inter-module mean local gene-gene correlations prior to bifurcation """ adata = adata.copy() if copy else adata logg.info("computing local correlations", reset=True) graph = adata.uns["graph"] edges = graph["pp_seg"][["from", "to"]].astype(str).apply(tuple, axis=1).values img = igraph.Graph() img.add_vertices( np.unique(graph["pp_seg"][["from", "to"]].values.flatten().astype(str))) img.add_edges(edges) uns_temp = adata.uns.copy() if "milestones_colors" in adata.uns: mlsc = adata.uns["milestones_colors"].copy() dct = graph["milestones"] keys = np.array(list(dct.keys())) vals = np.array(list(dct.values())) leaves = list(map(lambda leave: dct[leave], milestones)) root = dct[root_milestone] name = root_milestone + "->" + "<>".join(milestones) bif = adata.uns[name]["fork"] genesets = dict( zip( milestones, [ bif.index[(bif.module == "early") & (bif.branch == m)] for m in milestones ], )) pairs = list(itertools.combinations(milestones, 2)) for m_pair in pairs: synchro_path(adata, root_milestone, m_pair, genesetA=genesets[m_pair[0]], genesetB=genesets[m_pair[1]], **kwargs)
def scanvi( train_adata: anndata.AnnData, test_adata: anndata.AnnData, cell_type_col: str, n_per_class: int = 100, **kwargs, ) -> (np.ndarray, pd.DataFrame, anndata.AnnData, scvi.model.scanvi.SCANVI): '''Use SCANVI to transfer annotations. Parameters ---------- train_adata : anndata.AnnData [Cells, Genes] for training. test_adata : anndata.AnnData [Cells, Genes] for testing. cell_type_col : str column labeling ground truth cell types in `train_adata` and `test_adata`. n_per_class : int number of training examples per class. scANVI authors recommend `100` (default). Returns ------- predictions : np.ndarray [Cells,] cell type label predictions. probabilities : pd.DataFrame [Cells, Class] probabilities. classes are column labels. adata : anndata.AnnData [Cells, Genes] concatenation of `train_adata` and `test_adata` with the learned scANVI latent space in `.obsm['X_scANVI']`. class predictions are in `.obs['C_scANVI']`. lvae : scvi.model.scanvi.SCANVI a trained scANVI model object. Notes ----- This implementation exactly follows the working example from the `scvi` authors. https://www.scvi-tools.org/en/0.7.0-alpha.4/user_guide/notebooks/harmonization.html ''' # check that train_adata and test_adata contain # raw counts tr_int = isinteger(train_adata.X) te_int = isinteger(test_adata.X) if not (tr_int and te_int): # check if the raw counts are in `.raw` tr_int = isinteger(train_adata.raw.X) te_int = isinteger(test_adata.raw.X) if tr_int and te_int: train_adata = train_adata.copy() test_adata = test_adata.copy() # set raw counts to `X` train_adata.X = train_adata.raw[:, train_adata.var_names].X test_adata.X = test_adata.raw[:, test_adata.var_names].X else: msg = 'Integer raw counts not found.' raise ValueError(msg) else: # raw counts are already set to X pass # `.concatenate()` creates batch labels in `.obs['batch']` adata = train_adata.concatenate(test_adata) # store raw counts in a new layer # normalize and select highly variable genes # # scVI uses only a set of highly variable genes # to perform data integration in the latent space adata.layers["counts"] = adata.X.copy() sc.pp.normalize_total(adata, target_sum=1e4) sc.pp.log1p(adata) adata.raw = adata # keep full dimension safe sc.pp.highly_variable_genes(adata, flavor="seurat_v3", n_top_genes=2000, layer="counts", batch_key="batch", subset=True) # assign cell type labels for training # scANVI uses a special token `"Unknown"` for cells that are not labeled # the authors recommend using 100 cells per cell type # from the training set to balance classes adata.obs[cell_type_col] = pd.Categorical(adata.obs[cell_type_col], ) labels = np.repeat("Unknown", adata.shape[0]) labels = labels.astype("<U43") for x in np.unique(adata.obs[cell_type_col]): idx = np.where((adata.obs[cell_type_col] == x) & (adata.obs["batch"] == "0"))[0] sampled = np.random.choice(idx, np.min([n_per_class, len(idx)])) labels[sampled] = adata.obs[cell_type_col][sampled] adata.obs["celltype_scanvi"] = labels # setup scANVI for training scvi.data.setup_anndata( adata, layer="counts", batch_key="batch", labels_key="celltype_scanvi", ) # fit the semi-supervised scANVI model lvae = scvi.model.SCANVI( adata, "Unknown", use_cuda=True, n_latent=30, n_layers=2, ) lvae.train(n_epochs_semisupervised=100) # extract labels adata.obs["C_scANVI"] = lvae.predict(adata) adata.obsm["X_scANVI"] = lvae.get_latent_representation(adata) predictions = np.array(adata.obs.loc[adata.obs['batch'] == '1', "C_scANVI"]) # returns a [Cells, Classes] data frame with class # names as column labels and cell barcodes as indices probabilities = lvae.predict(adata, soft=True) probabilities = probabilities.loc[adata.obs['batch'] == '1'] # scANVI will add the "Unknown" token as a class, usually # with very low probability # here we drop it, then renorm probabilities to == 1 probabilities = probabilities.drop(columns=['Unknown']) probabilities = probabilities / np.tile( np.array(probabilities.sum(1)).reshape(-1, 1), (1, probabilities.shape[1])) # check probability normalization eq1 = np.allclose( probabilities.sum(1), np.ones(probabilities.shape[0]), ) if not eq1: msg = 'Not all sum(probabilities) are close to 1.' n = np.sum(probabilities.sum(1) != 1.) msg += f'{n} cells have probabilities != 1.' raise ValueError(msg) r = ( predictions, probabilities, adata, lvae, ) return r
def scrublet( adata: AnnData, adata_sim: Optional[AnnData] = None, batch_key: str = None, sim_doublet_ratio: float = 2.0, expected_doublet_rate: float = 0.05, stdev_doublet_rate: float = 0.02, synthetic_doublet_umi_subsampling: float = 1.0, knn_dist_metric: str = 'euclidean', normalize_variance: bool = True, log_transform: bool = False, mean_center: bool = True, n_prin_comps: int = 30, use_approx_neighbors: bool = True, get_doublet_neighbor_parents: bool = False, n_neighbors: Optional[int] = None, threshold: Optional[float] = None, verbose: bool = True, copy: bool = False, random_state: int = 0, ) -> Optional[AnnData]: """\ Predict doublets using Scrublet [Wolock19]_. Predict cell doublets using a nearest-neighbor classifier of observed transcriptomes and simulated doublets. Works best if the input is a raw (unnormalized) counts matrix from a single sample or a collection of similar samples from the same experiment. This function is a wrapper around functions that pre-process using Scanpy and directly call functions of Scrublet(). You may also undertake your own preprocessing, simulate doublets with scanpy.external.pp.scrublet_simulate_doublets(), and run the core scrublet function scanpy.external.pp.scrublet.scrublet(). .. note:: More information and bug reports `here <https://github.com/swolock/scrublet>`__. Parameters ---------- adata The annotated data matrix of shape ``n_obs`` × ``n_vars``. Rows correspond to cells and columns to genes. Expected to be un-normalised where adata_sim is not supplied, in which case doublets will be simulated and pre-processing applied to both objects. If adata_sim is supplied, this should be the observed transcriptomes processed consistently (filtering, transform, normalisaton, hvg) with adata_sim. adata_sim (Advanced use case) Optional annData object generated by sc.external.pp.scrublet_simulate_doublets(), with same number of vars as adata. This should have been built from adata_obs after filtering genes and cells and selcting highly-variable genes. batch_key Optional `adata.obs` column name discriminating between batches. sim_doublet_ratio Number of doublets to simulate relative to the number of observed transcriptomes. expected_doublet_rate Where adata_sim not suplied, the estimated doublet rate for the experiment. stdev_doublet_rate Where adata_sim not suplied, uncertainty in the expected doublet rate. synthetic_doublet_umi_subsampling Where adata_sim not suplied, rate for sampling UMIs when creating synthetic doublets. If 1.0, each doublet is created by simply adding the UMI counts from two randomly sampled observed transcriptomes. For values less than 1, the UMI counts are added and then randomly sampled at the specified rate. knn_dist_metric Distance metric used when finding nearest neighbors. For list of valid values, see the documentation for annoy (if `use_approx_neighbors` is True) or sklearn.neighbors.NearestNeighbors (if `use_approx_neighbors` is False). normalize_variance If True, normalize the data such that each gene has a variance of 1. `sklearn.decomposition.TruncatedSVD` will be used for dimensionality reduction, unless `mean_center` is True. log_transform Whether to use :func:``~scanpy.pp.log1p`` to log-transform the data prior to PCA. mean_center If True, center the data such that each gene has a mean of 0. `sklearn.decomposition.PCA` will be used for dimensionality reduction. n_prin_comps Number of principal components used to embed the transcriptomes prior to k-nearest-neighbor graph construction. use_approx_neighbors Use approximate nearest neighbor method (annoy) for the KNN classifier. get_doublet_neighbor_parents If True, return (in .uns) the parent transcriptomes that generated the doublet neighbors of each observed transcriptome. This information can be used to infer the cell states that generated a given doublet state. n_neighbors Number of neighbors used to construct the KNN graph of observed transcriptomes and simulated doublets. If ``None``, this is automatically set to ``np.round(0.5 * np.sqrt(n_obs))``. threshold Doublet score threshold for calling a transcriptome a doublet. If `None`, this is set automatically by looking for the minimum between the two modes of the `doublet_scores_sim_` histogram. It is best practice to check the threshold visually using the `doublet_scores_sim_` histogram and/or based on co-localization of predicted doublets in a 2-D embedding. verbose If True, print progress updates. copy If ``True``, return a copy of the input ``adata`` with Scrublet results added. Otherwise, Scrublet results are added in place. random_state Initial state for doublet simulation and nearest neighbors. Returns ------- adata : anndata.AnnData if ``copy=True`` it returns or else adds fields to ``adata``. Those fields: ``.obs['doublet_score']`` Doublet scores for each observed transcriptome ``.obs['predicted_doublets']`` Boolean indicating predicted doublet status ``.uns['scrublet']['doublet_scores_sim']`` Doublet scores for each simulated doublet transcriptome ``.uns['scrublet']['doublet_parents']`` Pairs of ``.obs_names`` used to generate each simulated doublet transcriptome ``.uns['scrublet']['parameters']`` Dictionary of Scrublet parameters See also -------- :func:`~scanpy.external.pp.scrublet_simulate_doublets`: Run Scrublet's doublet simulation separately for advanced usage. :func:`~scanpy.external.pl.scrublet_score_distribution`: Plot histogram of doublet scores for observed transcriptomes and simulated doublets. """ try: import scrublet as sl except ImportError: raise ImportError( 'Please install scrublet: `pip install scrublet` or `conda install scrublet`.' ) if copy: adata = adata.copy() start = logg.info('Running Scrublet') adata_obs = adata.copy() def _run_scrublet(ad_obs, ad_sim=None): # With no adata_sim we assume the regular use case, starting with raw # counts and simulating doublets if ad_sim is None: pp.filter_genes(ad_obs, min_cells=3) pp.filter_cells(ad_obs, min_genes=3) # Doublet simulation will be based on the un-normalised counts, but on the # selection of genes following normalisation and variability filtering. So # we need to save the raw and subset at the same time. ad_obs.layers['raw'] = ad_obs.X.copy() pp.normalize_total(ad_obs) # HVG process needs log'd data. logged = pp.log1p(ad_obs, copy=True) pp.highly_variable_genes(logged) ad_obs = ad_obs[:, logged.var['highly_variable']] # Simulate the doublets based on the raw expressions from the normalised # and filtered object. ad_sim = scrublet_simulate_doublets( ad_obs, layer='raw', sim_doublet_ratio=sim_doublet_ratio, synthetic_doublet_umi_subsampling= synthetic_doublet_umi_subsampling, ) if log_transform: pp.log1p(ad_obs) pp.log1p(ad_sim) # Now normalise simulated and observed in the same way pp.normalize_total(ad_obs, target_sum=1e6) pp.normalize_total(ad_sim, target_sum=1e6) ad_obs = _scrublet_call_doublets( adata_obs=ad_obs, adata_sim=ad_sim, n_neighbors=n_neighbors, expected_doublet_rate=expected_doublet_rate, stdev_doublet_rate=stdev_doublet_rate, mean_center=mean_center, normalize_variance=normalize_variance, n_prin_comps=n_prin_comps, use_approx_neighbors=use_approx_neighbors, knn_dist_metric=knn_dist_metric, get_doublet_neighbor_parents=get_doublet_neighbor_parents, threshold=threshold, random_state=random_state, verbose=verbose, ) return {'obs': ad_obs.obs, 'uns': ad_obs.uns['scrublet']} if batch_key is not None: if batch_key not in adata.obs.keys(): raise ValueError( '`batch_key` must be a column of .obs in the input annData object.' ) # Run Scrublet independently on batches and return just the # scrublet-relevant parts of the objects to add to the input object batches = np.unique(adata.obs[batch_key]) scrubbed = [ _run_scrublet( adata_obs[adata_obs.obs[batch_key] == batch, ], adata_sim, ) for batch in batches ] scrubbed_obs = pd.concat([scrub['obs'] for scrub in scrubbed]) # Now reset the obs to get the scrublet scores adata.obs = scrubbed_obs.loc[adata.obs_names.values] # Save the .uns from each batch separately adata.uns['scrublet'] = {} adata.uns['scrublet']['batches'] = dict( zip(batches, [scrub['uns'] for scrub in scrubbed])) # Record that we've done batched analysis, so e.g. the plotting # function knows what to do. adata.uns['scrublet']['batched_by'] = batch_key else: scrubbed = _run_scrublet(adata_obs, adata_sim) # Copy outcomes to input object from our processed version adata.obs['doublet_score'] = scrubbed['obs']['doublet_score'] adata.obs['predicted_doublet'] = scrubbed['obs']['predicted_doublet'] adata.uns['scrublet'] = scrubbed['uns'] logg.info(' Scrublet finished', time=start) if copy: return adata else: return None
def magic( adata: AnnData, name_list: Union[str, Sequence[str], None] = None, k: int = 10, a: int = 15, t: str = 'auto', n_pca: int = 100, knn_dist: str = 'euclidean', random_state: Optional[Union[int, RandomState]] = None, n_jobs: Optional[int] = None, verbose: bool = False, copy: Optional[bool] = None, **kwargs, ) -> Optional[AnnData]: """\ Markov Affinity-based Graph Imputation of Cells (MAGIC) API [vanDijk18]_. MAGIC is an algorithm for denoising and transcript recover of single cells applied to single-cell sequencing data. MAGIC builds a graph from the data and uses diffusion to smooth out noise and recover the data manifold. More information and bug reports `here <https://github.com/KrishnaswamyLab/MAGIC>`__. For help, visit <https://krishnaswamylab.org/get-help>. Parameters ---------- adata An anndata file with `.raw` attribute representing raw counts. name_list Denoised genes to return. The default `'all_genes'`/`None` may require a large amount of memory if the input data is sparse. Another possibility is `'pca_only'`. k number of nearest neighbors on which to build kernel a sets decay rate of kernel tails. If None, alpha decaying kernel is not used t power to which the diffusion operator is powered. This sets the level of diffusion. If 'auto', t is selected according to the Procrustes disparity of the diffused data n_pca Number of principal components to use for calculating neighborhoods. For extremely large datasets, using n_pca < 20 allows neighborhoods to be calculated in roughly log(n_samples) time. knn_dist recommended values: 'euclidean', 'cosine', 'precomputed' Any metric from `scipy.spatial.distance` can be used distance metric for building kNN graph. If 'precomputed', `data` should be an n_samples x n_samples distance or affinity matrix random_state Random seed. Defaults to the global `numpy` random number generator n_jobs Number of threads to use in training. All cores are used by default. verbose If `True` or an integer `>= 2`, print status messages. If `None`, `sc.settings.verbosity` is used. copy If true, a copy of anndata is returned. If `None`, `copy` is True if `genes` is not `'all_genes'` or `'pca_only'`. `copy` may only be False if `genes` is `'all_genes'` or `'pca_only'`, as the resultant data will otherwise have different column names from the input data. kwargs Additional arguments to `magic.MAGIC` Returns ------- If `copy` is True, AnnData object is returned. If `subset_genes` is not `all_genes`, PCA on MAGIC values of cells are stored in `adata.obsm['X_magic']` and `adata.X` is not modified. The raw counts are stored in `.raw` attribute of AnnData object. Examples -------- >>> import scanpy as sc >>> import scanpy.external as sce >>> adata = sc.datasets.paul15() >>> sc.pp.normalize_per_cell(adata) >>> sc.pp.sqrt(adata) # or sc.pp.log1p(adata) >>> adata_magic = sce.pp.magic(adata, name_list=['Mpo', 'Klf1', 'Ifitm1'], k=5) >>> adata_magic.shape (2730, 3) >>> sce.pp.magic(adata, name_list='pca_only', k=5) >>> adata.obsm['X_magic'].shape (2730, 100) >>> sce.pp.magic(adata, name_list='all_genes', k=5) >>> adata.X.shape (2730, 3451) """ try: from magic import MAGIC except ImportError: raise ImportError( 'Please install magic package via `pip install --user ' 'git+git://github.com/KrishnaswamyLab/MAGIC.git#subdirectory=python`' ) start = logg.info('computing PHATE') all_or_pca = isinstance(name_list, (str, type(None))) if all_or_pca and name_list not in {"all_genes", "pca_only", None}: raise ValueError("Invalid string value for `name_list`: " "Only `'all_genes'` and `'pca_only'` are allowed.") if copy is None: copy = not all_or_pca elif not all_or_pca and not copy: raise ValueError( "Can only perform MAGIC in-place with `name_list=='all_genes' or " f"`name_list=='pca_only'` (got {name_list}). Consider setting " "`copy=True`") adata = adata.copy() if copy else adata n_jobs = settings.n_jobs if n_jobs is None else n_jobs X_magic = MAGIC( k=k, a=a, t=t, n_pca=n_pca, knn_dist=knn_dist, random_state=random_state, n_jobs=n_jobs, verbose=verbose, **kwargs, ).fit_transform(adata, genes=name_list) logg.info( ' finished', time=start, deep=("added\n 'X_magic', PCA on MAGIC coordinates (adata.obsm)" if name_list == "pca_only" else ''), ) # update AnnData instance if name_list == "pca_only": # special case - update adata.obsm with smoothed values adata.obsm["X_magic"] = X_magic.X elif copy: # just return X_magic X_magic.raw = adata adata = X_magic else: # replace data with smoothed data adata.raw = adata adata.X = X_magic.X if copy: return adata
def umap(adata: AnnData, min_dist: float = 0.5, spread: float = 1.0, n_components: int = 2, maxiter: Optional[int] = None, alpha: float = 1.0, gamma: float = 1.0, negative_sample_rate: int = 5, init_pos: Union[_InitPos, np.ndarray, None] = 'spectral', random_state: Optional[Union[int, RandomState]] = 0, a: Optional[float] = None, b: Optional[float] = None, copy: bool = False, method: Literal['umap', 'rapids'] = 'umap') -> Optional[AnnData]: """\ Embed the neighborhood graph using UMAP [McInnes18]_. UMAP (Uniform Manifold Approximation and Projection) is a manifold learning technique suitable for visualizing high-dimensional data. Besides tending to be faster than tSNE, it optimizes the embedding such that it best reflects the topology of the data, which we represent throughout Scanpy using a neighborhood graph. tSNE, by contrast, optimizes the distribution of nearest-neighbor distances in the embedding such that these best match the distribution of distances in the high-dimensional space. We use the implementation of `umap-learn <https://github.com/lmcinnes/umap>`__ [McInnes18]_. For a few comparisons of UMAP with tSNE, see this `preprint <https://doi.org/10.1101/298430>`__. Parameters ---------- adata Annotated data matrix. min_dist The effective minimum distance between embedded points. Smaller values will result in a more clustered/clumped embedding where nearby points on the manifold are drawn closer together, while larger values will result on a more even dispersal of points. The value should be set relative to the ``spread`` value, which determines the scale at which embedded points will be spread out. The default of in the `umap-learn` package is 0.1. spread The effective scale of embedded points. In combination with `min_dist` this determines how clustered/clumped the embedded points are. n_components The number of dimensions of the embedding. maxiter The number of iterations (epochs) of the optimization. Called `n_epochs` in the original UMAP. alpha The initial learning rate for the embedding optimization. gamma Weighting applied to negative samples in low dimensional embedding optimization. Values higher than one will result in greater weight being given to negative samples. negative_sample_rate The number of negative edge/1-simplex samples to use per positive edge/1-simplex sample in optimizing the low dimensional embedding. init_pos How to initialize the low dimensional embedding. Called `init` in the original UMAP. Options are: * Any key for `adata.obsm`. * 'paga': positions from :func:`~scanpy.pl.paga`. * 'spectral': use a spectral embedding of the graph. * 'random': assign initial embedding positions at random. * A numpy array of initial embedding positions. random_state If `int`, `random_state` is the seed used by the random number generator; If `RandomState`, `random_state` is the random number generator; If `None`, the random number generator is the `RandomState` instance used by `np.random`. a More specific parameters controlling the embedding. If `None` these values are set automatically as determined by `min_dist` and `spread`. b More specific parameters controlling the embedding. If `None` these values are set automatically as determined by `min_dist` and `spread`. copy Return a copy instead of writing to adata. method Use the original 'umap' implementation, or 'rapids' (experimental, GPU only) Returns ------- Depending on `copy`, returns or updates `adata` with the following fields. **X_umap** : `adata.obsm` field UMAP coordinates of data. """ adata = adata.copy() if copy else adata if 'neighbors' not in adata.uns: raise ValueError( 'Did not find \'neighbors/connectivities\'. Run `sc.pp.neighbors` first.' ) start = logg.info('computing UMAP') if ('params' not in adata.uns['neighbors'] or adata.uns['neighbors']['params']['method'] != 'umap'): logg.warning( 'neighbors/connectivities have not been computed using umap') from umap.umap_ import find_ab_params, simplicial_set_embedding if a is None or b is None: a, b = find_ab_params(spread, min_dist) else: a = a b = b adata.uns['umap'] = {'params': {'a': a, 'b': b}} if isinstance(init_pos, str) and init_pos in adata.obsm.keys(): init_coords = adata.obsm[init_pos] elif isinstance(init_pos, str) and init_pos == 'paga': init_coords = get_init_pos_from_paga(adata, random_state=random_state) else: init_coords = init_pos # Let umap handle it if hasattr(init_coords, "dtype"): init_coords = check_array(init_coords, dtype=np.float32, accept_sparse=False) random_state = check_random_state(random_state) neigh_params = adata.uns['neighbors']['params'] X = _choose_representation(adata, neigh_params.get('use_rep', None), neigh_params.get('n_pcs', None), silent=True) if method == 'umap': # the data matrix X is really only used for determining the number of connected components # for the init condition in the UMAP embedding n_epochs = 0 if maxiter is None else maxiter X_umap = simplicial_set_embedding( X, adata.uns['neighbors']['connectivities'].tocoo(), n_components, alpha, a, b, gamma, negative_sample_rate, n_epochs, init_coords, random_state, neigh_params.get('metric', 'euclidean'), neigh_params.get('metric_kwds', {}), verbose=settings.verbosity > 3, ) elif method == 'rapids': metric = neigh_params.get('metric', 'euclidean') if metric != 'euclidean': raise ValueError( f'`sc.pp.neighbors` was called with `metric` {metric!r}, ' "but umap `method` 'rapids' only supports the 'euclidean' metric." ) from cuml import UMAP n_neighbors = adata.uns['neighbors']['params']['n_neighbors'] n_epochs = 500 if maxiter is None else maxiter # 0 is not a valid value for rapids, unlike original umap X_contiguous = np.ascontiguousarray(X, dtype=np.float32) umap = UMAP( n_neighbors=n_neighbors, n_components=n_components, n_epochs=n_epochs, learning_rate=alpha, init=init_pos, min_dist=min_dist, spread=spread, negative_sample_rate=negative_sample_rate, a=a, b=b, verbose=settings.verbosity > 3, ) X_umap = umap.fit_transform(X_contiguous) adata.obsm['X_umap'] = X_umap # annotate samples with UMAP coordinates logg.info( ' finished', time=start, deep=('added\n' " 'X_umap', UMAP coordinates (adata.obsm)"), ) return adata if copy else None
def louvain( adata: AnnData, resolution: Optional[float] = None, random_state: Optional[Union[int, RandomState]] = 0, restrict_to: Optional[Tuple[str, Sequence[str]]] = None, key_added: Optional[str] = 'louvain', adjacency: Optional[spmatrix] = None, flavor: str = 'vtraag', directed: bool = True, use_weights: bool = False, partition_type: Optional[Type[MutableVertexPartition]] = None, partition_kwargs: Optional[Mapping[str, Any]] = None, copy: bool = False, ) -> Optional[AnnData]: """Cluster cells into subgroups [Blondel08]_ [Levine15]_ [Traag17]_. Cluster cells using the Louvain algorithm [Blondel08]_ in the implementation of [Traag17]_. The Louvain algorithm has been proposed for single-cell analysis by [Levine15]_. This requires having ran :func:`~scanpy.pp.neighbors` or :func:`~scanpy.external.pp.bbknn` first, or explicitly passing a ``adjacency`` matrix. Parameters ---------- adata The annotated data matrix. resolution For the default flavor (``'vtraag'``), you can provide a resolution (higher resolution means finding more and smaller clusters), which defaults to 1.0. See “Time as a resolution parameter” in [Lambiotte09]_. random_state Change the initialization of the optimization. restrict_to Restrict the clustering to the categories within the key for sample annotation, tuple needs to contain ``(obs_key, list_of_categories)``. key_added Key under which to add the cluster labels. (default: ``'louvain'``) adjacency Sparse adjacency matrix of the graph, defaults to ``adata.uns['neighbors']['connectivities']``. flavor : {``'vtraag'``, ``'igraph'``, ``'rapids'``} Choose between to packages for computing the clustering. ``'vtraag'`` is much more powerful, and the default. directed Interpret the ``adjacency`` matrix as directed graph? use_weights Use weights from knn graph. partition_type Type of partition to use. Only a valid argument if ``flavor`` is ``'vtraag'``. partition_kwargs Key word arguments to pass to partitioning, if ``vtraag`` method is being used. copy Copy adata or modify it inplace. Returns ------- :obj:`None` By default (``copy=False``), updates ``adata`` with the following fields: ``adata.obs['louvain']`` (:class:`pandas.Series`, dtype ``category``) Array of dim (number of samples) that stores the subgroup id (``'0'``, ``'1'``, ...) for each cell. :class:`~anndata.AnnData` When ``copy=True`` is set, a copy of ``adata`` with those fields is returned. """ start = logg.info('running Louvain clustering') if (flavor != 'vtraag') and (partition_type is not None): raise ValueError( '`partition_type` is only a valid argument when `flavour` is "vtraag"' ) adata = adata.copy() if copy else adata if adjacency is None and 'neighbors' not in adata.uns: raise ValueError( 'You need to run `pp.neighbors` first to compute a neighborhood graph.' ) if adjacency is None: adjacency = adata.uns['neighbors']['connectivities'] if restrict_to is not None: restrict_key, restrict_categories = restrict_to adjacency, restrict_indices = restrict_adjacency( adata, restrict_key, restrict_categories, adjacency, ) if flavor in {'vtraag', 'igraph'}: if flavor == 'igraph' and resolution is not None: logg.warning( '`resolution` parameter has no effect for flavor "igraph"') if directed and flavor == 'igraph': directed = False if not directed: logg.debug(' using the undirected graph') g = _utils.get_igraph_from_adjacency(adjacency, directed=directed) if use_weights: weights = np.array(g.es["weight"]).astype(np.float64) else: weights = None if flavor == 'vtraag': import louvain if partition_kwargs is None: partition_kwargs = {} if partition_type is None: partition_type = louvain.RBConfigurationVertexPartition if resolution is not None: partition_kwargs["resolution_parameter"] = resolution if use_weights: partition_kwargs["weights"] = weights logg.info(' using the "louvain" package of Traag (2017)') louvain.set_rng_seed(random_state) part = louvain.find_partition( g, partition_type, **partition_kwargs, ) # adata.uns['louvain_quality'] = part.quality() else: part = g.community_multilevel(weights=weights) groups = np.array(part.membership) elif flavor == 'rapids': # nvLouvain only works with undirected graphs, and `adjacency` must have a directed edge in both directions import cudf import cugraph offsets = cudf.Series(adjacency.indptr) indices = cudf.Series(adjacency.indices) if use_weights: sources, targets = adjacency.nonzero() weights = adjacency[sources, targets] if isinstance(weights, np.matrix): weights = weights.A1 weights = cudf.Series(weights) else: weights = None g = cugraph.Graph() g.add_adj_list(offsets, indices, weights) logg.info(' using the "louvain" package of rapids') louvain_parts, _ = cugraph.nvLouvain(g) groups = louvain_parts.to_pandas().sort_values('vertex')[[ 'partition' ]].to_numpy().ravel() elif flavor == 'taynaud': # this is deprecated import networkx as nx import community g = nx.Graph(adjacency) partition = community.best_partition(g) groups = np.zeros(len(partition), dtype=int) for k, v in partition.items(): groups[k] = v else: raise ValueError( '`flavor` needs to be "vtraag" or "igraph" or "taynaud".') if restrict_to is not None: if key_added == 'louvain': key_added += '_R' groups = rename_groups( adata, key_added, restrict_key, restrict_categories, restrict_indices, groups, ) adata.obs[key_added] = pd.Categorical( values=groups.astype('U'), categories=natsorted(np.unique(groups).astype('U')), ) adata.uns['louvain'] = {} adata.uns['louvain']['params'] = { 'resolution': resolution, 'random_state': random_state } logg.info( ' finished', time=start, deep=( f'found {len(np.unique(groups))} clusters and added\n' f' {key_added!r}, the cluster labels (adata.obs, categorical)'), ) return adata if copy else None
def embedding_density(adata: AnnData, basis: str, key: str, *, group: Optional[str] = None, color_map: Union[Colormap, str] = 'YlOrRd', bg_dotsize: Optional[int] = 80, fg_dotsize: Optional[int] = 180, vmax: Optional[int] = 1, vmin: Optional[int] = 0, save: Union[bool, str, None] = None, **kwargs): """Plot the density of cells in an embedding (per condition) Plots the gaussian kernel density estimates (over condition) from the `sc.tl.embedding_density()` output. This function was written by Sophie Tritschler and implemented into Scanpy by Malte Luecken. Parameters ---------- adata The annotated data matrix. basis The embedding over which the density was calculated. This embedded representation should be found in `adata.obsm['X_[basis]']``. key Name of the `.obs` covariate that contains the density estimates group The category in the categorical observation annotation to be plotted. For example, 'G1' in the cell cycle 'phase' covariate. color_map Matplolib color map to use for density plotting. bg_dotsize Dot size for background data points not in the `group`. fg_dotsize Dot size for foreground data points in the `group`. vmax Density that corresponds to color bar maximum. vmin Density that corresponds to color bar minimum. {show_save_ax} Examples -------- >>> adata = sc.datasets.pbmc68k_reduced() >>> sc.tl.umap(adata) >>> sc.tl.embedding_density(adata, basis='umap', groupby='phase') >>> sc.pl.embedding_density(adata, basis='umap', key='umap_density_phase', ... group='G1') >>> sc.pl.embedding_density(adata, basis='umap', key='umap_density_phase', ... group='S') """ sanitize_anndata(adata) # Test user inputs basis = basis.lower() if basis == 'fa': basis = 'draw_graph_fa' if 'X_' + basis not in adata.obsm_keys(): raise ValueError( 'Cannot find the embedded representation `adata.obsm[X_{!r}]`. ' 'Compute the embedding first.'.format(basis)) if key not in adata.obs: raise ValueError( 'Please run `sc.tl.embedding_density()` first and specify the correct key.' ) if key + '_params' not in adata.uns: raise ValueError( 'Please run `sc.tl.embedding_density()` first and specify the correct key.' ) if 'components' in kwargs: logg.warn('Components were specified, but will be ignored. Only the ' 'components used to calculate the density can be plotted.') del kwargs['components'] components = adata.uns[key + '_params']['components'] groupby = adata.uns[key + '_params']['covariate'] if (group is None) and (groupby is not None): raise ValueError('Densities were calculated over an `.obs` covariate. ' 'Please specify a group from this covariate to plot.') if (group is not None) and (group not in adata.obs[groupby].cat.categories): raise ValueError( 'Please specify a group from the `.obs` category over which the density ' 'was calculated.') if (np.min(adata.obs[key]) < 0) or (np.max(adata.obs[key]) > 1): raise ValueError('Densities should be scaled between 0 and 1.') # Define plotting data dens_values = -np.ones(adata.n_obs) dot_sizes = np.ones(adata.n_obs) * bg_dotsize if group is not None: group_mask = (adata.obs[groupby] == group) dens_values[group_mask] = adata.obs[key][group_mask] dot_sizes[group_mask] = np.ones(sum(group_mask)) * fg_dotsize else: dens_values = adata.obs[key] dot_sizes = np.ones(adata.n_obs) * fg_dotsize # Make the color map if isinstance(color_map, str): cmap = cm.get_cmap(color_map) else: cmap = color_map #norm = colors.Normalize(vmin=-1, vmax=1) adata_vis = adata.copy() adata_vis.obs['Density'] = dens_values norm = colors.Normalize(vmin=vmin, vmax=vmax) cmap.set_over('black') cmap.set_under('lightgray') # Ensure title is blank as default if 'title' not in kwargs: title = "" else: title = kwargs.pop('title') # Plot the graph return plot_scatter(adata_vis, basis, components=components, color='Density', color_map=cmap, norm=norm, size=dot_sizes, vmax=vmax, vmin=vmin, save=save, title=title, **kwargs)
def score_genes( adata: AnnData, gene_list: Sequence[str], ctrl_size: int = 50, gene_pool: Optional[Sequence[str]] = None, n_bins: int = 25, score_name: str = 'score', random_state: Optional[Union[int, RandomState]] = 0, copy: bool = False, use_raw: bool = False, ): # we use the scikit-learn convention of calling the seed "random_state" """Score a set of genes [Satija15]_. The score is the average expression of a set of genes subtracted with the average expression of a reference set of genes. The reference set is randomly sampled from the `gene_pool` for each binned expression value. This reproduces the approach in Seurat [Satija15]_ and has been implemented for Scanpy by Davide Cittaro. Parameters ---------- adata The annotated data matrix. gene_list The list of gene names used for score calculation. ctrl_size Number of reference genes to be sampled. If `len(gene_list)` is not too low, you can set `ctrl_size=len(gene_list)`. gene_pool Genes for sampling the reference set. Default is all genes. n_bins Number of expression level bins for sampling. score_name Name of the field to be added in `.obs`. random_state The random seed for sampling. copy Copy `adata` or modify it inplace. use_raw Use `raw` attribute of `adata` if present. Returns ------- Depending on `copy`, returns or updates `adata` with an additional field `score_name`. Examples -------- See this `notebook <https://github.com/theislab/scanpy_usage/tree/master/180209_cell_cycle>`__. """ start = logg.info(f'computing score {score_name!r}') adata = adata.copy() if copy else adata if random_state is not None: np.random.seed(random_state) gene_list_in_var = [] var_names = adata.raw.var_names if use_raw else adata.var_names for gene in gene_list: if gene in var_names: gene_list_in_var.append(gene) else: logg.warning( f'gene: {gene} is not in adata.var_names and will be ignored') gene_list = set(gene_list_in_var[:]) if not gene_pool: gene_pool = list(var_names) else: gene_pool = [x for x in gene_pool if x in var_names] # Trying here to match the Seurat approach in scoring cells. # Basically we need to compare genes against random genes in a matched # interval of expression. _adata = adata.raw if use_raw else adata # TODO: this densifies the whole data matrix for `gene_pool` if issparse(_adata.X): obs_avg = pd.Series(np.nanmean(_adata[:, gene_pool].X.toarray(), axis=0), index=gene_pool) # average expression of genes else: obs_avg = pd.Series(np.nanmean(_adata[:, gene_pool].X, axis=0), index=gene_pool) # average expression of genes obs_avg = obs_avg[np.isfinite( obs_avg )] # Sometimes (and I don't know how) missing data may be there, with nansfor n_items = int(np.round(len(obs_avg) / (n_bins - 1))) obs_cut = obs_avg.rank(method='min') // n_items control_genes = set() # now pick `ctrl_size` genes from every cut for cut in np.unique(obs_cut.loc[gene_list]): r_genes = np.array(obs_cut[obs_cut == cut].index) np.random.shuffle(r_genes) control_genes.update(set(r_genes[:ctrl_size]) ) # uses full r_genes if ctrl_size > len(r_genes) # To index, we need a list - indexing implies an order. control_genes = list(control_genes - gene_list) gene_list = list(gene_list) X_list = _adata[:, gene_list].X if issparse(X_list): X_list = X_list.toarray() X_control = _adata[:, control_genes].X if issparse(X_control): X_control = X_control.toarray() X_control = np.nanmean(X_control, axis=1) if len(gene_list) == 0: # We shouldn't even get here, but just in case logg.hint(f'could not add \n' f' {score_name!r}, score of gene set (adata.obs)') return adata if copy else None elif len(gene_list) == 1: score = _adata[:, gene_list].X - X_control else: score = np.nanmean(X_list, axis=1) - X_control adata.obs[score_name] = pd.Series(np.array(score).ravel(), index=adata.obs_names) logg.info( ' finished', time=start, deep=('added\n' f' {score_name!r}, score of gene set (adata.obs)'), ) return adata if copy else None
def score_genes_cell_cycle( adata: AnnData, s_genes: Sequence[str], g2m_genes: Sequence[str], copy: bool = False, **kwargs, ) -> Optional[AnnData]: """\ Score cell cycle genes [Satija15]_. Given two lists of genes associated to S phase and G2M phase, calculates scores and assigns a cell cycle phase (G1, S or G2M). See :func:`~scanpy.tl.score_genes` for more explanation. Parameters ---------- adata The annotated data matrix. s_genes List of genes associated with S phase. g2m_genes List of genes associated with G2M phase. copy Copy `adata` or modify it inplace. **kwargs Are passed to :func:`~scanpy.tl.score_genes`. `ctrl_size` is not possible, as it's set as `min(len(s_genes), len(g2m_genes))`. Returns ------- Depending on `copy`, returns or updates `adata` with the following fields. **S_score** : `adata.obs`, dtype `object` The score for S phase for each cell. **G2M_score** : `adata.obs`, dtype `object` The score for G2M phase for each cell. **phase** : `adata.obs`, dtype `object` The cell cycle phase (`S`, `G2M` or `G1`) for each cell. See also -------- score_genes Examples -------- See this `notebook <https://github.com/scverse/scanpy_usage/tree/master/180209_cell_cycle>`__. """ logg.info('calculating cell cycle phase') adata = adata.copy() if copy else adata ctrl_size = min(len(s_genes), len(g2m_genes)) # add s-score score_genes( adata, gene_list=s_genes, score_name='S_score', ctrl_size=ctrl_size, **kwargs ) # add g2m-score score_genes( adata, gene_list=g2m_genes, score_name='G2M_score', ctrl_size=ctrl_size, **kwargs, ) scores = adata.obs[['S_score', 'G2M_score']] # default phase is S phase = pd.Series('S', index=scores.index) # if G2M is higher than S, it's G2M phase[scores.G2M_score > scores.S_score] = 'G2M' # if all scores are negative, it's G1... phase[np.all(scores < 0, axis=1)] = 'G1' adata.obs['phase'] = phase logg.hint(' \'phase\', cell cycle phase (adata.obs)') return adata if copy else None
def rank_genes_groups( adata: AnnData, groupby: str, use_raw: Optional[bool] = None, groups: Union[Literal['all'], Iterable[str]] = 'all', reference: str = 'rest', n_genes: Optional[int] = None, rankby_abs: bool = False, pts: bool = False, key_added: Optional[str] = None, copy: bool = False, method: _Method = None, corr_method: _CorrMethod = 'benjamini-hochberg', tie_correct: bool = False, layer: Optional[str] = None, **kwds, ) -> Optional[AnnData]: """\ Rank genes for characterizing groups. Expects logarithmized data. Parameters ---------- adata Annotated data matrix. groupby The key of the observations grouping to consider. use_raw Use `raw` attribute of `adata` if present. layer Key from `adata.layers` whose value will be used to perform tests on. groups Subset of groups, e.g. [`'g1'`, `'g2'`, `'g3'`], to which comparison shall be restricted, or `'all'` (default), for all groups. reference If `'rest'`, compare each group to the union of the rest of the group. If a group identifier, compare with respect to this group. n_genes The number of genes that appear in the returned tables. Defaults to all genes. method The default method is `'t-test'`, `'t-test_overestim_var'` overestimates variance of each group, `'wilcoxon'` uses Wilcoxon rank-sum, `'logreg'` uses logistic regression. See [Ntranos18]_, `here <https://github.com/scverse/scanpy/issues/95>`__ and `here <http://www.nxn.se/valent/2018/3/5/actionable-scrna-seq-clusters>`__, for why this is meaningful. corr_method p-value correction method. Used only for `'t-test'`, `'t-test_overestim_var'`, and `'wilcoxon'`. tie_correct Use tie correction for `'wilcoxon'` scores. Used only for `'wilcoxon'`. rankby_abs Rank genes by the absolute value of the score, not by the score. The returned scores are never the absolute values. pts Compute the fraction of cells expressing the genes. key_added The key in `adata.uns` information is saved to. **kwds Are passed to test methods. Currently this affects only parameters that are passed to :class:`sklearn.linear_model.LogisticRegression`. For instance, you can pass `penalty='l1'` to try to come up with a minimal set of genes that are good predictors (sparse solution meaning few non-zero fitted coefficients). Returns ------- **names** : structured `np.ndarray` (`.uns['rank_genes_groups']`) Structured array to be indexed by group id storing the gene names. Ordered according to scores. **scores** : structured `np.ndarray` (`.uns['rank_genes_groups']`) Structured array to be indexed by group id storing the z-score underlying the computation of a p-value for each gene for each group. Ordered according to scores. **logfoldchanges** : structured `np.ndarray` (`.uns['rank_genes_groups']`) Structured array to be indexed by group id storing the log2 fold change for each gene for each group. Ordered according to scores. Only provided if method is 't-test' like. Note: this is an approximation calculated from mean-log values. **pvals** : structured `np.ndarray` (`.uns['rank_genes_groups']`) p-values. **pvals_adj** : structured `np.ndarray` (`.uns['rank_genes_groups']`) Corrected p-values. **pts** : `pandas.DataFrame` (`.uns['rank_genes_groups']`) Fraction of cells expressing the genes for each group. **pts_rest** : `pandas.DataFrame` (`.uns['rank_genes_groups']`) Only if `reference` is set to `'rest'`. Fraction of cells from the union of the rest of each group expressing the genes. Notes ----- There are slight inconsistencies depending on whether sparse or dense data are passed. See `here <https://github.com/scverse/scanpy/blob/master/scanpy/tests/test_rank_genes_groups.py>`__. Examples -------- >>> import scanpy as sc >>> adata = sc.datasets.pbmc68k_reduced() >>> sc.tl.rank_genes_groups(adata, 'bulk_labels', method='wilcoxon') >>> # to visualize the results >>> sc.pl.rank_genes_groups(adata) """ if use_raw is None: use_raw = adata.raw is not None elif use_raw is True and adata.raw is None: raise ValueError("Received `use_raw=True`, but `adata.raw` is empty.") if method is None: logg.warning( "Default of the method has been changed to 't-test' from 't-test_overestim_var'" ) method = 't-test' if 'only_positive' in kwds: rankby_abs = not kwds.pop('only_positive') # backwards compat start = logg.info('ranking genes') avail_methods = {'t-test', 't-test_overestim_var', 'wilcoxon', 'logreg'} if method not in avail_methods: raise ValueError(f'Method must be one of {avail_methods}.') avail_corr = {'benjamini-hochberg', 'bonferroni'} if corr_method not in avail_corr: raise ValueError(f'Correction method must be one of {avail_corr}.') adata = adata.copy() if copy else adata _utils.sanitize_anndata(adata) # for clarity, rename variable if groups == 'all': groups_order = 'all' elif isinstance(groups, (str, int)): raise ValueError('Specify a sequence of groups') else: groups_order = list(groups) if isinstance(groups_order[0], int): groups_order = [str(n) for n in groups_order] if reference != 'rest' and reference not in set(groups_order): groups_order += [reference] if reference != 'rest' and reference not in adata.obs[ groupby].cat.categories: cats = adata.obs[groupby].cat.categories.tolist() raise ValueError( f'reference = {reference} needs to be one of groupby = {cats}.') if key_added is None: key_added = 'rank_genes_groups' adata.uns[key_added] = {} adata.uns[key_added]['params'] = dict( groupby=groupby, reference=reference, method=method, use_raw=use_raw, layer=layer, corr_method=corr_method, ) test_obj = _RankGenes(adata, groups_order, groupby, reference, use_raw, layer, pts) if check_nonnegative_integers(test_obj.X) and method != 'logreg': logg.warning( "It seems you use rank_genes_groups on the raw count data. " "Please logarithmize your data before calling rank_genes_groups.") # for clarity, rename variable n_genes_user = n_genes # make sure indices are not OoB in case there are less genes than n_genes # defaults to all genes if n_genes_user is None or n_genes_user > test_obj.X.shape[1]: n_genes_user = test_obj.X.shape[1] logg.debug(f'consider {groupby!r} groups:') logg.debug( f'with sizes: {np.count_nonzero(test_obj.groups_masks, axis=1)}') test_obj.compute_statistics(method, corr_method, n_genes_user, rankby_abs, tie_correct, **kwds) if test_obj.pts is not None: groups_names = [str(name) for name in test_obj.groups_order] adata.uns[key_added]['pts'] = pd.DataFrame(test_obj.pts.T, index=test_obj.var_names, columns=groups_names) if test_obj.pts_rest is not None: adata.uns[key_added]['pts_rest'] = pd.DataFrame( test_obj.pts_rest.T, index=test_obj.var_names, columns=groups_names) test_obj.stats.columns = test_obj.stats.columns.swaplevel() dtypes = { 'names': 'O', 'scores': 'float32', 'logfoldchanges': 'float32', 'pvals': 'float64', 'pvals_adj': 'float64', } for col in test_obj.stats.columns.levels[0]: adata.uns[key_added][col] = test_obj.stats[col].to_records( index=False, column_dtypes=dtypes[col]) logg.info( ' finished', time=start, deep= (f'added to `.uns[{key_added!r}]`\n' " 'names', sorted np.recarray to be indexed by group ids\n" " 'scores', sorted np.recarray to be indexed by group ids\n" + (" 'logfoldchanges', sorted np.recarray to be indexed by group ids\n" " 'pvals', sorted np.recarray to be indexed by group ids\n" " 'pvals_adj', sorted np.recarray to be indexed by group ids" if method in {'t-test', 't-test_overestim_var', 'wilcoxon'} else '')), ) return adata if copy else None
def dpt( adata: AnnData, n_dcs: int = 10, n_branchings: int = 0, min_group_size: float = 0.01, allow_kendall_tau_shift: bool = True, neighbors_key: Optional[str] = None, copy: bool = False, ) -> Optional[AnnData]: """\ Infer progression of cells through geodesic distance along the graph [Haghverdi16]_ [Wolf19]_. Reconstruct the progression of a biological process from snapshot data. `Diffusion Pseudotime` has been introduced by [Haghverdi16]_ and implemented within Scanpy [Wolf18]_. Here, we use a further developed version, which is able to deal with disconnected graphs [Wolf19]_ and can be run in a `hierarchical` mode by setting the parameter `n_branchings>1`. We recommend, however, to only use :func:`~scanpy.tl.dpt` for computing pseudotime (`n_branchings=0`) and to detect branchings via :func:`~scanpy.tl.paga`. For pseudotime, you need to annotate your data with a root cell. For instance:: adata.uns['iroot'] = np.flatnonzero(adata.obs['cell_types'] == 'Stem')[0] This requires to run :func:`~scanpy.pp.neighbors`, first. In order to reproduce the original implementation of DPT, use `method=='gauss'` in this. Using the default `method=='umap'` only leads to minor quantitative differences, though. .. versionadded:: 1.1 :func:`~scanpy.tl.dpt` also requires to run :func:`~scanpy.tl.diffmap` first. As previously, :func:`~scanpy.tl.dpt` came with a default parameter of ``n_dcs=10`` but :func:`~scanpy.tl.diffmap` has a default parameter of ``n_comps=15``, you need to pass ``n_comps=10`` in :func:`~scanpy.tl.diffmap` in order to exactly reproduce previous :func:`~scanpy.tl.dpt` results. Parameters ---------- adata Annotated data matrix. n_dcs The number of diffusion components to use. n_branchings Number of branchings to detect. min_group_size During recursive splitting of branches ('dpt groups') for `n_branchings` > 1, do not consider groups that contain less than `min_group_size` data points. If a float, `min_group_size` refers to a fraction of the total number of data points. allow_kendall_tau_shift If a very small branch is detected upon splitting, shift away from maximum correlation in Kendall tau criterion of [Haghverdi16]_ to stabilize the splitting. neighbors_key If not specified, dpt looks .uns['neighbors'] for neighbors settings and .obsp['connectivities'], .obsp['distances'] for connectivities and distances respectively (default storage places for pp.neighbors). If specified, dpt looks .uns[neighbors_key] for neighbors settings and .obsp[.uns[neighbors_key]['connectivities_key']], .obsp[.uns[neighbors_key]['distances_key']] for connectivities and distances respectively. copy Copy instance before computation and return a copy. Otherwise, perform computation inplace and return `None`. Returns ------- Depending on `copy`, returns or updates `adata` with the following fields. If `n_branchings==0`, no field `dpt_groups` will be written. `dpt_pseudotime` : :class:`pandas.Series` (`adata.obs`, dtype `float`) Array of dim (number of samples) that stores the pseudotime of each cell, that is, the DPT distance with respect to the root cell. `dpt_groups` : :class:`pandas.Series` (`adata.obs`, dtype `category`) Array of dim (number of samples) that stores the subgroup id ('0', '1', ...) for each cell. The groups typically correspond to 'progenitor cells', 'undecided cells' or 'branches' of a process. Notes ----- The tool is similar to the R package `destiny` of [Angerer16]_. """ # standard errors, warnings etc. adata = adata.copy() if copy else adata if neighbors_key is None: neighbors_key = 'neighbors' if neighbors_key not in adata.uns: raise ValueError( 'You need to run `pp.neighbors` and `tl.diffmap` first.') if 'iroot' not in adata.uns and 'xroot' not in adata.var: logg.warning( 'No root cell found. To compute pseudotime, pass the index or ' 'expression vector of a root cell, one of:\n' ' adata.uns[\'iroot\'] = root_cell_index\n' ' adata.var[\'xroot\'] = adata[root_cell_name, :].X') if 'X_diffmap' not in adata.obsm.keys(): logg.warning( 'Trying to run `tl.dpt` without prior call of `tl.diffmap`. ' 'Falling back to `tl.diffmap` with default parameters.') _diffmap(adata, neighbors_key=neighbors_key) # start with the actual computation dpt = DPT(adata, n_dcs=n_dcs, min_group_size=min_group_size, n_branchings=n_branchings, allow_kendall_tau_shift=allow_kendall_tau_shift, neighbors_key=neighbors_key) start = logg.info(f'computing Diffusion Pseudotime using n_dcs={n_dcs}') if n_branchings > 1: logg.info(' this uses a hierarchical implementation') if dpt.iroot is not None: dpt._set_pseudotime() # pseudotimes are distances from root point adata.uns[ 'iroot'] = dpt.iroot # update iroot, might have changed when subsampling, for example adata.obs['dpt_pseudotime'] = dpt.pseudotime # detect branchings and partition the data into segments if n_branchings > 0: dpt.branchings_segments() adata.obs['dpt_groups'] = pd.Categorical( values=dpt.segs_names.astype('U'), categories=natsorted(np.array(dpt.segs_names_unique).astype('U'))) # the "change points" separate segments in the ordering above adata.uns['dpt_changepoints'] = dpt.changepoints # the tip points of segments adata.uns['dpt_grouptips'] = dpt.segs_tips # the ordering according to segments and pseudotime ordering_id = np.zeros(adata.n_obs, dtype=int) for count, idx in enumerate(dpt.indices): ordering_id[idx] = count adata.obs['dpt_order'] = ordering_id adata.obs['dpt_order_indices'] = dpt.indices logg.info( ' finished', time=start, deep=('added\n' + (" 'dpt_pseudotime', the pseudotime (adata.obs)" if dpt.iroot is not None else '') + ("\n 'dpt_groups', the branching subgroups of dpt (adata.obs)" "\n 'dpt_order', cell order (adata.obs)" if n_branchings > 0 else '')), ) return adata if copy else None
def filter_genes( data: AnnData, min_counts: Optional[int] = None, min_cells: Optional[int] = None, max_counts: Optional[int] = None, max_cells: Optional[int] = None, inplace: bool = True, copy: bool = False, ) -> Union[AnnData, None, Tuple[np.ndarray, np.ndarray]]: """\ Filter genes based on number of cells or counts. Keep genes that have at least `min_counts` counts or are expressed in at least `min_cells` cells or have at most `max_counts` counts or are expressed in at most `max_cells` cells. Only provide one of the optional parameters `min_counts`, `min_cells`, `max_counts`, `max_cells` per call. Parameters ---------- data An annotated data matrix of shape `n_obs` × `n_vars`. Rows correspond to cells and columns to genes. min_counts Minimum number of counts required for a gene to pass filtering. min_cells Minimum number of cells expressed required for a gene to pass filtering. max_counts Maximum number of counts required for a gene to pass filtering. max_cells Maximum number of cells expressed required for a gene to pass filtering. inplace Perform computation inplace or return result. Returns ------- Depending on `inplace`, returns the following arrays or directly subsets and annotates the data matrix gene_subset Boolean index mask that does filtering. `True` means that the gene is kept. `False` means the gene is removed. number_per_gene Depending on what was tresholded (`counts` or `cells`), the array stores `n_counts` or `n_cells` per gene. """ if copy: logg.warning('`copy` is deprecated, use `inplace` instead.') n_given_options = sum( option is not None for option in [min_cells, min_counts, max_cells, max_counts]) if n_given_options != 1: raise ValueError( 'Only provide one of the optional parameters `min_counts`, ' '`min_cells`, `max_counts`, `max_cells` per call.') if isinstance(data, AnnData): adata = data.copy() if copy else data gene_subset, number = materialize_as_ndarray( filter_genes(adata.X, min_cells=min_cells, min_counts=min_counts, max_cells=max_cells, max_counts=max_counts)) if not inplace: return gene_subset, number if min_cells is None and max_cells is None: adata.var['n_counts'] = number else: adata.var['n_cells'] = number adata._inplace_subset_var(gene_subset) return adata if copy else None X = data # proceed with processing the data matrix min_number = min_counts if min_cells is None else min_cells max_number = max_counts if max_cells is None else max_cells number_per_gene = np.sum( X if min_cells is None and max_cells is None else X > 0, axis=0) if issparse(X): number_per_gene = number_per_gene.A1 if min_number is not None: gene_subset = number_per_gene >= min_number if max_number is not None: gene_subset = number_per_gene <= max_number s = np.sum(~gene_subset) if s > 0: msg = f'filtered out {s} genes that are detected ' if min_cells is not None or min_counts is not None: msg += 'in less than ' msg += f'{min_cells} cells' if min_counts is None else f'{min_counts} counts' if max_cells is not None or max_counts is not None: msg += 'in more than ' msg += f'{max_cells} cells' if max_counts is None else f'{max_counts} counts' logg.info(msg) return gene_subset, number_per_gene
def test_recipe_plotting(): sc.settings.autoshow = False adata = AnnData(np.random.randint(0, 1000, (1000, 1000))) # These shouldn't throw an error sc.pp.recipe_seurat(adata.copy(), plot=True) sc.pp.recipe_zheng17(adata.copy(), plot=True)
def filter_cells( data: AnnData, min_counts: Optional[int] = None, min_genes: Optional[int] = None, max_counts: Optional[int] = None, max_genes: Optional[int] = None, inplace: bool = True, copy: bool = False, ) -> Optional[Tuple[np.ndarray, np.ndarray]]: """\ Filter cell outliers based on counts and numbers of genes expressed. For instance, only keep cells with at least `min_counts` counts or `min_genes` genes expressed. This is to filter measurement outliers, i.e. “unreliable” observations. Only provide one of the optional parameters `min_counts`, `min_genes`, `max_counts`, `max_genes` per call. Parameters ---------- data The (annotated) data matrix of shape `n_obs` × `n_vars`. Rows correspond to cells and columns to genes. min_counts Minimum number of counts required for a cell to pass filtering. min_genes Minimum number of genes expressed required for a cell to pass filtering. max_counts Maximum number of counts required for a cell to pass filtering. max_genes Maximum number of genes expressed required for a cell to pass filtering. inplace Perform computation inplace or return result. Returns ------- Depending on `inplace`, returns the following arrays or directly subsets and annotates the data matrix: cells_subset Boolean index mask that does filtering. `True` means that the cell is kept. `False` means the cell is removed. number_per_cell Depending on what was tresholded (`counts` or `genes`), the array stores `n_counts` or `n_cells` per gene. Examples -------- >>> import scanpy as sc >>> adata = sc.datasets.krumsiek11() >>> adata.n_obs 640 >>> adata.var_names ['Gata2' 'Gata1' 'Fog1' 'EKLF' 'Fli1' 'SCL' 'Cebpa' 'Pu.1' 'cJun' 'EgrNab' 'Gfi1'] >>> # add some true zeros >>> adata.X[adata.X < 0.3] = 0 >>> # simply compute the number of genes per cell >>> sc.pp.filter_cells(adata, min_genes=0) >>> adata.n_obs 640 >>> adata.obs['n_genes'].min() 1 >>> # filter manually >>> adata_copy = adata[adata.obs['n_genes'] >= 3] >>> adata_copy.obs['n_genes'].min() >>> adata.n_obs 554 >>> adata.obs['n_genes'].min() 3 >>> # actually do some filtering >>> sc.pp.filter_cells(adata, min_genes=3) >>> adata.n_obs 554 >>> adata.obs['n_genes'].min() 3 """ if copy: logg.warning('`copy` is deprecated, use `inplace` instead.') n_given_options = sum( option is not None for option in [min_genes, min_counts, max_genes, max_counts]) if n_given_options != 1: raise ValueError( 'Only provide one of the optional parameters `min_counts`, ' '`min_genes`, `max_counts`, `max_genes` per call.') if isinstance(data, AnnData): adata = data.copy() if copy else data cell_subset, number = materialize_as_ndarray( filter_cells(adata.X, min_counts, min_genes, max_counts, max_genes)) if not inplace: return cell_subset, number if min_genes is None and max_genes is None: adata.obs['n_counts'] = number else: adata.obs['n_genes'] = number adata._inplace_subset_obs(cell_subset) return adata if copy else None X = data # proceed with processing the data matrix min_number = min_counts if min_genes is None else min_genes max_number = max_counts if max_genes is None else max_genes number_per_cell = np.sum( X if min_genes is None and max_genes is None else X > 0, axis=1) if issparse(X): number_per_cell = number_per_cell.A1 if min_number is not None: cell_subset = number_per_cell >= min_number if max_number is not None: cell_subset = number_per_cell <= max_number s = np.sum(~cell_subset) if s > 0: msg = f'filtered out {s} cells that have ' if min_genes is not None or min_counts is not None: msg += 'less than ' msg += f'{min_genes} genes expressed' if min_counts is None else f'{min_counts} counts' if max_genes is not None or max_counts is not None: msg += 'more than ' msg += f'{max_genes} genes expressed' if max_counts is None else f'{max_counts} counts' logg.info(msg) return cell_subset, number_per_cell
def neighbors(adata: AnnData, n_neighbors: int = 15, n_pcs: Optional[int] = None, use_rep: Optional[str] = None, knn: bool = True, random_state: Optional[Union[int, RandomState]] = 0, method: str = 'umap', metric: Union[str, Metric] = 'euclidean', metric_kwds: Mapping[str, Any] = {}, copy: bool = False) -> Optional[AnnData]: """\ Compute a neighborhood graph of observations [McInnes18]_. The neighbor search efficiency of this heavily relies on UMAP [McInnes18]_, which also provides a method for estimating connectivities of data points - the connectivity of the manifold (`method=='umap'`). If `method=='gauss'`, connectivities are computed according to [Coifman05]_, in the adaption of [Haghverdi16]_. Parameters ---------- adata Annotated data matrix. n_neighbors The size of local neighborhood (in terms of number of neighboring data points) used for manifold approximation. Larger values result in more global views of the manifold, while smaller values result in more local data being preserved. In general values should be in the range 2 to 100. If `knn` is `True`, number of nearest neighbors to be searched. If `knn` is `False`, a Gaussian kernel width is set to the distance of the `n_neighbors` neighbor. {n_pcs} {use_rep} knn If `True`, use a hard threshold to restrict the number of neighbors to `n_neighbors`, that is, consider a knn graph. Otherwise, use a Gaussian Kernel to assign low weights to neighbors more distant than the `n_neighbors` nearest neighbor. random_state A numpy random seed. method : {{'umap', 'gauss', `None`}} (default: `'umap'`) Use 'umap' [McInnes18]_ or 'gauss' (Gauss kernel following [Coifman05]_ with adaptive width [Haghverdi16]_) for computing connectivities. metric A known metric’s name or a callable that returns a distance. metric_kwds Options for the metric. copy Return a copy instead of writing to adata. Returns ------- Depending on `copy`, updates or returns `adata` with the following: **connectivities** : sparse matrix (`.uns['neighbors']`, dtype `float32`) Weighted adjacency matrix of the neighborhood graph of data points. Weights should be interpreted as connectivities. **distances** : sparse matrix (`.uns['neighbors']`, dtype `float32`) Instead of decaying weights, this stores distances for each pair of neighbors. """ start = logg.info('computing neighbors') adata = adata.copy() if copy else adata if adata.isview: # we shouldn't need this here... adata._init_as_actual(adata.copy()) neighbors = Neighbors(adata) neighbors.compute_neighbors( n_neighbors=n_neighbors, knn=knn, n_pcs=n_pcs, use_rep=use_rep, method=method, metric=metric, metric_kwds=metric_kwds, random_state=random_state, ) adata.uns['neighbors'] = {} adata.uns['neighbors']['params'] = { 'n_neighbors': n_neighbors, 'method': method } adata.uns['neighbors']['params']['metric'] = metric if metric_kwds: adata.uns['neighbors']['params']['metric_kwds'] = metric_kwds if use_rep is not None: adata.uns['neighbors']['params']['use_rep'] = use_rep if n_pcs is not None: adata.uns['neighbors']['params']['n_pcs'] = n_pcs adata.uns['neighbors']['distances'] = neighbors.distances adata.uns['neighbors']['connectivities'] = neighbors.connectivities if neighbors.rp_forest is not None: adata.uns['neighbors']['rp_forest'] = neighbors.rp_forest logg.info( ' finished', time=start, deep=('added to `.uns[\'neighbors\']`\n' ' \'distances\', distances for each pair of neighbors\n' ' \'connectivities\', weighted adjacency matrix'), ) return adata if copy else None
def modules( adata: AnnData, root_milestone, milestones, color: str = "milestones", show_traj: bool = False, layer: Optional[str] = None, smooth: bool = False, show: Optional[bool] = None, save: Union[str, bool, None] = None, **kwargs, ): """\ Plot the mean expression of the early and late modules. Parameters ---------- adata Annotated data matrix. root_milestone tip defining progenitor branch. milestones tips defining the progenies branches. color color the cells with variable from adata.obs. show_traj show trajectory on the early module plot. layer layer to use to compute mean of module. show show the plot. save save the plot. kwargs arguments to pass to :func:`scFates.pl.trajectory` if `show_traj=True`, else to :func:`scanpy.pl.embedding` Returns ------- If `show==False` a tuple of :class:`~matplotlib.axes.Axes` """ plt.rcParams["axes.grid"] = False graph = adata.uns["graph"] dct = graph["milestones"] leaves = list(map(lambda leave: dct[leave], milestones)) root = dct[root_milestone] name = root_milestone + "->" + "<>".join(milestones) stats = adata.uns[name]["fork"] if "milestones_colors" not in adata.uns or len( adata.uns["milestones_colors"]) == 1: from . import palette_tools palette_tools._set_default_colors_for_categorical_obs( adata, "milestones") mlsc = adata.uns["milestones_colors"].copy() mls = adata.obs.milestones.cat.categories.tolist() dct = dict(zip(mls, mlsc)) df = adata.obs.copy(deep=True) edges = graph["pp_seg"][["from", "to"]].astype(str).apply(tuple, axis=1).values img = igraph.Graph() img.add_vertices( np.unique(graph["pp_seg"][["from", "to"]].values.flatten().astype(str))) img.add_edges(edges) cells = np.unique( np.concatenate([ getpath(img, root, adata.uns["graph"]["tips"], leaves[0], graph, df).index, getpath(img, root, adata.uns["graph"]["tips"], leaves[1], graph, df).index, ])) if layer is None: if sparse.issparse(adata.X): X = pd.DataFrame( np.array(adata[:, stats.index].X.A), index=adata.obs_names, columns=stats.index, ) else: X = pd.DataFrame( np.array(adata[:, stats.index].X), index=adata.obs_names, columns=stats.index, ) else: if sparse.issparse(adata.layers[layer]): X = pd.DataFrame( np.array(adata[:, stats.index].layers[layer].A), index=adata.obs_names, columns=stats.index, ) else: X = pd.DataFrame( np.array(adata[:, stats.index].layers[layer]), index=adata.obs_names, columns=stats.index, ) early_1 = (stats.branch.values == milestones[0]) & (stats.module.values == "early") late_1 = (stats.branch.values == milestones[0]) & (stats.module.values == "late") early_2 = (stats.branch.values == milestones[1]) & (stats.module.values == "early") late_2 = (stats.branch.values == milestones[1]) & (stats.module.values == "late") X_early = pd.DataFrame({ "early_" + milestones[0]: X.loc[:, early_1].mean(axis=1), "early_" + milestones[1]: X.loc[:, early_2].mean(axis=1), }) X_late = pd.DataFrame({ "late_" + milestones[0]: X.loc[:, late_1].mean(axis=1), "late_" + milestones[1]: X.loc[:, late_2].mean(axis=1), }) adata_c = adata.copy() adata_c.obsm["X_early"] = X_early.values adata_c.obsm["X_late"] = X_late.values if smooth: adata_c.obsm["X_early"] = adata_c.obsp["connectivities"].dot( adata_c.obsm["X_early"]) adata_c.obsm["X_late"] = adata_c.obsp["connectivities"].dot( adata_c.obsm["X_late"]) axs, _, _, _ = setup_axes(panels=[0, 1]) if show_traj: plot_trajectory( adata_c, basis="early", root_milestone=root_milestone, milestones=milestones, color_cells=color, show=False, title="", legend_loc="none", ax=axs[0], **kwargs, ) else: sc.pl.embedding( adata_c[cells], basis="early", color=color, legend_loc="none", title="", show=False, ax=axs[0], **kwargs, ) sc.pl.embedding( adata_c[cells], basis="late", color=color, legend_loc="none", show=False, title="", ax=axs[1], **kwargs, ) axs[0].set_xlabel("early " + milestones[0]) axs[0].set_ylabel("early " + milestones[1]) axs[1].set_xlabel("late " + milestones[0]) axs[1].set_ylabel("late " + milestones[1]) savefig_or_show("modules", show=show, save=save)