def _replace_with_layout(adata: AnnData, layout: str) -> Dict[str, utt.Matrix]: replaced: Dict[str, utt.Matrix] = {} matrix: utt.Matrix = adata.X if not utt.is_layout(matrix, layout): replaced["__x__"] = matrix adata.X = get_vo_proper(adata, "__x__", layout=layout) for name in adata.layers: matrix = adata.layers[name] if not utt.is_layout(matrix, layout): replaced[name] = matrix adata.layers[name] = get_vo_proper(adata, name, layout=layout) return replaced
def process_transpose(ad: AnnData, min_cells: int = 10, min_genes: int = 200, max_genes: int = 2500, max_pct_mito: int = 30): ad = ad.copy() ad.X = ad.raw.X ad = _generic_preprocess(ad, min_cells, min_genes, max_genes, max_pct_mito) sc.pp.log1p(ad) sc.pp.highly_variable_genes(ad, batch_key="sample") ad = ad.transpose() sc.pp.pca(ad, n_comps=50) sc.pp.neighbors(ad) sc.tl.umap(ad) return ad
def test_x_is_none(): # test setter and getter adata = AnnData(np.array([[1, 2, 3], [4, 5, 6]]), dict(o1=[1, 2], o2=[3, 4])) adata.X = None assert adata.X is None # test setter and deleter adata.X = np.array([[4, 5, 6], [1, 2, 3]]) assert adata.X is not None del adata.X assert adata.X is None # test initialiser shape = (3, 5) adata = AnnData(None, uns=dict(test=np.array((3, 3))), shape=shape) assert adata.X is None assert adata.shape == shape # test transpose adataT = adata.transpose() assert_equal(adataT.shape, (5, 3)) assert_equal(adataT.obsp.keys(), adata.varp.keys()) assert_equal(adataT.T, adata)
def recipe_weinreb17( adata: AnnData, log: bool = True, mean_threshold: float = 0.01, cv_threshold: int = 2, n_pcs: int = 50, svd_solver='randomized', random_state: AnyRandom = 0, copy: bool = False, ) -> Optional[AnnData]: """\ Normalization and filtering as of [Weinreb17]_. Expects non-logarithmized data. If using logarithmized data, pass `log=False`. Parameters ---------- adata Annotated data matrix. log Logarithmize data? copy Return a copy if true. """ from ._deprecated import normalize_per_cell_weinreb16_deprecated, zscore_deprecated from scipy.sparse import issparse if issparse(adata.X): raise ValueError('`recipe_weinreb16 does not support sparse matrices.') if copy: adata = adata.copy() if log: pp.log1p(adata) adata.X = normalize_per_cell_weinreb16_deprecated( adata.X, max_fraction=0.05, mult_with_mean=True ) gene_subset = filter_genes_cv_deprecated(adata.X, mean_threshold, cv_threshold) adata._inplace_subset_var(gene_subset) # this modifies the object itself X_pca = pp.pca( zscore_deprecated(adata.X), n_comps=n_pcs, svd_solver=svd_solver, random_state=random_state, ) # update adata adata.obsm['X_pca'] = X_pca return adata if copy else None
def get_example_data(*, sparse=False): # create test object adata = AnnData(np.multiply(binomial(1, 0.15, (100, 20)), negative_binomial(2, 0.25, (100, 20)))) # adapt marker_genes for cluster (so as to have some form of reasonable input adata.X[0:10, 0:5] = np.multiply(binomial(1, 0.9, (10, 5)), negative_binomial(1, 0.5, (10, 5))) # The following construction is inefficient, but makes sure that the same data is used in the sparse case if sparse: adata.X = sp.csr_matrix(adata.X) # Create cluster according to groups adata.obs['true_groups'] = pd.Categorical(np.concatenate(( np.zeros((10,), dtype=int), np.ones((90,), dtype=int), ))) return adata
def import_10X_mtx(directory): start = time.time() X = load_mtx(os.path.join(directory, 'matrix.mtx')) genes = pd.read_csv(os.path.join(directory, 'genes.tsv'), header=None, sep='\t') if len(genes) == X.shape[0]: # transpose if matrix is genes x cells a = AnnData(X.T) else: a.X = Anndata(X) var_names = genes[1] a.var_names = var_names a.var['gene_ids'] = genes[0].values a.obs_names = pd.read_csv(os.path.join(directory, 'barcodes.tsv'), header=None)[0] a.uns['network'] = np.ones([a.X.shape[1], a.X.shape[1]]) return a
def assert_adata(adata: AnnData, attempFix=True): """Asserts that an adata object is containing information needed for the besca pipeline to run and export information. This is particularly usefull when loading public data The parameter attempFix will try to fix the issue by itself. However, we advise the user to check by himself what is the leading problem. Parameters ---------- adata: AnnData attempFix: `bool` if True will transform adata object to match requirements. Returns ------- returns an AnnData object """ if not 'CELL' in adata.obs.columns: if attempFix: adata.obs['CELL'] = adata.obs.index print('Creating columns CELL in adata.obs using adata.obs.index.') else: raise Exception('Required CELL columns in observations') if not all(adata.obs_names == adata.obs['CELL']): raise Exception('Required indexing of adata.obs by CELL column') if not issparse(adata.X): if attempFix: print( 'Required count matrix to be sparse, X transformed to sparse') try: adata.X = sparse.csr_matrix(adata.X.copy()) except: raise Exception('X transformation to sparse failed.') else: raise Exception('adata.X needs to be sparse.') # checking adata.var concordance for x in ['SYMBOL', 'ENSEMBL']: adata = add_var_column(adata, x, attempFix) if not all(isinstance(el, str) for el in adata.var.get(x)): raise Exception( 'In {x} non string values will create an issue for export') return (adata)
def clr(adata: AnnData, inplace: bool = True, axis: int = 0) -> Union[None, AnnData]: """ Apply the centered log ratio (CLR) transformation to normalize counts in adata.X. Args: data: AnnData object with protein expression counts. inplace: Whether to update adata.X inplace. axis: Axis across which CLR is performed. """ if axis not in [0, 1]: raise ValueError("Invalid value for `axis` provided. Admissible options are `0` and `1`.") if not inplace: adata = adata.copy() if issparse(adata.X) and axis == 0 and not isinstance(adata.X, csc_matrix): warn("adata.X is sparse but not in CSC format. Converting to CSC.") x = csc_matrix(adata.X) elif issparse(adata.X) and axis == 1 and not isinstance(adata.X, csr_matrix): warn("adata.X is sparse but not in CSR format. Converting to CSR.") x = csr_matrix(adata.X) else: x = adata.X if issparse(x): x.data /= np.repeat( np.exp(np.log1p(x).sum(axis=axis).A / x.shape[axis]), x.getnnz(axis=axis) ) np.log1p(x.data, out=x.data) else: np.log1p( x / np.exp(np.log1p(x).sum(axis=axis, keepdims=True) / x.shape[axis]), out=x, ) adata.X = x return None if inplace else adata
def set_modality( adata: AnnData, new_value: Union[ndarray, spmatrix, DataFrame], modality: Optional[str] = None, inplace: bool = True, ) -> Optional[AnnData]: """Set modality of annotated data object to new value. Arguments --------- adata Annotated data object. new_value New value of modality. modality Modality to overwrite with new value. Defaults to `None`. inplace Boolean flag to indicate whether setting of modality should be inplace or not. Defaults to `True`. Returns ------- Optional[AnnData] Copy of annotated data `adata` with updated modality if `inplace=True`. """ if not inplace: adata = adata.copy() if (modality == "X") or (modality is None): adata.X = new_value elif modality in adata.layers.keys(): adata.layers[modality] = new_value elif modality in adata.obsm.keys(): adata.obsm[modality] = new_value if not inplace: return adata
def load_file(path): """ Load single cell dataset from file """ if os.path.exists(DATA_PATH + path + '.h5ad'): adata = sc.read_h5ad(DATA_PATH + path + '.h5ad') elif os.path.isdir(path): # mtx format adata = read_mtx(path) elif os.path.isfile(path): if path.endswith(('.csv', '.csv.gz')): adata = sc.read_csv(path).T elif path.endswith(('.txt', '.txt.gz', '.tsv', '.tsv.gz')): df = pd.read_csv(path, sep='\t', index_col=0).T adata = AnnData(df.values, dict(obs_names=df.index.values), dict(var_names=df.columns.values)) elif path.endswith('.h5ad'): adata = sc.read_h5ad(path) else: raise ValueError("File {} not exists".format(path)) if not issparse(adata.X): adata.X = scipy.sparse.csr_matrix(adata.X) adata.var_names_make_unique() return adata
def filter_cells( adata: AnnData, min_counts: int = -1, max_counts: int = -1, max_mt_ratio: int = 20, # doublet_detection: bool = False, # scrublet_kwargs: dict = { # "total_counts": None, # "sim_doublet_ratio": 2.0, # "n_neighbors": None, # "expected_doublet_rate": 0.1, # "stdev_doublet_rate": 0.02, # "random_state": 0, # }, verbose=True, ): """Filter problematic cells in an AnnData Args: adata(AnnData): The AnnData object to be pre-processed. min_counts(int): Minimum number of counts required for a cell to pass filtering. `-1` -> median(counts) - std(counts) max_counts(int): Maximum number of counts required for a cell to pass filtering. `-1` -> median(counts) + std(counts) max_mt_ratio(int): Maximum proportion of mitochondrial genes in a cell to pass filtering. verbose: (Default value = True) Returns: * Sets """ # doublet_detection(bool): Uses doublet detection instead of max counts to remove doublets # scrublet_kwargs(dict): Arguments passed to Scrublet for doublet detection # -- sparse -> array if 'ndarray' not in str(type(adata.X)): adata.X = adata.X.toarray() # -- Mitochondrial content adata.var["mt"] = adata.var_names.str.startswith("MT-") sc.pp.calculate_qc_metrics( adata, qc_vars=["mt"], percent_top=None, log1p=False, inplace=True ) # -- min/max suggestion counts = adata.X.sum(axis=1) md = np.median(counts) sd = np.std(counts) if min_counts == -1: min_counts = max(0, md - sd) if max_counts == -1: max_counts = md + sd # # -- Doublet detection # if doublet_detection: # scrub = scr.Scrublet( # adata.X, # total_counts=scrublet_kwargs["total_counts"], # sim_doublet_ratio=scrublet_kwargs["sim_doublet_ratio"], # n_neighbors=scrublet_kwargs["n_neighbors"], # expected_doublet_rate=scrublet_kwargs["expected_doublet_rate"], # stdev_doublet_rate=scrublet_kwargs["stdev_doublet_rate"], # random_state=scrublet_kwargs["random_state"], # ) # ( # adata.obs["doublet_scores"], # adata.obs["predicted_doublets"], # ) = scrub.scrub_doublets() # inds1 = np.where( # (~adata.obs["predicted_doublets"].values) # & (adata.obs["total_counts"] < max_counts) # & (adata.obs["total_counts"] > min_counts) # ) # del scrub # else: inds1 = np.where( (adata.obs["total_counts"] > min_counts) & (adata.obs["total_counts"] < max_counts)) inds2 = np.where(adata.obs["pct_counts_mt"] < max_mt_ratio) if verbose: # if doublet_detection: # print(np.sum(adata.obs["predicted_doublets"]), "doublets encountered") # print(len(inds1[0]), "cells pass the doublet and counts filters.") # else: print(len(inds1[0]), "cells pass the count filter") print(len(inds2[0]), " cells pass the mt filter") ind_cells = np.intersect1d(inds1[0], inds2[0]) if verbose: print("Cells selected", len(ind_cells)) adata._inplace_subset_obs(ind_cells) gc.collect()
def balanced_pca(adata: anndata.AnnData, groups: str = "pre_clusters", max_cell_prop=0.1, n_comps=200, scale=False): """ Given a categorical variable (e.g., a pre-clustering label), perform balanced PCA by downsample cells in the large categories to make the overall population more balanced, so the PCs are expected to represent more variance among small categories. Parameters ---------- adata adata after preprocessing and feature selection steps groups the name of the categorical variable in adata.obsm max_cell_prop any single category with cells > `n_cell * max_cell_prop` will be downsampled to this number. n_comps Number of components in PCA scale whether to scale the input matrix before PCA Returns ------- adata with PC information stored in obsm, varm and uns like the :func:`scanpy.tl.pca` do. """ # downsample large clusters use_cells = [] size_to_downsample = max(int(adata.shape[0] * max_cell_prop), 50) for cluster, sub_df in adata.obs.groupby(groups): if sub_df.shape[0] > size_to_downsample: use_cells += sub_df.sample(size_to_downsample, random_state=0).index.tolist() else: use_cells += sub_df.index.tolist() # get training adata if len(use_cells) == adata.shape[0]: downsample = False adata_train = adata else: downsample = True adata_train = adata[use_cells, :].copy() # in case cells are smaller than n_comps n_comps = min(min(adata_train.shape), n_comps) # scale (optional) if scale: scaler = StandardScaler() adata_train.X = scaler.fit_transform(adata_train.X) else: scaler = None # pca sc.tl.pca( adata_train, n_comps=n_comps, zero_center=True, svd_solver="arpack", random_state=0, return_info=False, use_highly_variable=None, dtype="float32", copy=False, chunked=False, chunk_size=None, ) # transfer PCA result to full adata if downsample: if scale: adata.X = scaler.transform( adata.X) # scale all cells with the same scaler adata.varm["PCs"] = adata_train.varm["PCs"] adata.obsm["X_pca"] = adata.X @ adata_train.varm["PCs"] adata.uns["pca"] = adata_train.uns["pca"] return adata
def combat(adata: AnnData, key: str = 'batch', inplace: bool = True): """ ComBat function for batch effect correction [Johnson07]_ [Leek12]_. Corrects for batch effects by fitting linear models, gains statistical power via an EB framework where information is borrowed across genes. This uses the implementation of `ComBat <https://github.com/brentp/combat.py>`__ [Pedersen12]_. Parameters ---------- adata : :class:`~anndata.AnnData` Annotated data matrix key: `str`, optional (default: `"batch"`) Key to a categorical annotation from adata.obs that will be used for batch effect removal inplace: bool, optional (default: `True`) Wether to replace adata.X or to return the corrected data Returns ------- Depending on the value of inplace, either returns an updated AnnData object or modifies the passed one. """ # check the input if key not in adata.obs.keys(): raise ValueError( 'Could not find the key {!r} in adata.obs'.format(key)) # only works on dense matrices so far if issparse(adata.X): X = adata.X.A.T else: X = adata.X.T data = pd.DataFrame( data=X, index=adata.var_names, columns=adata.obs_names, ) # construct a pandas series of the batch annotation batch = pd.Series(adata.obs[key]) model = pd.DataFrame({'batch': batch}) batch_items = model.groupby("batch").groups.items() batch_info = [v for k, v in batch_items] n_batch = len(batch_info) n_batches = np.array([len(v) for v in batch_info]) n_array = float(sum(n_batches)) # standardize across genes using a pooled variance estimator sys.stderr.write("Standardizing Data across genes.\n") s_data, design, var_pooled, stand_mean = stand_data(model, data) # fitting the parameters on the standardized data sys.stderr.write("Fitting L/S model and finding priors\n") batch_design = design[design.columns[:n_batch]] # first estimate of the additive batch effect gamma_hat = np.dot( np.dot(la.inv(np.dot(batch_design.T, batch_design)), batch_design.T), s_data.T) delta_hat = [] # first estimate for the multiplicative batch effect for i, batch_idxs in enumerate(batch_info): delta_hat.append(s_data[batch_idxs].var(axis=1)) # empirically fix the prior hyperparameters gamma_bar = gamma_hat.mean(axis=1) t2 = gamma_hat.var(axis=1) # a_prior and b_prior are the priors on lambda and theta from Johnson and Li (2006) a_prior = list(map(aprior, delta_hat)) b_prior = list(map(bprior, delta_hat)) sys.stderr.write("Finding parametric adjustments\n") # gamma star and delta star will be our empirical bayes (EB) estimators # for the additive and multiplicative batch effect per batch and cell gamma_star, delta_star = [], [] for i, batch_idxs in enumerate(batch_info): # temp stores our estimates for the batch effect parameters. # temp[0] is the additive batch effect # temp[1] is the multiplicative batch effect gamma, delta = _it_sol( s_data[batch_idxs].values, gamma_hat[i], delta_hat[i].values, gamma_bar[i], t2[i], a_prior[i], b_prior[i], ) gamma_star.append(gamma) delta_star.append(delta) sys.stdout.write("Adjusting data\n") bayesdata = s_data gamma_star = np.array(gamma_star) delta_star = np.array(delta_star) # we now apply the parametric adjustment to the standardized data from above # loop over all batches in the data for j, batch_idxs in enumerate(batch_info): # we basically substract the additive batch effect, rescale by the ratio # of multiplicative batch effect to pooled variance and add the overall gene # wise mean dsq = np.sqrt(delta_star[j, :]) dsq = dsq.reshape((len(dsq), 1)) denom = np.dot(dsq, np.ones((1, n_batches[j]))) numer = np.array(bayesdata[batch_idxs] - np.dot(batch_design.loc[batch_idxs], gamma_star).T) bayesdata[batch_idxs] = numer / denom vpsq = np.sqrt(var_pooled).reshape((len(var_pooled), 1)) bayesdata = bayesdata * np.dot(vpsq, np.ones( (1, int(n_array)))) + stand_mean # put back into the adata object or return if inplace: adata.X = bayesdata.values.transpose() else: return bayesdata.values.transpose()
def test_set_x_is_none(): # test setter and getter adata = AnnData(np.array([[1, 2, 3], [4, 5, 6]]), dict(o1=[1, 2], o2=[3, 4])) adata.X = None assert adata.X is None
def _replace_back(adata: AnnData, replaced: Dict[str, utt.Matrix]) -> None: for name, matrix in replaced.items(): if name == "__x__": adata.X = matrix else: adata.layers[name] = matrix
def normalize_total( adata: AnnData, target_sum: Optional[float] = None, exclude_highly_expressed: bool = False, max_fraction: float = 0.05, key_added: Optional[str] = None, layers: Union[Literal['all'], Iterable[str]] = None, layer_norm: Optional[str] = None, inplace: bool = True, ) -> Optional[Dict[str, np.ndarray]]: """\ Normalize counts per cell. If choosing `target_sum=1e6`, this is CPM normalization. If `exclude_highly_expressed=True`, very highly expressed genes are excluded from the computation of the normalization factor (size factor) for each cell. This is meaningful as these can strongly influence the resulting normalized values for all other genes [Weinreb17]_. Similar functions are used, for example, by Seurat [Satija15]_, Cell Ranger [Zheng17]_ or SPRING [Weinreb17]_. Params ------ adata The annotated data matrix of shape `n_obs` × `n_vars`. Rows correspond to cells and columns to genes. target_sum If `None`, after normalization, each observation (cell) has a total count equal to the median of total counts for observations (cells) before normalization. exclude_highly_expressed Exclude (very) highly expressed genes for the computation of the normalization factor (size factor) for each cell. A gene is considered highly expressed, if it has more than `max_fraction` of the total counts in at least one cell. The not-excluded genes will sum up to `target_sum`. max_fraction If `exclude_highly_expressed=True`, consider cells as highly expressed that have more counts than `max_fraction` of the original total counts in at least one cell. key_added Name of the field in `adata.obs` where the normalization factor is stored. layers List of layers to normalize. Set to `'all'` to normalize all layers. layer_norm Specifies how to normalize layers: * If `None`, after normalization, for each layer in *layers* each cell has a total count equal to the median of the *counts_per_cell* before normalization of the layer. * If `'after'`, for each layer in *layers* each cell has a total count equal to `target_sum`. * If `'X'`, for each layer in *layers* each cell has a total count equal to the median of total counts for observations (cells) of `adata.X` before normalization. inplace Whether to update `adata` or return dictionary with normalized copies of `adata.X` and `adata.layers`. Returns ------- Returns dictionary with normalized copies of `adata.X` and `adata.layers` or updates `adata` with normalized version of the original `adata.X` and `adata.layers`, depending on `inplace`. Example -------- >>> from anndata import AnnData >>> import scanpy as sc >>> sc.settings.verbosity = 2 >>> np.set_printoptions(precision=2) >>> adata = AnnData(np.array([ ... [3, 3, 3, 6, 6], ... [1, 1, 1, 2, 2], ... [1, 22, 1, 2, 2], ... ])) >>> adata.X array([[ 3., 3., 3., 6., 6.], [ 1., 1., 1., 2., 2.], [ 1., 22., 1., 2., 2.]], dtype=float32) >>> X_norm = sc.pp.normalize_total(adata, target_sum=1, inplace=False)['X'] >>> X_norm array([[0.14, 0.14, 0.14, 0.29, 0.29], [0.14, 0.14, 0.14, 0.29, 0.29], [0.04, 0.79, 0.04, 0.07, 0.07]], dtype=float32) >>> X_norm = sc.pp.normalize_total( ... adata, target_sum=1, exclude_highly_expressed=True, ... max_fraction=0.2, inplace=False ... )['X'] The following highly-expressed genes are not considered during normalization factor computation: ['1', '3', '4'] >>> X_norm array([[ 0.5, 0.5, 0.5, 1. , 1. ], [ 0.5, 0.5, 0.5, 1. , 1. ], [ 0.5, 11. , 0.5, 1. , 1. ]], dtype=float32) """ if max_fraction < 0 or max_fraction > 1: raise ValueError('Choose max_fraction between 0 and 1.') if layers == 'all': layers = adata.layers.keys() elif isinstance(layers, str): raise ValueError( f"`layers` needs to be a list of strings or 'all', not {layers!r}") view_to_actual(adata) gene_subset = None msg = 'normalizing counts per cell' if exclude_highly_expressed: counts_per_cell = adata.X.sum(1) # original counts per cell counts_per_cell = np.ravel(counts_per_cell) # at least one cell as more than max_fraction of counts per cell gene_subset = (adata.X > counts_per_cell[:, None] * max_fraction).sum(0) gene_subset = (np.ravel(gene_subset) == 0) msg += ( ' The following highly-expressed genes are not considered during ' f'normalization factor computation:\n{adata.var_names[~gene_subset].tolist()}' ) start = logg.info(msg) # counts per cell for subset, if max_fraction!=1 X = adata.X if gene_subset is None else adata[:, gene_subset].X counts_per_cell = X.sum(1) # get rid of adata view counts_per_cell = np.ravel(counts_per_cell).copy() cell_subset = counts_per_cell > 0 if not np.all(cell_subset): logg.warning('Some cells have total count of genes equal to zero') if layer_norm == 'after': after = target_sum elif layer_norm == 'X': after = np.median(counts_per_cell[cell_subset]) elif layer_norm is None: after = None else: raise ValueError('layer_norm should be "after", "X" or None') del cell_subset if inplace: if key_added is not None: adata.obs[key_added] = counts_per_cell adata.X = _normalize_data(adata.X, counts_per_cell, target_sum) else: # not recarray because need to support sparse dat = dict( X=_normalize_data(adata.X, counts_per_cell, target_sum, copy=True), norm_factor=counts_per_cell, ) for layer_name in (layers or ()): layer = adata.layers[layer_name] counts = np.ravel(layer.sum(1)) if inplace: adata.layers[layer_name] = _normalize_data(layer, counts, after) else: dat[layer_name] = _normalize_data(layer, counts, after, copy=True) logg.info( ' finished ({time_passed})', time=start, ) if key_added is not None: logg.debug( f'and added {key_added!r}, counts per cell before normalization (adata.obs)' ) return dat if not inplace else None
def magic( adata: AnnData, name_list: Union[str, Sequence[str], None] = None, k: int = 10, a: int = 15, t: str = 'auto', n_pca: int = 100, knn_dist: str = 'euclidean', random_state: Optional[Union[int, RandomState]] = None, n_jobs: Optional[int] = None, verbose: bool = False, copy: Optional[bool] = None, **kwargs, ) -> Optional[AnnData]: """\ Markov Affinity-based Graph Imputation of Cells (MAGIC) API [vanDijk18]_. MAGIC is an algorithm for denoising and transcript recover of single cells applied to single-cell sequencing data. MAGIC builds a graph from the data and uses diffusion to smooth out noise and recover the data manifold. More information and bug reports `here <https://github.com/KrishnaswamyLab/MAGIC>`__. For help, visit <https://krishnaswamylab.org/get-help>. Parameters ---------- adata An anndata file with `.raw` attribute representing raw counts. name_list Denoised genes to return. The default `'all_genes'`/`None` may require a large amount of memory if the input data is sparse. Another possibility is `'pca_only'`. k number of nearest neighbors on which to build kernel a sets decay rate of kernel tails. If None, alpha decaying kernel is not used t power to which the diffusion operator is powered. This sets the level of diffusion. If 'auto', t is selected according to the Procrustes disparity of the diffused data n_pca Number of principal components to use for calculating neighborhoods. For extremely large datasets, using n_pca < 20 allows neighborhoods to be calculated in roughly log(n_samples) time. knn_dist recommended values: 'euclidean', 'cosine', 'precomputed' Any metric from `scipy.spatial.distance` can be used distance metric for building kNN graph. If 'precomputed', `data` should be an n_samples x n_samples distance or affinity matrix random_state Random seed. Defaults to the global `numpy` random number generator n_jobs Number of threads to use in training. All cores are used by default. verbose If `True` or an integer `>= 2`, print status messages. If `None`, `sc.settings.verbosity` is used. copy If true, a copy of anndata is returned. If `None`, `copy` is True if `genes` is not `'all_genes'` or `'pca_only'`. `copy` may only be False if `genes` is `'all_genes'` or `'pca_only'`, as the resultant data will otherwise have different column names from the input data. kwargs Additional arguments to `magic.MAGIC` Returns ------- If `copy` is True, AnnData object is returned. If `subset_genes` is not `all_genes`, PCA on MAGIC values of cells are stored in `adata.obsm['X_magic']` and `adata.X` is not modified. The raw counts are stored in `.raw` attribute of AnnData object. Examples -------- >>> import scanpy as sc >>> import scanpy.external as sce >>> adata = sc.datasets.paul15() >>> sc.pp.normalize_per_cell(adata) >>> sc.pp.sqrt(adata) # or sc.pp.log1p(adata) >>> adata_magic = sce.pp.magic(adata, name_list=['Mpo', 'Klf1', 'Ifitm1'], k=5) >>> adata_magic.shape (2730, 3) >>> sce.pp.magic(adata, name_list='pca_only', k=5) >>> adata.obsm['X_magic'].shape (2730, 100) >>> sce.pp.magic(adata, name_list='all_genes', k=5) >>> adata.X.shape (2730, 3451) """ try: from magic import MAGIC except ImportError: raise ImportError( 'Please install magic package via `pip install --user ' 'git+git://github.com/KrishnaswamyLab/MAGIC.git#subdirectory=python`' ) start = logg.info('computing PHATE') all_or_pca = isinstance(name_list, (str, type(None))) if all_or_pca and name_list not in {"all_genes", "pca_only", None}: raise ValueError("Invalid string value for `name_list`: " "Only `'all_genes'` and `'pca_only'` are allowed.") if copy is None: copy = not all_or_pca elif not all_or_pca and not copy: raise ValueError( "Can only perform MAGIC in-place with `name_list=='all_genes' or " f"`name_list=='pca_only'` (got {name_list}). Consider setting " "`copy=True`") adata = adata.copy() if copy else adata n_jobs = settings.n_jobs if n_jobs is None else n_jobs X_magic = MAGIC( k=k, a=a, t=t, n_pca=n_pca, knn_dist=knn_dist, random_state=random_state, n_jobs=n_jobs, verbose=verbose, **kwargs, ).fit_transform(adata, genes=name_list) logg.info( ' finished', time=start, deep=("added\n 'X_magic', PCA on MAGIC coordinates (adata.obsm)" if name_list == "pca_only" else '')) # update AnnData instance if name_list == "pca_only": # special case - update adata.obsm with smoothed values adata.obsm["X_magic"] = X_magic.X elif copy: # just return X_magic X_magic.raw = adata adata = X_magic else: # replace data with smoothed data adata.raw = adata adata.X = X_magic.X if copy: return adata
def generate_synthetic_dataset(adata: AnnData, sim_type: str = "avg", seed: int = 42): """Create cell-aggregate samples for ground-truth spatial decomposition task. Parameters ---------- adata : AnnData Anndata object. sim_type : str Simulation type: either average `'avg'` or per cell `'cell'`. seed: int Seed for rng. Returns ------- AnnData with: - `adata_spatial.obsm["proportions_true"]`: true proportion values. - `adata_spatial.X`: simulated counts (aggregate of sc dataset). - `adata_spatial.uns["sc_reference"]`: original sc adata for reference. The cell type labels are stored in adata_sc.obs["label"]. """ rng = np.random.default_rng(seed) adata.obs["label"] = adata.obs.label.astype("category") if isinstance(adata.X, csr_matrix): adata.X = adata.X.todense() n_genes = adata.shape[1] n_cells = adata.shape[0] n_types = len(set(adata.obs["label"].values)) # TODO(make these arguments) bead_depth = 1000 num_of_beads = n_cells * 2 # generate proportion values props = rng.dirichlet(np.ones(n_types), num_of_beads) true_proportion = np.zeros((num_of_beads, n_types)) bead_to_gene_matrix = np.zeros((num_of_beads, n_genes)) # if sim_type avg # generate from avg profiles if sim_type == "avg": profile_mean = obs_means(adata, "label") sc.pp.normalize_total(profile_mean, target_sum=1, inplace=True) # run for each bead for bead_index in range(num_of_beads): allocation = rng.multinomial(bead_depth, props[bead_index, :], size=1)[0] true_proportion[bead_index, :] = allocation.copy() for j in range(n_types): profile_mean.X[j, :] /= (profile_mean.X[j, :].sum() + 1e-5 ) # trick to make sum(arr) < 1.0 gene_exp = rng.multinomial(allocation[j], profile_mean.X[j, :], size=1)[0] bead_to_gene_matrix[bead_index, :] += gene_exp elif sim_type == "cell": # generate from cells # assign beads to actual cells # cell_ids with this cluster cells_to_sample_from_celltype = [] grouped = adata.obs.groupby("label") for idx in grouped.indices.values(): cells_to_sample_from_celltype += [idx] # Actual cells assigned randomly cell_association = np.zeros((num_of_beads, n_types)).astype(np.int) for j in range(n_types): cell_association[:, j] = rng.integers( low=0, high=len(cells_to_sample_from_celltype[j]), size=num_of_beads) counts = np.array(adata.X) rowSums = counts.sum(axis=1, keepdims=True) X_norm_prof = np.divide(counts, rowSums, where=rowSums > 0) for bead_index in range(num_of_beads): allocation = rng.multinomial(bead_depth, props[bead_index, :], size=1)[0] true_proportion[bead_index, :] = allocation.copy() for j in range(n_types): cell_index = cells_to_sample_from_celltype[j][cell_association[ bead_index, j]] print(cell_index) gene_exp = rng.multinomial(allocation[j], X_norm_prof[cell_index, :], size=1)[0] bead_to_gene_matrix[bead_index, :] += gene_exp else: raise ValueError(f"{sim_type} is not a valid key for `sim_type`.") bead_barcodes = np.arange(num_of_beads) adata_spatial = AnnData( bead_to_gene_matrix, obs=dict(obs_names=bead_barcodes), var=dict(var_names=adata.var_names), ) true_proportion = true_proportion / true_proportion.sum( 1)[:, np.newaxis].astype("float64") # fake coordinates adata_spatial.obsm["spatial"] = rng.random((adata_spatial.shape[0], 2)) adata_spatial.obsm["proportions_true"] = true_proportion adata_spatial.uns["sc_reference"] = adata.copy() return adata_spatial
def prep_simple( adata: AnnData, normalize_counts: bool = True, filter_var_genes: bool = True, n_top_genes: int = 10000, for_pooling: bool = False, log_transform: bool = True, division_factor: float = 1, score_cc: bool = True, verbose: bool = True, ): """Pre-processes AnnData without pooling. Should be done only once. Parameters ---------- adata: AnnData The raw AnnData object to be pre-processed normalize_counts: bool Set it to False if library does not need normalization filter_var_genes: bool If True, only `n_top_genes` highly variable genes are kept. n_top_genes: int Number of genes to keep after highly variable filter. Used if `filter_var_genes` is True. Passed to sc.pp.highly_variable_genes. for_pooling: bool Set to True if the function is called by the `prep_pooling` function. Changes the return object parameters. log_transform: bool Set it to false if you do not want values to be log-transformed. division_factor: int Scaling factor, divides the counts matrix by this value. score_cc: bool If True, cell cycle scores will be added. verbose: bool If True, messages about function progress will be printed. Returns ---------- - `None` """ assert division_factor != 0, "Null division factor. Terminating..." if 'ndarray' not in str(type(adata.X)): adata.X = adata.X.toarray() # np.divide(adata.X, division_factor, out=adata.X) if "total_counts" not in adata.obs.keys(): adata.obs["total_counts"] = adata.X.sum(1) # Normalization step if normalize_counts: sc.pp.normalize_total(adata, target_sum=np.median(adata.obs["total_counts"])) # Score cell cycle (multiple signatures) if score_cc: if verbose: print("Scoring cell cycle...") _score_cell_cycle(adata, g1s_markers, "G1S_Tirosh") _score_cell_cycle(adata, g2m_markers, "G2M_Tirosh") _score_cell_cycle(adata, G1S_genes_Freeman, "G1S_Freeman") _score_cell_cycle(adata, G2M_genes_Freeman, "G2M_Freeman") _score_cell_cycle(adata, g1s_markers_short, "G1S_short") _score_cell_cycle(adata, g2m_markers_short, "G2M_short") _score_cell_cycle(adata, histone_markers, "Histones") adata.obs['G1-S'] = adata.obs['G1S_Tirosh'] adata.obs['G2-M'] = adata.obs['G2M_Tirosh'] # Highly variable genes filtering if filter_var_genes: variances = np.var(adata.X, axis=0) inds = np.flip(np.argsort(variances)) ind_genes = inds[0:n_top_genes] if 0 in variances[ind_genes]: ind_first_zero = np.argwhere(variances[ind_genes] == 0)[0][0] ind_genes = ind_genes[0:ind_first_zero] adata._inplace_subset_var(ind_genes) # Logarithmization if log_transform: sc.pp.log1p(adata, base=10) if not for_pooling: adata.uns["scycle"] = { "preprocess": { "method": "simple", "n_top_genes": n_top_genes, "normalize_counts": normalize_counts, "filter_var_genes": filter_var_genes, "division_factor": division_factor, "log_transform": log_transform, "n_top_genes": n_top_genes, } } gc.collect()
def embedding( data: Union[AnnData, MuData], basis: str, color: Optional[Union[str, Sequence[str]]] = None, use_raw: Optional[bool] = None, layer: Optional[str] = None, **kwargs, ): """ Scatter plot for .obs Produce a scatter plot in the define basis, which can also be a basis inside any modality, e.g. ``"rna:X_pca"``. See :func:`scanpy.pl.embedding` for details. Parameters ---------- data : Union[AnnData, MuData] MuData or AnnData object basis : str Name of the `obsm` basis to use color : Optional[Union[str, typing.Sequence[str]]], optional (default: None) Keys for variables or annotations of observations (.obs columns). Can be from any modality. use_raw : Optional[bool], optional (default: None) Use `.raw` attribute of the modality where a feature (from `color`) is derived from. If `None`, defaults to `True` if `.raw` is present and a valid `layer` is not provided. layer : Optional[str], optional (default: None) Name of the layer in the modality where a feature (from `color`) is derived from. No layer is used by default. If a valid `layer` is provided, this takes precedence over `use_raw=True`. """ if isinstance(data, AnnData): return sc.pl.embedding(data, basis=basis, color=color, use_raw=use_raw, layer=layer, **kwargs) # `data` is MuData if basis not in data.obsm and "X_" + basis in data.obsm: basis = "X_" + basis if basis in data.obsm: adata = data basis_mod = basis else: # basis is not a joint embedding try: mod, basis_mod = basis.split(":") except ValueError: raise ValueError( f"Basis {basis} is not present in the MuData object (.obsm)") if mod not in data.mod: raise ValueError( f"Modality {mod} is not present in the MuData object with modalities {', '.join(data.mod)}" ) adata = data.mod[mod] if basis_mod not in adata.obsm: if "X_" + basis_mod in adata.obsm: basis_mod = "X_" + basis_mod elif len(adata.obsm) > 0: raise ValueError( f"Basis {basis_mod} is not present in the modality {mod} with embeddings {', '.join(adata.obsm)}" ) else: raise ValueError( f"Basis {basis_mod} is not present in the modality {mod} with no embeddings" ) obs = data.obs.loc[adata.obs.index.values] if color is None: ad = AnnData(obs=obs, obsm=adata.obsm, obsp=adata.obsp) return sc.pl.embedding(ad, basis=basis_mod, **kwargs) # Some `color` has been provided if isinstance(color, str): keys = [color] elif isinstance(color, Iterable): keys = color else: raise TypeError("Expected color to be a string or an iterable.") # Fetch respective features if not all([key in obs for key in keys]): # {'rna': [True, False], 'prot': [False, True]} keys_in_mod = { m: [key in data.mod[m].var_names for key in keys] for m in data.mod } # .raw slots might have exclusive var_names if use_raw is None or use_raw: for i, k in enumerate(keys): for m in data.mod: if keys_in_mod[m][i] == False and data.mod[ m].raw is not None: keys_in_mod[m][i] = k in data.mod[m].raw.var_names for m in data.mod: if np.sum(keys_in_mod[m]) > 0: mod_keys = np.array(keys)[keys_in_mod[m]] if use_raw is None or use_raw: if data.mod[m].raw is not None: keysidx = data.mod[m].raw.var.index.get_indexer_for( mod_keys) fmod_adata = AnnData( X=data.mod[m].raw.X[:, keysidx], var=pd.DataFrame(index=mod_keys), obs=data.mod[m].obs, ) else: if use_raw: warnings.warn( f"Attibute .raw is None for the modality {m}, using .X instead" ) fmod_adata = data.mod[m][:, mod_keys] else: fmod_adata = data.mod[m][:, mod_keys] if layer is not None: if layer in data.mod[m].layers: fmod_adata.X = data.mod[m][:, mod_keys].layers[layer] if use_raw: warnings.warn( f"Layer='{layer}' superseded use_raw={use_raw}" ) else: warnings.warn( f"Layer {layer} is not present for the modality {m}, using count matrix instead" ) x = fmod_adata.X.toarray() if issparse( fmod_adata.X) else fmod_adata.X obs = obs.join( pd.DataFrame(x, columns=mod_keys, index=fmod_adata.obs_names), how="left", ) ad = AnnData(obs=obs, obsm=adata.obsm, obsp=adata.obsp, uns=adata.uns) return sc.pl.embedding(ad, basis=basis_mod, color=color, **kwargs)
def sqrt_cpm(adata: ad.AnnData) -> ad.AnnData: """Normalize data to sqrt counts per million.""" _cpm(adata) adata.X = scprep.transform.sqrt(adata.X) return adata
def _high_dim(adata: AnnData) -> np.ndarray: adata.X = adata.layers["counts"] adata = log_cpm_hvg(adata) high_dim = adata.X return high_dim.A if issparse(high_dim) else high_dim
def magic( adata: AnnData, name_list: Union[Literal['all_genes', 'pca_only'], Sequence[str], None] = None, *, knn: int = 5, decay: Optional[float] = 1, knn_max: Optional[int] = None, t: Union[Literal['auto'], int] = 3, n_pca: Optional[int] = 100, solver: Literal['exact', 'approximate'] = 'exact', knn_dist: str = 'euclidean', random_state: Optional[Union[int, RandomState]] = None, n_jobs: Optional[int] = None, verbose: bool = False, copy: Optional[bool] = None, **kwargs, ) -> Optional[AnnData]: """\ Markov Affinity-based Graph Imputation of Cells (MAGIC) API [vanDijk18]_. MAGIC is an algorithm for denoising and transcript recover of single cells applied to single-cell sequencing data. MAGIC builds a graph from the data and uses diffusion to smooth out noise and recover the data manifold. The algorithm implemented here has changed primarily in two ways compared to the algorithm described in [vanDijk18]_. Firstly, we use the adaptive kernel described in Moon et al, 2019 [Moon17]_ for improved stability. Secondly, data diffusion is applied in the PCA space, rather than the data space, for speed and memory improvements. More information and bug reports `here <https://github.com/KrishnaswamyLab/MAGIC>`__. For help, visit <https://krishnaswamylab.org/get-help>. Parameters ---------- adata An anndata file with `.raw` attribute representing raw counts. name_list Denoised genes to return. The default `'all_genes'`/`None` may require a large amount of memory if the input data is sparse. Another possibility is `'pca_only'`. knn number of nearest neighbors on which to build kernel. decay sets decay rate of kernel tails. If None, alpha decaying kernel is not used. knn_max maximum number of nearest neighbors with nonzero connection. If `None`, will be set to 3 * `knn`. t power to which the diffusion operator is powered. This sets the level of diffusion. If 'auto', t is selected according to the Procrustes disparity of the diffused data. n_pca Number of principal components to use for calculating neighborhoods. For extremely large datasets, using n_pca < 20 allows neighborhoods to be calculated in roughly log(n_samples) time. If `None`, no PCA is performed. solver Which solver to use. "exact" uses the implementation described in van Dijk et al. (2018) [vanDijk18]_. "approximate" uses a faster implementation that performs imputation in the PCA space and then projects back to the gene space. Note, the "approximate" solver may return negative values. knn_dist recommended values: 'euclidean', 'cosine', 'precomputed' Any metric from `scipy.spatial.distance` can be used distance metric for building kNN graph. If 'precomputed', `data` should be an n_samples x n_samples distance or affinity matrix. random_state Random seed. Defaults to the global `numpy` random number generator. n_jobs Number of threads to use in training. All cores are used by default. verbose If `True` or an integer `>= 2`, print status messages. If `None`, `sc.settings.verbosity` is used. copy If true, a copy of anndata is returned. If `None`, `copy` is True if `genes` is not `'all_genes'` or `'pca_only'`. `copy` may only be False if `genes` is `'all_genes'` or `'pca_only'`, as the resultant data will otherwise have different column names from the input data. kwargs Additional arguments to `magic.MAGIC`. Returns ------- If `copy` is True, AnnData object is returned. If `subset_genes` is not `all_genes`, PCA on MAGIC values of cells are stored in `adata.obsm['X_magic']` and `adata.X` is not modified. The raw counts are stored in `.raw` attribute of AnnData object. Examples -------- >>> import scanpy as sc >>> import scanpy.external as sce >>> adata = sc.datasets.paul15() >>> sc.pp.normalize_per_cell(adata) >>> sc.pp.sqrt(adata) # or sc.pp.log1p(adata) >>> adata_magic = sce.pp.magic(adata, name_list=['Mpo', 'Klf1', 'Ifitm1'], knn=5) >>> adata_magic.shape (2730, 3) >>> sce.pp.magic(adata, name_list='pca_only', knn=5) >>> adata.obsm['X_magic'].shape (2730, 100) >>> sce.pp.magic(adata, name_list='all_genes', knn=5) >>> adata.X.shape (2730, 3451) """ try: from magic import MAGIC, __version__ except ImportError: raise ImportError( 'Please install magic package via `pip install --user ' 'git+git://github.com/KrishnaswamyLab/MAGIC.git#subdirectory=python`' ) else: if not version.parse(__version__) >= version.parse(MIN_VERSION): raise ImportError( 'scanpy requires magic-impute >= ' f'v{MIN_VERSION} (detected: v{__version__}). ' 'Please update magic package via `pip install --user ' '--upgrade magic-impute`' ) start = logg.info('computing MAGIC') all_or_pca = isinstance(name_list, (str, type(None))) if all_or_pca and name_list not in {"all_genes", "pca_only", None}: raise ValueError( "Invalid string value for `name_list`: " "Only `'all_genes'` and `'pca_only'` are allowed." ) if copy is None: copy = not all_or_pca elif not all_or_pca and not copy: raise ValueError( "Can only perform MAGIC in-place with `name_list=='all_genes' or " f"`name_list=='pca_only'` (got {name_list}). Consider setting " "`copy=True`" ) adata = adata.copy() if copy else adata n_jobs = settings.n_jobs if n_jobs is None else n_jobs X_magic = MAGIC( knn=knn, decay=decay, knn_max=knn_max, t=t, n_pca=n_pca, solver=solver, knn_dist=knn_dist, random_state=random_state, n_jobs=n_jobs, verbose=verbose, **kwargs, ).fit_transform(adata, genes=name_list) logg.info( ' finished', time=start, deep=( "added\n 'X_magic', PCA on MAGIC coordinates (adata.obsm)" if name_list == "pca_only" else '' ), ) # update AnnData instance if name_list == "pca_only": # special case – update adata.obsm with smoothed values adata.obsm["X_magic"] = X_magic.X elif copy: # just return X_magic X_magic.raw = adata adata = X_magic else: # replace data with smoothed data adata.raw = adata adata.X = X_magic.X if copy: return adata
def pseudo_spot( adata: AnnData, tile_path: Union[Path, str] = Path("/tmp/tiles"), use_data: str = "raw", crop_size: int = "auto", platform: _PLATFORM = "Visium", weights: _WEIGHTING_MATRIX = "weights_matrix_all", copy: _COPY = "pseudo_spot_adata", ) -> Optional[AnnData]: """\ using spatial location (S), tissue morphological feature (M) and gene expression (E) information to impute gap between spots and increase resolution for gene detection Parameters ---------- adata Annotated data matrix. use_data Input data, can be `raw` counts, log transformed data or dimension reduced space(`X_pca` and `X_umap`) tile_path Path to save spot image tiles crop_size Size of tiles if `auto`, automatically detect crop size weights Weighting matrix for imputation. if `weights_matrix_all`, matrix combined all information from spatial location (S), tissue morphological feature (M) and gene expression (E) if `weights_matrix_pd_md`, matrix combined information from spatial location (S), tissue morphological feature (M) platform `Visium` or `Old_ST` copy Return Anndata if `pseudo_spot_adata`, imputed Anndata if `combined_adata`, merged Anndata of original data imputed Anndata. Returns ------- Anndata """ from sklearn.linear_model import LinearRegression import math if platform == "Visium": img_row = adata.obs["imagerow"] img_col = adata.obs["imagecol"] array_row = adata.obs["array_row"] array_col = adata.obs["array_col"] rate = 3 obs_df_ = adata.obs[["array_row", "array_col"]].copy() obs_df_.loc[:, "array_row"] = obs_df_["array_row"].apply(lambda x: x - 2 / 3) obs_df = adata.obs[["array_row", "array_col"]].copy() obs_df.loc[:, "array_row"] = obs_df["array_row"].apply(lambda x: x + 2 / 3) obs_df = obs_df.append(obs_df_).reset_index() obs_df.drop_duplicates(subset=["array_row", "array_col"], keep="last") elif platform == "Old_ST": img_row = adata.obs["imagerow"] img_col = adata.obs["imagecol"] array_row = adata.obs_names.map(lambda x: x.split("x")[1]) array_col = adata.obs_names.map(lambda x: x.split("x")[0]) rate = 1.5 obs_df_left = pd.DataFrame( {"array_row": array_row.to_list(), "array_col": array_col.to_list()}, dtype=np.float64, ) obs_df_left.loc[:, "array_row"] = obs_df_left["array_row"].apply( lambda x: x - 1 / 2 ) obs_df_right = pd.DataFrame( {"array_row": array_row.to_list(), "array_col": array_col.to_list()}, dtype=np.float64, ) obs_df_right.loc[:, "array_row"] = obs_df_right["array_row"].apply( lambda x: x + 1 / 2 ) obs_df_up = pd.DataFrame( {"array_row": array_row.to_list(), "array_col": array_col.to_list()}, dtype=np.float64, ) obs_df_up.loc[:, "array_col"] = obs_df_up["array_col"].apply( lambda x: x - 1 / 2 ) obs_df_down = pd.DataFrame( {"array_row": array_row.to_list(), "array_col": array_col.to_list()}, dtype=np.float64, ) obs_df_down.loc[:, "array_col"] = obs_df_down["array_col"].apply( lambda x: x + 1 / 2 ) obs_df_left_up = pd.DataFrame( {"array_row": array_row.to_list(), "array_col": array_col.to_list()}, dtype=np.float64, ) obs_df_left_up.loc[:, "array_row"] = obs_df_left_up["array_row"].apply( lambda x: x - 1 / 2 ) obs_df_left_up.loc[:, "array_col"] = obs_df_left_up["array_col"].apply( lambda x: x - 1 / 2 ) obs_df_right_up = pd.DataFrame( {"array_row": array_row.to_list(), "array_col": array_col.to_list()}, dtype=np.float64, ) obs_df_right_up.loc[:, "array_row"] = obs_df_right_up["array_row"].apply( lambda x: x + 1 / 2 ) obs_df_right_up.loc[:, "array_col"] = obs_df_right_up["array_col"].apply( lambda x: x - 1 / 2 ) obs_df_left_down = pd.DataFrame( {"array_row": array_row.to_list(), "array_col": array_col.to_list()}, dtype=np.float64, ) obs_df_left_down.loc[:, "array_row"] = obs_df_left_down["array_row"].apply( lambda x: x - 1 / 2 ) obs_df_left_down.loc[:, "array_col"] = obs_df_left_down["array_col"].apply( lambda x: x + 1 / 2 ) obs_df_right_down = pd.DataFrame( {"array_row": array_row.to_list(), "array_col": array_col.to_list()}, dtype=np.float64, ) obs_df_right_down.loc[:, "array_row"] = obs_df_right_down["array_row"].apply( lambda x: x + 1 / 2 ) obs_df_right_down.loc[:, "array_col"] = obs_df_right_down["array_col"].apply( lambda x: x + 1 / 2 ) obs_df = obs_df_left.append( [ obs_df_right, obs_df_up, obs_df_down, obs_df_left_up, obs_df_right_up, obs_df_left_down, obs_df_right_down, ] ).reset_index() obs_df.drop_duplicates(subset=["array_row", "array_col"], keep="last") else: raise ValueError( f"""\ {platform!r} does not support. """ ) reg_row = LinearRegression().fit(array_row.values.reshape(-1, 1), img_row) reg_col = LinearRegression().fit(array_col.values.reshape(-1, 1), img_col) obs_df.loc[:, "imagerow"] = ( obs_df.loc[:, "array_row"] * reg_row.coef_ + reg_row.intercept_ ) obs_df.loc[:, "imagecol"] = ( obs_df.loc[:, "array_col"] * reg_col.coef_ + reg_col.intercept_ ) impute_coor = obs_df[["imagecol", "imagerow"]] coor = adata.obs[["imagecol", "imagerow"]].append(impute_coor) point_tree = scipy.spatial.cKDTree(coor) n_neighbour = [] unit = math.sqrt(reg_row.coef_**2 + reg_col.coef_**2) for i in range(len(impute_coor)): current_neighbour = point_tree.query_ball_point( impute_coor.values[i], round(unit) ) current_neighbour = [x for x in current_neighbour if x < len(adata)] n_neighbour.append(len(current_neighbour)) obs_df["n_neighbour"] = n_neighbour obs_df = obs_df.loc[obs_df["n_neighbour"] > 1, :].reset_index() obs_df.index = obs_df.index.map(lambda x: "Pseudo_Spot_" + str(x)) impute_df = pd.DataFrame(0, index=obs_df.index, columns=adata.var_names) pseudo_spot_adata = AnnData(impute_df, obs=obs_df) pseudo_spot_adata.uns["spatial"] = adata.uns["spatial"] if crop_size == "auto": crop_size = round(unit / 2) stlearn.pp.tiling(pseudo_spot_adata, tile_path, crop_size=crop_size) stlearn.pp.extract_feature(pseudo_spot_adata) if use_data == "raw": if isinstance(adata.X, csr_matrix): count_embed = adata.X.toarray() elif isinstance(adata.X, np.ndarray): count_embed = adata.X elif isinstance(adata.X, pd.Dataframe): count_embed = adata.X.values else: print(f"{type(adata.X)} is not a valid type") else: count_embed = adata.obsm[use_data] calculate_weight_matrix( adata, pseudo_spot_adata, pseudo_spots=True, platform=platform ) impute_neighbour(pseudo_spot_adata, count_embed=count_embed, weights=weights) assert pseudo_spot_adata.shape == pseudo_spot_adata.obsm["imputed_data"].shape pseudo_spot_adata.X = pseudo_spot_adata.obsm["imputed_data"] pseudo_spot_adata = pseudo_spot_adata[np.sum(pseudo_spot_adata.X, axis=1) > 0] print("Done") if copy == "pseudo_spot_adata": return pseudo_spot_adata else: return _merge(adata, pseudo_spot_adata)
def vlm_to_adata(vlm, trans_mats=None, cells_ixs=None, em_key=None): """ Conversion function from the velocyto world to the scanpy world Parameters -------- vlm: VelocytoLoom Object trans_mats: None or dict A dict of all relevant transition matrices cell_ixs: list of int These are the indices of the subsampled cells Output adata: AnnData object """ # create the anndata object adata = AnnData(vlm.Sx_sz.T, vlm.ca, vlm.ra, layers=dict(unspliced=vlm.U.T, spliced=vlm.S.T, velocity=vlm.velocity.T), uns=dict(velocity_graph=vlm.corrcoef, louvain_colors=list(np.unique(vlm.colorandum)))) # add uns annotations if trans_mats is not None: for key, value in trans_mats.items(): adata.uns[key] = trans_mats[key] if cells_ixs is not None: adata.uns['cell_ixs'] = cells_ixs # rename clusters to louvain try: ix = np.where(adata.obs.columns == 'Clusters')[0][0] obs_names = list(adata.obs.columns) obs_names[ix] = 'louvain' adata.obs.columns = obs_names # make louvain a categorical field adata.obs['louvain'] = pd.Categorical(adata.obs['louvain']) except: print('Could not find a filed \'Clusters\' in vlm.ca.') # save the pca embedding adata.obsm['X_pca'] = vlm.pcs[:, range(50)] # transfer the embedding if em_key is not None: adata.obsm['X_' + em_key] = vlm.ts adata.obsm['velocity_' + em_key] = vlm.delta_embedding # make things sparse adata.X = scp.sparse.csr_matrix(adata.X) adata.uns['velocity_graph'] = scp.sparse.csr_matrix( adata.uns['velocity_graph']) # make the layers sparse adata.layers['unspliced'] = scp.sparse.csr_matrix( adata.layers['unspliced']) adata.layers['spliced'] = scp.sparse.csr_matrix(adata.layers['unspliced']) adata.layers['velocity'] = scp.sparse.csr_matrix(adata.layers['unspliced']) return adata
def prep_simple( adata: AnnData, normalize_counts: bool = True, filter_var_genes: bool = True, n_top_genes: int = 10000, for_pooling: bool = True, log_transform: bool = True, division_factor: float = 1, verbose: bool = True, ): """Pre-processes AnnData without pooling. Should be done only once. Parameters ---------- adata: AnnData The raw AnnData object to be pre-processed normalize_counts: bool Set it to False if library does not need normalization filter_var_genes: bool If True, only `n_top_genes` highly variable genes are kept. n_top_genes: int Number of genes to keep after highly variable filter. Used if `filter_var_genes` is True. Passed to sc.pp.highly_variable_genes. for_pooling: bool Set to True if the function is called by the `prep_pooling` function. Changes the return object parameters. log_transform: bool Set it to false if you do not want values to be log-transformed. division_factor: int Scaling factor, divides the counts matrix by this value. verbose: bool If True, messages about function progress will be printed. Returns ---------- None """ assert division_factor != 0, "Null division factor. Terminating..." adata.X = adata.X / division_factor # Normalization step if normalize_counts: sc.pp.normalize_total(adata, target_sum=np.median(adata.obs["total_counts"])) # Highly variable genes filtering if filter_var_genes: variances = np.var(adata.X, axis=0) inds = np.flip(np.argsort(variances)) ind_genes = inds[0:n_top_genes] if 0 in variances[ind_genes]: ind_first_zero = np.argwhere(variances[ind_genes] == 0)[0][0] ind_genes = ind_genes[0:ind_first_zero] adata._inplace_subset_var(ind_genes) # Logarithmization if log_transform: sc.pp.log1p(adata, base=10) if not for_pooling: adata.uns["scycle"] = { "preprocess": { "method": "simple", "n_top_genes": n_top_genes, "normalize_counts": normalize_counts, "filter_var_genes": filter_var_genes, "division_factor": division_factor, "log_transform": log_transform, "n_top_genes": n_top_genes, } }
def combat( adata: AnnData, key: str = 'batch', covariates: Optional[Collection[str]] = None, inplace: bool = True, ) -> Union[AnnData, np.ndarray, None]: """\ ComBat function for batch effect correction [Johnson07]_ [Leek12]_ [Pedersen12]_. Corrects for batch effects by fitting linear models, gains statistical power via an EB framework where information is borrowed across genes. This uses the implementation `combat.py`_ [Pedersen12]_. .. _combat.py: https://github.com/brentp/combat.py Parameters ---------- adata Annotated data matrix key Key to a categorical annotation from :attr:`~anndata.AnnData.obs` that will be used for batch effect removal. covariates Additional covariates besides the batch variable such as adjustment variables or biological condition. This parameter refers to the design matrix `X` in Equation 2.1 in [Johnson07]_ and to the `mod` argument in the original combat function in the sva R package. Note that not including covariates may introduce bias or lead to the removal of biological signal in unbalanced designs. inplace Whether to replace adata.X or to return the corrected data Returns ------- Depending on the value of `inplace`, either returns the corrected matrix or or modifies `adata.X`. """ # check the input if key not in adata.obs_keys(): raise ValueError('Could not find the key {!r} in adata.obs'.format(key)) if covariates is not None: cov_exist = np.isin(covariates, adata.obs_keys()) if np.any(~cov_exist): missing_cov = np.array(covariates)[~cov_exist].tolist() raise ValueError( 'Could not find the covariate(s) {!r} in adata.obs'.format(missing_cov) ) if key in covariates: raise ValueError('Batch key and covariates cannot overlap') if len(covariates) != len(set(covariates)): raise ValueError('Covariates must be unique') # only works on dense matrices so far if issparse(adata.X): X = adata.X.A.T else: X = adata.X.T data = pd.DataFrame(data=X, index=adata.var_names, columns=adata.obs_names,) sanitize_anndata(adata) # construct a pandas series of the batch annotation model = adata.obs[[key] + (covariates if covariates else [])] batch_info = model.groupby(key).indices.values() n_batch = len(batch_info) n_batches = np.array([len(v) for v in batch_info]) n_array = float(sum(n_batches)) # standardize across genes using a pooled variance estimator logg.info("Standardizing Data across genes.\n") s_data, design, var_pooled, stand_mean = _standardize_data(model, data, key) # fitting the parameters on the standardized data logg.info("Fitting L/S model and finding priors\n") batch_design = design[design.columns[:n_batch]] # first estimate of the additive batch effect gamma_hat = ( la.inv(batch_design.T @ batch_design) @ batch_design.T @ s_data.T ).values delta_hat = [] # first estimate for the multiplicative batch effect for i, batch_idxs in enumerate(batch_info): delta_hat.append(s_data.iloc[:, batch_idxs].var(axis=1)) # empirically fix the prior hyperparameters gamma_bar = gamma_hat.mean(axis=1) t2 = gamma_hat.var(axis=1) # a_prior and b_prior are the priors on lambda and theta from Johnson and Li (2006) a_prior = list(map(_aprior, delta_hat)) b_prior = list(map(_bprior, delta_hat)) logg.info("Finding parametric adjustments\n") # gamma star and delta star will be our empirical bayes (EB) estimators # for the additive and multiplicative batch effect per batch and cell gamma_star, delta_star = [], [] for i, batch_idxs in enumerate(batch_info): # temp stores our estimates for the batch effect parameters. # temp[0] is the additive batch effect # temp[1] is the multiplicative batch effect gamma, delta = _it_sol( s_data.iloc[:, batch_idxs].values, gamma_hat[i], delta_hat[i].values, gamma_bar[i], t2[i], a_prior[i], b_prior[i], ) gamma_star.append(gamma) delta_star.append(delta) logg.info("Adjusting data\n") bayesdata = s_data gamma_star = np.array(gamma_star) delta_star = np.array(delta_star) # we now apply the parametric adjustment to the standardized data from above # loop over all batches in the data for j, batch_idxs in enumerate(batch_info): # we basically substract the additive batch effect, rescale by the ratio # of multiplicative batch effect to pooled variance and add the overall gene # wise mean dsq = np.sqrt(delta_star[j, :]) dsq = dsq.reshape((len(dsq), 1)) denom = np.dot(dsq, np.ones((1, n_batches[j]))) numer = np.array( bayesdata.iloc[:, batch_idxs] - np.dot(batch_design.iloc[batch_idxs], gamma_star).T ) bayesdata.iloc[:, batch_idxs] = numer / denom vpsq = np.sqrt(var_pooled).reshape((len(var_pooled), 1)) bayesdata = bayesdata * np.dot(vpsq, np.ones((1, int(n_array)))) + stand_mean # put back into the adata object or return if inplace: adata.X = bayesdata.values.transpose() else: return bayesdata.values.transpose()
def prep_pooling( adata: AnnData, dim_red_method_pooling: str = "pca", n_neighbors: int = 5, embed_n_comps: int = 20, filter_cells: bool = True, min_counts: int = 10000, max_counts: int = 40000, max_mt_ratio: int = 20, normalize_counts: bool = True, filter_var_genes: bool = True, n_top_genes: int = 10000, for_pooling: bool = True, log_transform: bool = True, division_factor: float = 1, verbose: bool = True, ): """Pre-processes AnnData without pooling Parameters ---------- adata: AnnData The AnnData object to be pre-processed. This should already have been processed to remove "bad cells" (high mitochondrial percentage, aberrant total counts). dim_red_method_pooling: str Method to use for dimensionality reduction to do the pooling procedure. Default: 'pca'. TO-DO: support 'ica' and other? n_neighbors: int Number of nearest neighbors to use for pooling. embed_n_comps: int Number of components to use for the embedding to do the pooling. filter_cells: bool Set it to False if bad quality cells were already filtered min_counts: int Minimum number of counts required for a cell to pass filtering. max_counts: int Maximum number of counts required for a cell to pass filtering. max_mt_ratio: int Maximum proportion of mitochondrial genes in a cell to pass filtering. normalize_counts: bool Set it to False if library does not need normalization filter_var_genes: bool If True, only `n_top_genes` highly variable genes are kept. n_top_genes: int Number of genes to keep after highly variable filter. Used if `filter_var_genes` is True. Passed to sc.pp.highly_variable_genes. for_pooling: bool Set to True if the function is called by the `prep_pooling` function. Changes the return object parameters. log_transform: bool Set it to false if you do not want values to be log-transformed. division_factor: int Scaling factor, divides the counts matrix by this value. verbose: bool If True, messages about function progress will be printed. Returns ---------- None """ if "scycle" in adata.uns: raise Exception("Data has already been pre-processed") if verbose: print("Preparing embedding...") assert division_factor != 0, "Null division factor. Terminating..." adata.X = adata.X / division_factor if filter_cells: quality_control(adata, min_counts, max_counts, max_mt_ratio, verbose) adata_simple = adata.copy() prep_simple( adata_simple, normalize_counts, filter_var_genes, n_top_genes, True, log_transform, 1, False, ) if verbose: print("Embedding for pooling...") X_embed = _embed_for_pooling(adata_simple, dim_red_method_pooling, n_comps=embed_n_comps) if verbose: print("Pooling", str(X_embed.shape[0]), "samples...") _smooth_adata_by_pooling(adata, X_embed, n_neighbours=n_neighbors) prep_simple( adata, normalize_counts, filter_var_genes, n_top_genes, False, log_transform, 1, verbose, ) adata.uns["scycle"] = { "preprocess": { "method": "pooling", "n_neighbors": n_neighbors, "min_counts": min_counts, "max_counts": max_counts, "max_mt_ratio": max_mt_ratio, "normalize_counts": normalize_counts, "filter_var_genes": filter_var_genes, "division_factor": division_factor, "log_transform": log_transform, "n_top_genes": n_top_genes, "embed_n_comps": embed_n_comps, } }
def regress_out( adata: AnnData, keys: Union[str, Sequence[str]], n_jobs: Optional[int] = None, copy: bool = False, ) -> Optional[AnnData]: """\ Regress out (mostly) unwanted sources of variation. Uses simple linear regression. This is inspired by Seurat's `regressOut` function in R [Satija15]. Note that this function tends to overcorrect in certain circumstances as described in :issue:`526`. Parameters ---------- adata The annotated data matrix. keys Keys for observation annotation on which to regress on. n_jobs Number of jobs for parallel computation. `None` means using :attr:`scanpy._settings.ScanpyConfig.n_jobs`. copy Determines whether a copy of `adata` is returned. Returns ------- Depending on `copy` returns or updates `adata` with the corrected data matrix. """ start = logg.info(f'regressing out {keys}') if issparse(adata.X): logg.info(' sparse input is densified and may ' 'lead to high memory use') adata = adata.copy() if copy else adata sanitize_anndata(adata) # TODO: This should throw an implicit modification warning if adata.is_view: adata._init_as_actual(adata.copy()) if isinstance(keys, str): keys = [keys] if issparse(adata.X): adata.X = adata.X.toarray() n_jobs = sett.n_jobs if n_jobs is None else n_jobs # regress on a single categorical variable variable_is_categorical = False if keys[0] in adata.obs_keys() and is_categorical_dtype( adata.obs[keys[0]]): if len(keys) > 1: raise ValueError('If providing categorical variable, ' 'only a single one is allowed. For this one ' 'we regress on the mean for each category.') logg.debug('... regressing on per-gene means within categories') regressors = np.zeros(adata.X.shape, dtype='float32') for category in adata.obs[keys[0]].cat.categories: mask = (category == adata.obs[keys[0]]).values for ix, x in enumerate(adata.X.T): regressors[mask, ix] = x[mask].mean() variable_is_categorical = True # regress on one or several ordinal variables else: # create data frame with selected keys (if given) if keys: regressors = adata.obs[keys] else: regressors = adata.obs.copy() # add column of ones at index 0 (first column) regressors.insert(0, 'ones', 1.0) len_chunk = np.ceil(min(1000, adata.X.shape[1]) / n_jobs).astype(int) n_chunks = np.ceil(adata.X.shape[1] / len_chunk).astype(int) tasks = [] # split the adata.X matrix by columns in chunks of size n_chunk # (the last chunk could be of smaller size than the others) chunk_list = np.array_split(adata.X, n_chunks, axis=1) if variable_is_categorical: regressors_chunk = np.array_split(regressors, n_chunks, axis=1) for idx, data_chunk in enumerate(chunk_list): # each task is a tuple of a data_chunk eg. (adata.X[:,0:100]) and # the regressors. This data will be passed to each of the jobs. if variable_is_categorical: regres = regressors_chunk[idx] else: regres = regressors tasks.append(tuple((data_chunk, regres, variable_is_categorical))) if n_jobs > 1 and n_chunks > 1: import multiprocessing pool = multiprocessing.Pool(n_jobs) res = pool.map_async(_regress_out_chunk, tasks).get(9999999) pool.close() else: res = list(map(_regress_out_chunk, tasks)) # res is a list of vectors (each corresponding to a regressed gene column). # The transpose is needed to get the matrix in the shape needed adata.X = np.vstack(res).T.astype(adata.X.dtype) logg.info(' finished', time=start) return adata if copy else None
def hashsolo( cell_hashing_adata: anndata.AnnData, priors: list = [.01, .8, .19], pre_existing_clusters: str = None, clustering_data: anndata.AnnData = None, resolutions: list = [.1, .25, .5, .75, 1], number_of_noise_barcodes: int = None, inplace: bool = True, ): '''Demultiplex cell hashing dataset using HashSolo method Parameters ---------- cell_hashing_adata : anndata.AnnData Anndata object filled only with hashing counts priors : list, a list of your prior for each hypothesis first element is your prior for the negative hypothesis second element is your prior for the singlet hypothesis third element is your prior for the doublet hypothesis We use [0.01, 0.8, 0.19] by default because we assume the barcodes in your cell hashing matrix are those cells which have passed QC in the transcriptome space, e.g. UMI counts, pct mito reads, etc. clustering_data : anndata.AnnData transcriptional data for clustering resolutions : list clustering resolutions for leiden pre_existing_clusters : str column in cell_hashing_adata.obs for how to break up demultiplexing inplace : bool To do operation in place Returns ------- cell_hashing_adata : AnnData if inplace is False returns AnnData with demultiplexing results in .obs attribute otherwise does is in place ''' if issparse(cell_hashing_adata.X): cell_hashing_adata.X = np.array(cell_hashing_adata.X.todense()) if clustering_data is not None: print( 'This may take awhile we are running clustering at {} different resolutions' .format(len(resolutions))) if not all(clustering_data.obs_names == cell_hashing_adata.obs_names): raise ValueError( 'clustering_data and cell hashing cell_hashing_adata must have same index' ) cell_hashing_adata.obs['best_leiden'] = _get_clusters( clustering_data, resolutions) data = cell_hashing_adata.X num_of_cells = cell_hashing_adata.shape[0] results = pd.DataFrame(np.zeros((num_of_cells, 6)), columns=[ 'most_likely_hypothesis', 'probs_hypotheses', 'cluster_feature', 'negative_hypothesis_probability', 'singlet_hypothesis_probability', 'doublet_hypothesis_probability', ], index=cell_hashing_adata.obs_names) if clustering_data is not None or pre_existing_clusters is not None: cluster_features = 'best_leiden' if pre_existing_clusters is None else pre_existing_clusters unique_cluster_features = np.unique( cell_hashing_adata.obs[cluster_features]) for cluster_feature in unique_cluster_features: cluster_feature_bool_vector = cell_hashing_adata.obs[ cluster_features] == cluster_feature posterior_dict = _calculate_bayes_rule( data[cluster_feature_bool_vector], priors, number_of_noise_barcodes) results.loc[cluster_feature_bool_vector, 'most_likely_hypothesis'] = posterior_dict[ 'most_likely_hypothesis'] results.loc[cluster_feature_bool_vector, 'cluster_feature'] = cluster_feature results.loc[cluster_feature_bool_vector, 'negative_hypothesis_probability'] = posterior_dict[ 'probs_hypotheses'][:, 0] results.loc[cluster_feature_bool_vector, 'singlet_hypothesis_probability'] = posterior_dict[ 'probs_hypotheses'][:, 1] results.loc[cluster_feature_bool_vector, 'doublet_hypothesis_probability'] = posterior_dict[ 'probs_hypotheses'][:, 2] else: posterior_dict = _calculate_bayes_rule(data, priors, number_of_noise_barcodes) results.loc[:, 'most_likely_hypothesis'] = posterior_dict[ 'most_likely_hypothesis'] results.loc[:, 'cluster_feature'] = 0 results.loc[:, 'negative_hypothesis_probability'] = posterior_dict[ 'probs_hypotheses'][:, 0] results.loc[:, 'singlet_hypothesis_probability'] = posterior_dict[ 'probs_hypotheses'][:, 1] results.loc[:, 'doublet_hypothesis_probability'] = posterior_dict[ 'probs_hypotheses'][:, 2] cell_hashing_adata.obs['most_likely_hypothesis'] = results.loc[ cell_hashing_adata.obs_names, 'most_likely_hypothesis'] cell_hashing_adata.obs['cluster_feature'] = results.loc[ cell_hashing_adata.obs_names, 'cluster_feature'] cell_hashing_adata.obs['negative_hypothesis_probability'] = results.loc[ cell_hashing_adata.obs_names, 'negative_hypothesis_probability'] cell_hashing_adata.obs['singlet_hypothesis_probability'] = results.loc[ cell_hashing_adata.obs_names, 'singlet_hypothesis_probability'] cell_hashing_adata.obs['doublet_hypothesis_probability'] = results.loc[ cell_hashing_adata.obs_names, 'doublet_hypothesis_probability'] cell_hashing_adata.obs['Classification'] = None cell_hashing_adata.obs.loc[ cell_hashing_adata.obs['most_likely_hypothesis'] == 2, 'Classification'] = 'Doublet' cell_hashing_adata.obs.loc[ cell_hashing_adata.obs['most_likely_hypothesis'] == 0, 'Classification'] = 'Negative' all_sings = cell_hashing_adata.obs['most_likely_hypothesis'] == 1 singlet_sample_index = np.argmax(cell_hashing_adata.X[all_sings], axis=1) cell_hashing_adata.obs.loc[ all_sings, 'Classification'] = cell_hashing_adata.var_names[singlet_sample_index] return cell_hashing_adata if not inplace else None