def _log1p(X): if issparse(X): np.log1p(X.data, out=X.data) else: np.log1p(X, out=X) return X
import pywren import zappy.base as np import zappy.executor executor = zappy.executor.PywrenExecutor() a = zappy.executor.ones(executor, (10, 2), chunks=(2, 2), dtype="i4") import s3fs.mapping s3 = s3fs.S3FileSystem() path = "sc-tom-test-data/ones.zarr" output_zarr = s3fs.mapping.S3Map(path, s3=s3) out = np.log1p(a) out.to_zarr(output_zarr, a.chunks) s3.ls(path) s3.rm(path, recursive=True)
import s3fs.mapping import zappy.base as np import zappy.executor s3 = s3fs.S3FileSystem() if s3.exists('sc-tom-test-data/10x-log1p.zarr'): s3.rm('sc-tom-test-data/10x-log1p.zarr', recursive=True) input_zarr = s3fs.mapping.S3Map( 'sc-tom-test-data/10x/anndata_zarr_2000/10x.zarr/X', s3=s3) output_zarr = s3fs.mapping.S3Map('sc-tom-test-data/10x-log1p.zarr', s3=s3) executor = zappy.executor.PywrenExecutor(live_viewer=True, exclude_modules=None, ignore_modules=[ 'dash', 'dash_html_components', 'dash_core_components', 'dask', 'google_auth_oauthlib', 'pandas', 'pytest' ]) x = zappy.executor.from_zarr(executor, input_zarr) out = np.log1p(x) out.to_zarr(output_zarr, x.chunks)
def test_log1p(self, x, xd): log1pnps = np.log1p(xd).asndarray() log1pnp = np.log1p(x) assert_allclose(log1pnps, log1pnp)
def filter_genes_dispersion(data, flavor='seurat', min_disp=None, max_disp=None, min_mean=None, max_mean=None, n_bins=20, n_top_genes=None, log=True, subset=True, copy=False): """Extract highly variable genes [Satija15]_ [Zheng17]_. If trying out parameters, pass the data matrix instead of AnnData. Depending on `flavor`, this reproduces the R-implementations of Seurat [Satija15]_ and Cell Ranger [Zheng17]_. The normalized dispersion is obtained by scaling with the mean and standard deviation of the dispersions for genes falling into a given bin for mean expression of genes. This means that for each bin of mean expression, highly variable genes are selected. Use `flavor='cell_ranger'` with care and in the same way as in :func:`~scanpy.api.pp.recipe_zheng17`. Parameters ---------- data : :class:`~anndata.AnnData`, `np.ndarray`, `sp.sparse` The (annotated) data matrix of shape `n_obs` × `n_vars`. Rows correspond to cells and columns to genes. flavor : {'seurat', 'cell_ranger'}, optional (default: 'seurat') Choose the flavor for computing normalized dispersion. If choosing 'seurat', this expects non-logarithmized data - the logarithm of mean and dispersion is taken internally when `log` is at its default value `True`. For 'cell_ranger', this is usually called for logarithmized data - in this case you should set `log` to `False`. In their default workflows, Seurat passes the cutoffs whereas Cell Ranger passes `n_top_genes`. min_mean=0.0125, max_mean=3, min_disp=0.5, max_disp=`None` : `float`, optional If `n_top_genes` unequals `None`, these cutoffs for the means and the normalized dispersions are ignored. n_bins : `int` (default: 20) Number of bins for binning the mean gene expression. Normalization is done with respect to each bin. If just a single gene falls into a bin, the normalized dispersion is artificially set to 1. You'll be informed about this if you set `settings.verbosity = 4`. n_top_genes : `int` or `None` (default: `None`) Number of highly-variable genes to keep. log : `bool`, optional (default: `True`) Use the logarithm of the mean to variance ratio. subset : `bool`, optional (default: `True`) Keep highly-variable genes only (if True) else write a bool array for h ighly-variable genes while keeping all genes copy : `bool`, optional (default: `False`) If an :class:`~anndata.AnnData` is passed, determines whether a copy is returned. Returns ------- If an AnnData `adata` is passed, returns or updates `adata` depending on \ `copy`. It filters the `adata` and adds the annotations means : adata.var Means per gene. Logarithmized when `log` is `True`. dispersions : adata.var Dispersions per gene. Logarithmized when `log` is `True`. dispersions_norm : adata.var Normalized dispersions per gene. Logarithmized when `log` is `True`. If a data matrix `X` is passed, the annotation is returned as `np.recarray` \ with the same information stored in fields: `gene_subset`, `means`, `dispersions`, `dispersion_norm`. """ if n_top_genes is not None and not all([ min_disp is None, max_disp is None, min_mean is None, max_mean is None ]): logg.info('If you pass `n_top_genes`, all cutoffs are ignored.') if min_disp is None: min_disp = 0.5 if min_mean is None: min_mean = 0.0125 if max_mean is None: max_mean = 3 if isinstance(data, AnnData): adata = data.copy() if copy else data result = filter_genes_dispersion(adata.X, log=log, min_disp=min_disp, max_disp=max_disp, min_mean=min_mean, max_mean=max_mean, n_top_genes=n_top_genes, flavor=flavor) adata.var['means'] = result['means'] adata.var['dispersions'] = result['dispersions'] adata.var['dispersions_norm'] = result['dispersions_norm'] if subset: adata._inplace_subset_var(result['gene_subset']) else: adata.var['highly_variable'] = result['gene_subset'] return adata if copy else None logg.msg('extracting highly variable genes', r=True, v=4) X = data # no copy necessary, X remains unchanged in the following mean, var = materialize_as_ndarray(_get_mean_var(X)) # now actually compute the dispersion mean[mean == 0] = 1e-12 # set entries equal to zero to small value dispersion = var / mean if log: # logarithmized mean as in Seurat dispersion[dispersion == 0] = np.nan dispersion = np.log(dispersion) mean = np.log1p(mean) # all of the following quantities are "per-gene" here import pandas as pd df = pd.DataFrame() df['mean'] = mean df['dispersion'] = dispersion if flavor == 'seurat': df['mean_bin'] = pd.cut(df['mean'], bins=n_bins) disp_grouped = df.groupby('mean_bin')['dispersion'] disp_mean_bin = disp_grouped.mean() disp_std_bin = disp_grouped.std(ddof=1) # retrieve those genes that have nan std, these are the ones where # only a single gene fell in the bin and implicitly set them to have # a normalized disperion of 1 one_gene_per_bin = disp_std_bin.isnull() gen_indices = np.where(one_gene_per_bin[df['mean_bin']])[0].tolist() if len(gen_indices) > 0: logg.msg( 'Gene indices {} fell into a single bin: their ' 'normalized dispersion was set to 1.\n ' 'Decreasing `n_bins` will likely avoid this effect.'.format( gen_indices), v=4) # Circumvent pandas 0.23 bug. Both sides of the assignment have dtype==float32, # but there’s still a dtype error without “.value”. disp_std_bin[one_gene_per_bin] = disp_mean_bin[one_gene_per_bin].values disp_mean_bin[one_gene_per_bin] = 0 # actually do the normalization df['dispersion_norm'] = (df['dispersion'].values # use values here as index differs - disp_mean_bin[df['mean_bin']].values) \ / disp_std_bin[df['mean_bin']].values elif flavor == 'cell_ranger': from statsmodels import robust df['mean_bin'] = pd.cut( df['mean'], np.r_[-np.inf, np.percentile(df['mean'], np.arange(10, 105, 5)), np.inf]) disp_grouped = df.groupby('mean_bin')['dispersion'] disp_median_bin = disp_grouped.median() # the next line raises the warning: "Mean of empty slice" with warnings.catch_warnings(): warnings.simplefilter('ignore') disp_mad_bin = disp_grouped.apply(robust.mad) df['dispersion_norm'] = np.abs((df['dispersion'].values - disp_median_bin[df['mean_bin']].values)) \ / disp_mad_bin[df['mean_bin']].values else: raise ValueError('`flavor` needs to be "seurat" or "cell_ranger"') dispersion_norm = df['dispersion_norm'].values.astype('float32') if n_top_genes is not None: dispersion_norm = dispersion_norm[~np.isnan(dispersion_norm)] dispersion_norm[::-1].sort( ) # interestingly, np.argpartition is slightly slower disp_cut_off = dispersion_norm[n_top_genes - 1] gene_subset = df['dispersion_norm'].values >= disp_cut_off logg.msg( 'the {} top genes correspond to a normalized dispersion cutoff of'. format(n_top_genes, disp_cut_off), v=5) else: max_disp = np.inf if max_disp is None else max_disp dispersion_norm[np.isnan(dispersion_norm)] = 0 # similar to Seurat gene_subset = np.logical_and.reduce( (mean > min_mean, mean < max_mean, dispersion_norm > min_disp, dispersion_norm < max_disp)) logg.msg(' finished', time=True, v=4) return np.rec.fromarrays( (gene_subset, df['mean'].values, df['dispersion'].values, df['dispersion_norm'].values.astype('float32', copy=False)), dtype=[('gene_subset', bool), ('means', 'float32'), ('dispersions', 'float32'), ('dispersions_norm', 'float32')])
import pywren import zappy.base as np import zappy.executor executor = zappy.executor.PywrenExecutor() a = zappy.executor.ones(executor, (20000, 28000), chunks=(10000, 28000), dtype=float) import s3fs.mapping s3 = s3fs.S3FileSystem() path = "sc-tom-test-data/ones.zarr" output_zarr = s3fs.mapping.S3Map(path, s3=s3) np.log1p(a, out=a) a.to_zarr(output_zarr, a.chunks) s3.ls(path) s3.rm(path, recursive=True)