def per_cell_normalize(adata, results_folder): #get start time start = time() #normalize per cell normalize_per_cell( adata, counts_per_cell_after=1e4 ) #already normalize BEFORE saving "raw" - as recommended in the scanpy tutorial print('adata normalized per cell') #keep raw copy adata.raw = log1p(adata, copy=True) print('log1p values saved into adata.raw') #make log entries logging.info('Per cell normalization completed successfully.') logging.info("\tTime for per-cell normalization: " + str(round(time() - start, 3)) + 's') #export to file start = time() export_cp10k(adata, basepath=results_folder) logging.info('cp10k values exported to file.') logging.info("\tTime for cp10k export: " + str(round(time() - start, 3)) + 's') return (adata)
def filter_and_normalize(adata, min_counts=10, n_top_genes=None, log=True, copy=False): """Filtering, normalization and log transform Expects non-logarithmized data. If using logarithmized data, pass `log=False`. Runs the following steps .. code:: python sc.pp.filter_genes(adata, min_counts=10) sc.pp.normalize_per_cell(adata) sc.pp.filter_genes_dispersion(adata, n_top_genes=10000) sc.pp.normalize_per_cell(adata) if log: sc.pp.log1p(adata) Arguments --------- adata: :class:`~anndata.AnnData` Annotated data matrix. min_counts: `int` (default: 10) Minimum number of gene counts per cell. n_top_genes: `int` (default: 10000) Number of genes to keep. log: `bool` (default: `True`) Take logarithm. copy: `bool` (default: `False`) Return a copy of `adata` instead of updating it. Returns ------- Returns or updates `adata` depending on `copy`. """ from scanpy.api.pp import filter_genes, filter_genes_dispersion, normalize_per_cell, log1p filter_genes(adata, min_counts=min_counts) if n_top_genes is not None and n_top_genes < adata.shape[1]: normalize_per_cell(adata) filter_genes_dispersion(adata, n_top_genes=n_top_genes) normalize_per_cell(adata) if log: log1p(adata) return adata if copy else None
def highly_variable_genes(adata): start = time() #take log1p log1p(adata) print('log1p taken of adata') filter_result = sc_highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5, inplace=False) pl_highly_variable_genes(filter_result, save='.hvg.png', show=True) adata = adata[:, filter_result.highly_variable == True] #logging logging.info('After feature selection of highly variable genes: ' + str(adata.shape[0]) + ' cells, ' + str(adata.shape[1]) + ' genes') logging.info('\tTime for feature selection: ' + str(round(time() - start, 3)) + 's') return (adata)
import anndata as ad import s3fs.mapping from scanpy.api.pp import log1p import zappy.executor executor = zappy.executor.PywrenExecutor(live_viewer=True, exclude_modules=None, ignore_modules=['dash', 'dash_html_components', 'dash_core_components', 'dask', 'google_auth_oauthlib', 'pandas', 'pytest']) s3 = s3fs.S3FileSystem() input_zarr = s3fs.mapping.S3Map('sc-tom-test-data/10x/anndata_zarr_2000/10x.zarr', s3=s3) input_zarr_X = s3fs.mapping.S3Map('sc-tom-test-data/10x/anndata_zarr_2000/10x.zarr/X', s3=s3) output_zarr = s3fs.mapping.S3Map('sc-tom-test-data/10x-log1p.zarr', s3=s3) # regular anndata except for X adata = ad.read_zarr(input_zarr) adata.X = zappy.executor.from_zarr(executor, input_zarr_X) log1p(adata) # updates in place adata.X.to_zarr(output_zarr, adata.X.chunks)
def Log1P(self): self.scRNAseq_HVGData_log1p = log1p(self.scRNAseq_HVGData, copy=True)
def filter_and_normalize(data, min_counts=3, min_counts_u=3, min_cells=None, min_cells_u=None, n_top_genes=None, log=True, plot=False, copy=False): """Filtering, normalization and log transform Expects non-logarithmized data. If using logarithmized data, pass `log=False`. Runs the following steps .. code:: python sc.pp.filter_genes(adata, min_counts=10) sc.pp.normalize_per_cell(adata) sc.pp.filter_genes_dispersion(adata, n_top_genes=10000) sc.pp.normalize_per_cell(adata) if log: sc.pp.log1p(adata) Arguments --------- data: :class:`~anndata.AnnData` Annotated data matrix. min_counts: `int` (default: 10) Minimum number of gene counts per cell. n_top_genes: `int` (default: 10000) Number of genes to keep. log: `bool` (default: `True`) Take logarithm. copy: `bool` (default: `False`) Return a copy of `adata` instead of updating it. Returns ------- Returns or updates `adata` depending on `copy`. """ adata = data.copy() if copy else data from scanpy.api.pp import filter_genes, filter_genes_dispersion, normalize_per_cell, log1p def filter_genes_u(adata, min_counts_u=None, min_cells_u=None): counts = adata.layers[ 'unspliced'] if min_counts_u is not None else adata.layers[ 'unspliced'] > 0 counts = counts.sum(0).A1 if issparse(counts) else counts.sum(0) adata._inplace_subset_var(counts >= ( min_counts_u if min_counts_u is not None else min_cells_u)) if min_counts is not None: filter_genes(adata, min_counts=min_counts) if min_cells is not None: filter_genes(adata, min_cells=min_cells) if 'unspliced' in adata.layers.keys(): if min_counts_u is not None: filter_genes_u(adata, min_counts_u=min_counts_u) if min_cells_u is not None: filter_genes_u(adata, min_cells_u=min_cells_u) if n_top_genes is not None and n_top_genes < adata.shape[1]: normalize_per_cell(adata) filter_result = filter_genes_dispersion(adata.X, n_top_genes=n_top_genes, log=False) if plot: from scanpy.plotting.preprocessing import filter_genes_dispersion as plot_filter_genes_dispersion plot_filter_genes_dispersion(filter_result, log=True) adata._inplace_subset_var(filter_result.gene_subset) #filter_genes_dispersion(adata, n_top_genes=n_top_genes) normalize_per_cell(adata) if log: log1p(adata) return adata if copy else None
def recluster(adata, celltype, celltype_label='leiden', min_mean=0.0125, max_mean=4, min_disp=0.5, resolution=1.0, regress_out_key=None, random_seed=0, show_plot_filter=False, method='leiden'): """ Perform subclustering on specific celltype to identify subclusters. Extract all cells that belong to the pre-labeled celltype into a new data subset. This datasubset is initialized with the raw data contained in adata.raw. New highly variable genes are selected and a new clustering is performed. The function returns the adata subset with the new clustering annotation. This can be performed on leiden clusters by setting celltype_label = 'leiden' and passing the clusters that are to be selected for reclustering as strings or tuple of strings to the parameter celltype. Parameters ---------- adata: the complete AnnData object of the Dataset. celltype: `str` or (`str`) string identifying the cluster which is to be filtered out, if more than one is to be selected please pass them as a tuple not as a list! celltype_label: `str` | default = 'leiden' string identifying which column in adata.obs will be matching with the celltype argument. min_mean: `float` | default = 0.0125 the minimum gene expression a gene must have to be considered highly variable max_mean: `float` | default = 4 the maximum gene expression a gene can have to be considered highly variable min_disp: `float` | default = 0.5 the minimum dispersion a gene must have to be considered highly variable regress_out_key: `list of str` | default = None A list of string identifiers of the adata.obs columns that should be regressed out before performing clustering. If None then no regress_out is calculated. random_seed: `int` | default = 0 the random seed that is used to produce reproducible PCA, clustering and UMAP results show_plot_filter: `bool` | default = False boolian value indicating if a plot showing the filtering results for highly variable gene detection should be displayed or not method: `str` | default = 'louvain' clustering method to use for the reclustering of the datasubset. Possible:louvain/leiden Returns ------- AnnData object containing the subcluster annotated with PCA, nearest neighbors, louvain cluster, and UMAP coordinates. Examples -------- For a more detailed example of the entire reclustering process please refer to the code examples. >>> import besca as bc >>> import scanpy.api as sc >>> adata = bc.datasets.pbmc3k_processed() >>> adata_subset = bc.tl.rc.recluster(adata, celltype=('0', '1', '3', '6'), resolution = 1.3) >>> sc.pl.umap(adata_subset, color = ['louvain', 'CD3G', 'CD8A', 'CD4', 'IL7R', 'NKG7', 'GNLY']) """ if (not method in ['leiden', 'louvain']): raise ValueError("method argument should be leiden or louvain") if type(celltype) == str: cluster_subset = _subset_adata( adata, adata.obs.get(celltype_label) == celltype) elif type(celltype) == tuple: filter = adata.obs.get(celltype_label) == 'NONE' for i in range(len(celltype)): filter = filter | (adata.obs.get(celltype_label) == celltype[i]) cluster_subset = _subset_adata(adata, filter) else: sys.exit('specify cluster input as a string or tuple') cluster_subset.raw = cluster_subset #identify highly variable genes filter_result = filter_genes_dispersion(cluster_subset.X, min_mean=min_mean, max_mean=max_mean, min_disp=min_disp) if show_plot_filter: plot_filter(filter_result) print('In total', str(sum(filter_result.gene_subset)), 'highly variable genes selected within cluster') #apply filter cluster_subset = _subset_adata(cluster_subset, filter_result.gene_subset, axis=1, raw=False) #perform further processing log1p(cluster_subset) if regress_out_key is not None: regress_out(cluster_subset, keys=regress_out_key) sc_scale(cluster_subset) sc_pca( cluster_subset, random_state=random_seed, svd_solver='arpack' ) #using `svd_solver='arpack' ensures that the PCA leads to reproducible results neighbors(cluster_subset, n_neighbors=10, random_state=random_seed) umap(cluster_subset, random_state=random_seed) if method == 'louvain': louvain(cluster_subset, resolution=resolution, random_state=random_seed) if method == 'leiden': leiden(cluster_subset, resolution=resolution, random_state=random_seed) return (cluster_subset)
def filter_and_normalize(data, min_counts=None, min_counts_u=None, min_cells=None, min_cells_u=None, n_top_genes=None, flavor='seurat', log=True, copy=False): """Filtering, normalization and log transform Expects non-logarithmized data. If using logarithmized data, pass `log=False`. Runs the following steps .. code:: python scv.pp.filter_genes(adata) scv.pp.normalize_per_cell(adata) if n_top_genes is not None: scv.pp.filter_genes_dispersion(adata) if log: scv.pp.log1p(adata) Arguments --------- data: :class:`~anndata.AnnData` Annotated data matrix. min_counts: `int` (default: `None`) Minimum number of counts required for a gene to pass filtering (spliced). min_counts_u: `int` (default: `None`) Minimum number of counts required for a gene to pass filtering (unspliced). min_cells: `int` (default: `None`) Minimum number of cells expressed required for a gene to pass filtering (spliced). min_cells_u: `int` (default: `None`) Minimum number of cells expressed required for a gene to pass filtering (unspliced). n_top_genes: `int` (default: `None`) Number of genes to keep. flavor: {'seurat', 'cell_ranger', 'svr'}, optional (default: 'seurat') Choose the flavor for computing normalized dispersion. If choosing 'seurat', this expects non-logarithmized data. log: `bool` (default: `True`) Take logarithm. copy: `bool` (default: `False`) Return a copy of `adata` instead of updating it. Returns ------- Returns or updates `adata` depending on `copy`. """ adata = data.copy() if copy else data if 'spliced' in adata.layers.keys() and 'unspliced' in adata.layers.keys(): X_not_yet_processed = np.all( adata.X.data[:100] == adata.layers['spliced'].data[:100]) else: raise ValueError('Could not find spliced / unspliced counts.') filter_genes(adata, min_counts=min_counts, min_counts_u=min_counts_u, min_cells=min_cells, min_cells_u=min_cells_u) normalize_per_cell(adata) if n_top_genes is not None: filter_genes_dispersion(adata, n_top_genes=n_top_genes, flavor=flavor) if log and X_not_yet_processed: log1p(adata) logg.info('Logarithmized X.') elif log: logg.info('Did not modify X as it looks preprocessed already.') elif X_not_yet_processed: logg.info( 'Consider logarithmizing adata.X with `scv.pp.log1p` for better results.' ) return adata if copy else None
def filter_and_normalize(data, min_counts=None, min_counts_u=None, min_cells=None, min_cells_u=None, n_top_genes=None, flavor='seurat', log=True, copy=False): """Filtering, normalization and log transform Expects non-logarithmized data. If using logarithmized data, pass `log=False`. Runs the following steps .. code:: python scv.pp.filter_genes(adata) scv.pp.normalize_per_cell(adata) if n_top_genes is not None: scv.pp.filter_genes_dispersion(adata) if log: scv.pp.log1p(adata) Arguments --------- data: :class:`~anndata.AnnData` Annotated data matrix. min_counts: `int` (default: `None`) Minimum number of counts required for a gene to pass filtering (spliced). min_counts_u: `int` (default: `None`) Minimum number of counts required for a gene to pass filtering (unspliced). min_cells: `int` (default: `None`) Minimum number of cells expressed required for a gene to pass filtering (spliced). min_cells_u: `int` (default: `None`) Minimum number of cells expressed required for a gene to pass filtering (unspliced). n_top_genes: `int` (default: `None`) Number of genes to keep. flavor: {'seurat', 'cell_ranger', 'svr'}, optional (default: 'seurat') Choose the flavor for computing normalized dispersion. If choosing 'seurat', this expects non-logarithmized data. log: `bool` (default: `True`) Take logarithm. copy: `bool` (default: `False`) Return a copy of `adata` instead of updating it. Returns ------- Returns or updates `adata` depending on `copy`. """ adata = data.copy() if copy else data filter_genes(adata, min_counts=min_counts, min_counts_u=min_counts_u, min_cells=min_cells, min_cells_u=min_cells_u) normalize_per_cell(adata) if n_top_genes is not None: filter_genes_dispersion(adata, n_top_genes=n_top_genes, flavor=flavor) if log: log1p(adata) return adata if copy else None