Example #1
0
def per_cell_normalize(adata, results_folder):
    #get start time
    start = time()
    #normalize per cell
    normalize_per_cell(
        adata, counts_per_cell_after=1e4
    )  #already normalize BEFORE saving "raw" - as recommended in the scanpy tutorial
    print('adata normalized per cell')

    #keep raw copy
    adata.raw = log1p(adata, copy=True)
    print('log1p values saved into adata.raw')

    #make log entries
    logging.info('Per cell normalization completed successfully.')
    logging.info("\tTime for per-cell normalization: " +
                 str(round(time() - start, 3)) + 's')

    #export to file
    start = time()
    export_cp10k(adata, basepath=results_folder)

    logging.info('cp10k values exported to file.')
    logging.info("\tTime for cp10k export: " + str(round(time() - start, 3)) +
                 's')

    return (adata)
Example #2
0
def filter_and_normalize(adata,
                         min_counts=10,
                         n_top_genes=None,
                         log=True,
                         copy=False):
    """Filtering, normalization and log transform

    Expects non-logarithmized data. If using logarithmized data, pass `log=False`.

    Runs the following steps

    .. code:: python

        sc.pp.filter_genes(adata, min_counts=10)
        sc.pp.normalize_per_cell(adata)
        sc.pp.filter_genes_dispersion(adata, n_top_genes=10000)
        sc.pp.normalize_per_cell(adata)
        if log: sc.pp.log1p(adata)


    Arguments
    ---------
    adata: :class:`~anndata.AnnData`
        Annotated data matrix.
    min_counts: `int` (default: 10)
        Minimum number of gene counts per cell.
    n_top_genes: `int` (default: 10000)
        Number of genes to keep.
    log: `bool` (default: `True`)
        Take logarithm.
    copy: `bool` (default: `False`)
        Return a copy of `adata` instead of updating it.

    Returns
    -------
    Returns or updates `adata` depending on `copy`.
    """
    from scanpy.api.pp import filter_genes, filter_genes_dispersion, normalize_per_cell, log1p
    filter_genes(adata, min_counts=min_counts)
    if n_top_genes is not None and n_top_genes < adata.shape[1]:
        normalize_per_cell(adata)
        filter_genes_dispersion(adata, n_top_genes=n_top_genes)
    normalize_per_cell(adata)
    if log: log1p(adata)
    return adata if copy else None
Example #3
0
def highly_variable_genes(adata):
    start = time()

    #take log1p
    log1p(adata)
    print('log1p taken of adata')

    filter_result = sc_highly_variable_genes(adata,
                                             min_mean=0.0125,
                                             max_mean=3,
                                             min_disp=0.5,
                                             inplace=False)
    pl_highly_variable_genes(filter_result, save='.hvg.png', show=True)

    adata = adata[:, filter_result.highly_variable == True]

    #logging
    logging.info('After feature selection of highly variable genes: ' +
                 str(adata.shape[0]) + ' cells, ' + str(adata.shape[1]) +
                 ' genes')
    logging.info('\tTime for feature selection: ' +
                 str(round(time() - start, 3)) + 's')

    return (adata)
import anndata as ad
import s3fs.mapping
from scanpy.api.pp import log1p
import zappy.executor

executor = zappy.executor.PywrenExecutor(live_viewer=True, exclude_modules=None, ignore_modules=['dash', 'dash_html_components', 'dash_core_components', 'dask', 'google_auth_oauthlib', 'pandas', 'pytest'])

s3 = s3fs.S3FileSystem()
input_zarr = s3fs.mapping.S3Map('sc-tom-test-data/10x/anndata_zarr_2000/10x.zarr', s3=s3)
input_zarr_X = s3fs.mapping.S3Map('sc-tom-test-data/10x/anndata_zarr_2000/10x.zarr/X', s3=s3)
output_zarr = s3fs.mapping.S3Map('sc-tom-test-data/10x-log1p.zarr', s3=s3)

# regular anndata except for X
adata = ad.read_zarr(input_zarr)
adata.X = zappy.executor.from_zarr(executor, input_zarr_X)

log1p(adata) # updates in place

adata.X.to_zarr(output_zarr, adata.X.chunks)
Example #5
0
 def Log1P(self):
     self.scRNAseq_HVGData_log1p = log1p(self.scRNAseq_HVGData, copy=True)
Example #6
0
def filter_and_normalize(data,
                         min_counts=3,
                         min_counts_u=3,
                         min_cells=None,
                         min_cells_u=None,
                         n_top_genes=None,
                         log=True,
                         plot=False,
                         copy=False):
    """Filtering, normalization and log transform

    Expects non-logarithmized data. If using logarithmized data, pass `log=False`.

    Runs the following steps

    .. code:: python

        sc.pp.filter_genes(adata, min_counts=10)
        sc.pp.normalize_per_cell(adata)
        sc.pp.filter_genes_dispersion(adata, n_top_genes=10000)
        sc.pp.normalize_per_cell(adata)
        if log: sc.pp.log1p(adata)


    Arguments
    ---------
    data: :class:`~anndata.AnnData`
        Annotated data matrix.
    min_counts: `int` (default: 10)
        Minimum number of gene counts per cell.
    n_top_genes: `int` (default: 10000)
        Number of genes to keep.
    log: `bool` (default: `True`)
        Take logarithm.
    copy: `bool` (default: `False`)
        Return a copy of `adata` instead of updating it.

    Returns
    -------
    Returns or updates `adata` depending on `copy`.
    """
    adata = data.copy() if copy else data
    from scanpy.api.pp import filter_genes, filter_genes_dispersion, normalize_per_cell, log1p

    def filter_genes_u(adata, min_counts_u=None, min_cells_u=None):
        counts = adata.layers[
            'unspliced'] if min_counts_u is not None else adata.layers[
                'unspliced'] > 0
        counts = counts.sum(0).A1 if issparse(counts) else counts.sum(0)
        adata._inplace_subset_var(counts >= (
            min_counts_u if min_counts_u is not None else min_cells_u))

    if min_counts is not None: filter_genes(adata, min_counts=min_counts)
    if min_cells is not None: filter_genes(adata, min_cells=min_cells)

    if 'unspliced' in adata.layers.keys():
        if min_counts_u is not None:
            filter_genes_u(adata, min_counts_u=min_counts_u)
        if min_cells_u is not None:
            filter_genes_u(adata, min_cells_u=min_cells_u)

    if n_top_genes is not None and n_top_genes < adata.shape[1]:
        normalize_per_cell(adata)

        filter_result = filter_genes_dispersion(adata.X,
                                                n_top_genes=n_top_genes,
                                                log=False)
        if plot:
            from scanpy.plotting.preprocessing import filter_genes_dispersion as plot_filter_genes_dispersion
            plot_filter_genes_dispersion(filter_result, log=True)
        adata._inplace_subset_var(filter_result.gene_subset)

        #filter_genes_dispersion(adata, n_top_genes=n_top_genes)

    normalize_per_cell(adata)
    if log: log1p(adata)
    return adata if copy else None
Example #7
0
def recluster(adata,
              celltype,
              celltype_label='leiden',
              min_mean=0.0125,
              max_mean=4,
              min_disp=0.5,
              resolution=1.0,
              regress_out_key=None,
              random_seed=0,
              show_plot_filter=False,
              method='leiden'):
    """ Perform subclustering on specific celltype to identify subclusters.

    Extract all cells that belong to the pre-labeled celltype into a new 
    data subset. This datasubset is initialized with the raw data contained in adata.raw. New highly
    variable genes are selected and a new clustering is performed. The function returns the adata 
    subset with the new clustering annotation.

    This can be performed on leiden clusters by setting celltype_label = 'leiden' and passing the
    clusters that are to be selected for reclustering as strings or tuple of strings to the parameter
    celltype. 

    Parameters
    ----------
    adata: 
        the complete AnnData object of the Dataset.
    celltype: `str` or (`str`)
        string identifying the cluster which is to be filtered out, if more than one is to be selected please
        pass them as a tuple not as a list!
    celltype_label: `str` | default = 'leiden'
        string identifying which column in adata.obs will be matching with the celltype argument.
    min_mean: `float` | default = 0.0125
        the minimum gene expression a gene must have to be considered highly variable
    max_mean: `float` | default = 4
        the maximum gene expression a gene can have to be considered highly variable        
    min_disp: `float` | default = 0.5
        the minimum dispersion a gene must have to be considered highly variable
    regress_out_key: `list of str` | default = None
        A list of string identifiers of the adata.obs columns that should be regressed out before 
        performing clustering. If None then no regress_out is calculated.
    random_seed: `int` | default = 0
        the random seed that is used to produce reproducible PCA, clustering and UMAP results
    show_plot_filter: `bool` | default = False
        boolian value indicating if a plot showing the filtering results for highly variable gene 
        detection should be displayed or not
    method: `str` | default = 'louvain' 
        clustering method to use for the reclustering of the datasubset. Possible:louvain/leiden

    Returns
    -------

    AnnData object containing the subcluster annotated with PCA, nearest neighbors, louvain cluster,
    and UMAP coordinates.

    Examples
    --------

    For a more detailed example of the entire reclustering process please refer to the code examples.

    >>> import besca as bc
    >>> import scanpy.api as sc
    >>> adata = bc.datasets.pbmc3k_processed()
    >>> adata_subset = bc.tl.rc.recluster(adata, celltype=('0', '1', '3', '6'), resolution = 1.3)
    >>> sc.pl.umap(adata_subset, color = ['louvain', 'CD3G', 'CD8A', 'CD4', 'IL7R', 'NKG7', 'GNLY'])

    """
    if (not method in ['leiden', 'louvain']):
        raise ValueError("method argument should be leiden or louvain")
    if type(celltype) == str:
        cluster_subset = _subset_adata(
            adata,
            adata.obs.get(celltype_label) == celltype)
    elif type(celltype) == tuple:
        filter = adata.obs.get(celltype_label) == 'NONE'
        for i in range(len(celltype)):
            filter = filter | (adata.obs.get(celltype_label) == celltype[i])
        cluster_subset = _subset_adata(adata, filter)
    else:
        sys.exit('specify cluster input as a string or tuple')

    cluster_subset.raw = cluster_subset

    #identify highly variable genes
    filter_result = filter_genes_dispersion(cluster_subset.X,
                                            min_mean=min_mean,
                                            max_mean=max_mean,
                                            min_disp=min_disp)
    if show_plot_filter:
        plot_filter(filter_result)
    print('In total', str(sum(filter_result.gene_subset)),
          'highly variable genes selected within cluster')

    #apply filter
    cluster_subset = _subset_adata(cluster_subset,
                                   filter_result.gene_subset,
                                   axis=1,
                                   raw=False)

    #perform further processing
    log1p(cluster_subset)
    if regress_out_key is not None:
        regress_out(cluster_subset, keys=regress_out_key)
    sc_scale(cluster_subset)
    sc_pca(
        cluster_subset, random_state=random_seed, svd_solver='arpack'
    )  #using `svd_solver='arpack' ensures that the PCA leads to reproducible results
    neighbors(cluster_subset, n_neighbors=10, random_state=random_seed)
    umap(cluster_subset, random_state=random_seed)
    if method == 'louvain':
        louvain(cluster_subset,
                resolution=resolution,
                random_state=random_seed)
    if method == 'leiden':
        leiden(cluster_subset, resolution=resolution, random_state=random_seed)

    return (cluster_subset)
Example #8
0
def filter_and_normalize(data,
                         min_counts=None,
                         min_counts_u=None,
                         min_cells=None,
                         min_cells_u=None,
                         n_top_genes=None,
                         flavor='seurat',
                         log=True,
                         copy=False):
    """Filtering, normalization and log transform

    Expects non-logarithmized data. If using logarithmized data, pass `log=False`.

    Runs the following steps

    .. code:: python

        scv.pp.filter_genes(adata)
        scv.pp.normalize_per_cell(adata)
        if n_top_genes is not None:
            scv.pp.filter_genes_dispersion(adata)
        if log:
            scv.pp.log1p(adata)


    Arguments
    ---------
    data: :class:`~anndata.AnnData`
        Annotated data matrix.
    min_counts: `int` (default: `None`)
        Minimum number of counts required for a gene to pass filtering (spliced).
    min_counts_u: `int` (default: `None`)
        Minimum number of counts required for a gene to pass filtering (unspliced).
    min_cells: `int` (default: `None`)
        Minimum number of cells expressed required for a gene to pass filtering (spliced).
    min_cells_u: `int` (default: `None`)
        Minimum number of cells expressed required for a gene to pass filtering (unspliced).
    n_top_genes: `int` (default: `None`)
        Number of genes to keep.
    flavor: {'seurat', 'cell_ranger', 'svr'}, optional (default: 'seurat')
        Choose the flavor for computing normalized dispersion. If choosing 'seurat', this expects non-logarithmized data.
    log: `bool` (default: `True`)
        Take logarithm.
    copy: `bool` (default: `False`)
        Return a copy of `adata` instead of updating it.

    Returns
    -------
    Returns or updates `adata` depending on `copy`.
    """
    adata = data.copy() if copy else data

    if 'spliced' in adata.layers.keys() and 'unspliced' in adata.layers.keys():
        X_not_yet_processed = np.all(
            adata.X.data[:100] == adata.layers['spliced'].data[:100])
    else:
        raise ValueError('Could not find spliced / unspliced counts.')

    filter_genes(adata,
                 min_counts=min_counts,
                 min_counts_u=min_counts_u,
                 min_cells=min_cells,
                 min_cells_u=min_cells_u)
    normalize_per_cell(adata)
    if n_top_genes is not None:
        filter_genes_dispersion(adata, n_top_genes=n_top_genes, flavor=flavor)

    if log and X_not_yet_processed:
        log1p(adata)
        logg.info('Logarithmized X.')
    elif log:
        logg.info('Did not modify X as it looks preprocessed already.')
    elif X_not_yet_processed:
        logg.info(
            'Consider logarithmizing adata.X with `scv.pp.log1p` for better results.'
        )

    return adata if copy else None
Example #9
0
def filter_and_normalize(data,
                         min_counts=None,
                         min_counts_u=None,
                         min_cells=None,
                         min_cells_u=None,
                         n_top_genes=None,
                         flavor='seurat',
                         log=True,
                         copy=False):
    """Filtering, normalization and log transform

    Expects non-logarithmized data. If using logarithmized data, pass `log=False`.

    Runs the following steps

    .. code:: python

        scv.pp.filter_genes(adata)
        scv.pp.normalize_per_cell(adata)
        if n_top_genes is not None:
            scv.pp.filter_genes_dispersion(adata)
        if log:
            scv.pp.log1p(adata)


    Arguments
    ---------
    data: :class:`~anndata.AnnData`
        Annotated data matrix.
    min_counts: `int` (default: `None`)
        Minimum number of counts required for a gene to pass filtering (spliced).
    min_counts_u: `int` (default: `None`)
        Minimum number of counts required for a gene to pass filtering (unspliced).
    min_cells: `int` (default: `None`)
        Minimum number of cells expressed required for a gene to pass filtering (spliced).
    min_cells_u: `int` (default: `None`)
        Minimum number of cells expressed required for a gene to pass filtering (unspliced).
    n_top_genes: `int` (default: `None`)
        Number of genes to keep.
    flavor: {'seurat', 'cell_ranger', 'svr'}, optional (default: 'seurat')
        Choose the flavor for computing normalized dispersion. If choosing 'seurat', this expects non-logarithmized data.
    log: `bool` (default: `True`)
        Take logarithm.
    copy: `bool` (default: `False`)
        Return a copy of `adata` instead of updating it.

    Returns
    -------
    Returns or updates `adata` depending on `copy`.
    """
    adata = data.copy() if copy else data
    filter_genes(adata,
                 min_counts=min_counts,
                 min_counts_u=min_counts_u,
                 min_cells=min_cells,
                 min_cells_u=min_cells_u)
    normalize_per_cell(adata)
    if n_top_genes is not None:
        filter_genes_dispersion(adata, n_top_genes=n_top_genes, flavor=flavor)
    if log: log1p(adata)
    return adata if copy else None