Example #1
0
def filter_and_normalize(adata,
                         min_counts=10,
                         n_top_genes=None,
                         log=True,
                         copy=False):
    """Filtering, normalization and log transform

    Expects non-logarithmized data. If using logarithmized data, pass `log=False`.

    Runs the following steps

    .. code:: python

        sc.pp.filter_genes(adata, min_counts=10)
        sc.pp.normalize_per_cell(adata)
        sc.pp.filter_genes_dispersion(adata, n_top_genes=10000)
        sc.pp.normalize_per_cell(adata)
        if log: sc.pp.log1p(adata)


    Arguments
    ---------
    adata: :class:`~anndata.AnnData`
        Annotated data matrix.
    min_counts: `int` (default: 10)
        Minimum number of gene counts per cell.
    n_top_genes: `int` (default: 10000)
        Number of genes to keep.
    log: `bool` (default: `True`)
        Take logarithm.
    copy: `bool` (default: `False`)
        Return a copy of `adata` instead of updating it.

    Returns
    -------
    Returns or updates `adata` depending on `copy`.
    """
    from scanpy.api.pp import filter_genes, filter_genes_dispersion, normalize_per_cell, log1p
    filter_genes(adata, min_counts=min_counts)
    if n_top_genes is not None and n_top_genes < adata.shape[1]:
        normalize_per_cell(adata)
        filter_genes_dispersion(adata, n_top_genes=n_top_genes)
    normalize_per_cell(adata)
    if log: log1p(adata)
    return adata if copy else None
Example #2
0
def filter_and_normalize(data,
                         min_counts=None,
                         min_counts_u=None,
                         min_cells=None,
                         min_cells_u=None,
                         min_shared_counts=None,
                         min_shared_cells=None,
                         n_top_genes=None,
                         flavor='seurat',
                         log=True,
                         copy=False):
    """Filtering, normalization and log transform

    Expects non-logarithmized data. If using logarithmized data, pass `log=False`.

    Runs the following steps

    .. code:: python

        scv.pp.filter_genes(adata)
        scv.pp.normalize_per_cell(adata)
        if n_top_genes is not None:
            scv.pp.filter_genes_dispersion(adata)
        if log:
            scv.pp.log1p(adata)


    Arguments
    ---------
    data: :class:`~anndata.AnnData`
        Annotated data matrix.
    min_counts: `int` (default: `None`)
        Minimum number of counts required for a gene to pass filtering (spliced).
    min_counts_u: `int` (default: `None`)
        Minimum number of counts required for a gene to pass filtering (unspliced).
    min_cells: `int` (default: `None`)
        Minimum number of cells expressed required for a gene to pass filtering (spliced).
    min_cells_u: `int` (default: `None`)
        Minimum number of cells expressed required for a gene to pass filtering (unspliced).
    min_shared_counts: `int`, optional (default: `None`)
        Minimum number of counts (in cells expressed simultaneously in unspliced and spliced) required for a gene.
    min_shared_cells: `int`, optional (default: `None`)
        Minimum number of cells required for a gene to be expressed simultaneously in unspliced and spliced.
    n_top_genes: `int` (default: `None`)
        Number of genes to keep.
    flavor: {'seurat', 'cell_ranger', 'svr'}, optional (default: 'seurat')
        Choose the flavor for computing normalized dispersion. If choosing 'seurat', this expects non-logarithmized data.
    log: `bool` (default: `True`)
        Take logarithm.
    copy: `bool` (default: `False`)
        Return a copy of `adata` instead of updating it.

    Returns
    -------
    Returns or updates `adata` depending on `copy`.
    """
    adata = data.copy() if copy else data

    if 'spliced' not in adata.layers.keys(
    ) or 'unspliced' not in adata.layers.keys():
        raise ValueError('Could not find spliced / unspliced counts.')

    filter_genes(
        adata,
        min_counts=min_counts,
        min_counts_u=min_counts_u,
        min_cells=min_cells,
        min_cells_u=min_cells_u,
        min_shared_counts=min_shared_counts,
        min_shared_cells=min_shared_cells,
    )
    normalize_per_cell(adata)
    if n_top_genes is not None:
        filter_genes_dispersion(adata, n_top_genes=n_top_genes, flavor=flavor)

    log_advised = np.allclose(adata.X[:10].sum(),
                              adata.layers['spliced'][:10].sum())
    if log and log_advised: log1p(adata)

    if log and log_advised: logg.info('Logarithmized X.')
    elif log and not log_advised:
        logg.warn('Did not modify X as it looks preprocessed already.')
    elif log_advised and not log:
        logg.warn(
            'Consider logarithmizing X with `scv.pp.log1p` for better results.'
        )

    return adata if copy else None
Example #3
0
def filter_genes_dispersion(data,
                            flavor='seurat',
                            min_disp=None,
                            max_disp=None,
                            min_mean=None,
                            max_mean=None,
                            n_bins=20,
                            n_top_genes=None,
                            log=True,
                            copy=False):
    """Extract highly variable genes.
    The normalized dispersion is obtained by scaling with the mean and standard
    deviation of the dispersions for genes falling into a given bin for mean
    expression of genes. This means that for each bin of mean expression, highly
    variable genes are selected.

    Parameters
    ----------
    data : :class:`~anndata.AnnData`, `np.ndarray`, `sp.sparse`
        The (annotated) data matrix of shape `n_obs` × `n_vars`. Rows correspond
        to cells and columns to genes.
    flavor : {'seurat', 'cell_ranger', 'svr'}, optional (default: 'seurat')
        Choose the flavor for computing normalized dispersion. If choosing
        'seurat', this expects non-logarithmized data - the logarithm of mean
        and dispersion is taken internally when `log` is at its default value
        `True`. For 'cell_ranger', this is usually called for logarithmized data
        - in this case you should set `log` to `False`. In their default
        workflows, Seurat passes the cutoffs whereas Cell Ranger passes
        `n_top_genes`.
    min_mean=0.0125, max_mean=3, min_disp=0.5, max_disp=`None` : `float`, optional
        If `n_top_genes` unequals `None`, these cutoffs for the means and the
        normalized dispersions are ignored.
    n_bins : `int` (default: 20)
        Number of bins for binning the mean gene expression. Normalization is
        done with respect to each bin. If just a single gene falls into a bin,
        the normalized dispersion is artificially set to 1. You'll be informed
        about this if you set `settings.verbosity = 4`.
    n_top_genes : `int` or `None` (default: `None`)
        Number of highly-variable genes to keep.
    log : `bool`, optional (default: `True`)
        Use the logarithm of the mean to variance ratio.
    copy : `bool`, optional (default: `False`)
        If an :class:`~anndata.AnnData` is passed, determines whether a copy
        is returned.

    Returns
    -------
    If an AnnData `adata` is passed, returns or updates `adata` depending on \
    `copy`. It filters the `adata` and adds the annotations
    """
    adata = data.copy() if copy else data
    set_initial_size(adata)
    if n_top_genes is not None and adata.n_vars < n_top_genes:
        logg.info(
            'Skip filtering by dispersion since number of variables are less than `n_top_genes`'
        )
    else:
        if flavor is 'svr':
            mu = adata.X.mean(0).A1 if issparse(adata.X) else adata.X.mean(0)
            sigma = np.sqrt(adata.X.multiply(adata.X).mean(0).A1 -
                            mu**2) if issparse(adata.X) else adata.X.std(0)
            log_mu = np.log2(mu)
            log_cv = np.log2(sigma / mu)

            from sklearn.svm import SVR
            clf = SVR(gamma=150. / len(mu))
            clf.fit(log_mu[:, None], log_cv)
            score = log_cv - clf.predict(log_mu[:, None])
            nth_score = np.sort(score)[::-1][n_top_genes]
            adata._inplace_subset_var(score >= nth_score)
        else:
            from scanpy.api.pp import filter_genes_dispersion
            filter_genes_dispersion(adata,
                                    flavor=flavor,
                                    min_disp=min_disp,
                                    max_disp=max_disp,
                                    min_mean=min_mean,
                                    max_mean=max_mean,
                                    n_bins=n_bins,
                                    n_top_genes=n_top_genes,
                                    log=log)
    return adata if copy else None
Example #4
0
 def FindHVG(self):
     self.scRNAseq_HVGData = filter_genes_dispersion(
         self.scRNAseq_Propcessed, copy=True)
Example #5
0
def filter_and_normalize(data,
                         min_counts=3,
                         min_counts_u=3,
                         min_cells=None,
                         min_cells_u=None,
                         n_top_genes=None,
                         log=True,
                         plot=False,
                         copy=False):
    """Filtering, normalization and log transform

    Expects non-logarithmized data. If using logarithmized data, pass `log=False`.

    Runs the following steps

    .. code:: python

        sc.pp.filter_genes(adata, min_counts=10)
        sc.pp.normalize_per_cell(adata)
        sc.pp.filter_genes_dispersion(adata, n_top_genes=10000)
        sc.pp.normalize_per_cell(adata)
        if log: sc.pp.log1p(adata)


    Arguments
    ---------
    data: :class:`~anndata.AnnData`
        Annotated data matrix.
    min_counts: `int` (default: 10)
        Minimum number of gene counts per cell.
    n_top_genes: `int` (default: 10000)
        Number of genes to keep.
    log: `bool` (default: `True`)
        Take logarithm.
    copy: `bool` (default: `False`)
        Return a copy of `adata` instead of updating it.

    Returns
    -------
    Returns or updates `adata` depending on `copy`.
    """
    adata = data.copy() if copy else data
    from scanpy.api.pp import filter_genes, filter_genes_dispersion, normalize_per_cell, log1p

    def filter_genes_u(adata, min_counts_u=None, min_cells_u=None):
        counts = adata.layers[
            'unspliced'] if min_counts_u is not None else adata.layers[
                'unspliced'] > 0
        counts = counts.sum(0).A1 if issparse(counts) else counts.sum(0)
        adata._inplace_subset_var(counts >= (
            min_counts_u if min_counts_u is not None else min_cells_u))

    if min_counts is not None: filter_genes(adata, min_counts=min_counts)
    if min_cells is not None: filter_genes(adata, min_cells=min_cells)

    if 'unspliced' in adata.layers.keys():
        if min_counts_u is not None:
            filter_genes_u(adata, min_counts_u=min_counts_u)
        if min_cells_u is not None:
            filter_genes_u(adata, min_cells_u=min_cells_u)

    if n_top_genes is not None and n_top_genes < adata.shape[1]:
        normalize_per_cell(adata)

        filter_result = filter_genes_dispersion(adata.X,
                                                n_top_genes=n_top_genes,
                                                log=False)
        if plot:
            from scanpy.plotting.preprocessing import filter_genes_dispersion as plot_filter_genes_dispersion
            plot_filter_genes_dispersion(filter_result, log=True)
        adata._inplace_subset_var(filter_result.gene_subset)

        #filter_genes_dispersion(adata, n_top_genes=n_top_genes)

    normalize_per_cell(adata)
    if log: log1p(adata)
    return adata if copy else None
Example #6
0
def recluster(adata,
              celltype,
              celltype_label='leiden',
              min_mean=0.0125,
              max_mean=4,
              min_disp=0.5,
              resolution=1.0,
              regress_out_key=None,
              random_seed=0,
              show_plot_filter=False,
              method='leiden'):
    """ Perform subclustering on specific celltype to identify subclusters.

    Extract all cells that belong to the pre-labeled celltype into a new 
    data subset. This datasubset is initialized with the raw data contained in adata.raw. New highly
    variable genes are selected and a new clustering is performed. The function returns the adata 
    subset with the new clustering annotation.

    This can be performed on leiden clusters by setting celltype_label = 'leiden' and passing the
    clusters that are to be selected for reclustering as strings or tuple of strings to the parameter
    celltype. 

    Parameters
    ----------
    adata: 
        the complete AnnData object of the Dataset.
    celltype: `str` or (`str`)
        string identifying the cluster which is to be filtered out, if more than one is to be selected please
        pass them as a tuple not as a list!
    celltype_label: `str` | default = 'leiden'
        string identifying which column in adata.obs will be matching with the celltype argument.
    min_mean: `float` | default = 0.0125
        the minimum gene expression a gene must have to be considered highly variable
    max_mean: `float` | default = 4
        the maximum gene expression a gene can have to be considered highly variable        
    min_disp: `float` | default = 0.5
        the minimum dispersion a gene must have to be considered highly variable
    regress_out_key: `list of str` | default = None
        A list of string identifiers of the adata.obs columns that should be regressed out before 
        performing clustering. If None then no regress_out is calculated.
    random_seed: `int` | default = 0
        the random seed that is used to produce reproducible PCA, clustering and UMAP results
    show_plot_filter: `bool` | default = False
        boolian value indicating if a plot showing the filtering results for highly variable gene 
        detection should be displayed or not
    method: `str` | default = 'louvain' 
        clustering method to use for the reclustering of the datasubset. Possible:louvain/leiden

    Returns
    -------

    AnnData object containing the subcluster annotated with PCA, nearest neighbors, louvain cluster,
    and UMAP coordinates.

    Examples
    --------

    For a more detailed example of the entire reclustering process please refer to the code examples.

    >>> import besca as bc
    >>> import scanpy.api as sc
    >>> adata = bc.datasets.pbmc3k_processed()
    >>> adata_subset = bc.tl.rc.recluster(adata, celltype=('0', '1', '3', '6'), resolution = 1.3)
    >>> sc.pl.umap(adata_subset, color = ['louvain', 'CD3G', 'CD8A', 'CD4', 'IL7R', 'NKG7', 'GNLY'])

    """
    if (not method in ['leiden', 'louvain']):
        raise ValueError("method argument should be leiden or louvain")
    if type(celltype) == str:
        cluster_subset = _subset_adata(
            adata,
            adata.obs.get(celltype_label) == celltype)
    elif type(celltype) == tuple:
        filter = adata.obs.get(celltype_label) == 'NONE'
        for i in range(len(celltype)):
            filter = filter | (adata.obs.get(celltype_label) == celltype[i])
        cluster_subset = _subset_adata(adata, filter)
    else:
        sys.exit('specify cluster input as a string or tuple')

    cluster_subset.raw = cluster_subset

    #identify highly variable genes
    filter_result = filter_genes_dispersion(cluster_subset.X,
                                            min_mean=min_mean,
                                            max_mean=max_mean,
                                            min_disp=min_disp)
    if show_plot_filter:
        plot_filter(filter_result)
    print('In total', str(sum(filter_result.gene_subset)),
          'highly variable genes selected within cluster')

    #apply filter
    cluster_subset = _subset_adata(cluster_subset,
                                   filter_result.gene_subset,
                                   axis=1,
                                   raw=False)

    #perform further processing
    log1p(cluster_subset)
    if regress_out_key is not None:
        regress_out(cluster_subset, keys=regress_out_key)
    sc_scale(cluster_subset)
    sc_pca(
        cluster_subset, random_state=random_seed, svd_solver='arpack'
    )  #using `svd_solver='arpack' ensures that the PCA leads to reproducible results
    neighbors(cluster_subset, n_neighbors=10, random_state=random_seed)
    umap(cluster_subset, random_state=random_seed)
    if method == 'louvain':
        louvain(cluster_subset,
                resolution=resolution,
                random_state=random_seed)
    if method == 'leiden':
        leiden(cluster_subset, resolution=resolution, random_state=random_seed)

    return (cluster_subset)
Example #7
0
def filter_and_normalize(data,
                         min_counts=None,
                         min_counts_u=None,
                         min_cells=None,
                         min_cells_u=None,
                         n_top_genes=None,
                         flavor='seurat',
                         log=True,
                         copy=False):
    """Filtering, normalization and log transform

    Expects non-logarithmized data. If using logarithmized data, pass `log=False`.

    Runs the following steps

    .. code:: python

        scv.pp.filter_genes(adata)
        scv.pp.normalize_per_cell(adata)
        if n_top_genes is not None:
            scv.pp.filter_genes_dispersion(adata)
        if log:
            scv.pp.log1p(adata)


    Arguments
    ---------
    data: :class:`~anndata.AnnData`
        Annotated data matrix.
    min_counts: `int` (default: `None`)
        Minimum number of counts required for a gene to pass filtering (spliced).
    min_counts_u: `int` (default: `None`)
        Minimum number of counts required for a gene to pass filtering (unspliced).
    min_cells: `int` (default: `None`)
        Minimum number of cells expressed required for a gene to pass filtering (spliced).
    min_cells_u: `int` (default: `None`)
        Minimum number of cells expressed required for a gene to pass filtering (unspliced).
    n_top_genes: `int` (default: `None`)
        Number of genes to keep.
    flavor: {'seurat', 'cell_ranger', 'svr'}, optional (default: 'seurat')
        Choose the flavor for computing normalized dispersion. If choosing 'seurat', this expects non-logarithmized data.
    log: `bool` (default: `True`)
        Take logarithm.
    copy: `bool` (default: `False`)
        Return a copy of `adata` instead of updating it.

    Returns
    -------
    Returns or updates `adata` depending on `copy`.
    """
    adata = data.copy() if copy else data
    filter_genes(adata,
                 min_counts=min_counts,
                 min_counts_u=min_counts_u,
                 min_cells=min_cells,
                 min_cells_u=min_cells_u)
    normalize_per_cell(adata)
    if n_top_genes is not None:
        filter_genes_dispersion(adata, n_top_genes=n_top_genes, flavor=flavor)
    if log: log1p(adata)
    return adata if copy else None