Beispiel #1
0
def per_cell_normalize(adata, results_folder):
    #get start time
    start = time()
    #normalize per cell
    normalize_per_cell(
        adata, counts_per_cell_after=1e4
    )  #already normalize BEFORE saving "raw" - as recommended in the scanpy tutorial
    print('adata normalized per cell')

    #keep raw copy
    adata.raw = log1p(adata, copy=True)
    print('log1p values saved into adata.raw')

    #make log entries
    logging.info('Per cell normalization completed successfully.')
    logging.info("\tTime for per-cell normalization: " +
                 str(round(time() - start, 3)) + 's')

    #export to file
    start = time()
    export_cp10k(adata, basepath=results_folder)

    logging.info('cp10k values exported to file.')
    logging.info("\tTime for cp10k export: " + str(round(time() - start, 3)) +
                 's')

    return (adata)
Beispiel #2
0
def normalize_layers(data,
                     layers=['spliced', 'unspliced'],
                     counts_per_cell_after=None,
                     max_proportion_per_cell=None,
                     by_total_size=None,
                     enforce=False,
                     copy=False):
    """Normalize by total counts to median.
    """
    adata = data.copy() if copy else data
    from scanpy.api.pp import normalize_per_cell

    for layer in layers:
        if not_yet_normalized(adata.layers[layer]) or enforce:
            counts_per_cell = get_initial_size(adata, layer, by_total_size)
            if max_proportion_per_cell is not None and (
                    0 < max_proportion_per_cell < 1):
                counts_per_cell = counts_per_cell_quantile(
                    adata.X, max_proportion_per_cell, counts_per_cell)
            counts_per_cell += counts_per_cell == 0
            adata.layers[layer] = normalize_per_cell(adata.layers[layer],
                                                     counts_per_cell_after,
                                                     counts_per_cell,
                                                     copy=True)
    return adata if copy else None
Beispiel #3
0
def normalize_layers(data,
                     layers={'spliced', 'unspliced'},
                     by_total_size=None,
                     max_proportion_per_cell=None,
                     copy=False):
    """Normalize by total counts to median.
    """
    adata = data.copy() if copy else data
    from scanpy.api.pp import normalize_per_cell

    def not_normalized_yet(adata, layer):
        X = adata.layers[layer]
        return np.allclose((X.data[:10] if issparse(X) else X[0]) % 1,
                           0,
                           atol=1e-3)

    for layer in layers:
        if not_normalized_yet(adata, layer):
            counts_per_cell = get_initial_size(adata, layer, by_total_size)
            if max_proportion_per_cell is not None and (
                    0 < max_proportion_per_cell < 1):
                counts_per_cell = counts_per_cell_quantile(
                    adata.X, max_proportion_per_cell, counts_per_cell)
            adata.layers[layer] = normalize_per_cell(adata.layers[layer],
                                                     None,
                                                     counts_per_cell,
                                                     copy=True)
    return adata if copy else None
Beispiel #4
0
def normalize_layers(adata, layers={'spliced', 'unspliced'}, copy=False):
    """Normalize by total counts to median
    """
    from scanpy.api.pp import normalize_per_cell, filter_cells
    for layer in layers:
        subset, counts = filter_cells(adata.layers[layer], min_counts=1)
        adata.layers[layer] = normalize_per_cell(adata.layers[layer], None, counts, copy=True)
    return adata if copy else None
Beispiel #5
0
def filter_and_normalize(adata,
                         min_counts=10,
                         n_top_genes=None,
                         log=True,
                         copy=False):
    """Filtering, normalization and log transform

    Expects non-logarithmized data. If using logarithmized data, pass `log=False`.

    Runs the following steps

    .. code:: python

        sc.pp.filter_genes(adata, min_counts=10)
        sc.pp.normalize_per_cell(adata)
        sc.pp.filter_genes_dispersion(adata, n_top_genes=10000)
        sc.pp.normalize_per_cell(adata)
        if log: sc.pp.log1p(adata)


    Arguments
    ---------
    adata: :class:`~anndata.AnnData`
        Annotated data matrix.
    min_counts: `int` (default: 10)
        Minimum number of gene counts per cell.
    n_top_genes: `int` (default: 10000)
        Number of genes to keep.
    log: `bool` (default: `True`)
        Take logarithm.
    copy: `bool` (default: `False`)
        Return a copy of `adata` instead of updating it.

    Returns
    -------
    Returns or updates `adata` depending on `copy`.
    """
    from scanpy.api.pp import filter_genes, filter_genes_dispersion, normalize_per_cell, log1p
    filter_genes(adata, min_counts=min_counts)
    if n_top_genes is not None and n_top_genes < adata.shape[1]:
        normalize_per_cell(adata)
        filter_genes_dispersion(adata, n_top_genes=n_top_genes)
    normalize_per_cell(adata)
    if log: log1p(adata)
    return adata if copy else None
Beispiel #6
0
def normalize_per_cell(data,
                       counts_per_cell_after=None,
                       counts_per_cell=None,
                       key_n_counts=None,
                       max_proportion_per_cell=None,
                       layers={'spliced', 'unspliced'},
                       copy=False):
    """Normalize each cell by total counts over all genes.

    Parameters
    ----------
    data : :class:`~anndata.AnnData`, `np.ndarray`, `sp.sparse`
        The (annotated) data matrix of shape `n_obs` × `n_vars`. Rows correspond
        to cells and columns to genes.
    counts_per_cell_after : `float` or `None`, optional (default: `None`)
        If `None`, after normalization, each cell has a total count equal
        to the median of the *counts_per_cell* before normalization.
    counts_per_cell : `np.array`, optional (default: `None`)
        Precomputed counts per cell.
    key_n_counts : `str`, optional (default: `'n_counts'`)
        Name of the field in `adata.obs` where the total counts per cell are
        stored.
    max_proportion_per_cell : `int` (default: `None`)
        Exclude genes counts that account for more than a specific proportion of cell size, e.g. 0.05.
    layers : `str` or `list` (default: `{'spliced', 'unspliced'}`)
        Keys for layers to be also considered for normalization.
    copy : `bool`, optional (default: `False`)
        If an :class:`~anndata.AnnData` is passed, determines whether a copy
        is returned.

    Returns
    -------
    Returns or updates `adata` with normalized version of the original `adata.X`, depending on `copy`.
    """
    adata = data.copy() if copy else data
    from scanpy.api.pp import normalize_per_cell
    if max_proportion_per_cell is not None and (0 < max_proportion_per_cell <
                                                1):
        counts_per_cell = counts_per_cell_quantile(adata.X,
                                                   max_proportion_per_cell)
    normalize_per_cell(adata, counts_per_cell_after, counts_per_cell,
                       key_n_counts)
    normalize_layers(adata, layers, max_proportion_per_cell)
    return adata if copy else None
Beispiel #7
0
 def Normalized_per_Cell(self):
     self.scRNAseq_Propcessed = normalize_per_cell(self.scRNAseq_Counts,
                                                   copy=True)
Beispiel #8
0
def filter_and_normalize(data,
                         min_counts=3,
                         min_counts_u=3,
                         min_cells=None,
                         min_cells_u=None,
                         n_top_genes=None,
                         log=True,
                         plot=False,
                         copy=False):
    """Filtering, normalization and log transform

    Expects non-logarithmized data. If using logarithmized data, pass `log=False`.

    Runs the following steps

    .. code:: python

        sc.pp.filter_genes(adata, min_counts=10)
        sc.pp.normalize_per_cell(adata)
        sc.pp.filter_genes_dispersion(adata, n_top_genes=10000)
        sc.pp.normalize_per_cell(adata)
        if log: sc.pp.log1p(adata)


    Arguments
    ---------
    data: :class:`~anndata.AnnData`
        Annotated data matrix.
    min_counts: `int` (default: 10)
        Minimum number of gene counts per cell.
    n_top_genes: `int` (default: 10000)
        Number of genes to keep.
    log: `bool` (default: `True`)
        Take logarithm.
    copy: `bool` (default: `False`)
        Return a copy of `adata` instead of updating it.

    Returns
    -------
    Returns or updates `adata` depending on `copy`.
    """
    adata = data.copy() if copy else data
    from scanpy.api.pp import filter_genes, filter_genes_dispersion, normalize_per_cell, log1p

    def filter_genes_u(adata, min_counts_u=None, min_cells_u=None):
        counts = adata.layers[
            'unspliced'] if min_counts_u is not None else adata.layers[
                'unspliced'] > 0
        counts = counts.sum(0).A1 if issparse(counts) else counts.sum(0)
        adata._inplace_subset_var(counts >= (
            min_counts_u if min_counts_u is not None else min_cells_u))

    if min_counts is not None: filter_genes(adata, min_counts=min_counts)
    if min_cells is not None: filter_genes(adata, min_cells=min_cells)

    if 'unspliced' in adata.layers.keys():
        if min_counts_u is not None:
            filter_genes_u(adata, min_counts_u=min_counts_u)
        if min_cells_u is not None:
            filter_genes_u(adata, min_cells_u=min_cells_u)

    if n_top_genes is not None and n_top_genes < adata.shape[1]:
        normalize_per_cell(adata)

        filter_result = filter_genes_dispersion(adata.X,
                                                n_top_genes=n_top_genes,
                                                log=False)
        if plot:
            from scanpy.plotting.preprocessing import filter_genes_dispersion as plot_filter_genes_dispersion
            plot_filter_genes_dispersion(filter_result, log=True)
        adata._inplace_subset_var(filter_result.gene_subset)

        #filter_genes_dispersion(adata, n_top_genes=n_top_genes)

    normalize_per_cell(adata)
    if log: log1p(adata)
    return adata if copy else None
Beispiel #9
0
def filter_and_normalize(data,
                         min_counts=None,
                         min_counts_u=None,
                         min_cells=None,
                         min_cells_u=None,
                         n_top_genes=None,
                         flavor='seurat',
                         log=True,
                         copy=False):
    """Filtering, normalization and log transform

    Expects non-logarithmized data. If using logarithmized data, pass `log=False`.

    Runs the following steps

    .. code:: python

        scv.pp.filter_genes(adata)
        scv.pp.normalize_per_cell(adata)
        if n_top_genes is not None:
            scv.pp.filter_genes_dispersion(adata)
        if log:
            scv.pp.log1p(adata)


    Arguments
    ---------
    data: :class:`~anndata.AnnData`
        Annotated data matrix.
    min_counts: `int` (default: `None`)
        Minimum number of counts required for a gene to pass filtering (spliced).
    min_counts_u: `int` (default: `None`)
        Minimum number of counts required for a gene to pass filtering (unspliced).
    min_cells: `int` (default: `None`)
        Minimum number of cells expressed required for a gene to pass filtering (spliced).
    min_cells_u: `int` (default: `None`)
        Minimum number of cells expressed required for a gene to pass filtering (unspliced).
    n_top_genes: `int` (default: `None`)
        Number of genes to keep.
    flavor: {'seurat', 'cell_ranger', 'svr'}, optional (default: 'seurat')
        Choose the flavor for computing normalized dispersion. If choosing 'seurat', this expects non-logarithmized data.
    log: `bool` (default: `True`)
        Take logarithm.
    copy: `bool` (default: `False`)
        Return a copy of `adata` instead of updating it.

    Returns
    -------
    Returns or updates `adata` depending on `copy`.
    """
    adata = data.copy() if copy else data

    if 'spliced' in adata.layers.keys() and 'unspliced' in adata.layers.keys():
        X_not_yet_processed = np.all(
            adata.X.data[:100] == adata.layers['spliced'].data[:100])
    else:
        raise ValueError('Could not find spliced / unspliced counts.')

    filter_genes(adata,
                 min_counts=min_counts,
                 min_counts_u=min_counts_u,
                 min_cells=min_cells,
                 min_cells_u=min_cells_u)
    normalize_per_cell(adata)
    if n_top_genes is not None:
        filter_genes_dispersion(adata, n_top_genes=n_top_genes, flavor=flavor)

    if log and X_not_yet_processed:
        log1p(adata)
        logg.info('Logarithmized X.')
    elif log:
        logg.info('Did not modify X as it looks preprocessed already.')
    elif X_not_yet_processed:
        logg.info(
            'Consider logarithmizing adata.X with `scv.pp.log1p` for better results.'
        )

    return adata if copy else None
Beispiel #10
0
def normalize_per_cell(data,
                       counts_per_cell_after=None,
                       counts_per_cell=None,
                       key_n_counts=None,
                       max_proportion_per_cell=None,
                       layers=['spliced', 'unspliced'],
                       enforce=False,
                       copy=False):
    """Normalize each cell by total counts over all genes.

    Parameters
    ----------
    data : :class:`~anndata.AnnData`, `np.ndarray`, `sp.sparse`
        The (annotated) data matrix of shape `n_obs` × `n_vars`. Rows correspond
        to cells and columns to genes.
    counts_per_cell_after : `float` or `None`, optional (default: `None`)
        If `None`, after normalization, each cell has a total count equal
        to the median of the *counts_per_cell* before normalization.
    counts_per_cell : `np.array`, optional (default: `None`)
        Precomputed counts per cell.
    key_n_counts : `str`, optional (default: `'n_counts'`)
        Name of the field in `adata.obs` where the total counts per cell are
        stored.
    max_proportion_per_cell : `int` (default: `None`)
        Exclude genes counts that account for more than a specific proportion of cell size, e.g. 0.05.
    layers : `str` or `list` (default: `{'spliced', 'unspliced'}`)
        Keys for layers to be also considered for normalization.
    copy : `bool`, optional (default: `False`)
        If an :class:`~anndata.AnnData` is passed, determines whether a copy
        is returned.

    Returns
    -------
    Returns or updates `adata` with normalized version of the original `adata.X`, depending on `copy`.
    """
    adata = data.copy() if copy else data
    from scanpy.api.pp import normalize_per_cell

    if max_proportion_per_cell is not None and (0 < max_proportion_per_cell <
                                                1):
        counts_per_cell = counts_per_cell_quantile(adata.X,
                                                   max_proportion_per_cell)

    if not_yet_normalized(adata.X) or enforce:
        normalize_per_cell(adata, counts_per_cell_after, counts_per_cell,
                           key_n_counts)
        add_msg_str = 'X and '
    else:
        add_msg_str = ''

    layers = [layers] if isinstance(layers, str) else [
        layer for layer in layers if layer in adata.layers.keys()
    ]
    if all([not_yet_normalized(adata.layers[layer])
            for layer in layers]) or enforce:
        normalize_layers(adata, layers, counts_per_cell_after,
                         max_proportion_per_cell)
        logg.info('Normalized ' + add_msg_str +
                  'spliced/unspliced count data.')
    else:
        logg.info(
            'Looks like it\'s already normalized.'
            'If you want to (re-)normalize your data, use `scv.pp.normalize_per_cell(adata, enforce=True)`.'
        )

    return adata if copy else None
Beispiel #11
0
def filter_and_normalize(data,
                         min_counts=None,
                         min_counts_u=None,
                         min_cells=None,
                         min_cells_u=None,
                         n_top_genes=None,
                         flavor='seurat',
                         log=True,
                         copy=False):
    """Filtering, normalization and log transform

    Expects non-logarithmized data. If using logarithmized data, pass `log=False`.

    Runs the following steps

    .. code:: python

        scv.pp.filter_genes(adata)
        scv.pp.normalize_per_cell(adata)
        if n_top_genes is not None:
            scv.pp.filter_genes_dispersion(adata)
        if log:
            scv.pp.log1p(adata)


    Arguments
    ---------
    data: :class:`~anndata.AnnData`
        Annotated data matrix.
    min_counts: `int` (default: `None`)
        Minimum number of counts required for a gene to pass filtering (spliced).
    min_counts_u: `int` (default: `None`)
        Minimum number of counts required for a gene to pass filtering (unspliced).
    min_cells: `int` (default: `None`)
        Minimum number of cells expressed required for a gene to pass filtering (spliced).
    min_cells_u: `int` (default: `None`)
        Minimum number of cells expressed required for a gene to pass filtering (unspliced).
    n_top_genes: `int` (default: `None`)
        Number of genes to keep.
    flavor: {'seurat', 'cell_ranger', 'svr'}, optional (default: 'seurat')
        Choose the flavor for computing normalized dispersion. If choosing 'seurat', this expects non-logarithmized data.
    log: `bool` (default: `True`)
        Take logarithm.
    copy: `bool` (default: `False`)
        Return a copy of `adata` instead of updating it.

    Returns
    -------
    Returns or updates `adata` depending on `copy`.
    """
    adata = data.copy() if copy else data
    filter_genes(adata,
                 min_counts=min_counts,
                 min_counts_u=min_counts_u,
                 min_cells=min_cells,
                 min_cells_u=min_cells_u)
    normalize_per_cell(adata)
    if n_top_genes is not None:
        filter_genes_dispersion(adata, n_top_genes=n_top_genes, flavor=flavor)
    if log: log1p(adata)
    return adata if copy else None