Beispiel #1
0
def normalize_per_cell(data, counts_per_cell_after=None, counts_per_cell=None, key_n_counts=None,
                       max_proportion_per_cell=None, use_initial_size=True, layers=['spliced', 'unspliced'],
                       enforce=False, copy=False):
    """Normalize each cell by total counts over all genes.

    Parameters
    ----------
    data : :class:`~anndata.AnnData`, `np.ndarray`, `sp.sparse`
        The (annotated) data matrix of shape `n_obs` × `n_vars`. Rows correspond
        to cells and columns to genes.
    counts_per_cell_after : `float` or `None`, optional (default: `None`)
        If `None`, after normalization, each cell has a total count equal
        to the median of the *counts_per_cell* before normalization.
    counts_per_cell : `np.array`, optional (default: `None`)
        Precomputed counts per cell.
    key_n_counts : `str`, optional (default: `'n_counts'`)
        Name of the field in `adata.obs` where the total counts per cell are
        stored.
    max_proportion_per_cell : `int` (default: `None`)
        Exclude genes counts that account for more than a specific proportion of cell size, e.g. 0.05.
    use_initial_size : `bool` (default: `True`)
        Whether to use initial cell sizes oder actual cell sizes.
    layers : `str` or `list` (default: `{'spliced', 'unspliced'}`)
        Keys for layers to be also considered for normalization.
    copy : `bool`, optional (default: `False`)
        If an :class:`~anndata.AnnData` is passed, determines whether a copy
        is returned.

    Returns
    -------
    Returns or updates `adata` with normalized version of the original `adata.X`, depending on `copy`.
    """
    adata = data.copy() if copy else data
    layers = adata.layers.keys() if layers is 'all' else [layers] if isinstance(layers, str) \
        else [layer for layer in layers if layer in adata.layers.keys()]
    layers = ['X'] + layers
    modified_layers = []

    for layer in layers:
        X = adata.X if layer is 'X' else adata.layers[layer]
        if not_yet_normalized(X) or enforce:
            counts = counts_per_cell if counts_per_cell is not None \
                else get_initial_size(adata, layer) if use_initial_size else get_size(adata, layer)
            if max_proportion_per_cell is not None and (0 < max_proportion_per_cell < 1):
                counts = counts_per_cell_quantile(X, max_proportion_per_cell, counts)
            # equivalent to scanpy.pp.normalize_per_cell(X, counts_per_cell_after, counts)
            counts_after = np.median(counts) if counts_per_cell_after is None else counts_per_cell_after
            counts /= counts_after + (counts_after == 0)
            counts += counts == 0  # to avoid division by zero
            if issparse(X):
                sparsefuncs.inplace_row_scale(X, 1 / counts)
            else:
                X /= np.array(counts[:, None])
            modified_layers.append(layer)

    adata.obs['n_counts' if key_n_counts is None else key_n_counts] = get_size(adata)
    if len(modified_layers) > 0:
        logg.info('Normalized count data:', ', '.join(modified_layers) + '.')

    return adata if copy else None
Beispiel #2
0
def normalize_knn_graph(knn):
    """normalize the knn graph so that each row will be sum up to 1."""
    knn.setdiag(1)
    knn = knn.astype("float32")
    sparsefuncs.inplace_row_scale(knn, 1 / knn.sum(axis=1).A1)

    return knn
Beispiel #3
0
def normalize_by_idf(matrix):
    numbcs_per_feature = matrix.get_numbcs_per_feature()
    scaling_factors_row = np.log(matrix.bcs_dim + 1) - np.log(1 + numbcs_per_feature) 

    m = matrix.m.copy().astype(np.float64)
    sparsefuncs.inplace_row_scale(m, scaling_factors_row)

    return m
Beispiel #4
0
def _normalize_data(X, counts, after=None, copy=False):
    X = X.copy() if copy else X
    after = np.median(counts[counts > 0]) if after is None else after
    counts += (counts == 0)
    counts /= after
    if issparse(X):
        sparsefuncs.inplace_row_scale(X, 1 / counts)
    else:
        X /= counts[:, None]
    return X if copy else None
Beispiel #5
0
def normalize_per_cell(data, counts_per_cell_after=None, copy=False,
                       counts_per_cell=None, field_name_counts=None):
    """Normalize each cell.

    Normalize each cell by UMI count, so that every cell has the same total
    count.

    Similar functions are used, for example, by Cell Ranger [Zheng17], Seurat
    [Satija15], or SPRING [Weinreb17].

    Parameters
    ----------
    data : array_like, sparse or AnnData
        Data matrix. Rows correspond to cells and columns to genes.
    counts_per_cell_after : float or None (default: None)
        If None, after normalization, each cell has a total count equal
        to the median of the counts_per_cell before normalization.
    counts_per_cell : array (default: None)
        Precomputed counts per cell.
    copy : bool (default: False)
        Determines whether function operates inplace (default) or a copy is
        returned.

    Returns
    -------
    Returns or updates ``adata`` with normalized version of the original ``adata.X``,
    depending on `copy`.
    """
    if field_name_counts is None: field_name_counts = 'n_counts'
    if isinstance(data, AnnData):
        logg.m('normalizing by total count per cell', r=True)
        adata = data.copy() if copy else data
        cell_subset, counts_per_cell = filter_cells(adata.X, min_counts=1)
        adata.smp[field_name_counts] = counts_per_cell
        adata.inplace_subset_smp(cell_subset)
        normalize_per_cell(adata.X, counts_per_cell_after, copy,
                           counts_per_cell=counts_per_cell[cell_subset])
        logg.m('    finished', t=True, end=' ')
        logg.m('normalized adata.X and added', no_indent=True)
        logg.m('    "{}", counts per cell before normalization (adata.smp)'
               .format(field_name_counts),
               no_indent=True)
        return adata if copy else None
    # proceed with data matrix
    X = data.copy() if copy else data
    if counts_per_cell is None:
        cell_subset, counts_per_cell = filter_cells(X, min_counts=1)
        X = X[cell_subset]
        counts_per_cell = counts_per_cell[cell_subset]
    if counts_per_cell_after is None:
        counts_per_cell_after = np.median(counts_per_cell)
    counts_per_cell /= counts_per_cell_after
    if not issparse(X): X /= counts_per_cell[:, np.newaxis]
    else: sparsefuncs.inplace_row_scale(X, 1/counts_per_cell)
    return X if copy else None
Beispiel #6
0
def _normalize_data(X, counts, after=None, copy=False):
    X = X.copy() if copy else X
    if issubclass(X.dtype.type, (int, np.integer)):
        X = X.astype(np.float32)  # TODO: Check if float64 should be used
    counts = np.asarray(counts)  # dask doesn't do medians
    after = np.median(counts[counts > 0], axis=0) if after is None else after
    counts += (counts == 0)
    counts = counts / after
    if issparse(X):
        sparsefuncs.inplace_row_scale(X, 1 / counts)
    else:
        np.divide(X, counts[:, None], out=X)
    return X
def test_inplace_row_scale():
    rng = np.random.RandomState(0)
    X = sp.rand(100, 200, 0.05)
    Xr = X.tocsr()
    Xc = X.tocsc()
    XA = X.toarray()
    scale = rng.rand(100)
    XA *= scale.reshape(-1, 1)

    inplace_row_scale(Xc, scale)
    inplace_row_scale(Xr, scale)
    assert_array_almost_equal(Xr.toarray(), Xc.toarray())
    assert_array_almost_equal(XA, Xc.toarray())
    assert_array_almost_equal(XA, Xr.toarray())
    assert_raises(TypeError, inplace_column_scale, X.tolil(), scale)

    X = X.astype(np.float32)
    scale = scale.astype(np.float32)
    Xr = X.tocsr()
    Xc = X.tocsc()
    XA = X.toarray()
    XA *= scale.reshape(-1, 1)
    inplace_row_scale(Xc, scale)
    inplace_row_scale(Xr, scale)
    assert_array_almost_equal(Xr.toarray(), Xc.toarray())
    assert_array_almost_equal(XA, Xc.toarray())
    assert_array_almost_equal(XA, Xr.toarray())
    assert_raises(TypeError, inplace_column_scale, X.tolil(), scale)
Beispiel #8
0
def ClusterSpecificGenes(
    adata, genes, obs
):  #Use obs = 'Type_num' for atlas and obs = 'Type_iGB' for all other time points
    all_var_genes = genes
    percentages = np.zeros(
        (len(all_var_genes), len(adata.obs[obs].values.categories)))
    all_var_genes_index = []
    for i in all_var_genes:
        all_var_genes_index.append(np.where(adata.var.index.values == i)[0][0])
    clusters = list(adata.obs[obs].values.categories)
    for index, value in enumerate(clusters):
        cells_in_clust = adata.obs.index[adata.obs[obs].values == value]
        cells_in_clust_index = []
        for i in cells_in_clust:
            cells_in_clust_index.append(
                np.where(adata.obs.index.values == i)[0][0])
        percentages[:, index] = adata.layers['raw'][
            cells_in_clust_index, :][:, all_var_genes_index].getnnz(
                axis=0) / len(cells_in_clust_index)
    var_genes = []
    var_genes_index = []
    for i in range(len(all_var_genes_index)):
        if any(i > 0.3 for i in percentages[i, :]) == True:
            var_genes.append(all_var_genes[i])
            var_genes_index.append(all_var_genes_index[i])
    X = adata.layers['raw'].copy()
    counts_per_cell = X.sum(1)
    counts_per_cell = np.ravel(counts_per_cell)
    counts = np.asarray(counts_per_cell)
    after = np.median(counts[counts > 0], axis=0)
    counts += (counts == 0)
    counts = counts / after
    sparsefuncs.inplace_row_scale(X, 1 / counts)
    E = np.zeros((len(var_genes), len(adata.obs[obs].values.categories)))
    for index, value in enumerate(clusters):
        cells_in_clust = adata.obs.index[adata.obs[obs].values == value]
        cells_in_clust_index = []
        for i in cells_in_clust:
            cells_in_clust_index.append(
                np.where(adata.obs.index.values == i)[0][0])
        E[:,
          index] = np.log(X[cells_in_clust_index, :][:, var_genes_index].mean(
              axis=0) + 1)
    a = np.zeros(len(var_genes))
    for i in range(len(a)):
        ranking_E = np.sort(E[i, :])
        a[i] = np.mean(ranking_E[-7:]) / np.mean(ranking_E[:7])
    to_return = list(np.array(var_genes)[a > 8])

    return to_return
Beispiel #9
0
def test_inplace_row_scale():
    rng = np.random.RandomState(0)
    X = sp.rand(100, 200, 0.05)
    Xr = X.tocsr()
    Xc = X.tocsc()
    XA = X.toarray()
    scale = rng.rand(100)
    XA *= scale.reshape(-1, 1)

    inplace_row_scale(Xc, scale)
    inplace_row_scale(Xr, scale)
    assert_array_almost_equal(Xr.toarray(), Xc.toarray())
    assert_array_almost_equal(XA, Xc.toarray())
    assert_array_almost_equal(XA, Xr.toarray())
    assert_raises(TypeError, inplace_column_scale, X.tolil(), scale)

    X = X.astype(np.float32)
    scale = scale.astype(np.float32)
    Xr = X.tocsr()
    Xc = X.tocsc()
    XA = X.toarray()
    XA *= scale.reshape(-1, 1)
    inplace_row_scale(Xc, scale)
    inplace_row_scale(Xr, scale)
    assert_array_almost_equal(Xr.toarray(), Xc.toarray())
    assert_array_almost_equal(XA, Xc.toarray())
    assert_array_almost_equal(XA, Xr.toarray())
    assert_raises(TypeError, inplace_column_scale, X.tolil(), scale)
Beispiel #10
0
def normalize_and_log_the_raw_matrix( adata, counts_per_cell_after = 1e4 ):
    '''
    '''
    if check_if_raw_matrix_is_logged( adata ):
        print('normalize_and_log_the_raw_matrix:: matrix is already logged')
        return adata

    print('Normalize and logging matrix...')

    ft_varname = pmhc_scoring.get_feature_types_varname(adata)
    if ft_varname:
        ngenes = sum( adata.raw.var[ft_varname] != 'Antibody Capture' )
    else:
        ngenes = adata.raw.shape[1]

    X_gex = adata.raw.X[:,:ngenes]
    X_ab  = adata.raw.X[:,ngenes:]

    counts_per_cell = np.sum( X_gex, axis=1 ).A1 # A1 since X_gex is sparse
    assert np.min( counts_per_cell ) > 0
    if np.median( counts_per_cell ) < 100:
        print('WARNING normalize_and_log_the_raw_matrix: low median counts_per_cell.', np.median(counts_per_cell),'\n',
              'has the matrix already been log1p-ed???')
        exit()

    counts_per_cell /= counts_per_cell_after

    sparsefuncs.inplace_row_scale(X_gex, 1/counts_per_cell)

    new_counts_per_cell = np.sum( X_gex, axis=1 ).A1 # A1 since X_gex is sparse
    assert min(new_counts_per_cell) > counts_per_cell_after-1 and max(new_counts_per_cell) < counts_per_cell_after+1

    new_X = scipy.sparse.hstack( [X_gex, X_ab], format="csr" )
    np.log1p( new_X.data, out = new_X.data )

    adata_new = AnnData( X = new_X, obs = adata.obs, var = adata.raw.var )

    adata.raw = adata_new

    set_raw_matrix_is_logged_to_true( adata )

    #print(adata)
    return adata
Beispiel #11
0
def _normalize_data(X, counts, after=None, copy=False):
    X = X.copy() if copy else X
    if issubclass(X.dtype.type, (int, np.integer)):
        X = X.astype(np.float32)  # TODO: Check if float64 should be used
    if isinstance(counts, DaskArray):
        counts_greater_than_zero = counts[counts > 0].compute_chunk_sizes()
    else:
        counts_greater_than_zero = counts[counts > 0]

    after = np.median(counts_greater_than_zero,
                      axis=0) if after is None else after
    counts += counts == 0
    counts = counts / after
    if issparse(X):
        sparsefuncs.inplace_row_scale(X, 1 / counts)
    elif isinstance(counts, np.ndarray):
        np.divide(X, counts[:, None], out=X)
    else:
        X = np.divide(X, counts[:, None])  # dask does not support kwarg "out"
    return X
def normalize_by_idf(matrix):
    numbcs_per_feature = matrix.get_numbcs_per_feature()
    scaling_factors_row = np.log(matrix.bcs_dim +
                                 1) - np.log(1 + numbcs_per_feature)

    m = matrix.m.copy().astype(np.float64)
    sparsefuncs.inplace_row_scale(m, scaling_factors_row)

    # Extremely Rare Case (1 out of 1000s of samples tested):
    # Either the scaling or the count may be zero for all features for some barcode
    # This would lead to zero-ing out entire barcode upon normalization, which leads to a null projection as well.
    # This is harmful to analysis code that depends on at least a non-zero norm for each barcode (e.g. spherical clustering and normalized tsne)
    # We sprinkle in a small value that ensures an nnz for the all-zero barcode, after finding such barcodes.

    # find zeroed barcodes and assign nnz to first feature (these barcodes are indistinguishable anyway)
    # We run the very small risk of making it similar to another barcode that is also nnz in the first feature only
    zeroed = np.where(np.squeeze(np.asarray(m.sum(axis=0))) == 0)
    for bc_ix in zeroed:
        m[0, bc_ix] = 1e-15

    return m
Beispiel #13
0
    def fit(self, urm):
        """
        Train the recommender with a list of known interactions playlist - track
        :param urm: the user rating matrix
        """
        print('Training Top Pop Followers...')
        self.urm = urm

        # Remove duplicates
        self.urm.data = np.ones(len(self.urm.data))

        # Normalize
        self.followers = normalize(self.followers.reshape(
            self.followers.shape[0], -1),
                                   norm='l2',
                                   axis=0).reshape(self.followers.shape)

        # Scale urm according to followers
        inplace_row_scale(urm, self.followers)

        self.track_weighted = urm.sum(axis=0)
        self.track_weighted = np.squeeze(np.asarray(self.track_weighted))

        self.popular_tracks = np.argsort(self.track_weighted)[::-1][:10000]
Beispiel #14
0
 def __apply_tf__(icm):
     if tf_type == 'none' or tf_type =='raw' or tf_type=='':
         pass
     elif tf_type == 'binary':
         icm.data = ones(len(icm.data))
     elif tf_type=='tf_normal':
         skfun.inplace_row_scale(icm, 1/nt_dupli)
     elif tf_type=='tf_duplicates':
         skfun.inplace_row_scale(icm, 1/nt)
     elif tf_type=='tf_elduplicates':
         icm.data = ones(len(icm.data))
         skfun.inplace_row_scale(icm, 1/nt)
     elif tf_type== 'log':
         icm.data += ones(len(icm.data))
         icm.data = log(icm.data)
     elif tf_type =='double_k':
         max_per_playlist = np.maximum.reduceat(icm.data, icm.indptr[:-1])
         max_per_playlist[np.diff(icm.indptr) == 0] = 0
         skfun.inplace_row_scale(icm, k / max_per_playlist)
         icm.data = k + icm.data
     else:
         raise AttributeError("n***a wut? idf ["+tf_type+"] not found")
Beispiel #15
0
def kernels_from_velocyto_scvelo(
    X,
    X_embedding,
    V,
    indices,
    neg_cells_trick,
    xy_grid_nums,
    kernel="pearson",
    n_recurse_neighbors=2,
    max_neighs=None,
    transform="sqrt",
    use_neg_vals=True,
    correct_density=True,
):
    """utility function for calculating the transition matrix and low dimensional velocity embedding via the original
    pearson correlation kernel (La Manno et al., 2018) or the cosine kernel from scVelo (Bergen et al., 2019)."""
    n = X.shape[0]
    if indices is not None:
        rows = []
        cols = []
        vals = []

    delta_X = np.zeros((n, X_embedding.shape[1]))
    for i in LoggerManager.progress_logger(
        range(n),
        progress_name=f"calculating transition matrix via {kernel} kernel with {transform} transform.",
    ):
        velocity = V[i, :]  # project V to pca space

        if velocity.sum() != 0:
            i_vals = get_iterative_indices(indices, i, n_recurse_neighbors, max_neighs)  # np.zeros((knn, 1))
            diff = X[i_vals, :] - X[i, :]

            if transform == "log":
                diff_velocity = np.sign(velocity) * np.log1p(np.abs(velocity))
                diff_rho = np.sign(diff) * np.log1p(np.abs(diff))
            elif transform == "logratio":
                hi_dim, hi_dim_t = X[i, :], X[i, :] + velocity
                log2hidim = np.log1p(np.abs(hi_dim))
                diff_velocity = np.log1p(np.abs(hi_dim_t)) - log2hidim
                diff_rho = np.log1p(np.abs(X[i_vals, :])) - np.log1p(np.abs(hi_dim))
            elif transform == "linear":
                diff_velocity = velocity
                diff_rho = diff
            elif transform == "sqrt":
                diff_velocity = np.sign(velocity) * np.sqrt(np.abs(velocity))
                diff_rho = np.sign(diff) * np.sqrt(np.abs(diff))

            if kernel == "pearson":
                vals_ = einsum_correlation(diff_rho, diff_velocity, type="pearson")
            elif kernel == "cosine":
                vals_ = einsum_correlation(diff_rho, diff_velocity, type="cosine")

            rows.extend([i] * len(i_vals))
            cols.extend(i_vals)
            vals.extend(vals_)
    vals = np.hstack(vals)
    vals[np.isnan(vals)] = 0
    G = sp.csr_matrix((vals, (rows, cols)), shape=(X_embedding.shape[0], X_embedding.shape[0]))
    G = split_velocity_graph(G, neg_cells_trick)

    if neg_cells_trick:
        G, G_ = G

    confidence, ub_confidence = G.max(1).A.flatten(), np.percentile(G.max(1).A.flatten(), 98)
    dig_p = np.clip(ub_confidence - confidence, 0, 1)
    G.setdiag(dig_p)

    T = np.expm1(G / 0.1)

    if neg_cells_trick:
        if use_neg_vals:
            T -= np.expm1(-G_ / 0.1)
        else:
            T += np.expm1(G_ / 0.1)
            T.data = T.data + 1

    # T = w * (~ direct_neighs).multiply(T) + (1 - w) * direct_neighs.multiply(T)

    # normalize so that each row sum up to 1
    sparsefuncs.inplace_row_scale(T, 1 / np.abs(T).sum(axis=1).A1)

    T.setdiag(0)
    T.eliminate_zeros()

    delta_X = projection_with_transition_matrix(n, T, X_embedding, correct_density)

    X_grid, V_grid, D = velocity_on_grid(
        X_embedding[:, :2],
        (X_embedding + delta_X)[:, :2],
        xy_grid_nums=xy_grid_nums,
    )

    return T, delta_X, X_grid, V_grid, D
Beispiel #16
0
def normalize_per_cell(
    data: Union[AnnData, np.ndarray, spmatrix],
    counts_per_cell_after: Optional[float] = None,
    counts_per_cell: Optional[np.ndarray] = None,
    key_n_counts: str = 'n_counts',
    copy: bool = False,
    layers: Union[Literal['all'], Iterable[str]] = (),
    use_rep: Optional[Literal['after', 'X']] = None,
    min_counts: int = 1,
) -> Optional[AnnData]:
    """\
    Normalize total counts per cell.

    .. warning::
        .. deprecated:: 1.3.7
            Use :func:`~scanpy.pp.normalize_total` instead.
            The new function is equivalent to the present
            function, except that

            * the new function doesn't filter cells based on `min_counts`,
              use :func:`~scanpy.pp.filter_cells` if filtering is needed.
            * some arguments were renamed
            * `copy` is replaced by `inplace`

    Normalize each cell by total counts over all genes, so that every cell has
    the same total count after normalization.

    Similar functions are used, for example, by Seurat [Satija15]_, Cell Ranger
    [Zheng17]_ or SPRING [Weinreb17]_.

    Parameters
    ----------
    data
        The (annotated) data matrix of shape `n_obs` × `n_vars`. Rows correspond
        to cells and columns to genes.
    counts_per_cell_after
        If `None`, after normalization, each cell has a total count equal
        to the median of the *counts_per_cell* before normalization.
    counts_per_cell
        Precomputed counts per cell.
    key_n_counts
        Name of the field in `adata.obs` where the total counts per cell are
        stored.
    copy
        If an :class:`~anndata.AnnData` is passed, determines whether a copy
        is returned.
    min_counts
        Cells with counts less than `min_counts` are filtered out during
        normalization.

    Returns
    -------
    Returns or updates `adata` with normalized version of the original
    `adata.X`, depending on `copy`.

    Examples
    --------
    >>> import scanpy as sc
    >>> adata = AnnData(np.array([[1, 0], [3, 0], [5, 6]]))
    >>> print(adata.X.sum(axis=1))
    [  1.   3.  11.]
    >>> sc.pp.normalize_per_cell(adata)
    >>> print(adata.obs)
    >>> print(adata.X.sum(axis=1))
       n_counts
    0       1.0
    1       3.0
    2      11.0
    [ 3.  3.  3.]
    >>> sc.pp.normalize_per_cell(
    >>>     adata, counts_per_cell_after=1,
    >>>     key_n_counts='n_counts2',
    >>> )
    >>> print(adata.obs)
    >>> print(adata.X.sum(axis=1))
       n_counts  n_counts2
    0       1.0        3.0
    1       3.0        3.0
    2      11.0        3.0
    [ 1.  1.  1.]
    """
    if isinstance(data, AnnData):
        start = logg.info('normalizing by total count per cell')
        adata = data.copy() if copy else data
        if counts_per_cell is None:
            cell_subset, counts_per_cell = materialize_as_ndarray(
                filter_cells(adata.X, min_counts=min_counts))
            adata.obs[key_n_counts] = counts_per_cell
            adata._inplace_subset_obs(cell_subset)
            counts_per_cell = counts_per_cell[cell_subset]
        normalize_per_cell(adata.X, counts_per_cell_after, counts_per_cell)

        layers = adata.layers.keys() if layers == 'all' else layers
        if use_rep == 'after':
            after = counts_per_cell_after
        elif use_rep == 'X':
            after = np.median(counts_per_cell[cell_subset])
        elif use_rep is None:
            after = None
        else:
            raise ValueError('use_rep should be "after", "X" or None')
        for layer in layers:
            subset, counts = filter_cells(adata.layers[layer],
                                          min_counts=min_counts)
            temp = normalize_per_cell(adata.layers[layer],
                                      after,
                                      counts,
                                      copy=True)
            adata.layers[layer] = temp

        logg.info(
            '    finished ({time_passed}): normalized adata.X and added'
            f'    {key_n_counts!r}, counts per cell before normalization (adata.obs)',
            time=start,
        )
        return adata if copy else None
    # proceed with data matrix
    X = data.copy() if copy else data
    if counts_per_cell is None:
        if copy == False:
            raise ValueError('Can only be run with copy=True')
        cell_subset, counts_per_cell = filter_cells(X, min_counts=min_counts)
        X = X[cell_subset]
        counts_per_cell = counts_per_cell[cell_subset]
    if counts_per_cell_after is None:
        counts_per_cell_after = np.median(counts_per_cell)
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        counts_per_cell += counts_per_cell == 0
        counts_per_cell /= counts_per_cell_after
        if not issparse(X):
            X /= materialize_as_ndarray(counts_per_cell[:, np.newaxis])
        else:
            sparsefuncs.inplace_row_scale(X, 1 / counts_per_cell)
    return X if copy else None
Beispiel #17
0
def lambda_correction(
    adata: anndata.AnnData,
    lambda_key: str = "lambda",
    inplace: bool = True,
    copy: bool = False,
) -> Union[anndata.AnnData, None]:
    """Use lambda (cell-wise detection rate) to estimate the labelled RNA.

    Parameters
    ----------
        adata:
            adata object generated from dynast.
        lambda_key:
            The key to the cell-wise detection rate.
        inplace:
            Whether to inplace update the layers. If False, new layers that append '_corrected" to the existing will be
            used to store the updated data.
        copy:
            Whether to copy the adata object or update adata object inplace.

    Returns
    -------
        adata: :class:`~anndata.AnnData`
            An new or updated anndata object, based on copy parameter, that are updated with Size_Factor, normalized
            expression values, X and reduced dimensions, etc.
    """

    logger = LoggerManager.gen_logger("dynamo-lambda_correction")
    logger.log_time()

    adata = copy_adata(adata) if copy else adata

    logger.info("apply detection rate correction to adata...", indent_level=1)

    if lambda_key not in adata.obs.keys():
        raise ValueError(
            f"the lambda_key {lambda_key} is not included in adata.obs! Please ensure you have calculated "
            "per-cell detection rate!"
        )

    logger.info("retrieving the cell-wise detection rate..", indent_level=1)
    detection_rate = adata.obs[lambda_key].values[:, None]

    logger.info("identify the data type..", indent_level=1)
    all_layers = adata.layers.keys()

    has_ul = np.any([i.contains("ul_") for i in all_layers])
    has_un = np.any([i.contains("un_") for i in all_layers])
    has_sl = np.any([i.contains("sl_") for i in all_layers])
    has_sn = np.any([i.contains("sn_") for i in all_layers])

    has_l = np.any([i.contains("_l_") for i in all_layers])
    has_n = np.any([i.contains("_n_") for i in all_layers])

    if sum(has_ul + has_un + has_sl + has_sn) == 4:
        datatype = "splicing_labeling"
    elif sum(has_l + has_n):
        datatype = "labeling"

    logger.info(f"the data type identified is {datatype}", indent_level=2)

    logger.info("retrieve relevant layers for detection rate correction", indent_level=1)
    if datatype == "splicing_labeling":
        layers, match_tot_layer = [], []
        for layer in all_layers:
            if "ul_" in layer:
                layers += layer
                match_tot_layer += "unspliced"
            elif "un_" in layer:
                layers += layer
                match_tot_layer += "unspliced"
            elif "sl_" in layer:
                layers += layer
                match_tot_layer += "spliced"
            elif "sn_" in layer:
                layers += layer
                match_tot_layer += "spliced"
            elif "spliced" in layer:
                layers += layer
            elif "unspliced" in layer:
                layers += layer

            if len(layers) != 6:
                raise ValueError(
                    "the adata object has to include ul, un, sl, sn, unspliced, spliced, "
                    "six relevant layers for splicing and labeling quantified datasets."
                )
    elif datatype == "labeling":
        layers, match_tot_layer = [], []
        for layer in all_layers:
            if "_l_" in layer:
                layers += layer
                match_tot_layer += ["total"]
            elif "_n_" in layer:
                layers += layer
                match_tot_layer += ["total"]
            elif "total" in layer:
                layers += layer

            if len(layers) != 3:
                raise ValueError(
                    "the adata object has to include labeled, unlabeled, three relevant layers for labeling quantified "
                    "datasets."
                )

    logger.info("detection rate correction starts", indent_level=1)
    for i, layer in enumerate(main_tqdm(layers, desc="iterating all relevant layers")):
        if i < len(match_tot_layer):
            cur_layer = adata.layers[layer] if inplace else adata.layers[layer].copy()
            cur_total = adata.layers[match_tot_layer[i]]

            # even layers is labeled RNA and odd unlabeled RNA
            if i % 2 == 0:
                # formula: min(L / lambda, (L + U)) from scNT-seq
                if issparse(cur_layer):
                    sparsefuncs.inplace_row_scale(cur_layer, 1 / detection_rate)
                else:
                    cur_layer /= detection_rate
                if inplace:
                    adata.layers[layer] = sparse_mimmax(cur_layer, cur_total)
                else:
                    adata.layers[layer + "_corrected"] = sparse_mimmax(cur_layer, cur_total)

            else:
                if inplace:
                    adata.layers[layer] = cur_total - adata.layers[layer[i - 1]]
                else:
                    adata.layers[layer + "_corrected"] = cur_total - adata.layers[layer[i - 1]]

    logger.finish_progress(progress_name="lambda_correction")

    if copy:
        return adata
    return None
Beispiel #18
0
 def inplace_row_scale(self, scale):
     sparsefuncs.inplace_row_scale(self.value, scale)
     return self
Beispiel #19
0
def normalize_per_cell(data,
                       counts_per_cell_after=None,
                       counts_per_cell=None,
                       key_n_counts=None,
                       copy=False):
    """Normalize total counts per cell.

    Normalize each cell by total counts over all genes, so that every cell has
    the same total count after normalization.

    Similar functions are used, for example, by Seurat [Satija15]_, Cell Ranger
    [Zheng17]_ or SPRING [Weinreb17]_.

    Parameters
    ----------
    data : :class:`~scanpy.api.AnnData`, `np.ndarray`, `sp.sparse`
        The (annotated) data matrix of shape `n_obs` × `n_vars`. Rows correspond
        to cells and columns to genes.
    counts_per_cell_after : `float` or `None`, optional (default: `None`)
        If `None`, after normalization, each cell has a total count equal
        to the median of the *counts_per_cell* before normalization.
    counts_per_cell : `np.array`, optional (default: `None`)
        Precomputed counts per cell.
    key_n_counts : `str`, optional (default: `'n_counts'`)
        Name of the field in `adata.obs` where the total counts per cell are
        stored.
    copy : `bool`, optional (default: `False`)
        If an :class:`~scanpy.api.AnnData` is passed, determines whether a copy
        is returned.

    Returns
    -------
    Returns or updates `adata` with normalized version of the original
    `adata.X`, depending on `copy`.

    Examples
    --------
    >>> adata = AnnData(
    >>>     data=np.array([[1, 0], [3, 0], [5, 6]]))
    >>> print(adata.X.sum(axis=1))
    [  1.   3.  11.]
    >>> sc.pp.normalize_per_cell(adata)
    >>> print(adata.obs)
    >>> print(adata.X.sum(axis=1))
       n_counts
    0       1.0
    1       3.0
    2      11.0
    [ 3.  3.  3.]
    >>> sc.pp.normalize_per_cell(adata, counts_per_cell_after=1,
    >>>                          key_n_counts='n_counts2')
    >>> print(adata.obs)
    >>> print(adata.X.sum(axis=1))
       n_counts  n_counts2
    0       1.0        3.0
    1       3.0        3.0
    2      11.0        3.0
    [ 1.  1.  1.]
    """
    if key_n_counts is None: key_n_counts = 'n_counts'
    if isinstance(data, AnnData):
        logg.msg('normalizing by total count per cell', r=True)
        adata = data.copy() if copy else data
        cell_subset, counts_per_cell = filter_cells(adata.X, min_counts=1)
        adata.obs[key_n_counts] = counts_per_cell
        adata._inplace_subset_obs(cell_subset)
        normalize_per_cell(adata.X,
                           counts_per_cell_after,
                           counts_per_cell=counts_per_cell[cell_subset])
        logg.msg('    finished', t=True, end=': ')
        logg.msg('normalized adata.X and added', no_indent=True)
        logg.msg(
            '    \'{}\', counts per cell before normalization (adata.obs)'.
            format(key_n_counts))
        return adata if copy else None
    # proceed with data matrix
    X = data.copy() if copy else data
    if counts_per_cell is None:
        cell_subset, counts_per_cell = filter_cells(X, min_counts=1)
        X = X[cell_subset]
        counts_per_cell = counts_per_cell[cell_subset]
    if counts_per_cell_after is None:
        counts_per_cell_after = np.median(counts_per_cell)
    counts_per_cell /= counts_per_cell_after
    if not issparse(X): X /= counts_per_cell[:, np.newaxis]
    else: sparsefuncs.inplace_row_scale(X, 1 / counts_per_cell)
    return X if copy else None
Beispiel #20
0
def SVRs(adata,
         filter_bool=None,
         layers='X',
         total_szfactor=None,
         min_expr_cells=2,
         min_expr_avg=0,
         max_expr_avg=20,
         svr_gamma=None,
         winsorize=False,
         winsor_perc=(1, 99.5),
         sort_inverse=False):
    """This function is modified from https://github.com/velocyto-team/velocyto.py/blob/master/velocyto/analysis.py

    Parameters
    ----------
        adata: :class:`~anndata.AnnData`
            AnnData object.
        filter_bool: :class:`~numpy.ndarray` (default: None)
            A boolean array from the user to select cells for downstream analysis.
        layers: `str` (default: 'X')
            The layer(s) to be used for calculating dispersion score via support vector regression (SVR). Default is X if there is no spliced layers.
        total_szfactor: `str` (default: `None`)
            The column name in the .obs attribute that corresponds to the size factor for the total mRNA.
        min_expr_cells: `int` (default: `2`)
            minimum number of cells that express that gene for it to be considered in the fit.
        min_expr_avg: `int` (default: `0`)
            The minimum average of genes across cells accepted.
        max_expr_avg: `float` (defaul: `20`)
            The maximum average of genes across cells accepted before treating house-keeping/outliers for removal.
        svr_gamma: `float` or None (default: `None`)
            the gamma hyper-parameter of the SVR.
        winsorize: `bool` (default: `False`)
            Wether to winsorize the data for the cv vs mean model.
        winsor_perc: `tuple` (default: `(1, 99.5)`)
            the up and lower bound of the winsorization.
        sort_inverse: `bool` (default: `False`)
            if True it sorts genes from less noisy to more noisy (to use for size estimation not for feature selection).

    Returns
    -------
        adata: :class:`~anndata.AnnData`
            A updated annData object with `log_m`, `log_cv`, `score` added to .obs columns and `SVR` added to uns attribute
            as a new key.
    """
    from sklearn.svm import SVR

    layers = get_layer_keys(adata, layers)

    for layer in layers:
        if layer is 'raw':
            CM = adata.X.copy() if adata.raw is None else adata.raw
            szfactors = adata.obs[
                layer +
                '_Size_Factor'][:,
                                None] if adata.raw is not None else adata.obs[
                                    'Size_Factor'][:, None]
        elif layer is 'X':
            CM = adata.X.copy()
            szfactors = adata.obs['Size_Factor'][:, None]
        elif layer is 'protein':
            if 'protein' in adata.obsm_keys():
                CM = adata.obsm['protein'].copy()
                szfactors = adata.obs[layer + '_Size_Factor'][:, None]
            else:
                continue
        else:
            CM = adata.layers[layer].copy()
            szfactors = adata.obs[layer + '_Size_Factor'][:, None]

        if total_szfactor is not None and total_szfactor in adata.obs.keys():
            szfactors = adata.obs[total_szfactor][:, None]
        if issparse(CM):
            sparsefuncs.inplace_row_scale(CM, 1 / szfactors)
        else:
            CM /= 1 / szfactors

        if winsorize:
            if min_expr_cells <= ((100 - winsor_perc[1]) * CM.shape[0] * 0.01):
                min_expr_cells = int(
                    np.ceil((100 - winsor_perc[1]) * CM.shape[1] * 0.01)) + 2

        detected_bool = np.array(((CM > 0).sum(0) > min_expr_cells)
                                 & (CM.mean(0) < max_expr_avg)
                                 & (CM.mean(0) > min_expr_avg)).flatten()

        if filter_bool is not None:
            detected_bool = filter_bool & detected_bool

        valid_CM = CM[:, detected_bool]
        if winsorize:
            down, up = np.percentile(valid_CM.A, winsor_perc, 0) if issparse(
                valid_CM) else np.percentile(valid_CM, winsor_perc, 0)
            Sfw = np.clip(valid_CM.A, down[None, :], up[None, :]) if issparse(
                valid_CM) else np.percentile(valid_CM, winsor_perc, 0)
            mu = Sfw.mean(0)
            sigma = Sfw.std(0, ddof=1)
        else:
            mu = np.array(valid_CM.mean(0)).flatten()
            sigma = np.array(
                np.sqrt(valid_CM.multiply(valid_CM).mean(0).A1 -
                        mu**2)).flatten() if issparse(
                            valid_CM) else valid_CM.std(0, ddof=1)

        cv = sigma / mu
        log_m = np.array(np.log2(mu)).flatten()
        log_cv = np.array(np.log2(cv)).flatten()

        if svr_gamma is None:
            svr_gamma = 150. / len(mu)
        # Fit the Support Vector Regression
        clf = SVR(gamma=svr_gamma)
        clf.fit(log_m[:, None], log_cv)
        fitted_fun = clf.predict
        ff = fitted_fun(log_m[:, None])
        score = log_cv - ff
        if sort_inverse:
            score = -score

        adata.var['log_m'], adata.var['log_cv'], adata.var[
            'score'] = np.nan, np.nan, -np.inf
        adata.var.loc[detected_bool, 'log_m'], adata.var.loc[
            detected_bool, 'log_cv'], adata.var.loc[
                detected_bool, 'score'] = np.array(log_m).flatten(), np.array(
                    log_cv).flatten(), np.array(score).flatten()

        key = "velocyto_SVR" if layer is 'raw' or layer is 'X' else layer + "_velocyto_SVR"
        adata.uns[key] = {"SVR": fitted_fun, "detected_bool": detected_bool}

        adata
Beispiel #21
0
def normalize_per_cell(
    data,
    counts_per_cell_after=None,
    counts_per_cell=None,
    key_n_counts=None,
    max_proportion_per_cell=None,
    use_initial_size=True,
    layers=None,
    enforce=None,
    copy=False,
):
    """Normalize each cell by total counts over all genes.

    Parameters
    ----------
    data : :class:`~anndata.AnnData`, `np.ndarray`, `sp.sparse`
        The (annotated) data matrix of shape `n_obs` × `n_vars`. Rows correspond
        to cells and columns to genes.
    counts_per_cell_after : `float` or `None`, optional (default: `None`)
        If `None`, after normalization, each cell has a total count equal
        to the median of the *counts_per_cell* before normalization.
    counts_per_cell : `np.array`, optional (default: `None`)
        Precomputed counts per cell.
    key_n_counts : `str`, optional (default: `'n_counts'`)
        Name of the field in `adata.obs` where the total counts per cell are
        stored.
    max_proportion_per_cell : `int` (default: `None`)
        Exclude genes counts that account for more than
        a specific proportion of cell size, e.g. 0.05.
    use_initial_size : `bool` (default: `True`)
        Whether to use initial cell sizes oder actual cell sizes.
    layers : `str` or `list` (default: `['spliced', 'unspliced']`)
        Keys for layers to be also considered for normalization.
    copy : `bool`, optional (default: `False`)
        If an :class:`~anndata.AnnData` is passed, determines whether a copy
        is returned.

    Returns
    -------
    Returns or updates `adata` with normalized counts.
    """

    adata = data.copy() if copy else data
    if layers is None:
        layers = ["spliced", "unspliced"]
    elif layers == "all":
        layers = adata.layers.keys()
    elif isinstance(layers, str):
        layers = [layers]
    layers = ["X"
              ] + [layer for layer in layers if layer in adata.layers.keys()]
    modified_layers = []

    if isinstance(counts_per_cell, str):
        if counts_per_cell not in adata.obs.keys():
            _set_initial_size(adata, layers)
        counts_per_cell = (adata.obs[counts_per_cell].values
                           if counts_per_cell in adata.obs.keys() else None)

    for layer in layers:
        check_if_valid_dtype(adata, layer)
        X = adata.X if layer == "X" else adata.layers[layer]

        if not_yet_normalized(X) or enforce:
            counts = (counts_per_cell if counts_per_cell is not None else
                      _get_initial_size(adata, layer)
                      if use_initial_size else _get_size(adata, layer))
            if max_proportion_per_cell is not None and (
                    0 < max_proportion_per_cell < 1):
                counts = counts_per_cell_quantile(X, max_proportion_per_cell,
                                                  counts)
            # equivalent to sc.pp.normalize_per_cell(X, counts_per_cell_after, counts)
            counts_after = (np.median(counts) if counts_per_cell_after is None
                            else counts_per_cell_after)

            counts_after += counts_after == 0
            counts = counts / counts_after
            counts += counts == 0  # to avoid division by zero

            if issparse(X):
                sparsefuncs.inplace_row_scale(X, 1 / counts)
            else:
                X /= np.array(counts[:, None])
            modified_layers.append(layer)
            if (layer == "X" and "gene_count_corr" not in adata.var.keys()
                    and X.shape[-1] > 3e3):
                try:
                    adata.var["gene_count_corr"] = np.round(
                        csr_vcorrcoef(X.T, np.ravel((X > 0).sum(1))), 4)
                except Exception:
                    pass
        else:
            logg.warn(
                f"Did not normalize {layer} as it looks processed already. "
                "To enforce normalization, set `enforce=True`.")

    adata.obs["n_counts"
              if key_n_counts is None else key_n_counts] = _get_size(adata)
    if len(modified_layers) > 0:
        logg.info("Normalized count data:", f"{', '.join(modified_layers)}.")

    return adata if copy else None
Beispiel #22
0
def normalize_per_cell(data,
                       counts_per_cell_after=None,
                       counts_per_cell=None,
                       key_n_counts=None,
                       copy=False,
                       layers=[],
                       use_rep=None,
                       min_counts=1):
    """Normalize total counts per cell.

    Normalize each cell by total counts over all genes, so that every cell has
    the same total count after normalization.

    Similar functions are used, for example, by Seurat [Satija15]_, Cell Ranger
    [Zheng17]_ or SPRING [Weinreb17]_.

    Parameters
    ----------
    data : :class:`~anndata.AnnData`, `np.ndarray`, `sp.sparse`
        The (annotated) data matrix of shape `n_obs` × `n_vars`. Rows correspond
        to cells and columns to genes.
    counts_per_cell_after : `float` or `None`, optional (default: `None`)
        If `None`, after normalization, each cell has a total count equal
        to the median of the *counts_per_cell* before normalization.
    counts_per_cell : `np.array`, optional (default: `None`)
        Precomputed counts per cell.
    key_n_counts : `str`, optional (default: `'n_counts'`)
        Name of the field in `adata.obs` where the total counts per cell are
        stored.
    copy : `bool`, optional (default: `False`)
        If an :class:`~anndata.AnnData` is passed, determines whether a copy
        is returned.
    min_counts : `int`, optional (default: 1)
        Cells with counts less than `min_counts` are filtered out during
        normalization.

    Returns
    -------
    AnnData, `None`
        Returns or updates `adata` with normalized version of the original
        `adata.X`, depending on `copy`.

    Examples
    --------
    >>> adata = AnnData(
    >>>     data=np.array([[1, 0], [3, 0], [5, 6]]))
    >>> print(adata.X.sum(axis=1))
    [  1.   3.  11.]
    >>> sc.pp.normalize_per_cell(adata)
    >>> print(adata.obs)
    >>> print(adata.X.sum(axis=1))
       n_counts
    0       1.0
    1       3.0
    2      11.0
    [ 3.  3.  3.]
    >>> sc.pp.normalize_per_cell(adata, counts_per_cell_after=1,
    >>>                          key_n_counts='n_counts2')
    >>> print(adata.obs)
    >>> print(adata.X.sum(axis=1))
       n_counts  n_counts2
    0       1.0        3.0
    1       3.0        3.0
    2      11.0        3.0
    [ 1.  1.  1.]
    """
    if key_n_counts is None: key_n_counts = 'n_counts'
    if isinstance(data, AnnData):
        logg.msg('normalizing by total count per cell', r=True)
        adata = data.copy() if copy else data
        cell_subset, counts_per_cell = materialize_as_ndarray(
            filter_cells(adata.X, min_counts=min_counts))
        adata.obs[key_n_counts] = counts_per_cell
        adata._inplace_subset_obs(cell_subset)
        normalize_per_cell(adata.X,
                           counts_per_cell_after,
                           counts_per_cell=counts_per_cell[cell_subset])

        layers = adata.layers.keys() if layers == 'all' else layers
        if use_rep == 'after':
            after = counts_per_cell_after
        elif use_rep == 'X':
            after = np.median(counts_per_cell[cell_subset])
        elif use_rep is None:
            after = None
        else:
            raise ValueError('use_rep should be "after", "X" or None')
        for layer in layers:
            subset, counts = filter_cells(adata.layers[layer],
                                          min_counts=min_counts)
            temp = normalize_per_cell(adata.layers[layer],
                                      after,
                                      counts,
                                      copy=True)
            adata.layers[layer] = temp

        logg.msg('    finished', t=True, end=': ')
        logg.msg('normalized adata.X and added', no_indent=True)
        logg.msg(
            '    \'{}\', counts per cell before normalization (adata.obs)'.
            format(key_n_counts))
        return adata if copy else None
    # proceed with data matrix
    X = data.copy() if copy else data
    if counts_per_cell is None:
        if copy == False:
            raise ValueError('Can only be run with copy=True')
        cell_subset, counts_per_cell = filter_cells(X, min_counts=min_counts)
        X = X[cell_subset]
        counts_per_cell = counts_per_cell[cell_subset]
    if counts_per_cell_after is None:
        counts_per_cell_after = np.median(counts_per_cell)
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        counts_per_cell += counts_per_cell == 0
        counts_per_cell /= counts_per_cell_after
        if not issparse(X):
            X /= materialize_as_ndarray(counts_per_cell[:, np.newaxis])
        else:
            sparsefuncs.inplace_row_scale(X, 1 / counts_per_cell)
    return X if copy else None