Esempio n. 1
0
 def test_filter_genes(self, adata, adata_dist):
     filter_genes(adata_dist, min_cells=2)
     result = materialize_as_ndarray(adata_dist.X)
     filter_genes(adata, min_cells=2)
     assert result.shape == adata.shape
     assert result.shape == (adata.n_obs, adata.n_vars)
     npt.assert_allclose(result, adata.X)
Esempio n. 2
0
 def test_normalize_per_cell(self, adata, adata_dist):
     normalize_per_cell(adata_dist)
     result = materialize_as_ndarray(adata_dist.X)
     normalize_per_cell(adata)
     assert result.shape == adata.shape
     assert result.shape == (adata.n_obs, adata.n_vars)
     npt.assert_allclose(result, adata.X)
Esempio n. 3
0
 def test_log1p(self, adata, adata_dist):
     log1p(adata_dist)
     result = materialize_as_ndarray(adata_dist.X)
     log1p(adata)
     assert result.shape == adata.shape
     assert result.shape == (adata.n_obs, adata.n_vars)
     npt.assert_allclose(result, adata.X)
Esempio n. 4
0
 def test_scale(self, adata, adata_dist):
     if isinstance(adata_dist.X, da.Array):
         return  # fails for dask
     scale(adata_dist)
     result = materialize_as_ndarray(adata_dist.X)
     scale(adata)
     assert result.shape == adata.shape
     assert result.shape == (adata.n_obs, adata.n_vars)
     npt.assert_allclose(result, adata.X)
Esempio n. 5
0
 def test_recipe_zheng17(self, adata, adata_dist):
     recipe_zheng17(adata_dist, n_top_genes=100)
     result = materialize_as_ndarray(adata_dist.X)
     recipe_zheng17(adata, n_top_genes=100)
     assert result.shape == adata.shape
     assert result.shape == (adata.n_obs, adata.n_vars)
     # Note the low tolerance required to get this to pass.
     # Not sure why results diverge so much. (Seems to be scaling again.)
     # Find the element that differs the most with
     # import numpy
     # am = (numpy.absolute(result - adata.X)/ numpy.absolute(adata.X)).argmax()
     # ind = numpy.unravel_index(am, result.shape)
     # print(result[ind], adata.X[ind])
     npt.assert_allclose(result, adata.X, 1e-1)
def _highly_variable_pearson_residuals(
    adata: AnnData,
    theta: float = 100,
    clip: Optional[float] = None,
    n_top_genes: int = 1000,
    batch_key: Optional[str] = None,
    chunksize: int = 1000,
    check_values: bool = True,
    layer: Optional[str] = None,
    subset: bool = False,
    inplace: bool = True,
) -> Optional[pd.DataFrame]:
    """\
    See `scanpy.experimental.pp.highly_variable_genes`.

    Returns
    -------
    If `inplace=True`, `adata.var` is updated with the following fields. Otherwise,
    returns the same fields as :class:`~pandas.DataFrame`.

    highly_variable : bool
        boolean indicator of highly-variable genes
    means : float
        means per gene
    variances : float
        variance per gene
    residual_variances : float
        Residual variance per gene. Averaged in the case of multiple batches.
    highly_variable_rank : float
        Rank of the gene according to residual variance, median rank in the case of multiple batches
    highly_variable_nbatches : int
        If `batch_key` given, denotes in how many batches genes are detected as HVG
    highly_variable_intersection : bool
        If `batch_key` given, denotes the genes that are highly variable in all batches
    """

    view_to_actual(adata)
    X = _get_obs_rep(adata, layer=layer)
    computed_on = layer if layer else 'adata.X'

    # Check for raw counts
    if check_values and (check_nonnegative_integers(X) is False):
        warnings.warn(
            "`flavor='pearson_residuals'` expects raw count data, but non-integers were found.",
            UserWarning,
        )
    # check theta
    if theta <= 0:
        # TODO: would "underdispersion" with negative theta make sense?
        # then only theta=0 were undefined..
        raise ValueError('Pearson residuals require theta > 0')
    # prepare clipping

    if batch_key is None:
        batch_info = np.zeros(adata.shape[0], dtype=int)
    else:
        batch_info = adata.obs[batch_key].values
    n_batches = len(np.unique(batch_info))

    # Get pearson residuals for each batch separately
    residual_gene_vars = []
    for batch in np.unique(batch_info):

        adata_subset_prefilter = adata[batch_info == batch]
        X_batch_prefilter = _get_obs_rep(adata_subset_prefilter, layer=layer)

        # Filter out zero genes
        with settings.verbosity.override(Verbosity.error):
            nonzero_genes = np.ravel(X_batch_prefilter.sum(axis=0)) != 0
        adata_subset = adata_subset_prefilter[:, nonzero_genes]
        X_batch = _get_obs_rep(adata_subset, layer=layer)

        # Prepare clipping
        if clip is None:
            n = X_batch.shape[0]
            clip = np.sqrt(n)
        if clip < 0:
            raise ValueError("Pearson residuals require `clip>=0` or `clip=None`.")

        if sp_sparse.issparse(X_batch):
            sums_genes = np.sum(X_batch, axis=0)
            sums_cells = np.sum(X_batch, axis=1)
            sum_total = np.sum(sums_genes).squeeze()
        else:
            sums_genes = np.sum(X_batch, axis=0, keepdims=True)
            sums_cells = np.sum(X_batch, axis=1, keepdims=True)
            sum_total = np.sum(sums_genes)

        # Compute pearson residuals in chunks
        residual_gene_var = np.empty((X_batch.shape[1]))
        for start in np.arange(0, X_batch.shape[1], chunksize):
            stop = start + chunksize
            mu = np.array(sums_cells @ sums_genes[:, start:stop] / sum_total)
            X_dense = X_batch[:, start:stop].toarray()
            residuals = (X_dense - mu) / np.sqrt(mu + mu**2 / theta)
            residuals = np.clip(residuals, a_min=-clip, a_max=clip)
            residual_gene_var[start:stop] = np.var(residuals, axis=0)

        # Add 0 values for genes that were filtered out
        unmasked_residual_gene_var = np.zeros(len(nonzero_genes))
        unmasked_residual_gene_var[nonzero_genes] = residual_gene_var
        residual_gene_vars.append(unmasked_residual_gene_var.reshape(1, -1))

    residual_gene_vars = np.concatenate(residual_gene_vars, axis=0)

    # Get rank per gene within each batch
    # argsort twice gives ranks, small rank means most variable
    ranks_residual_var = np.argsort(np.argsort(-residual_gene_vars, axis=1), axis=1)
    ranks_residual_var = ranks_residual_var.astype(np.float32)
    # count in how many batches a genes was among the n_top_genes
    highly_variable_nbatches = np.sum(
        (ranks_residual_var < n_top_genes).astype(int), axis=0
    )
    # set non-top genes within each batch to nan
    ranks_residual_var[ranks_residual_var >= n_top_genes] = np.nan
    ranks_masked_array = np.ma.masked_invalid(ranks_residual_var)
    # Median rank across batches, ignoring batches in which gene was not selected
    medianrank_residual_var = np.ma.median(ranks_masked_array, axis=0).filled(np.nan)

    means, variances = materialize_as_ndarray(_get_mean_var(X))
    df = pd.DataFrame.from_dict(
        dict(
            means=means,
            variances=variances,
            residual_variances=np.mean(residual_gene_vars, axis=0),
            highly_variable_rank=medianrank_residual_var,
            highly_variable_nbatches=highly_variable_nbatches.astype(np.int64),
            highly_variable_intersection=highly_variable_nbatches == n_batches,
        )
    )
    df = df.set_index(adata.var_names)

    # Sort genes by how often they selected as hvg within each batch and
    # break ties with median rank of residual variance across batches
    df.sort_values(
        ['highly_variable_nbatches', 'highly_variable_rank'],
        ascending=[False, True],
        na_position='last',
        inplace=True,
    )

    high_var = np.zeros(df.shape[0], dtype=bool)
    high_var[:n_top_genes] = True
    df['highly_variable'] = high_var
    df = df.loc[adata.var_names, :]

    if inplace:
        adata.uns['hvg'] = {'flavor': 'pearson_residuals', 'computed_on': computed_on}
        logg.hint(
            'added\n'
            '    \'highly_variable\', boolean vector (adata.var)\n'
            '    \'highly_variable_rank\', float vector (adata.var)\n'
            '    \'highly_variable_nbatches\', int vector (adata.var)\n'
            '    \'highly_variable_intersection\', boolean vector (adata.var)\n'
            '    \'means\', float vector (adata.var)\n'
            '    \'variances\', float vector (adata.var)\n'
            '    \'residual_variances\', float vector (adata.var)'
        )
        adata.var['means'] = df['means'].values
        adata.var['variances'] = df['variances'].values
        adata.var['residual_variances'] = df['residual_variances']
        adata.var['highly_variable_rank'] = df['highly_variable_rank'].values
        if batch_key is not None:
            adata.var['highly_variable_nbatches'] = df[
                'highly_variable_nbatches'
            ].values
            adata.var['highly_variable_intersection'] = df[
                'highly_variable_intersection'
            ].values
        adata.var['highly_variable'] = df['highly_variable'].values

        if subset:
            adata._inplace_subset_var(df['highly_variable'].values)

    else:
        if batch_key is None:
            df = df.drop(
                ['highly_variable_nbatches', 'highly_variable_intersection'], axis=1
            )
        if subset:
            df = df.iloc[df.highly_variable.values, :]

        return df