Ejemplo n.º 1
0
def compute_query_projection(
    what: Union[str, ut.Matrix] = "__x__",
    *,
    adata: AnnData,
    qdata: AnnData,
    weights: ut.Matrix,
    atlas_total_umis: Optional[ut.Vector] = None,
    query_total_umis: Optional[ut.Vector] = None,
) -> None:
    """
    Compute the projected image of the query on the atlas.

    **Input**

    Annotated query ``qdata`` and atlas ``adata``, where the observations are cells and the variables are genes, where
    ``what`` is a per-variable-per-observation matrix or the name of a per-variable-per-observation annotation
    containing such a matrix.

    The ``weights`` of the projection where each row is a query metacell, each column is an atlas metacell, and the
    value is the weight of the atlas cell for projecting the metacell, such that the sum of weights in each row
    is one.

    **Returns**

    In addition, sets the following annotations in ``qdata``:

    Observation (Cell) Annotations
        ``projection``
            The number of UMIs of each gene in the projected image of the query to the metacell, if the total number of
            UMIs in the projection is equal to the total number of UMIs in the query metacell.

    **Computation Parameters**

    1. Compute the fraction of each gene in the atlas and the query based on the total UMIs, unless ``atlas_total_umis``
       and/or ``query_total_umis`` are specified.

    2. Compute the projected image of each query metacell on the atlas using the weights.

    3. Convert this image to UMIs count based on the total UMIs of each metacell. Note that if overriding the total
       atlas or query UMIs, this means that the result need not sum to this total.
    """
    assert np.all(adata.var_names == qdata.var_names)

    atlas_umis = ut.get_vo_proper(adata, what, layout="row_major")
    query_umis = ut.get_vo_proper(qdata, what, layout="row_major")

    if atlas_total_umis is None:
        atlas_total_umis = ut.sum_per(atlas_umis, per="row")
    atlas_total_umis = ut.to_numpy_vector(atlas_total_umis)

    if query_total_umis is None:
        query_total_umis = ut.sum_per(query_umis, per="row")
    query_total_umis = ut.to_numpy_vector(query_total_umis)

    atlas_fractions = ut.to_numpy_matrix(ut.fraction_by(atlas_umis, by="row", sums=atlas_total_umis))
    projected_fractions = weights @ atlas_fractions  # type: ignore
    projected_umis = ut.scale_by(projected_fractions, scale=query_total_umis, by="row")
    ut.set_vo_data(qdata, "projected", projected_umis)
Ejemplo n.º 2
0
def downsample_cells(
    adata: AnnData,
    what: Union[str, ut.Matrix] = "__x__",
    *,
    downsample_min_cell_quantile: float = pr.downsample_min_cell_quantile,
    downsample_min_samples: float = pr.downsample_min_samples,
    downsample_max_cell_quantile: float = pr.downsample_max_cell_quantile,
    random_seed: int = pr.random_seed,
    inplace: bool = True,
) -> Optional[ut.PandasFrame]:
    """
    Downsample the values of ``what`` (default: {what}) data.

    Downsampling is an effective way to get the same number of samples in multiple cells
    (that is, the same number of total UMIs in multiple cells), and serves as an alternative to
    normalization (e.g., working with UMI fractions instead of raw UMI counts).

    Downsampling is especially important when computing correlations between cells. When there is
    high variance between the total UMI count in different cells, then normalization will return
    higher correlation values between cells with a higher total UMI count, which will result in an
    inflated estimation of their similarity to other cells. Downsampling avoids this effect.

    **Input**

    Annotated ``adata``, where the observations are cells and the variables are genes, where
    ``what`` is a per-variable-per-observation matrix or the name of a per-variable-per-observation
    annotation containing such a matrix.

    **Returns**

    Variable-Observation (Gene-Cell) Annotations
        ``downsampled``
            The downsampled data where the total number of samples in each cell is at most
            ``samples``.

    If ``inplace`` (default: {inplace}), this is written to the data, and the function returns
    ``None``. Otherwise this is returned as a pandas data frame (indexed by the cell and gene
    names).

    **Computation Parameters**

    1. Compute the total samples in each cell.

    2. Decide on the value to downsample to. We would like all cells to end up with at least some
       reasonable number of samples (total UMIs) ``downsample_min_samples`` (default:
       {downsample_min_samples}). We'd also like all (most) cells to end up with the highest
       reasonable downsampled total number of samples, so if possible we increase the number of
       samples, as long as at most ``downsample_min_cell_quantile`` (default:
       {downsample_min_cell_quantile}) cells will have lower number of samples. We'd also like all
       (most) cells to end up with the same downsampled total number of samples, so if we have to we
       decrease the number of samples to ensure at most ``downsample_max_cell_quantile`` (default:
       {downsample_max_cell_quantile}) cells will have a lower number of samples.

    3. Downsample each cell so that it has at most the selected number of samples. Use the
       ``random_seed`` to allow making this replicable.
    """
    total_per_cell = ut.get_o_numpy(adata, what, sum=True)

    samples = int(
        round(
            min(
                max(downsample_min_samples,
                    np.quantile(total_per_cell, downsample_min_cell_quantile)),
                np.quantile(total_per_cell, downsample_max_cell_quantile),
            )))

    ut.log_calc("samples", samples)

    data = ut.get_vo_proper(adata, what, layout="row_major")
    assert ut.shaped_dtype(data) == "float32"
    downsampled = ut.downsample_matrix(data,
                                       per="row",
                                       samples=samples,
                                       random_seed=random_seed)
    if inplace:
        ut.set_vo_data(adata, "downsampled", downsampled)
        return None

    return ut.to_pandas_frame(downsampled,
                              index=adata.obs_names,
                              columns=adata.var_names)
Ejemplo n.º 3
0
def renormalize_query_by_atlas(  # pylint: disable=too-many-statements,too-many-branches
    what: str = "__x__",
    *,
    adata: AnnData,
    qdata: AnnData,
    var_annotations: Dict[str, Any],
    layers: Dict[str, Any],
    varp_annotations: Dict[str, Any],
) -> Optional[AnnData]:
    """
    Add an ``ATLASNORM`` pseudo-gene to query metacells data to compensate for the query having filtered out many genes.

    This renormalizes the gene fractions in the query to fit the atlas in case the query has aggressive filtered a
    significant amount of genes.

    **Input**

    Annotated query ``qdata`` and atlas ``adata``, where the observations are cells and the variables are genes, where
    ``X`` is a per-variable-per-observation matrix or the name of a per-variable-per-observation annotation containing
    such a matrix.

    **Returns**

    None if no normalization is needed (or possible). Otherwise, a copy of the query metacells data, with an additional
    variable (gene) called ``ATLASNORM`` to the query data, such that the total number of UMIs for each query metacells
    is as expected given the total number of UMIs of the genes common to the query and the atlas. This is skipped if the
    query and the atlas have exactly the same list of genes, or if if the query already contains a high number of genes
    missing from the atlas so that the total number of UMIs for the query metacells is already at least the expected
    based on the common genes.

    **Computation Parameters**

    1. Computes how many UMIs should be added to each query metacell so that its (total UMIs / total common gene UMIs)
       would be the same as the (total atlas UMIs / total atlas common UMIs). If this is zero (or negative), stop.

    2. Add an ``ATLASNORM`` pseudo-gene to the query with the above amount of UMIs. For each per-variable (gene)
       observation, add the value specified in ``var_annotations``, whose list of keys must cover the set of
       per-variable annotations in the query data. For each per-observation-per-variable layer, add the value specified
       in ``layers``, whose list of keys must cover the existing layers. For each per-variable-per-variable annotation,
       add the value specified in ``varp_annotations``.
    """
    for name in qdata.var.keys():
        if "|" not in name and name not in var_annotations.keys():
            raise RuntimeError(f"missing default value for variable annotation {name}")

    for name in qdata.layers.keys():
        if name not in layers.keys():
            raise RuntimeError(f"missing default value for layer {name}")

    for name in qdata.varp.keys():
        if name not in varp_annotations.keys():
            raise RuntimeError(f"missing default value for variable-variable {name}")

    if list(qdata.var_names) == list(adata.var_names):
        return None

    query_genes_list = list(qdata.var_names)
    atlas_genes_list = list(adata.var_names)
    common_genes_list = list(sorted(set(qdata.var_names) & set(adata.var_names)))
    query_gene_indices = np.array([query_genes_list.index(gene) for gene in common_genes_list])
    atlas_gene_indices = np.array([atlas_genes_list.index(gene) for gene in common_genes_list])
    common_qdata = ut.slice(qdata, name=".common", vars=query_gene_indices, track_var="full_index")
    common_adata = ut.slice(adata, name=".common", vars=atlas_gene_indices, track_var="full_index")

    assert list(common_qdata.var_names) == list(common_adata.var_names)

    atlas_total_umis_per_metacell = ut.get_o_numpy(adata, what, sum=True)
    atlas_common_umis_per_metacell = ut.get_o_numpy(common_adata, what, sum=True)
    atlas_total_umis = np.sum(atlas_total_umis_per_metacell)
    atlas_common_umis = np.sum(atlas_common_umis_per_metacell)
    atlas_disjoint_umis_fraction = atlas_total_umis / atlas_common_umis - 1.0

    ut.log_calc("atlas_total_umis", atlas_total_umis)
    ut.log_calc("atlas_common_umis", atlas_common_umis)
    ut.log_calc("atlas_disjoint_umis_fraction", atlas_disjoint_umis_fraction)

    query_total_umis_per_metacell = ut.get_o_numpy(qdata, what, sum=True)
    query_common_umis_per_metacell = ut.get_o_numpy(common_qdata, what, sum=True)
    query_total_umis = np.sum(query_total_umis_per_metacell)
    query_common_umis = np.sum(query_common_umis_per_metacell)
    query_disjoint_umis_fraction = query_total_umis / query_common_umis - 1.0

    ut.log_calc("query_total_umis", query_total_umis)
    ut.log_calc("query_common_umis", query_common_umis)
    ut.log_calc("query_disjoint_umis_fraction", query_disjoint_umis_fraction)

    if query_disjoint_umis_fraction >= atlas_disjoint_umis_fraction:
        return None

    query_normalization_umis_fraction = atlas_disjoint_umis_fraction - query_disjoint_umis_fraction
    ut.log_calc("query_normalization_umis_fraction", query_normalization_umis_fraction)
    query_normalization_umis_per_metacell = query_common_umis_per_metacell * query_normalization_umis_fraction

    _proper, dense, compressed = ut.to_proper_matrices(qdata.X)

    if dense is None:
        assert compressed is not None
        dense = ut.to_numpy_matrix(compressed)
    added = np.concatenate([dense, query_normalization_umis_per_metacell[:, np.newaxis]], axis=1)

    if compressed is not None:
        added = sp.csr_matrix(added)

    assert added.shape[0] == qdata.shape[0]
    assert added.shape[1] == qdata.shape[1] + 1

    ndata = AnnData(added)
    ndata.obs_names = qdata.obs_names
    var_names = list(qdata.var_names)
    var_names.append("ATLASNORM")
    ndata.var_names = var_names

    for name, value in qdata.uns.items():
        ut.set_m_data(ndata, name, value)

    for name, value in qdata.obs.items():
        ut.set_o_data(ndata, name, value)

    for name, value in qdata.obsp.items():
        ut.set_oo_data(ndata, name, value)

    for name in qdata.var.keys():
        if "|" in name:
            continue
        value = ut.get_v_numpy(qdata, name)
        value = np.append(value, [var_annotations[name]])
        ut.set_v_data(ndata, name, value)

    for name in qdata.layers.keys():
        data = ut.get_vo_proper(qdata, name)
        _proper, dense, compressed = ut.to_proper_matrices(data)

        if dense is None:
            assert compressed is not None
            dense = ut.to_numpy_matrix(compressed)

        values = np.full(qdata.n_obs, layers[name], dtype=dense.dtype)
        added = np.concatenate([dense, values[:, np.newaxis]], axis=1)

        if compressed is not None:
            added = sp.csr_matrix(added)

        ut.set_vo_data(ndata, name, added)

    for name in qdata.varp.keys():
        data = ut.get_vv_proper(qdata, name)
        _proper, dense, compressed = ut.to_proper_matrices(data)

        if dense is None:
            assert compressed is not None
            dense = ut.to_numpy_matrix(compressed)

        values = np.full(qdata.n_vars, varp_annotations[name], dtype=dense.dtype)
        added = np.concatenate([dense, values[:, np.newaxis]], axis=1)
        values = np.full(qdata.n_vars + 1, varp_annotations[name], dtype=dense.dtype)
        added = np.concatenate([added, values[:, np.newaxis]], axis=0)

        if compressed is not None:
            added = sp.csr_matrix(added)

        ut.set_vv_data(ndata, name, added)

    return ndata
Ejemplo n.º 4
0
def compute_distinct_folds(
    adata: AnnData,
    what: Union[str, ut.Matrix] = "__x__",
    *,
    normalization: float = 0,
    inplace: bool = True,
) -> Optional[ut.PandasFrame]:
    """
    Compute for each observation (cell) and each variable (gene) how much is the ``what`` (default:
    {what}) value different from the overall population.

    **Input**

    Annotated ``adata``, where the observations are cells and the variables are genes, where
    ``what`` is a per-variable-per-observation matrix or the name of a per-variable-per-observation
    annotation containing such a matrix.

    **Returns**

    Per-Observation-Per-Variable (Cell-Gene) Annotations:
        ``distinct_ratio``
            For each gene in each cell, the log based 2 of the ratio between the fraction of the
            gene in the cell and the fraction of the gene in the overall population (sum of cells).

    If ``inplace`` (default: {inplace}), this is written to the data, and the function returns
    ``None``. Otherwise this is returned as a pandas frame (indexed by the observation and
    distinct gene rank).

    **Computation Parameters**

    1. Compute, for each gene, the fraction of the gene's values out of the total sum of the values
       (that is, the mean fraction of the gene's expression in the population).

    2. Compute, for each cell, for each gene, the fraction of the gene's value out of the sum
       of the values in the cell (that is, the fraction of the gene's expression in the cell).

    3. Divide the two to the distinct ratio (that is, how much the gene's expression in the cell is
       different from the overall population), first adding the ``normalization`` (default:
       {normalization}) to both.

    4. Compute the log (base 2) of the result and use it as the fold factor.
    """
    columns_data = ut.get_vo_proper(adata, what, layout="column_major")
    fractions_of_genes_in_data = ut.fraction_per(columns_data, per="column")
    fractions_of_genes_in_data += normalization

    total_umis_of_cells = ut.get_o_numpy(adata, what, sum=True)
    total_umis_of_cells[total_umis_of_cells == 0] = 1

    rows_data = ut.get_vo_proper(adata, what, layout="row_major")
    fraction_of_genes_in_cells = ut.to_numpy_matrix(
        rows_data) / total_umis_of_cells[:, np.newaxis]
    fraction_of_genes_in_cells += normalization

    zeros_mask = fractions_of_genes_in_data <= 0
    fractions_of_genes_in_data[zeros_mask] = -1
    fraction_of_genes_in_cells[:, zeros_mask] = -1

    ratio_of_genes_in_cells = fraction_of_genes_in_cells
    ratio_of_genes_in_cells /= fractions_of_genes_in_data
    assert np.min(np.min(ratio_of_genes_in_cells)) > 0

    fold_of_genes_in_cells = np.log2(ratio_of_genes_in_cells,
                                     out=ratio_of_genes_in_cells)

    if inplace:
        ut.set_vo_data(adata, "distinct_fold", fold_of_genes_in_cells)
        return None

    return ut.to_pandas_frame(fold_of_genes_in_cells,
                              index=adata.obs_names,
                              columns=adata.var_names)
Ejemplo n.º 5
0
def compute_deviant_fold_factors(
    what: Union[str, ut.Matrix] = "__x__",
    *,
    adata: AnnData,
    gdata: AnnData,
    group: Union[str, ut.Vector] = "metacell",
    similar: Union[str, ut.Vector] = "similar",
    significant_gene_fold_factor: float = pr.significant_gene_fold_factor,
) -> None:
    """
    Given an assignment of observations (cells) to groups (metacells) or, if an outlier, to the most
    similar groups, compute for each observation and gene the fold factor relative to its group
    for the purpose of detecting deviant cells.

    Ideally, all grouped cells would have no genes with high enough fold factors to be considered deviants, and all
    outlier cells would. In practice grouped cells might have a (few) such genes to the restriction on the fraction
    of deviants.

    It is important not to read too much into the results for a single cell, but looking at which genes appear for cell
    populations (e.g., cells with specific metadata such as batch identification) might be instructive.

    **Input**

    Annotated ``adata``, where the observations are cells and the variables are genes, where ``what`` is a
    per-variable-per-observation matrix or the name of a per-variable-per-observation annotation containing such a
    matrix.

    In addition, ``gdata`` is assumed to have one observation for each group, and use the same genes as ``adata``.

    **Returns**

    Sets the following in ``adata``:

    Per-Variable Per-Observation (Gene-Cell) Annotations

        ``deviant_fold``
            The fold factor between the cell's UMIs and the expected number of UMIs for the purpose of computing
            deviant cells.

    **Computation Parameters**

    1. For each cell, compute the expected UMIs for each gene given the fraction of the gene in the metacells associated
       with the cell (the one it is belongs to, or the most similar one for outliers). If this is less than
       ``significant_gene_fold_factor`` (default: {significant_gene_fold_factor}), set it to zero so the result will be
       sparse.
    """
    cells_data = ut.get_vo_proper(adata, what, layout="row_major")
    metacells_data = ut.get_vo_proper(gdata, what, layout="row_major")
    total_umis_per_cell = ut.sum_per(cells_data, per="row")
    total_umis_per_metacell = ut.sum_per(metacells_data, per="row")

    group_of_cells = ut.get_o_numpy(adata,
                                    group,
                                    formatter=ut.groups_description)
    similar_of_cells = ut.get_o_numpy(adata,
                                      similar,
                                      formatter=ut.groups_description)

    @ut.timed_call("compute_cell_deviant_certificates")
    def _compute_cell_deviant_certificates(
            cell_index: int) -> Tuple[ut.NumpyVector, ut.NumpyVector]:
        return _compute_cell_certificates(
            cell_index=cell_index,
            cells_data=cells_data,
            metacells_data=metacells_data,
            group_of_cells=group_of_cells,
            similar_of_cells=similar_of_cells,
            total_umis_per_cell=total_umis_per_cell,
            total_umis_per_metacell=total_umis_per_metacell,
            significant_gene_fold_factor=significant_gene_fold_factor,
        )

    results = list(
        ut.parallel_map(_compute_cell_deviant_certificates, adata.n_obs))

    cell_indices = np.concatenate([
        np.full(len(result[0]), cell_index, dtype="int32")
        for cell_index, result in enumerate(results)
    ])
    gene_indices = np.concatenate([result[0] for result in results])
    fold_factors = np.concatenate([result[1] for result in results])

    deviant_folds = sparse.csr_matrix(
        (fold_factors, (cell_indices, gene_indices)), shape=adata.shape)
    ut.set_vo_data(adata, "deviant_folds", deviant_folds)
Ejemplo n.º 6
0
def compute_significant_projected_fold_factors(
    adata: AnnData,
    what: Union[str, ut.Matrix] = "__x__",
    *,
    total_umis: Optional[ut.Vector],
    projected: Union[str, ut.Matrix] = "projected",
    fold_normalization: float = pr.project_fold_normalization,
    min_significant_gene_value: float = pr.project_min_significant_gene_value,
    min_gene_fold_factor: float = pr.project_max_projection_fold_factor,
    min_entry_fold_factor: float = pr.min_entry_project_fold_factor,
    abs_folds: bool = pr.project_abs_folds,
) -> None:
    """
    Compute the significant projected fold factors of genes for each query metacell.

    This computes, for each metacell of the query, the fold factors between the actual query UMIs and the UMIs of the
    projection of the metacell onto the atlas (see :py:func:`metacells.tools.project.project_query_onto_atlas`). The
    result per-metacell-per-gene matrix is then made sparse by discarding too-low values (setting them to zero).
    Ideally, this matrix should be "very" sparse. If it contains "too many" non-zero values, more genes need to
    be ignored by the projection, or somehow corrected for batch effects prior to computing the projection.

    **Input**

    Annotated ``adata``, where the observations are query metacells and the variables are genes, where ``what`` is a
    per-variable-per-observation matrix or the name of a per-variable-per-observation annotation containing such a
    matrix.

    In addition, the ``projected`` UMIs of each query metacells onto the atlas.

    **Returns**

    Sets the following in ``gdata``:

    Per-Variable Per-Observation (Gene-Cell) Annotations
        ``projected_fold``
            For each gene and query metacell, the fold factor of this gene between the query and its projection (unless
            the value is too low to be of interest, in which case it will be zero).

    **Computation Parameters**

    1. For each group (metacell), for each gene, compute the gene's fold factor
       log2((actual UMIs + ``fold_normalization``) / (expected UMIs + ``fold_normalization``)), similarly to
       :py:func:`metacells.tools.project.project_query_onto_atlas` (the default ``fold_normalization`` is
       {fold_normalization}).

    2. Set the fold factor to zero for every case where the total UMIs in the query metacell and the projected image is
       not at least ``min_significant_gene_value`` (default: {min_significant_gene_value}).

    3. If the maximal fold factor for a gene (across all metacells) is below ``min_gene_fold_factor`` (default:
       {min_gene_fold_factor}), then set all the gene's fold factors to zero (too low to be of interest).

    4. Otherwise, for any metacell whose fold factor for the gene is less than ``min_entry_fold_factor`` (default:
       {min_entry_fold_factor}), set the fold factor to zero (too low to be of interest). If ``abs_folds`` (default:
       {abs_folds}), consider the absolute fold factors.
    """
    assert 0 <= min_entry_fold_factor <= min_gene_fold_factor
    assert fold_normalization >= 0

    metacells_data = ut.get_vo_proper(adata, what, layout="row_major")
    projected_data = ut.get_vo_proper(adata, projected, layout="row_major")

    metacells_fractions = ut.fraction_by(metacells_data,
                                         by="row",
                                         sums=total_umis)
    projected_fractions = ut.fraction_by(projected_data,
                                         by="row",
                                         sums=total_umis)

    metacells_fractions += fold_normalization  # type: ignore
    projected_fractions += fold_normalization  # type: ignore

    dense_folds = metacells_fractions / projected_fractions  # type: ignore
    dense_folds = np.log2(dense_folds, out=dense_folds)

    total_umis = ut.to_numpy_matrix(metacells_data +
                                    projected_data)  # type: ignore
    insignificant_folds_mask = total_umis < min_significant_gene_value
    ut.log_calc("insignificant entries", insignificant_folds_mask)
    dense_folds[insignificant_folds_mask] = 0.0

    significant_folds = significant_folds_matrix(dense_folds,
                                                 min_gene_fold_factor,
                                                 min_entry_fold_factor,
                                                 abs_folds)
    ut.set_vo_data(adata, "projected_fold", significant_folds)
Ejemplo n.º 7
0
def compute_inner_fold_factors(
    what: Union[str, ut.Matrix] = "__x__",
    *,
    adata: AnnData,
    gdata: AnnData,
    group: Union[str, ut.Vector] = "metacell",
    min_gene_inner_fold_factor: float = pr.min_gene_inner_fold_factor,
    min_entry_inner_fold_factor: float = pr.min_entry_inner_fold_factor,
    inner_abs_folds: float = pr.inner_abs_folds,
) -> None:
    """
    Compute the inner fold factors of genes within in each metacell.

    This computes, for each cell of the metacell, the same fold factors that are used to detect deviant cells (see
    :py:func:`metacells.tools.deviants.find_deviant_cells`), and keeps the maximal fold for each gene in the metacell.
    The result per-metacell-per-gene matrix is then made sparse by discarding too-low values (setting them to zero).
    Ideally, this matrix should be "very" sparse. If it contains "too many" non-zero values, this indicates the
    metacells contains "too much" variability. This may be due to actual biology (e.g. immune cells or olfactory nerves
    which are all similar except for each one expressing one different gene), due to batch effects (similar cells in
    distinct batches differing in some genes due to technical issues), due to low data quality (the overall noise level
    is so high that this is simply the best the algorithm can do), or worse - a combination of the above.

    **Input**

    Annotated ``adata``, where the observations are cells and the variables are genes, where ``what`` is a
    per-variable-per-observation matrix or the name of a per-variable-per-observation annotation containing such a
    matrix.

    In addition, ``gdata`` is assumed to have one observation for each group, and use the same
    genes as ``adata``.

    **Returns**

    Sets the following in ``gdata``:

    Per-Variable Per-Observation (Gene-Cell) Annotations

        ``inner_fold``
            For each gene and group, the maximal fold factor of this gene in any cell contained in the group (unless the
            value is too low to be of interest, in which case it will be zero).

    **Computation Parameters**

    1. For each group (metacell), for each gene, compute the gene's maximal (in all the cells of the group) fold factor
       log2((actual UMIs + 1) / (expected UMIs + 1)), similarly to
       :py:func:`metacells.tools.deviants.find_deviant_cells`.

    2. If the maximal fold factor for a gene (across all metacells) is below ``min_gene_inner_fold_factor`` (default:
       {min_gene_inner_fold_factor}), then set all the gene's fold factors to zero (too low to be of interest). If
       ``inner_abs_folds`` (default: {inner_abs_folds}), consider the absolute fold factors.

    3. Otherwise, for any metacell whose fold factor for the gene is less than ``min_entry_inner_fold_factor`` (default:
       {min_entry_inner_fold_factor}), set the fold factor to zero (too low to be of interest).
    """
    assert 0 <= min_entry_inner_fold_factor <= min_gene_inner_fold_factor

    cells_data = ut.get_vo_proper(adata, what, layout="row_major")
    metacells_data = ut.get_vo_proper(gdata, what, layout="row_major")
    group_of_cells = ut.get_o_numpy(adata,
                                    group,
                                    formatter=ut.groups_description)
    total_umis_per_cell = ut.sum_per(cells_data, per="row")
    total_umis_per_metacell = ut.sum_per(metacells_data, per="row")

    @ut.timed_call("compute_metacell_inner_folds")
    def _compute_single_metacell_inner_folds(
            metacell_index: int) -> ut.NumpyVector:
        return _compute_metacell_inner_folds(
            metacell_index=metacell_index,
            cells_data=cells_data,
            metacells_data=metacells_data,
            group_of_cells=group_of_cells,
            total_umis_per_cell=total_umis_per_cell,
            total_umis_per_metacell=total_umis_per_metacell,
        )

    results = list(
        ut.parallel_map(_compute_single_metacell_inner_folds, gdata.n_obs))
    dense_inner_folds_by_row = np.array(results)
    dense_inner_folds_by_column = ut.to_layout(dense_inner_folds_by_row,
                                               "column_major")
    if inner_abs_folds:
        comparable_dense_inner_folds_by_column = np.abs(
            dense_inner_folds_by_column)
    else:
        comparable_dense_inner_folds_by_column = dense_inner_folds_by_column
    max_fold_per_gene = ut.max_per(comparable_dense_inner_folds_by_column,
                                   per="column")
    significant_genes_mask = max_fold_per_gene >= min_gene_inner_fold_factor
    ut.log_calc("significant_genes_mask", significant_genes_mask)
    dense_inner_folds_by_column[:, ~significant_genes_mask] = 0
    dense_inner_folds_by_column[comparable_dense_inner_folds_by_column <
                                min_entry_inner_fold_factor] = 0
    dense_inner_folds_by_row = ut.to_layout(dense_inner_folds_by_column,
                                            layout="row_major")
    sparse_inner_folds = sparse.csr_matrix(dense_inner_folds_by_row)
    ut.set_vo_data(gdata, "inner_fold", sparse_inner_folds)
Ejemplo n.º 8
0
def compute_inner_normalized_variance(
    what: Union[str, ut.Matrix] = "__x__",
    *,
    compatible_size: Optional[str] = None,
    downsample_min_samples: int = pr.downsample_min_samples,
    downsample_min_cell_quantile: float = pr.downsample_min_cell_quantile,
    downsample_max_cell_quantile: float = pr.downsample_max_cell_quantile,
    min_gene_total: int = pr.quality_min_gene_total,
    adata: AnnData,
    gdata: AnnData,
    group: Union[str, ut.Vector] = "metacell",
    random_seed: int = pr.random_seed,
) -> None:
    """
    Compute the inner normalized variance (variance / mean) for each gene in each group.

    This is also known as the "index of dispersion" and can serve as a quality measure for
    the groups. An ideal group would contain only cells with "the same" biological state
    and all remaining inner variance would be due to technical sampling noise.

    **Input**

    Annotated ``adata``, where the observations are cells and the variables are genes, where
    ``what`` is a per-variable-per-observation matrix or the name of a per-variable-per-observation
    annotation containing such a matrix.

    In addition, ``gdata`` is assumed to have one observation for each group, and use the same
    genes as ``adata``.

    **Returns**

    Sets the following in ``gdata``:

    Per-Variable Per-Observation (Gene-Cell) Annotations

        ``inner_variance``
            For each gene and group, the variance of the gene in the group.

        ``inner_normalized_variance``
            For each gene and group, the normalized variance (variance over mean) of the
            gene in the group.

    **Computation Parameters**

    For each group (metacell):

    1. If ``compatible_size`` (default: {compatible_size}) is specified, it should be an
       integer per-observation annotation of the groups, whose value is at most the
       number of grouped cells in the group. Pick a random subset of the cells of
       this size. If ``compatible_size`` is ``None``, use all the cells of the group.

    2. Invoke :py:func:`metacells.tools.downsample.downsample_cells` to downsample the surviving
       cells to the same total number of UMIs, using the ``downsample_min_samples`` (default:
       {downsample_min_samples}), ``downsample_min_cell_quantile`` (default:
       {downsample_min_cell_quantile}), ``downsample_max_cell_quantile`` (default:
       {downsample_max_cell_quantile}) and the ``random_seed`` (default: {random_seed}).

    3. Compute the normalized variance of each gene based on the downsampled data. Set the
       result to ``nan`` for genes with less than ``min_gene_total`` (default: {min_gene_total}).
    """
    cells_data = ut.get_vo_proper(adata, what, layout="row_major")

    if compatible_size is not None:
        compatible_size_of_groups: Optional[ut.NumpyVector] = ut.get_o_numpy(
            gdata, compatible_size, formatter=ut.sizes_description)
    else:
        compatible_size_of_groups = None

    group_of_cells = ut.get_o_numpy(adata,
                                    group,
                                    formatter=ut.groups_description)

    groups_count = np.max(group_of_cells) + 1
    assert groups_count > 0

    assert gdata.n_obs == groups_count
    variance_per_gene_per_group = np.full(gdata.shape, None, dtype="float32")
    normalized_variance_per_gene_per_group = np.full(gdata.shape,
                                                     None,
                                                     dtype="float32")

    for group_index in range(groups_count):
        with ut.log_step(
                "- group",
                group_index,
                formatter=lambda group_index: ut.progress_description(
                    groups_count, group_index, "group"),
        ):
            if compatible_size_of_groups is not None:
                compatible_size_of_group = compatible_size_of_groups[
                    group_index]
            else:
                compatible_size_of_group = None

            _collect_group_data(
                group_index,
                group_of_cells=group_of_cells,
                cells_data=cells_data,
                compatible_size=compatible_size_of_group,
                downsample_min_samples=downsample_min_samples,
                downsample_min_cell_quantile=downsample_min_cell_quantile,
                downsample_max_cell_quantile=downsample_max_cell_quantile,
                min_gene_total=min_gene_total,
                random_seed=random_seed,
                variance_per_gene_per_group=variance_per_gene_per_group,
                normalized_variance_per_gene_per_group=
                normalized_variance_per_gene_per_group,
            )

    ut.set_vo_data(gdata, "inner_variance", variance_per_gene_per_group)
    ut.set_vo_data(gdata, "inner_normalized_variance",
                   normalized_variance_per_gene_per_group)