Ejemplo n.º 1
0
def _project_single_metacell(
    *,
    query_metacell_index: int,
    atlas_umis: ut.Matrix,
    query_atlas_corr: ut.NumpyMatrix,
    atlas_project_data: ut.NumpyMatrix,
    query_project_data: ut.NumpyMatrix,
    atlas_log_fractions: ut.NumpyMatrix,
    candidates_count: int,
    min_significant_gene_value: float,
    min_usage_weight: float,
    max_consistency_fold_factor: float,
) -> Tuple[ut.NumpyVector, ut.NumpyVector]:
    query_metacell_project_data = query_project_data[query_metacell_index, :]
    query_metacell_atlas_correlations = query_atlas_corr[query_metacell_index, :]
    query_metacell_atlas_order = np.argsort(-query_metacell_atlas_correlations)

    atlas_anchor_index = query_metacell_atlas_order[0]
    ut.log_calc("atlas_anchor_index", atlas_anchor_index)
    atlas_anchor_log_fractions = atlas_log_fractions[atlas_anchor_index, :]
    atlas_anchor_umis = ut.to_numpy_vector(atlas_umis[atlas_anchor_index, :])

    atlas_candidate_indices_list = [atlas_anchor_index]
    position = 1
    while len(atlas_candidate_indices_list) < candidates_count and position < len(query_metacell_atlas_order):
        atlas_metacell_index = query_metacell_atlas_order[position]
        position += 1
        atlas_metacell_log_fractions = atlas_log_fractions[atlas_metacell_index, :]
        atlas_metacell_consistency_fold_factors = np.abs(atlas_metacell_log_fractions - atlas_anchor_log_fractions)
        atlas_metacell_umis = ut.to_numpy_vector(atlas_umis[atlas_metacell_index, :])
        atlas_metacell_significant_genes_mask = atlas_metacell_umis + atlas_anchor_umis >= min_significant_gene_value
        atlas_metacell_consistency = np.max(
            atlas_metacell_consistency_fold_factors[atlas_metacell_significant_genes_mask]
        )
        if atlas_metacell_consistency <= max_consistency_fold_factor / 2.0:
            atlas_candidate_indices_list.append(atlas_metacell_index)

    atlas_candidate_indices = np.array(sorted(atlas_candidate_indices_list))
    atlas_candidates_project_data = atlas_project_data[atlas_candidate_indices, :]

    represent_result = ut.represent(query_metacell_project_data, atlas_candidates_project_data)
    assert represent_result is not None
    atlas_candidate_weights = represent_result[1]
    atlas_candidate_weights[atlas_candidate_weights < min_usage_weight] = 0
    atlas_candidate_weights[atlas_candidate_weights < min_usage_weight] /= np.sum(atlas_candidate_weights)

    atlas_used_mask = atlas_candidate_weights > 0

    atlas_used_indices = atlas_candidate_indices[atlas_used_mask].astype("int32")
    ut.log_return("atlas_used_indices", atlas_used_indices)

    atlas_used_weights = atlas_candidate_weights[atlas_used_mask]
    atlas_used_weights = atlas_used_weights.astype("float32")
    ut.log_return("atlas_used_weights", atlas_used_weights)

    return (atlas_used_indices, atlas_used_weights)
Ejemplo n.º 2
0
def find_high_relative_variance_genes(
    adata: AnnData,
    what: Union[str, ut.Matrix] = "__x__",
    *,
    min_gene_relative_variance: float = pr.significant_gene_relative_variance,
    window_size: int = pr.relative_variance_window_size,
    inplace: bool = True,
) -> Optional[ut.PandasSeries]:
    """
    Find genes which have high relative variance of ``what`` (default: {what}) data.

    The relative variance measures the variance / mean of each gene relative to the other genes with
    a similar level of expression. See
    :py:func:`metacells.utilities.computation.relative_variance_per` for details.

    Genes with a high relative variance are good candidates for being selected as "feature genes",
    that is, be used to compute the similarity between cells. Using the relative variance
    compensates for the bias for selecting higher-expression genes, whose normalized variance can to
    be larger due to random noise alone.

    **Input**

    Annotated ``adata``, where the observations are cells and the variables are genes, where
    ``what`` is a per-variable-per-observation matrix or the name of a per-variable-per-observation
    annotation containing such a matrix.

    **Returns**

    Variable (Gene) Annotations
        ``high_relative_variance_gene``
            A boolean mask indicating whether each gene was found to have a high relative
            variance.

    If ``inplace`` (default: {inplace}), this is written to the data, and the function returns
    ``None``. Otherwise this is returned as a pandas series (indexed by the variable names).

    **Computation Parameters**

    1. Use :py:func:`metacells.utilities.computation.relative_variance_per` to get the relative
       variance of each gene.

    2. Select the genes whose relative variance is at least
       ``min_gene_relative_variance`` (default: {min_gene_relative_variance}).
    """
    data = ut.get_vo_proper(adata, what, layout="column_major")
    relative_variance_of_genes = ut.relative_variance_per(data, per="column", window_size=window_size)

    genes_mask = relative_variance_of_genes >= min_gene_relative_variance

    if inplace:
        ut.set_v_data(adata, "high_relative_variance_gene", genes_mask)
        return None

    ut.log_return("high_relative_variance_genes", genes_mask)
    return ut.to_pandas_series(genes_mask, index=adata.var_names)
Ejemplo n.º 3
0
def find_top_feature_genes(
    adata: AnnData,
    *,
    max_genes: int = pr.max_top_feature_genes,
    inplace: bool = True,
) -> Optional[ut.PandasSeries]:
    """
    Find genes which have high ``feature_gene`` value.

    This is applied after computing metacells to pick the "strongest" feature genes. If using the
    direct algorithm (:py:func:`metacells.pipeline.direct.compute_direct_metacells`) then all
    feature genes are equally "strong"; however, if using the divide-and-conquer algorithm
    (:py:func:`metacells.pipeline.divide_and_conquer.divide_and_conquer_pipeline`,
    :py:func:`metacells.pipeline.divide_and_conquer.compute_divide_and_conquer_metacells`) then this
    will pick the genes which were most commonly used as features across all the piles.

    **Input**

    Annotated ``adata``, where the observations are cells and the variables are genes, where
    ``feature_gene`` is a per-variable (gene) annotation counting how many times each gene was used
    as a feature.

    **Returns**

    Variable (Gene) Annotations
        ``top_feature_gene``
            A boolean mask indicating whether each gene was found to be a top feature gene.

    If ``inplace`` (default: {inplace}), this is written to the data, and the function returns
    ``None``. Otherwise this is returned as a pandas series (indexed by the variable names).

    **Computation Parameters**

    1. Look for the lowest positive ``feature_gene`` threshold such that at most ``max_genes`` are
       picked as top feature genes. Note we may still pick more than ``max_genes``, for example when
       using the direct algorithm, we always return all feature genes as there's no way to
       distinguish between them using the ``feature_gene`` data.
    """
    feature_of_gene = ut.get_v_numpy(adata, "feature_gene", formatter=ut.mask_description)
    max_threshold = np.max(feature_of_gene)
    assert max_threshold > 0
    threshold = 0
    selected_count = max_genes + 1
    while selected_count > max_genes and threshold < max_threshold:
        threshold = threshold + 1
        genes_mask = feature_of_gene >= threshold
        selected_count = np.sum(genes_mask)
        ut.log_calc(f"threshold: {threshold} selected: {selected_count}")

    if inplace:
        ut.set_v_data(adata, "top_feature_gene", genes_mask)
        return None

    ut.log_return("top_feature_gene", genes_mask)
    return ut.to_pandas_series(genes_mask, index=adata.var_names)
Ejemplo n.º 4
0
def find_high_normalized_variance_genes(
    adata: AnnData,
    what: Union[str, ut.Matrix] = "__x__",
    *,
    min_gene_normalized_variance: float = pr.significant_gene_normalized_variance,
    inplace: bool = True,
) -> Optional[ut.PandasSeries]:
    """
    Find genes which have high normalized variance of ``what`` (default: {what}) data.

    The normalized variance measures the variance / mean of each gene. See
    :py:func:`metacells.utilities.computation.normalized_variance_per` for details.

    Genes with a high normalized variance are "noisy", that is, have significantly different
    expression level in different cells.

    **Input**

    Annotated ``adata``, where the observations are cells and the variables are genes, where
    ``what`` is a per-variable-per-observation matrix or the name of a per-variable-per-observation
    annotation containing such a matrix.

    **Returns**

    Variable (Gene) Annotations
        ``high_normalized_variance_gene``
            A boolean mask indicating whether each gene was found to have a high normalized
            variance.

    If ``inplace`` (default: {inplace}), this is written to the data, and the function returns
    ``None``. Otherwise this is returned as a pandas series (indexed by the variable names).

    **Computation Parameters**

    1. Use :py:func:`metacells.utilities.computation.normalized_variance_per` to get the normalized
       variance of each gene.

    2. Select the genes whose normalized variance is at least
       ``min_gene_normalized_variance`` (default: {min_gene_normalized_variance}).
    """
    data = ut.get_vo_proper(adata, what, layout="column_major")
    normalized_variance_of_genes = ut.normalized_variance_per(data, per="column")

    genes_mask = normalized_variance_of_genes >= min_gene_normalized_variance

    if inplace:
        ut.set_v_data(adata, "high_normalized_variance_gene", genes_mask)
        return None

    ut.log_return("high_normalized_variance_genes", genes_mask)
    return ut.to_pandas_series(genes_mask, index=adata.var_names)
Ejemplo n.º 5
0
def find_high_topN_genes(  # pylint: disable=invalid-name
    adata: AnnData,
    what: Union[str, ut.Matrix] = "__x__",
    *,
    topN: int,  # pylint: disable=invalid-name
    min_gene_topN: int,  # pylint: disable=invalid-name
    inplace: bool = True,
) -> Optional[ut.PandasSeries]:
    """
    Find genes which have high total top-Nth value of ``what`` (default: {what}) data.

    This should typically only be applied to downsampled data to ensure that variance in sampling
    depth does not affect the result.

    Genes with too-low expression are typically excluded from computations. In particular,
    genes may have all-zero expression, in which case including them just slows the
    computations (and triggers numeric edge cases).

    **Input**

    Annotated ``adata``, where the observations are cells and the variables are genes, where
    ``what`` is a per-variable-per-observation matrix or the name of a per-variable-per-observation
    annotation containing such a matrix.

    **Returns**

    Variable (Gene) Annotations
        ``high_top<topN>_gene``
            A boolean mask indicating whether each gene was found to have a high top-Nth value.

    If ``inplace`` (default: {inplace}), this is written to the data, and the function returns
    ``None``. Otherwise this is returned as a pandas series (indexed by the variable names).

    **Computation Parameters**

    1. Use :py:func:`metacells.utilities.computation.top_per` to get the top-Nth UMIs of each gene.

    2. Select the genes whose fraction is at least ``min_gene_topN``.
    """
    data_of_genes = ut.get_vo_proper(adata, what, layout="column_major")
    rank = max(adata.n_obs - topN - 1, 1)
    topN_of_genes = ut.rank_per(data_of_genes, per="column", rank=rank)  # pylint: disable=invalid-name
    genes_mask = topN_of_genes >= min_gene_topN

    if inplace:
        ut.set_v_data(adata, f"high_top{topN}_gene", genes_mask)
        return None

    ut.log_return(f"high_top{topN}_genes", genes_mask)
    return ut.to_pandas_series(genes_mask, index=adata.var_names)
Ejemplo n.º 6
0
def find_properly_sampled_genes(
    adata: AnnData,
    what: Union[str, ut.Matrix] = "__x__",
    *,
    min_gene_total: int = pr.properly_sampled_min_gene_total,
    inplace: bool = True,
) -> Optional[ut.PandasSeries]:
    """
    Detect genes with a "proper" amount of ``what`` (default: {what}) data.

    Due to both technical effects and natural variance between genes, the expression of genes varies
    greatly between cells. This is exactly the information we are trying to analyze. We often would
    like to work on genes that have a sufficient level of expression for meaningful analysis.
    Specifically, it doesn't make sense to analyze genes that have zero expression in all the cells.

    .. todo::

        Provide additional optional criteria for "properly sampled genes"?

    **Input**

    Annotated ``adata``, where the observations are cells and the variables are genes, where
    ``what`` is a per-variable-per-observation matrix or the name of a per-variable-per-observation
    annotation containing such a matrix.

    **Returns**

    Variable (Gene) Annotations
        ``properly_sampled_gene``
            A boolean mask indicating whether each gene has a "proper" number of UMIs.

    If ``inplace`` (default: {inplace}), this is written to the data and the function returns
    ``None``. Otherwise this is returned as a pandas series (indexed by the variable names).

    **Computation Parameters**

    1. Exclude all genes whose total data is less than the ``min_gene_total`` (default:
       {min_gene_total}).
    """
    total_of_genes = ut.get_v_numpy(adata, what, sum=True)

    genes_mask = total_of_genes >= min_gene_total

    if inplace:
        ut.set_v_data(adata, "properly_sampled_gene", genes_mask)
        return None

    ut.log_return("properly_sampled_gene", genes_mask)
    return ut.to_pandas_series(genes_mask, index=adata.obs_names)
Ejemplo n.º 7
0
def find_high_fraction_genes(
    adata: AnnData,
    what: Union[str, ut.Matrix] = "__x__",
    *,
    min_gene_fraction: float = pr.significant_gene_fraction,
    inplace: bool = True,
) -> Optional[ut.PandasSeries]:
    """
    Find genes which have high fraction of the total ``what`` (default: {what}) data of the cells.

    Genes with too-low expression are typically excluded from computations. In particular,
    genes may have all-zero expression, in which case including them just slows the
    computations (and triggers numeric edge cases).

    **Input**

    Annotated ``adata``, where the observations are cells and the variables are genes, where
    ``what`` is a per-variable-per-observation matrix or the name of a per-variable-per-observation
    annotation containing such a matrix.

    **Returns**

    Variable (Gene) Annotations
        ``high_fraction_gene``
            A boolean mask indicating whether each gene was found to have a high normalized
            variance.

    If ``inplace`` (default: {inplace}), this is written to the data, and the function returns
    ``None``. Otherwise this is returned as a pandas series (indexed by the variable names).

    **Computation Parameters**

    1. Use :py:func:`metacells.utilities.computation.fraction_per` to get the fraction of each gene.

    2. Select the genes whose fraction is at least ``min_gene_fraction`` (default:
       {min_gene_fraction}).
    """
    data = ut.get_vo_proper(adata, what, layout="column_major")
    fraction_of_genes = ut.fraction_per(data, per="column")

    genes_mask = fraction_of_genes >= min_gene_fraction

    if inplace:
        ut.set_v_data(adata, "high_fraction_gene", genes_mask)
        return None

    ut.log_return("high_fraction_genes", genes_mask)
    return ut.to_pandas_series(genes_mask, index=adata.var_names)
Ejemplo n.º 8
0
def find_high_total_genes(
    adata: AnnData,
    what: Union[str, ut.Matrix] = "__x__",
    *,
    min_gene_total: int,
    inplace: bool = True,
) -> Optional[ut.PandasSeries]:
    """
    Find genes which have high total number of ``what`` (default: {what}) data.

    This should typically only be applied to downsampled data to ensure that variance in sampling
    depth does not affect the result.

    Genes with too-low expression are typically excluded from computations. In particular,
    genes may have all-zero expression, in which case including them just slows the
    computations (and triggers numeric edge cases).

    **Input**

    Annotated ``adata``, where the observations are cells and the variables are genes, where
    ``what`` is a per-variable-per-observation matrix or the name of a per-variable-per-observation
    annotation containing such a matrix.

    **Returns**

    Variable (Gene) Annotations
        ``high_total_gene``
            A boolean mask indicating whether each gene was found to have a high normalized
            variance.

    If ``inplace`` (default: {inplace}), this is written to the data, and the function returns
    ``None``. Otherwise this is returned as a pandas series (indexed by the variable names).

    **Computation Parameters**

    1. Use :py:func:`metacells.utilities.computation.sum_per` to get the total UMIs of each gene.

    2. Select the genes whose fraction is at least ``min_gene_total``.
    """
    total_of_genes = ut.get_v_numpy(adata, what, sum=True)
    genes_mask = total_of_genes >= min_gene_total

    if inplace:
        ut.set_v_data(adata, "high_total_gene", genes_mask)
        return None

    ut.log_return("high_total_genes", genes_mask)
    return ut.to_pandas_series(genes_mask, index=adata.var_names)
Ejemplo n.º 9
0
def find_named_genes(
    adata: AnnData,
    *,
    names: Optional[Collection[str]] = None,
    patterns: Optional[Collection[Union[str, Pattern]]] = None,
    to: Optional[str] = None,
    invert: bool = False,
) -> Optional[ut.PandasSeries]:
    """
    Find genes by their (case-insensitive) name.

    This creates a mask of all the genes whose name appears in ``names`` or matches any of the
    ``patterns``. If ``invert`` (default: {invert}), invert the resulting mask.

    If ``to`` (default: {to}) is specified, this is stored as a per-variable (gene) annotation with
    that name, and returns ``None``. This is useful to fill gene masks such as ``excluded_genes``
    (genes which should be excluded from the rest of the processing) and ``forbidden_genes`` (genes
    which must not be chosen as feature genes).

    Otherwise, it returns it as a pandas series (indexed by the variable, that is gene, names).
    """
    if names is None:
        names_mask = np.zeros(adata.n_vars, dtype="bool")
    else:
        lower_names_set = {name.lower() for name in names}
        names_mask = np.array([name.lower() in lower_names_set for name in adata.var_names])  #

    if patterns is None:
        patterns_mask = np.zeros(adata.n_vars, dtype="bool")
    else:
        patterns_mask = ut.patterns_matches(patterns, adata.var_names)

    genes_mask = names_mask | patterns_mask

    if invert:
        genes_mask = ~genes_mask

    if to is not None:
        ut.set_v_data(adata, to, genes_mask)
        return None

    ut.log_return("named_genes", genes_mask)
    return ut.to_pandas_series(genes_mask, index=adata.var_names)
Ejemplo n.º 10
0
def _results(
    *,
    adata: AnnData,
    rare_module_of_cells: ut.NumpyVector,
    list_of_rare_gene_indices_of_modules: List[List[int]],
    inplace: bool,
) -> Optional[Tuple[ut.PandasFrame, ut.PandasFrame]]:
    assert np.max(
        rare_module_of_cells) == len(list_of_rare_gene_indices_of_modules) - 1

    if not inplace:
        var_metrics = ut.to_pandas_frame(index=adata.var_names)

    rare_gene_mask = np.zeros(adata.n_vars, dtype="bool")
    for module_index, rare_gene_indices_of_module in enumerate(
            list_of_rare_gene_indices_of_modules):
        rare_module_gene_mask = np.zeros(adata.n_vars, dtype="bool")
        rare_module_gene_mask[rare_gene_indices_of_module] = True
        property_name = f"rare_gene_module_{module_index}"
        if inplace:
            ut.set_v_data(adata, property_name, rare_module_gene_mask)
        else:
            var_metrics[property_name] = rare_module_gene_mask
            ut.log_return(property_name, rare_module_gene_mask)
        rare_gene_mask |= rare_module_gene_mask

    if inplace:
        ut.set_v_data(adata, "rare_gene", rare_gene_mask)
    else:
        var_metrics["rare_gene"] = rare_gene_mask
        ut.log_return("rare_gene", rare_gene_mask)

    if inplace:
        ut.set_o_data(adata,
                      "cells_rare_gene_module",
                      rare_module_of_cells,
                      formatter=ut.groups_description)
        ut.set_o_data(adata, "rare_cell", rare_module_of_cells >= 0)
        return None

    obs_metrics = ut.to_pandas_frame(index=adata.obs_names)
    ut.log_return("cells_rare_gene_module",
                  rare_module_of_cells,
                  formatter=ut.groups_description)
    ut.log_return("rare_cell", rare_module_of_cells >= 0)

    return obs_metrics, var_metrics
Ejemplo n.º 11
0
def compute_candidate_metacells(  # pylint: disable=too-many-statements,too-many-branches
    adata: AnnData,
    what: Union[str, ut.Matrix] = "obs_outgoing_weights",
    *,
    target_metacell_size: float,
    cell_sizes: Optional[Union[str, ut.Vector]] = pr.candidates_cell_sizes,
    cell_seeds: Optional[Union[str, ut.Vector]] = None,
    min_seed_size_quantile: float = pr.min_seed_size_quantile,
    max_seed_size_quantile: float = pr.max_seed_size_quantile,
    cooldown_pass: float = pr.cooldown_pass,
    cooldown_node: float = pr.cooldown_node,
    cooldown_phase: float = pr.cooldown_phase,
    min_split_size_factor: Optional[float] = pr.candidates_min_split_size_factor,
    max_merge_size_factor: Optional[float] = pr.candidates_max_merge_size_factor,
    min_metacell_cells: Optional[int] = pr.candidates_min_metacell_cells,
    max_split_min_cut_strength: Optional[float] = pr.max_split_min_cut_strength,
    min_cut_seed_cells: Optional[int] = pr.min_cut_seed_cells,
    must_complete_cover: bool = False,
    random_seed: int = 0,
    inplace: bool = True,
) -> Optional[ut.PandasSeries]:
    """
    Assign observations (cells) to (raw, candidate) metacells based on ``what`` data. (a weighted
    directed graph).

    These candidate metacells typically go through additional vetting (e.g. deviant detection and
    dissolving too-small metacells) to obtain the final metacells.

    **Input**

    Annotated ``adata``, where the observations are cells and the variables are genes, where
    ``what`` is a per-observation-per-observation matrix where each row is the outgoing weights from
    each observation to the rest, or just the name of a per-observation-per-observation annotation
    containing such a matrix. Typically this matrix will be sparse for efficient processing.

    **Returns**

    Observation (Cell) Annotations
        ``candidate``
            The integer index of the (raw, candidate) metacell each cell belongs to. The metacells
            are in no particular order.

    If ``inplace`` (default: {inplace}), this is written to the data, and the function returns
    ``None``. Otherwise this is returned as a pandas series (indexed by the variable names).

    **Computation Parameters**

    1. We are trying to build metacells of ``target_metacell_size``, using the ``cell_sizes``
       (default: {cell_sizes}) to assign a size for each node (cell). This can be a string name of a
       per-observation annotation or a vector of values.

    2. We start with some an assignment of cells to ``cell_seeds`` (default: {cell_seeds}). If no
       seeds are provided, we use :py:func:`choose_seeds` using ``min_seed_size_quantile`` (default:
       {min_seed_size_quantile}) and ``max_seed_size_quantile`` (default: {max_seed_size_quantile})
       to compute them, picking a number of seeds such that the average metacell size would match
       the target.

    3. We optimize the seeds using :py:func:`optimize_partitions` to obtain initial communities by
       maximizing the "stability" of the solution (probability of starting at a random node and
       moving either forward or backward in the graph and staying within the same metacell, divided
       by the probability of staying in the metacell if the edges connected random nodes). We pass
       it the ``cooldown_pass`` {cooldown_pass}) and ``cooldown_node`` (default: {cooldown_node}).

    4. If ``min_split_size_factor`` (default: {min_split_size_factor}) is specified, randomly split
       to two each community whose size is partition method on each community whose size is at least
       ``target_metacell_size * min_split_size_factor`` and re-optimize the solution (resulting in
       one additional metacell). Every time we re-optimize, we multiply 1 - ``cooldown_pass`` by
       1 - ``cooldown_phase`` (default: {cooldown_phase}).

    5. If ``max_split_min_cut_strength`` (default: {max_split_min_cut_strength}) is specified, and
       the minimal cut of a candidate is lower, split it into two. If one of the partitions is
       smaller than ``min_cut_seed_cells``, then mark the cells in it as outliers, or if
       ``must_complete_cover`` is ``True``, skip the cut altogether.

    5. If ``max_merge_size_factor`` (default: {max_merge_size_factor}) or ``min_metacell_cells``
       (default: {min_metacell_cells}) are specified, make outliers of cells of a community whose
       size is at most ``target_metacell_size * max_merge_size_factor`` or contains less cells and
       re-optimize, which will assign these cells to other metacells (resulting on one less
       metacell). We again apply the ``cooldown_phase`` every time we re-optimize.

    6. Repeat the above steps until all metacells candidates are in the acceptable size range.
    """
    edge_weights = ut.get_oo_proper(adata, what, layout="row_major")
    assert edge_weights.shape[0] == edge_weights.shape[1]
    assert 0.0 < cooldown_pass < 1.0
    assert 0.0 <= cooldown_node <= 1.0
    assert 0.0 < cooldown_phase <= 1.0

    size = edge_weights.shape[0]

    outgoing_edge_weights = ut.mustbe_compressed_matrix(edge_weights)

    assert ut.is_layout(outgoing_edge_weights, "row_major")
    incoming_edge_weights = ut.mustbe_compressed_matrix(ut.to_layout(outgoing_edge_weights, layout="column_major"))
    assert ut.is_layout(incoming_edge_weights, "column_major")

    assert outgoing_edge_weights.data.dtype == "float32"
    assert outgoing_edge_weights.indices.dtype == "int32"
    assert outgoing_edge_weights.indptr.dtype == "int32"
    assert incoming_edge_weights.data.dtype == "float32"
    assert incoming_edge_weights.indices.dtype == "int32"
    assert incoming_edge_weights.indptr.dtype == "int32"

    node_sizes = ut.maybe_o_numpy(adata, cell_sizes, formatter=ut.sizes_description)
    if node_sizes is None:
        node_sizes = np.full(size, 1.0, dtype="float32")
    else:
        node_sizes = node_sizes.astype("float32")
    ut.log_calc("node_sizes", node_sizes, formatter=ut.sizes_description)

    assert target_metacell_size > 0
    max_metacell_size = None
    min_metacell_size = None

    if min_split_size_factor is not None:
        assert min_split_size_factor > 0
        max_metacell_size = ceil(target_metacell_size * min_split_size_factor) - 1
    ut.log_calc("max_metacell_size", max_metacell_size)

    if max_merge_size_factor is not None:
        assert max_merge_size_factor > 0
        min_metacell_size = floor(target_metacell_size * max_merge_size_factor) + 1
    ut.log_calc("min_metacell_size", min_metacell_size)

    target_metacell_cells = max(
        1.0 if min_metacell_cells is None else float(min_metacell_cells),
        float(target_metacell_size / np.mean(node_sizes)),
    )
    ut.log_calc("target_metacell_cells", target_metacell_cells)

    if min_split_size_factor is not None and max_merge_size_factor is not None:
        assert max_merge_size_factor < min_split_size_factor
        assert min_metacell_size is not None
        assert max_metacell_size is not None
        assert min_metacell_size <= max_metacell_size

    community_of_nodes = ut.maybe_o_numpy(adata, cell_seeds, formatter=ut.groups_description)

    if community_of_nodes is not None:
        assert community_of_nodes.dtype == "int32"
    else:
        target_seeds_count = ceil(size / target_metacell_cells)
        ut.log_calc("target_seeds_count", target_seeds_count)

        community_of_nodes = np.full(size, -1, dtype="int32")
        _choose_seeds(
            outgoing_edge_weights=outgoing_edge_weights,
            incoming_edge_weights=incoming_edge_weights,
            seed_of_cells=community_of_nodes,
            max_seeds_count=target_seeds_count,
            min_seed_size_quantile=min_seed_size_quantile,
            max_seed_size_quantile=max_seed_size_quantile,
            random_seed=random_seed,
        )

    ut.set_o_data(adata, "seed", community_of_nodes, formatter=ut.groups_description)
    community_of_nodes = community_of_nodes.copy()

    np.random.seed(random_seed)

    cold_temperature = 1 - cooldown_pass

    old_score = 1e9
    old_communities = community_of_nodes
    old_small_nodes_count = len(community_of_nodes)
    atomic_candidates: Set[Tuple[int, ...]] = set()
    kept_communities_count = 0

    while True:
        cold_temperature, score = _optimize_split_communities(  #
            outgoing_edge_weights=outgoing_edge_weights,
            incoming_edge_weights=incoming_edge_weights,
            community_of_nodes=community_of_nodes,
            node_sizes=node_sizes,
            target_metacell_size=target_metacell_size,
            max_metacell_size=max_metacell_size,
            max_split_min_cut_strength=max_split_min_cut_strength,
            min_cut_seed_cells=min_cut_seed_cells,
            must_complete_cover=must_complete_cover,
            min_seed_size_quantile=min_seed_size_quantile,
            max_seed_size_quantile=max_seed_size_quantile,
            random_seed=random_seed,
            cooldown_pass=cooldown_pass,
            cooldown_node=cooldown_node,
            cooldown_phase=cooldown_phase,
            kept_communities_count=kept_communities_count,
            cold_temperature=cold_temperature,
            atomic_candidates=atomic_candidates,
        )

        small_communities, small_nodes_count = _find_small_communities(
            community_of_nodes=community_of_nodes,
            node_sizes=node_sizes,
            min_metacell_size=min_metacell_size,
            min_metacell_cells=min_metacell_cells,
        )

        small_communities_count = len(small_communities)
        if small_communities_count < 2:
            break

        if (old_small_nodes_count, old_score) <= (small_nodes_count, score):
            ut.logger().debug("is not better, revert")
            community_of_nodes = old_communities
            score = old_score
            ut.log_calc("communities", community_of_nodes, formatter=ut.groups_description)
            ut.log_calc("score", score)
            break

        old_score = score
        old_communities = community_of_nodes.copy()
        old_small_nodes_count = small_nodes_count

        kept_communities_count = _cancel_communities(
            community_of_nodes=community_of_nodes, cancelled_communities=small_communities
        )

        _choose_seeds(
            outgoing_edge_weights=outgoing_edge_weights,
            incoming_edge_weights=incoming_edge_weights,
            seed_of_cells=community_of_nodes,
            max_seeds_count=kept_communities_count + small_communities_count - 1,
            min_seed_size_quantile=min_seed_size_quantile,
            max_seed_size_quantile=max_seed_size_quantile,
            random_seed=random_seed,
        )

    if inplace:
        ut.set_o_data(adata, "candidate", community_of_nodes, formatter=ut.groups_description)
        return None

    if must_complete_cover:
        assert np.min(community_of_nodes) == 0
    else:
        community_of_nodes[community_of_nodes < 0] = -1

    ut.log_return("candidate", community_of_nodes, formatter=ut.groups_description)
    return ut.to_pandas_series(community_of_nodes, index=adata.obs_names)
Ejemplo n.º 12
0
def find_metacells_significant_genes(
    adata: AnnData,
    what: Union[str, ut.Matrix] = "__x__",
    *,
    min_gene_range_fold: float = pr.min_significant_metacells_gene_range_fold_factor,
    normalization: float = pr.metacells_gene_range_normalization,
    min_gene_fraction: float = pr.min_significant_metacells_gene_fraction,
    inplace: bool = True,
) -> Optional[ut.PandasSeries]:
    """
    Find genes which have a significant signal in metacells data. This computation is too unreliable to be used on
    cells.

    Find genes which have a high maximal expression in at least one metacell, and a wide range of expression across the
    metacells. Such genes are good candidates for being used as marker genes and/or to compute distances between
    metacells.

    **Input**

    Annotated ``adata``, where the observations are cells and the variables are genes, where
    ``what`` is a per-variable-per-observation matrix or the name of a per-variable-per-observation
    annotation containing such a matrix.

    **Returns**

    Variable (Gene) Annotations
        ``significant_gene``
            A boolean mask indicating whether each gene was found to be significant.

    If ``inplace`` (default: {inplace}), this is written to the data, and the function returns
    ``None``. Otherwise this is returned as a pandas series (indexed by the variable names).

    **Computation Parameters**

    1. Compute the minimal and maximal expression level of each gene.

    2. Select the genes whose fold factor (log2 of maximal over minimal value, using the ``normalization``
       (default: {normalization}) is at least ``min_gene_range_fold`` (default: {min_gene_range_fold}).

    3. Select the genes whose maximal expression is at least ``min_gene_fraction`` (default: {min_gene_fraction}).
    """
    assert normalization >= 0

    data = ut.get_vo_proper(adata, what, layout="row_major")
    fractions_of_genes = ut.to_layout(ut.fraction_by(data, by="row"), layout="column_major")

    min_fraction_of_genes = ut.min_per(fractions_of_genes, per="column")
    max_fraction_of_genes = ut.max_per(fractions_of_genes, per="column")

    high_max_fraction_genes_mask = max_fraction_of_genes >= min_gene_fraction
    ut.log_calc("high max fraction genes", high_max_fraction_genes_mask)

    min_fraction_of_genes += normalization
    max_fraction_of_genes += normalization

    max_fraction_of_genes /= min_fraction_of_genes
    range_fold_of_genes = np.log2(max_fraction_of_genes, out=max_fraction_of_genes)

    high_range_genes_mask = range_fold_of_genes >= min_gene_range_fold
    ut.log_calc("high range genes", high_range_genes_mask)

    significant_genes_mask = high_max_fraction_genes_mask & high_range_genes_mask

    if inplace:
        ut.set_v_data(adata, "significant_gene", significant_genes_mask)
        return None

    ut.log_return("significant_genes", significant_genes_mask)
    return ut.to_pandas_series(significant_genes_mask, index=adata.var_names)
Ejemplo n.º 13
0
def find_deviant_cells(
    adata: AnnData,
    what: Union[str, ut.Matrix] = "__x__",
    *,
    candidates: Union[str, ut.Vector] = "candidate",
    min_gene_fold_factor: float = pr.deviants_min_gene_fold_factor,
    abs_folds: bool = pr.deviants_abs_folds,
    max_gene_fraction: Optional[float] = pr.deviants_max_gene_fraction,
    max_cell_fraction: Optional[float] = pr.deviants_max_cell_fraction,
    inplace: bool = True,
) -> Optional[Tuple[ut.PandasSeries, ut.PandasSeries]]:
    """
    Find cells which are have significantly different gene expression from the metacells they are
    belong to based on ``what`` (default: {what}) data.

    **Input**

    Annotated ``adata``, where the observations are cells and the variables are genes, where
    ``what`` is a per-variable-per-observation matrix or the name of a per-variable-per-observation
    annotation containing such a matrix.

    **Returns**

    Observation (Cell) Annotations
        ``cell_deviant_votes``
            The number of genes that were the reason the cell was marked as deviant (if zero, the
            cell is not deviant).

    Variable (Gene) Annotations
        ``gene_deviant_votes``
            The number of cells each gene marked as deviant (if zero, the gene did not mark any cell
            as deviant).

    If ``inplace`` (default: {inplace}), this is written to the data, and the function returns
    ``None``. Otherwise this is returned as two pandas series (indexed by the observation and
    variable names).

    **Computation Parameters**

    Intuitively, we first select some fraction of the genes which were least predictable compared to
    the mean expression in the candidate metacells. We then mark as deviants some fraction of the
    cells whose expression of these genes was least predictable compared to the mean expression in
    the candidate metacells. Operationally:

    1. Compute for each candidate metacell the mean fraction of the UMIs expressed by each gene.
       Scale this by each cell's total UMIs to compute the expected number of UMIs for each cell.
       Compute the fold factor log2((actual UMIs + 1) / (expected UMIs + 1)) for each gene for each
       cell.

    2. Ignore all fold factors less than the ``min_gene_fold_factor`` (default: {min_gene_fold_factor}). If
       ``abs_folds`` (default: {abs_folds}), consider the absolute fold factors. Count the number of genes which have a
       fold factor above this minimum in at least one cell. If the fraction of such genes is above ``max_gene_fraction``
       (default: {max_gene_fraction}), then raise the minimal gene fold factor such that at most this fraction of genes
       remain.

    3. For each remaining gene, rank all the cells where it is expressed above the min fold
       factor. Give an artificial maximum rank to all cells with fold factor 0, that is, below the
       minimum.

    4. For each cell, compute the minimal rank it has in any of these genes. That is, if a cell has
       a rank of 1, it means that it has at least one gene whose expression fold factor is the worst
       (highest) across all cells (and is also above the minimum).

    5. Select as deviants all cells whose minimal rank is below the artificial maximum rank, that
       is, which contain at least one gene whose expression fold factor is high relative to the rest
       of the cells. If the fraction of such cells is higher than ``max_cell_fraction`` (default:
       {max_cell_fraction}), reduce the maximal rank such that at most this fraction of cells are
       selected as deviants.
    """
    if max_gene_fraction is None:
        max_gene_fraction = 1

    if max_cell_fraction is None:
        max_cell_fraction = 1

    assert min_gene_fold_factor > 0
    assert 0 < max_gene_fraction < 1
    assert 0 < max_cell_fraction < 1

    cells_count, genes_count = adata.shape
    assert cells_count > 0

    candidate_of_cells = ut.get_o_numpy(adata, candidates, formatter=ut.groups_description)

    totals_of_cells = ut.get_o_numpy(adata, what, sum=True)
    assert totals_of_cells.size == cells_count

    data = ut.get_vo_proper(adata, what, layout="row_major")
    list_of_fold_factors, list_of_cell_index_of_rows = _collect_fold_factors(
        data=data,
        candidate_of_cells=candidate_of_cells,
        totals_of_cells=totals_of_cells,
        min_gene_fold_factor=min_gene_fold_factor,
        abs_folds=abs_folds,
    )

    fold_factors = _construct_fold_factors(cells_count, list_of_fold_factors, list_of_cell_index_of_rows)

    if fold_factors is None:
        votes_of_deviant_cells = np.zeros(adata.n_obs, dtype="int32")
        votes_of_deviant_genes = np.zeros(adata.n_vars, dtype="int32")

    else:
        deviant_gene_indices = _filter_genes(
            cells_count=cells_count,
            genes_count=genes_count,
            fold_factors=fold_factors,
            min_gene_fold_factor=min_gene_fold_factor,
            max_gene_fraction=max_gene_fraction,
        )

        deviant_genes_fold_ranks = _fold_ranks(
            cells_count=cells_count, fold_factors=fold_factors, deviant_gene_indices=deviant_gene_indices
        )

        votes_of_deviant_cells, votes_of_deviant_genes = _filter_cells(
            cells_count=cells_count,
            genes_count=genes_count,
            deviant_genes_fold_ranks=deviant_genes_fold_ranks,
            deviant_gene_indices=deviant_gene_indices,
            max_cell_fraction=max_cell_fraction,
        )

    if inplace:
        ut.set_v_data(adata, "gene_deviant_votes", votes_of_deviant_genes, formatter=ut.mask_description)
        ut.set_o_data(adata, "cell_deviant_votes", votes_of_deviant_cells, formatter=ut.mask_description)
        return None

    ut.log_return("gene_deviant_votes", votes_of_deviant_genes, formatter=ut.mask_description)
    ut.log_return("cell_deviant_votes", votes_of_deviant_cells, formatter=ut.mask_description)

    return (
        ut.to_pandas_series(votes_of_deviant_cells, index=adata.obs_names),
        ut.to_pandas_series(votes_of_deviant_genes, index=adata.var_names),
    )
Ejemplo n.º 14
0
def find_noisy_lonely_genes(  # pylint: disable=too-many-statements
    adata: AnnData,
    what: Union[str, ut.Matrix] = "__x__",
    *,
    excluded_genes_mask: Optional[str] = None,
    max_sampled_cells: int = pr.noisy_lonely_max_sampled_cells,
    downsample_min_samples: int = pr.noisy_lonely_downsample_min_samples,
    downsample_min_cell_quantile: float = pr.
    noisy_lonely_downsample_max_cell_quantile,
    downsample_max_cell_quantile: float = pr.
    noisy_lonely_downsample_min_cell_quantile,
    min_gene_total: int = pr.noisy_lonely_min_gene_total,
    min_gene_normalized_variance: float = pr.
    noisy_lonely_min_gene_normalized_variance,
    max_gene_similarity: float = pr.noisy_lonely_max_gene_similarity,
    random_seed: int = pr.random_seed,
    inplace: bool = True,
) -> Optional[ut.PandasSeries]:
    """
    Detect "noisy lonely" genes based on ``what`` (default: {what}) data.

    Return the indices of genes which are "noisy" (have high variance compared to their mean) and
    also "lonely" (have low correlation with all other genes). Such genes should be excluded since
    they will never meaningfully help us compute groups, and will actively cause profiles to be
    considered "deviants".

    Noisy genes have high expression and variance. Lonely genes have no (or low) correlations with
    any other gene. Noisy lonely genes tend to throw off clustering algorithms. In general, such
    algorithms try to group together cells with the same overall biological state. Since the genes
    are lonely, they don't contribute towards this goal. Since they are noisy, they actively hamper
    this, because they make cells which are otherwise similar appear different (just for this lonely
    gene).

    It is therefore useful to explicitly identify, in a pre-processing step, the (few) such genes,
    and exclude them from the rest of the analysis.

    **Input**

    Annotated ``adata``, where the observations are cells and the variables are genes, where
    ``what`` is a per-variable-per-observation matrix or the name of a per-variable-per-observation
    annotation containing such a matrix.

    **Returns**

    Variable (Gene) Annotations
        ``noisy_lonely_genes``
            A boolean mask indicating whether each gene was found to be a "noisy lonely" gene.

    If ``inplace`` (default: {inplace}), this is written to the data, and the function returns
    ``None``. Otherwise this is returned as a pandas series (indexed by the variable names).

    **Computation Parameters**

    1. If we have more than ``max_sampled_cells`` (default: {max_sampled_cells}), pick this number
       of random cells from the data using the ``random_seed``.

    2. If we were specified an ``excluded_genes_mask``, this is the name of a per-variable (gene)
       annotation containing a mask of excluded genes. Get rid of all these excluded genes.

    3. Invoke :py:func:`metacells.tools.downsample.downsample_cells` to downsample the cells to the
       same total number of UMIs, using the ``downsample_min_samples`` (default:
       {downsample_min_samples}), ``downsample_min_cell_quantile`` (default:
       {downsample_min_cell_quantile}), ``downsample_max_cell_quantile`` (default:
       {downsample_max_cell_quantile}) and the ``random_seed`` (default: {random_seed}).

    4. Find "noisy" genes which have a total number of UMIs of at least ``min_gene_total`` (default:
       {min_gene_total}) and a normalized variance of at least ``min_gene_normalized_variance``
       (default: ``min_gene_normalized_variance``).

    5. Cross-correlate the noisy genes.

    6. Find the noisy "lonely" genes whose maximal correlation is at most
       ``max_gene_similarity`` (default: {max_gene_similarity}) with all other genes.
    """
    if max_sampled_cells < adata.n_obs:
        np.random.seed(random_seed)
        cell_indices = np.random.choice(np.arange(adata.n_obs),
                                        size=max_sampled_cells,
                                        replace=False)
        s_data = ut.slice(adata,
                          obs=cell_indices,
                          name=".sampled",
                          top_level=False)
    else:
        s_data = ut.copy_adata(adata, top_level=False)

    track_var: Optional[str] = "sampled_gene_index"

    if excluded_genes_mask is not None:
        results = filter_data(s_data,
                              name="included",
                              top_level=False,
                              track_var=track_var,
                              var_masks=[f"~{excluded_genes_mask}"])
        track_var = None
        assert results is not None
        i_data = results[0]
        assert i_data is not None
    else:
        i_data = s_data

    downsample_cells(
        i_data,
        what,
        downsample_min_samples=downsample_min_samples,
        downsample_min_cell_quantile=downsample_min_cell_quantile,
        downsample_max_cell_quantile=downsample_max_cell_quantile,
        random_seed=random_seed,
    )

    find_high_total_genes(i_data, "downsampled", min_gene_total=min_gene_total)

    results = filter_data(i_data,
                          name="high_total",
                          top_level=False,
                          track_var=track_var,
                          var_masks=["high_total_gene"])
    track_var = None
    assert results is not None
    ht_data = results[0]

    noisy_lonely_genes_mask = np.full(adata.n_vars, False)

    if ht_data is not None:
        ht_genes_count = ht_data.shape[1]

        ht_gene_ht_gene_similarity_frame = compute_var_var_similarity(
            ht_data,
            "downsampled",
            inplace=False,
            reproducible=(random_seed != 0))
        assert ht_gene_ht_gene_similarity_frame is not None

        ht_gene_ht_gene_similarity_matrix = ut.to_numpy_matrix(
            ht_gene_ht_gene_similarity_frame, only_extract=True)
        ht_gene_ht_gene_similarity_matrix = ut.to_layout(
            ht_gene_ht_gene_similarity_matrix,
            layout="row_major",
            symmetric=True)
        np.fill_diagonal(ht_gene_ht_gene_similarity_matrix, -1)

        htv_mask_series = find_high_normalized_variance_genes(
            ht_data,
            "downsampled",
            min_gene_normalized_variance=min_gene_normalized_variance,
            inplace=False)
        assert htv_mask_series is not None
        htv_mask = ut.to_numpy_vector(htv_mask_series)

        htv_genes_count = np.sum(htv_mask)
        assert htv_genes_count <= ht_genes_count

        if htv_genes_count > 0:
            htv_gene_ht_gene_similarity_matrix = ht_gene_ht_gene_similarity_matrix[
                htv_mask, :]
            assert ut.is_layout(htv_gene_ht_gene_similarity_matrix,
                                "row_major")
            assert htv_gene_ht_gene_similarity_matrix.shape == (
                htv_genes_count, ht_genes_count)

            max_similarity_of_htv_genes = ut.max_per(
                htv_gene_ht_gene_similarity_matrix, per="row")
            htvl_mask = max_similarity_of_htv_genes <= max_gene_similarity
            htvl_genes_count = np.sum(htvl_mask)
            ut.log_calc("noisy_lonely_genes_count", htvl_genes_count)

            if htvl_genes_count > 0:
                base_index_of_ht_genes = ut.get_v_numpy(
                    ht_data, "sampled_gene_index")
                base_index_of_htv_genes = base_index_of_ht_genes[htv_mask]
                base_index_of_htvl_genes = base_index_of_htv_genes[htvl_mask]

                noisy_lonely_genes_mask[base_index_of_htvl_genes] = True

                htvl_gene_ht_gene_similarity_matrix = htv_gene_ht_gene_similarity_matrix[
                    htvl_mask, :]
                htvl_gene_ht_gene_similarity_matrix = ut.to_layout(
                    htvl_gene_ht_gene_similarity_matrix, layout="row_major")
                assert htvl_gene_ht_gene_similarity_matrix.shape == (
                    htvl_genes_count, ht_genes_count)

                if ut.logging_calc():
                    i_gene_totals = ut.get_v_numpy(i_data,
                                                   "downsampled",
                                                   sum=True)
                    ht_mask = ut.get_v_numpy(i_data, "high_total_gene")
                    i_total = np.sum(i_gene_totals)
                    htvl_gene_totals = i_gene_totals[ht_mask][htv_mask][
                        htvl_mask]
                    top_similarity_of_htvl_genes = ut.top_per(
                        htvl_gene_ht_gene_similarity_matrix, 10, per="row")
                    for htvl_index, gene_index in enumerate(
                            base_index_of_htvl_genes):
                        gene_name = adata.var_names[gene_index]
                        gene_total = htvl_gene_totals[htvl_index]
                        gene_percent = 100 * gene_total / i_total
                        similar_ht_values = ut.to_numpy_vector(
                            top_similarity_of_htvl_genes[htvl_index, :])  #
                        assert len(similar_ht_values) == ht_genes_count
                        top_similar_ht_mask = similar_ht_values > 0
                        top_similar_ht_values = similar_ht_values[
                            top_similar_ht_mask]
                        top_similar_ht_indices = base_index_of_ht_genes[
                            top_similar_ht_mask]
                        top_similar_ht_names = adata.var_names[
                            top_similar_ht_indices]
                        ut.log_calc(
                            f"- {gene_name}",
                            f"total downsampled UMIs: {gene_total} " +
                            f"({gene_percent:.4g}%), correlated with: " +
                            ", ".join([
                                f"{similar_gene_name}: {similar_gene_value:.4g}"
                                for similar_gene_value, similar_gene_name in
                                reversed(
                                    sorted(
                                        zip(top_similar_ht_values,
                                            top_similar_ht_names)))
                            ]),
                        )

    if ut.logging_calc():
        ut.log_calc("noisy_lonely_gene_names",
                    sorted(list(adata.var_names[noisy_lonely_genes_mask])))

    if inplace:
        ut.set_v_data(adata, "noisy_lonely_gene", noisy_lonely_genes_mask)
        return None

    ut.log_return("noisy_lonely_genes", noisy_lonely_genes_mask)
    return ut.to_pandas_series(noisy_lonely_genes_mask, index=adata.var_names)
Ejemplo n.º 15
0
def dissolve_metacells(
    adata: AnnData,
    what: Union[str, ut.Matrix] = "__x__",
    *,
    candidates: Union[str, ut.Vector] = "candidate",
    deviants: Optional[Union[str, ut.Vector]] = "cell_deviant_votes",
    target_metacell_size: float = pr.target_metacell_size,
    cell_sizes: Optional[Union[str, ut.Vector]] = pr.dissolve_cell_sizes,
    min_metacell_cells: int = pr.dissolve_min_metacell_cells,
    min_robust_size_factor: Optional[float] = pr.
    dissolve_min_robust_size_factor,
    min_convincing_size_factor: Optional[float] = pr.
    dissolve_min_convincing_size_factor,
    min_convincing_gene_fold_factor: float = pr.
    dissolve_min_convincing_gene_fold_factor,
    abs_folds: bool = pr.dissolve_abs_folds,
    inplace: bool = True,
) -> Optional[ut.PandasFrame]:
    """
    Dissolve too-small metacells based on ``what`` (default: {what}) data.

    **Input**

    Annotated ``adata``, where the observations are cells and the variables are genes, where
    ``what`` is a per-variable-per-observation matrix or the name of a per-variable-per-observation
    annotation containing such a matrix.

    **Returns**

    Observation (Cell) Annotations
        ``metacell``
            The integer index of the metacell each cell belongs to. The metacells are in no
            particular order. Cells with no metacell assignment are given a metacell index of
            ``-1``.

        ``dissolved``
            A boolean mask of the cells which were in a dissolved metacell.

    If ``inplace`` (default: {inplace}), this is written to the data, and the function returns
    ``None``. Otherwise this is returned as a pandas data frame (indexed by the observation names).

    **Computation Parameters**

    1. Mark all cells with non-zero ``deviants`` (default: {deviants}) as "outliers". This can be
       the name of a per-observation (cell) annotation, or an explicit boolean mask of cells, or a
       or ``None`` if there are no deviant cells to mark.

    2. Any metacell which has less cells than the ``min_metacell_cells`` is dissolved.

    3. We are trying to create metacells of size ``target_metacell_size``. Compute the sizes of the
       resulting metacells by summing the ``cell_sizes`` (default: {cell_sizes}). If it is ``None``,
       each has a size of one. These parameters are typically identical to these passed to
       :py:func:`metacells.tools.candidates.compute_candidate_metacells`.

    4. If ``min_robust_size_factor`` (default: {min_robust_size_factor}) is specified, then any
       metacell whose total size is at least ``target_metacell_size * min_robust_size_factor`` is
       preserved.

    5. If ``min_convincing_size_factor`` (default: {min_convincing_size_factor}) is specified, then any remaining
       metacells whose size is at least ``target_metacell_size * min_convincing_size_factor`` are preserved, given they
       contain at least one gene whose fold factor (log2((actual + 1) / (expected + 1))) is at least
       ``min_convincing_gene_fold_factor`` (default: {min_convincing_gene_fold_factor}). If ``abs_folds``, consider the
       absolute fold factors. That is, we only preserve these smaller metacells if there is at least one gene whose
       expression is significantly different from the mean of the population.

    6 . Any remaining metacell is dissolved into "outlier" cells.
    """
    dissolved_of_cells = np.zeros(adata.n_obs, dtype="bool")

    candidate_of_cells = ut.get_o_numpy(adata,
                                        candidates,
                                        formatter=ut.groups_description)
    candidate_of_cells = np.copy(candidate_of_cells)

    deviant_of_cells = ut.maybe_o_numpy(adata,
                                        deviants,
                                        formatter=ut.mask_description)
    if deviant_of_cells is not None:
        deviant_of_cells = deviant_of_cells > 0
    cell_sizes = ut.maybe_o_numpy(adata,
                                  cell_sizes,
                                  formatter=ut.sizes_description)

    if deviant_of_cells is not None:
        candidate_of_cells[deviant_of_cells > 0] = -1
    candidate_of_cells = ut.compress_indices(candidate_of_cells)
    candidates_count = np.max(candidate_of_cells) + 1

    data = ut.get_vo_proper(adata, what, layout="column_major")
    fraction_of_genes = ut.fraction_per(data, per="column")

    if min_robust_size_factor is None:
        min_robust_size = None
    else:
        min_robust_size = target_metacell_size * min_robust_size_factor
    ut.log_calc("min_robust_size", min_robust_size)

    if min_convincing_size_factor is None:
        min_convincing_size = None
    else:
        min_convincing_size = target_metacell_size * min_convincing_size_factor
    ut.log_calc("min_convincing_size", min_convincing_size)

    did_dissolve = False
    for candidate_index in range(candidates_count):
        candidate_cell_indices = np.where(
            candidate_of_cells == candidate_index)[0]
        if not _keep_candidate(
                adata,
                candidate_index,
                data=data,
                cell_sizes=cell_sizes,
                fraction_of_genes=fraction_of_genes,
                min_metacell_cells=min_metacell_cells,
                min_robust_size=min_robust_size,
                min_convincing_size=min_convincing_size,
                min_convincing_gene_fold_factor=min_convincing_gene_fold_factor,
                abs_folds=abs_folds,
                candidates_count=candidates_count,
                candidate_cell_indices=candidate_cell_indices,
        ):
            dissolved_of_cells[candidate_cell_indices] = True
            candidate_of_cells[candidate_cell_indices] = -1
            did_dissolve = True

    if did_dissolve:
        metacell_of_cells = ut.compress_indices(candidate_of_cells)
    else:
        metacell_of_cells = candidate_of_cells

    if inplace:
        ut.set_o_data(adata,
                      "dissolved",
                      dissolved_of_cells,
                      formatter=ut.mask_description)

        ut.set_o_data(adata,
                      "metacell",
                      metacell_of_cells,
                      formatter=ut.groups_description)
        return None

    ut.log_return("dissolved", dissolved_of_cells)
    ut.log_return("metacell",
                  metacell_of_cells,
                  formatter=ut.groups_description)

    obs_frame = ut.to_pandas_frame(index=adata.obs_names)
    obs_frame["dissolved"] = dissolved_of_cells
    obs_frame["metacell"] = metacell_of_cells
    return obs_frame
Ejemplo n.º 16
0
def find_properly_sampled_cells(
    adata: AnnData,
    what: Union[str, ut.Matrix] = "__x__",
    *,
    min_cell_total: Optional[int],
    max_cell_total: Optional[int],
    excluded_adata: Optional[AnnData] = None,
    max_excluded_genes_fraction: Optional[float],
    inplace: bool = True,
) -> Optional[ut.PandasSeries]:
    """
    Detect cells with a "proper" amount of ``what`` (default: {what}) data.

    Due to both technical effects and natural variance between cells, the total number of UMIs
    varies from cell to cell. We often would like to work on cells that contain a sufficient number
    of UMIs for meaningful analysis; we sometimes also wish to exclude cells which have "too many"
    UMIs.

    **Input**

    Annotated ``adata``, where the observations are cells and the variables are genes, where
    ``what`` is a per-variable-per-observation matrix or the name of a per-variable-per-observation
    annotation containing such a matrix.

    **Returns**

    Observation (Cell) Annotations
        ``properly_sampled_cell``
            A boolean mask indicating whether each cell has a "proper" amount of UMIs.

    If ``inplace`` (default: {inplace}), this is written to the data, and the function returns
    ``None``. Otherwise this is returned as a pandas series (indexed by the observation names).

    **Computation Parameters**

    1. Exclude all cells whose total data is less than the ``min_cell_total`` (no default), unless
       it is ``None``.

    2. Exclude all cells whose total data is more than the ``max_cell_total`` (no default), unless
       it is ``None``.

    3. If ``max_excluded_genes_fraction`` (no default) is not ``None``, then ``excluded_adata`` must
       not be ``None`` and should contain just the excluded genes data for each cell. Exclude all
       cells whose sum of the excluded data divided by the total data is more than the specified
       threshold.
    """
    assert (max_excluded_genes_fraction is None) == (excluded_adata is None)

    total_of_cells = ut.get_o_numpy(adata, what, sum=True)

    cells_mask = np.full(adata.n_obs, True, dtype="bool")

    if min_cell_total is not None:
        cells_mask = cells_mask & (total_of_cells >= min_cell_total)

    if max_cell_total is not None:
        cells_mask = cells_mask & (total_of_cells <= max_cell_total)

    if excluded_adata is not None:
        assert max_excluded_genes_fraction is not None
        excluded_data = ut.get_vo_proper(excluded_adata, layout="row_major")
        excluded_of_cells = ut.sum_per(excluded_data, per="row")
        if np.min(total_of_cells) == 0:
            total_of_cells = np.copy(total_of_cells)
            total_of_cells[total_of_cells == 0] = 1
        excluded_fraction = excluded_of_cells / total_of_cells
        cells_mask = cells_mask & (excluded_fraction <=
                                   max_excluded_genes_fraction)

    if inplace:
        ut.set_o_data(adata, "properly_sampled_cell", cells_mask)
        return None

    ut.log_return("properly_sampled_cell", cells_mask)
    return ut.to_pandas_series(cells_mask, index=adata.obs_names)
Ejemplo n.º 17
0
def combine_masks(  # pylint: disable=too-many-branches,too-many-statements
    adata: AnnData,
    masks: List[str],
    *,
    invert: bool = False,
    to: Optional[str] = None,
) -> Optional[ut.PandasSeries]:
    """
    Combine different pre-computed masks into a final overall mask.

    **Input**

    Annotated ``adata``, where the observations are cells and the variables are genes.

    **Returns**

    If ``to`` (default: {to}) is ``None``, returns the computed mask. Otherwise, sets the
    mask as an annotation (per-variable or per-observation depending on the type of the combined masks).

    **Computation Parameters**

    1. For each of the mask in ``masks``, fetch it. Silently ignore missing masks if the name has a
       ``?`` suffix. Invert the mask if the name has a ``~`` prefix. If the name has a ``|`` prefix
       (before the ``~`` prefix, if any), then bitwise-OR the mask into the OR mask, otherwise (or if
       it has a ``&`` prefix), bitwise-AND the mask into the AND mask.

    2. Combine (bitwise-AND) the AND mask and the OR mask into a single mask.

    3. If ``invert`` (default: {invert}), invert the result combined mask.
    """
    assert len(masks) > 0

    per: Optional[str] = None

    and_mask: Optional[ut.NumpyVector] = None
    or_mask: Optional[ut.NumpyVector] = None

    for mask_name in masks:
        log_mask_name = mask_name

        if mask_name[0] == "|":
            is_or = True
            mask_name = mask_name[1:]
        else:
            is_or = False
            if mask_name[0] == "&":
                mask_name = mask_name[1:]

        if mask_name[0] == "~":
            invert_mask = True
            mask_name = mask_name[1:]
        else:
            invert_mask = False

        if mask_name[-1] == "?":
            must_exist = False
            mask_name = mask_name[:-1]
        else:
            must_exist = True

        if mask_name in adata.obs:
            mask_per = "o"
            mask = ut.get_o_numpy(
                adata, mask_name, formatter=ut.mask_description) > 0
        elif mask_name in adata.var:
            mask_per = "v"
            mask = ut.get_v_numpy(
                adata, mask_name, formatter=ut.mask_description) > 0
        else:
            if must_exist:
                raise KeyError(f"unknown mask data: {mask_name}")
            continue

        if mask.dtype != "bool":
            raise ValueError(f"the data: {mask_name} is not a boolean mask")

        if invert_mask:
            mask = ~mask

        if ut.logging_calc():
            ut.log_calc(log_mask_name, mask)

        if per is None:
            per = mask_per
        else:
            if mask_per != per:
                raise ValueError(
                    "mixing per-observation and per-variable masks")

        if is_or:
            if or_mask is None:
                or_mask = mask
            else:
                or_mask = or_mask | mask
        else:
            if and_mask is None:
                and_mask = mask
            else:
                and_mask = and_mask & mask

    if and_mask is not None:
        if or_mask is not None:
            combined_mask = and_mask & or_mask
        else:
            combined_mask = and_mask
    else:
        if or_mask is not None:
            combined_mask = or_mask
        else:
            raise ValueError("no masks to combine")

    if invert:
        combined_mask = ~combined_mask

    if to is None:
        ut.log_return("combined", combined_mask)
        if per == "o":
            return ut.to_pandas_series(combined_mask, index=adata.obs_names)
        assert per == "v"
        return ut.to_pandas_series(combined_mask, index=adata.var_names)

    if per == "o":
        ut.set_o_data(adata, to, combined_mask)
    else:
        ut.set_v_data(adata, to, combined_mask)

    return None