Example #1
0
def find_top_feature_genes(
    adata: AnnData,
    *,
    max_genes: int = pr.max_top_feature_genes,
    inplace: bool = True,
) -> Optional[ut.PandasSeries]:
    """
    Find genes which have high ``feature_gene`` value.

    This is applied after computing metacells to pick the "strongest" feature genes. If using the
    direct algorithm (:py:func:`metacells.pipeline.direct.compute_direct_metacells`) then all
    feature genes are equally "strong"; however, if using the divide-and-conquer algorithm
    (:py:func:`metacells.pipeline.divide_and_conquer.divide_and_conquer_pipeline`,
    :py:func:`metacells.pipeline.divide_and_conquer.compute_divide_and_conquer_metacells`) then this
    will pick the genes which were most commonly used as features across all the piles.

    **Input**

    Annotated ``adata``, where the observations are cells and the variables are genes, where
    ``feature_gene`` is a per-variable (gene) annotation counting how many times each gene was used
    as a feature.

    **Returns**

    Variable (Gene) Annotations
        ``top_feature_gene``
            A boolean mask indicating whether each gene was found to be a top feature gene.

    If ``inplace`` (default: {inplace}), this is written to the data, and the function returns
    ``None``. Otherwise this is returned as a pandas series (indexed by the variable names).

    **Computation Parameters**

    1. Look for the lowest positive ``feature_gene`` threshold such that at most ``max_genes`` are
       picked as top feature genes. Note we may still pick more than ``max_genes``, for example when
       using the direct algorithm, we always return all feature genes as there's no way to
       distinguish between them using the ``feature_gene`` data.
    """
    feature_of_gene = ut.get_v_numpy(adata, "feature_gene", formatter=ut.mask_description)
    max_threshold = np.max(feature_of_gene)
    assert max_threshold > 0
    threshold = 0
    selected_count = max_genes + 1
    while selected_count > max_genes and threshold < max_threshold:
        threshold = threshold + 1
        genes_mask = feature_of_gene >= threshold
        selected_count = np.sum(genes_mask)
        ut.log_calc(f"threshold: {threshold} selected: {selected_count}")

    if inplace:
        ut.set_v_data(adata, "top_feature_gene", genes_mask)
        return None

    ut.log_return("top_feature_gene", genes_mask)
    return ut.to_pandas_series(genes_mask, index=adata.var_names)
Example #2
0
def find_properly_sampled_genes(
    adata: AnnData,
    what: Union[str, ut.Matrix] = "__x__",
    *,
    min_gene_total: int = pr.properly_sampled_min_gene_total,
    inplace: bool = True,
) -> Optional[ut.PandasSeries]:
    """
    Detect genes with a "proper" amount of ``what`` (default: {what}) data.

    Due to both technical effects and natural variance between genes, the expression of genes varies
    greatly between cells. This is exactly the information we are trying to analyze. We often would
    like to work on genes that have a sufficient level of expression for meaningful analysis.
    Specifically, it doesn't make sense to analyze genes that have zero expression in all the cells.

    .. todo::

        Provide additional optional criteria for "properly sampled genes"?

    **Input**

    Annotated ``adata``, where the observations are cells and the variables are genes, where
    ``what`` is a per-variable-per-observation matrix or the name of a per-variable-per-observation
    annotation containing such a matrix.

    **Returns**

    Variable (Gene) Annotations
        ``properly_sampled_gene``
            A boolean mask indicating whether each gene has a "proper" number of UMIs.

    If ``inplace`` (default: {inplace}), this is written to the data and the function returns
    ``None``. Otherwise this is returned as a pandas series (indexed by the variable names).

    **Computation Parameters**

    1. Exclude all genes whose total data is less than the ``min_gene_total`` (default:
       {min_gene_total}).
    """
    total_of_genes = ut.get_v_numpy(adata, what, sum=True)

    genes_mask = total_of_genes >= min_gene_total

    if inplace:
        ut.set_v_data(adata, "properly_sampled_gene", genes_mask)
        return None

    ut.log_return("properly_sampled_gene", genes_mask)
    return ut.to_pandas_series(genes_mask, index=adata.obs_names)
Example #3
0
def find_high_total_genes(
    adata: AnnData,
    what: Union[str, ut.Matrix] = "__x__",
    *,
    min_gene_total: int,
    inplace: bool = True,
) -> Optional[ut.PandasSeries]:
    """
    Find genes which have high total number of ``what`` (default: {what}) data.

    This should typically only be applied to downsampled data to ensure that variance in sampling
    depth does not affect the result.

    Genes with too-low expression are typically excluded from computations. In particular,
    genes may have all-zero expression, in which case including them just slows the
    computations (and triggers numeric edge cases).

    **Input**

    Annotated ``adata``, where the observations are cells and the variables are genes, where
    ``what`` is a per-variable-per-observation matrix or the name of a per-variable-per-observation
    annotation containing such a matrix.

    **Returns**

    Variable (Gene) Annotations
        ``high_total_gene``
            A boolean mask indicating whether each gene was found to have a high normalized
            variance.

    If ``inplace`` (default: {inplace}), this is written to the data, and the function returns
    ``None``. Otherwise this is returned as a pandas series (indexed by the variable names).

    **Computation Parameters**

    1. Use :py:func:`metacells.utilities.computation.sum_per` to get the total UMIs of each gene.

    2. Select the genes whose fraction is at least ``min_gene_total``.
    """
    total_of_genes = ut.get_v_numpy(adata, what, sum=True)
    genes_mask = total_of_genes >= min_gene_total

    if inplace:
        ut.set_v_data(adata, "high_total_gene", genes_mask)
        return None

    ut.log_return("high_total_genes", genes_mask)
    return ut.to_pandas_series(genes_mask, index=adata.var_names)
Example #4
0
def renormalize_query_by_atlas(  # pylint: disable=too-many-statements,too-many-branches
    what: str = "__x__",
    *,
    adata: AnnData,
    qdata: AnnData,
    var_annotations: Dict[str, Any],
    layers: Dict[str, Any],
    varp_annotations: Dict[str, Any],
) -> Optional[AnnData]:
    """
    Add an ``ATLASNORM`` pseudo-gene to query metacells data to compensate for the query having filtered out many genes.

    This renormalizes the gene fractions in the query to fit the atlas in case the query has aggressive filtered a
    significant amount of genes.

    **Input**

    Annotated query ``qdata`` and atlas ``adata``, where the observations are cells and the variables are genes, where
    ``X`` is a per-variable-per-observation matrix or the name of a per-variable-per-observation annotation containing
    such a matrix.

    **Returns**

    None if no normalization is needed (or possible). Otherwise, a copy of the query metacells data, with an additional
    variable (gene) called ``ATLASNORM`` to the query data, such that the total number of UMIs for each query metacells
    is as expected given the total number of UMIs of the genes common to the query and the atlas. This is skipped if the
    query and the atlas have exactly the same list of genes, or if if the query already contains a high number of genes
    missing from the atlas so that the total number of UMIs for the query metacells is already at least the expected
    based on the common genes.

    **Computation Parameters**

    1. Computes how many UMIs should be added to each query metacell so that its (total UMIs / total common gene UMIs)
       would be the same as the (total atlas UMIs / total atlas common UMIs). If this is zero (or negative), stop.

    2. Add an ``ATLASNORM`` pseudo-gene to the query with the above amount of UMIs. For each per-variable (gene)
       observation, add the value specified in ``var_annotations``, whose list of keys must cover the set of
       per-variable annotations in the query data. For each per-observation-per-variable layer, add the value specified
       in ``layers``, whose list of keys must cover the existing layers. For each per-variable-per-variable annotation,
       add the value specified in ``varp_annotations``.
    """
    for name in qdata.var.keys():
        if "|" not in name and name not in var_annotations.keys():
            raise RuntimeError(f"missing default value for variable annotation {name}")

    for name in qdata.layers.keys():
        if name not in layers.keys():
            raise RuntimeError(f"missing default value for layer {name}")

    for name in qdata.varp.keys():
        if name not in varp_annotations.keys():
            raise RuntimeError(f"missing default value for variable-variable {name}")

    if list(qdata.var_names) == list(adata.var_names):
        return None

    query_genes_list = list(qdata.var_names)
    atlas_genes_list = list(adata.var_names)
    common_genes_list = list(sorted(set(qdata.var_names) & set(adata.var_names)))
    query_gene_indices = np.array([query_genes_list.index(gene) for gene in common_genes_list])
    atlas_gene_indices = np.array([atlas_genes_list.index(gene) for gene in common_genes_list])
    common_qdata = ut.slice(qdata, name=".common", vars=query_gene_indices, track_var="full_index")
    common_adata = ut.slice(adata, name=".common", vars=atlas_gene_indices, track_var="full_index")

    assert list(common_qdata.var_names) == list(common_adata.var_names)

    atlas_total_umis_per_metacell = ut.get_o_numpy(adata, what, sum=True)
    atlas_common_umis_per_metacell = ut.get_o_numpy(common_adata, what, sum=True)
    atlas_total_umis = np.sum(atlas_total_umis_per_metacell)
    atlas_common_umis = np.sum(atlas_common_umis_per_metacell)
    atlas_disjoint_umis_fraction = atlas_total_umis / atlas_common_umis - 1.0

    ut.log_calc("atlas_total_umis", atlas_total_umis)
    ut.log_calc("atlas_common_umis", atlas_common_umis)
    ut.log_calc("atlas_disjoint_umis_fraction", atlas_disjoint_umis_fraction)

    query_total_umis_per_metacell = ut.get_o_numpy(qdata, what, sum=True)
    query_common_umis_per_metacell = ut.get_o_numpy(common_qdata, what, sum=True)
    query_total_umis = np.sum(query_total_umis_per_metacell)
    query_common_umis = np.sum(query_common_umis_per_metacell)
    query_disjoint_umis_fraction = query_total_umis / query_common_umis - 1.0

    ut.log_calc("query_total_umis", query_total_umis)
    ut.log_calc("query_common_umis", query_common_umis)
    ut.log_calc("query_disjoint_umis_fraction", query_disjoint_umis_fraction)

    if query_disjoint_umis_fraction >= atlas_disjoint_umis_fraction:
        return None

    query_normalization_umis_fraction = atlas_disjoint_umis_fraction - query_disjoint_umis_fraction
    ut.log_calc("query_normalization_umis_fraction", query_normalization_umis_fraction)
    query_normalization_umis_per_metacell = query_common_umis_per_metacell * query_normalization_umis_fraction

    _proper, dense, compressed = ut.to_proper_matrices(qdata.X)

    if dense is None:
        assert compressed is not None
        dense = ut.to_numpy_matrix(compressed)
    added = np.concatenate([dense, query_normalization_umis_per_metacell[:, np.newaxis]], axis=1)

    if compressed is not None:
        added = sp.csr_matrix(added)

    assert added.shape[0] == qdata.shape[0]
    assert added.shape[1] == qdata.shape[1] + 1

    ndata = AnnData(added)
    ndata.obs_names = qdata.obs_names
    var_names = list(qdata.var_names)
    var_names.append("ATLASNORM")
    ndata.var_names = var_names

    for name, value in qdata.uns.items():
        ut.set_m_data(ndata, name, value)

    for name, value in qdata.obs.items():
        ut.set_o_data(ndata, name, value)

    for name, value in qdata.obsp.items():
        ut.set_oo_data(ndata, name, value)

    for name in qdata.var.keys():
        if "|" in name:
            continue
        value = ut.get_v_numpy(qdata, name)
        value = np.append(value, [var_annotations[name]])
        ut.set_v_data(ndata, name, value)

    for name in qdata.layers.keys():
        data = ut.get_vo_proper(qdata, name)
        _proper, dense, compressed = ut.to_proper_matrices(data)

        if dense is None:
            assert compressed is not None
            dense = ut.to_numpy_matrix(compressed)

        values = np.full(qdata.n_obs, layers[name], dtype=dense.dtype)
        added = np.concatenate([dense, values[:, np.newaxis]], axis=1)

        if compressed is not None:
            added = sp.csr_matrix(added)

        ut.set_vo_data(ndata, name, added)

    for name in qdata.varp.keys():
        data = ut.get_vv_proper(qdata, name)
        _proper, dense, compressed = ut.to_proper_matrices(data)

        if dense is None:
            assert compressed is not None
            dense = ut.to_numpy_matrix(compressed)

        values = np.full(qdata.n_vars, varp_annotations[name], dtype=dense.dtype)
        added = np.concatenate([dense, values[:, np.newaxis]], axis=1)
        values = np.full(qdata.n_vars + 1, varp_annotations[name], dtype=dense.dtype)
        added = np.concatenate([added, values[:, np.newaxis]], axis=0)

        if compressed is not None:
            added = sp.csr_matrix(added)

        ut.set_vv_data(ndata, name, added)

    return ndata
Example #5
0
def relate_genes(
    adata: AnnData,
    what: Union[str, ut.Matrix] = "__x__",
    *,
    max_sampled_cells: int = pr.related_max_sampled_cells,
    downsample_min_samples: float = pr.related_downsample_min_samples,
    downsample_min_cell_quantile: float = pr.
    related_downsample_min_cell_quantile,
    downsample_max_cell_quantile: float = pr.
    related_downsample_max_cell_quantile,
    min_gene_relative_variance: float = pr.related_min_gene_relative_variance,
    min_gene_total: int = pr.related_min_gene_total,
    min_gene_top3: int = pr.related_min_gene_top3,
    forbidden_gene_names: Optional[Collection[str]] = None,
    forbidden_gene_patterns: Optional[Collection[Union[str, Pattern]]] = None,
    genes_similarity_method: str = pr.related_genes_similarity_method,
    genes_cluster_method: str = pr.related_genes_cluster_method,
    min_genes_of_modules: int = pr.related_min_genes_of_modules,
    random_seed: int = 0,
) -> None:
    """
    Detect coarse relations between genes based on ``what`` (default: {what}) data.

    This is a quick-and-dirty way to group genes together and shouldn't only be used as a starting
    point for more precise forms of gene relationship analysis.

    **Input**

    Annotated ``adata``, where the observations are cells and the variables are genes, where
    ``what`` is a per-variable-per-observation matrix or the name of a per-variable-per-observation
    annotation containing such a matrix.

    **Returns**

    Variable-pair (Gene) Annotations
        ``related_genes_similarity``
            The similarity between each two related genes.

    Variable (Gene) Annotations
        ``related_genes_module``
            The index of the gene module for each gene.

    **Computation Parameters**

    1. If we have more than ``max_sampled_cells`` (default: {max_sampled_cells}), pick this number
       of random cells from the data using the ``random_seed``.

    2. Pick candidate genes using :py:func:`metacells.pipeline.feature.extract_feature_data`.

    3. Compute the similarity between the feature genes using
       :py:func:`metacells.tools.similarity.compute_var_var_similarity` using the
       ``genes_similarity_method`` (default: {genes_similarity_method}).

    4. Create a hierarchical clustering of the candidate genes using the ``genes_cluster_method``
       (default: {genes_cluster_method}).

    5. Identify gene modules in the hierarchical clustering which contain at least
       ``min_genes_of_modules`` genes.
    """
    if max_sampled_cells < adata.n_obs:
        np.random.seed(random_seed)
        cell_indices = np.random.choice(np.arange(adata.n_obs),
                                        size=max_sampled_cells,
                                        replace=False)
        sdata = ut.slice(adata,
                         obs=cell_indices,
                         name=".sampled",
                         top_level=False)
    else:
        sdata = ut.copy_adata(adata, top_level=False)

    fdata = extract_feature_data(
        sdata,
        what,
        top_level=False,
        downsample_min_samples=downsample_min_samples,
        downsample_min_cell_quantile=downsample_min_cell_quantile,
        downsample_max_cell_quantile=downsample_max_cell_quantile,
        min_gene_relative_variance=min_gene_relative_variance,
        min_gene_total=min_gene_total,
        min_gene_top3=min_gene_top3,
        forbidden_gene_names=forbidden_gene_names,
        forbidden_gene_patterns=forbidden_gene_patterns,
        random_seed=random_seed,
    )
    assert fdata is not None

    frame = tl.compute_var_var_similarity(fdata,
                                          what,
                                          method=genes_similarity_method,
                                          reproducible=(random_seed != 0),
                                          inplace=False)
    assert frame is not None
    similarity = ut.to_layout(ut.to_numpy_matrix(frame), layout="row_major")

    linkage = _cluster_genes(similarity, genes_cluster_method)
    clusters = _linkage_to_clusters(linkage, min_genes_of_modules,
                                    fdata.n_vars)

    cluster_of_genes = pd.Series(np.full(adata.n_vars, -1, dtype="int32"),
                                 index=adata.var_names)
    for cluster_index, gene_indices in enumerate(clusters):
        cluster_of_genes[fdata.var_names[gene_indices]] = cluster_index

    ut.set_v_data(adata,
                  "related_genes_module",
                  cluster_of_genes,
                  formatter=ut.groups_description)

    feature_gene_indices = ut.get_v_numpy(fdata, "full_gene_index")
    data = similarity.flatten(order="C")
    rows = np.repeat(feature_gene_indices, len(feature_gene_indices))
    cols = np.tile(feature_gene_indices, len(feature_gene_indices))
    full_similarity = sp.csr_matrix((data, (rows, cols)),
                                    shape=(adata.n_vars, adata.n_vars))

    ut.set_vv_data(adata, "related_genes_similarity", full_similarity)
Example #6
0
def find_noisy_lonely_genes(  # pylint: disable=too-many-statements
    adata: AnnData,
    what: Union[str, ut.Matrix] = "__x__",
    *,
    excluded_genes_mask: Optional[str] = None,
    max_sampled_cells: int = pr.noisy_lonely_max_sampled_cells,
    downsample_min_samples: int = pr.noisy_lonely_downsample_min_samples,
    downsample_min_cell_quantile: float = pr.
    noisy_lonely_downsample_max_cell_quantile,
    downsample_max_cell_quantile: float = pr.
    noisy_lonely_downsample_min_cell_quantile,
    min_gene_total: int = pr.noisy_lonely_min_gene_total,
    min_gene_normalized_variance: float = pr.
    noisy_lonely_min_gene_normalized_variance,
    max_gene_similarity: float = pr.noisy_lonely_max_gene_similarity,
    random_seed: int = pr.random_seed,
    inplace: bool = True,
) -> Optional[ut.PandasSeries]:
    """
    Detect "noisy lonely" genes based on ``what`` (default: {what}) data.

    Return the indices of genes which are "noisy" (have high variance compared to their mean) and
    also "lonely" (have low correlation with all other genes). Such genes should be excluded since
    they will never meaningfully help us compute groups, and will actively cause profiles to be
    considered "deviants".

    Noisy genes have high expression and variance. Lonely genes have no (or low) correlations with
    any other gene. Noisy lonely genes tend to throw off clustering algorithms. In general, such
    algorithms try to group together cells with the same overall biological state. Since the genes
    are lonely, they don't contribute towards this goal. Since they are noisy, they actively hamper
    this, because they make cells which are otherwise similar appear different (just for this lonely
    gene).

    It is therefore useful to explicitly identify, in a pre-processing step, the (few) such genes,
    and exclude them from the rest of the analysis.

    **Input**

    Annotated ``adata``, where the observations are cells and the variables are genes, where
    ``what`` is a per-variable-per-observation matrix or the name of a per-variable-per-observation
    annotation containing such a matrix.

    **Returns**

    Variable (Gene) Annotations
        ``noisy_lonely_genes``
            A boolean mask indicating whether each gene was found to be a "noisy lonely" gene.

    If ``inplace`` (default: {inplace}), this is written to the data, and the function returns
    ``None``. Otherwise this is returned as a pandas series (indexed by the variable names).

    **Computation Parameters**

    1. If we have more than ``max_sampled_cells`` (default: {max_sampled_cells}), pick this number
       of random cells from the data using the ``random_seed``.

    2. If we were specified an ``excluded_genes_mask``, this is the name of a per-variable (gene)
       annotation containing a mask of excluded genes. Get rid of all these excluded genes.

    3. Invoke :py:func:`metacells.tools.downsample.downsample_cells` to downsample the cells to the
       same total number of UMIs, using the ``downsample_min_samples`` (default:
       {downsample_min_samples}), ``downsample_min_cell_quantile`` (default:
       {downsample_min_cell_quantile}), ``downsample_max_cell_quantile`` (default:
       {downsample_max_cell_quantile}) and the ``random_seed`` (default: {random_seed}).

    4. Find "noisy" genes which have a total number of UMIs of at least ``min_gene_total`` (default:
       {min_gene_total}) and a normalized variance of at least ``min_gene_normalized_variance``
       (default: ``min_gene_normalized_variance``).

    5. Cross-correlate the noisy genes.

    6. Find the noisy "lonely" genes whose maximal correlation is at most
       ``max_gene_similarity`` (default: {max_gene_similarity}) with all other genes.
    """
    if max_sampled_cells < adata.n_obs:
        np.random.seed(random_seed)
        cell_indices = np.random.choice(np.arange(adata.n_obs),
                                        size=max_sampled_cells,
                                        replace=False)
        s_data = ut.slice(adata,
                          obs=cell_indices,
                          name=".sampled",
                          top_level=False)
    else:
        s_data = ut.copy_adata(adata, top_level=False)

    track_var: Optional[str] = "sampled_gene_index"

    if excluded_genes_mask is not None:
        results = filter_data(s_data,
                              name="included",
                              top_level=False,
                              track_var=track_var,
                              var_masks=[f"~{excluded_genes_mask}"])
        track_var = None
        assert results is not None
        i_data = results[0]
        assert i_data is not None
    else:
        i_data = s_data

    downsample_cells(
        i_data,
        what,
        downsample_min_samples=downsample_min_samples,
        downsample_min_cell_quantile=downsample_min_cell_quantile,
        downsample_max_cell_quantile=downsample_max_cell_quantile,
        random_seed=random_seed,
    )

    find_high_total_genes(i_data, "downsampled", min_gene_total=min_gene_total)

    results = filter_data(i_data,
                          name="high_total",
                          top_level=False,
                          track_var=track_var,
                          var_masks=["high_total_gene"])
    track_var = None
    assert results is not None
    ht_data = results[0]

    noisy_lonely_genes_mask = np.full(adata.n_vars, False)

    if ht_data is not None:
        ht_genes_count = ht_data.shape[1]

        ht_gene_ht_gene_similarity_frame = compute_var_var_similarity(
            ht_data,
            "downsampled",
            inplace=False,
            reproducible=(random_seed != 0))
        assert ht_gene_ht_gene_similarity_frame is not None

        ht_gene_ht_gene_similarity_matrix = ut.to_numpy_matrix(
            ht_gene_ht_gene_similarity_frame, only_extract=True)
        ht_gene_ht_gene_similarity_matrix = ut.to_layout(
            ht_gene_ht_gene_similarity_matrix,
            layout="row_major",
            symmetric=True)
        np.fill_diagonal(ht_gene_ht_gene_similarity_matrix, -1)

        htv_mask_series = find_high_normalized_variance_genes(
            ht_data,
            "downsampled",
            min_gene_normalized_variance=min_gene_normalized_variance,
            inplace=False)
        assert htv_mask_series is not None
        htv_mask = ut.to_numpy_vector(htv_mask_series)

        htv_genes_count = np.sum(htv_mask)
        assert htv_genes_count <= ht_genes_count

        if htv_genes_count > 0:
            htv_gene_ht_gene_similarity_matrix = ht_gene_ht_gene_similarity_matrix[
                htv_mask, :]
            assert ut.is_layout(htv_gene_ht_gene_similarity_matrix,
                                "row_major")
            assert htv_gene_ht_gene_similarity_matrix.shape == (
                htv_genes_count, ht_genes_count)

            max_similarity_of_htv_genes = ut.max_per(
                htv_gene_ht_gene_similarity_matrix, per="row")
            htvl_mask = max_similarity_of_htv_genes <= max_gene_similarity
            htvl_genes_count = np.sum(htvl_mask)
            ut.log_calc("noisy_lonely_genes_count", htvl_genes_count)

            if htvl_genes_count > 0:
                base_index_of_ht_genes = ut.get_v_numpy(
                    ht_data, "sampled_gene_index")
                base_index_of_htv_genes = base_index_of_ht_genes[htv_mask]
                base_index_of_htvl_genes = base_index_of_htv_genes[htvl_mask]

                noisy_lonely_genes_mask[base_index_of_htvl_genes] = True

                htvl_gene_ht_gene_similarity_matrix = htv_gene_ht_gene_similarity_matrix[
                    htvl_mask, :]
                htvl_gene_ht_gene_similarity_matrix = ut.to_layout(
                    htvl_gene_ht_gene_similarity_matrix, layout="row_major")
                assert htvl_gene_ht_gene_similarity_matrix.shape == (
                    htvl_genes_count, ht_genes_count)

                if ut.logging_calc():
                    i_gene_totals = ut.get_v_numpy(i_data,
                                                   "downsampled",
                                                   sum=True)
                    ht_mask = ut.get_v_numpy(i_data, "high_total_gene")
                    i_total = np.sum(i_gene_totals)
                    htvl_gene_totals = i_gene_totals[ht_mask][htv_mask][
                        htvl_mask]
                    top_similarity_of_htvl_genes = ut.top_per(
                        htvl_gene_ht_gene_similarity_matrix, 10, per="row")
                    for htvl_index, gene_index in enumerate(
                            base_index_of_htvl_genes):
                        gene_name = adata.var_names[gene_index]
                        gene_total = htvl_gene_totals[htvl_index]
                        gene_percent = 100 * gene_total / i_total
                        similar_ht_values = ut.to_numpy_vector(
                            top_similarity_of_htvl_genes[htvl_index, :])  #
                        assert len(similar_ht_values) == ht_genes_count
                        top_similar_ht_mask = similar_ht_values > 0
                        top_similar_ht_values = similar_ht_values[
                            top_similar_ht_mask]
                        top_similar_ht_indices = base_index_of_ht_genes[
                            top_similar_ht_mask]
                        top_similar_ht_names = adata.var_names[
                            top_similar_ht_indices]
                        ut.log_calc(
                            f"- {gene_name}",
                            f"total downsampled UMIs: {gene_total} " +
                            f"({gene_percent:.4g}%), correlated with: " +
                            ", ".join([
                                f"{similar_gene_name}: {similar_gene_value:.4g}"
                                for similar_gene_value, similar_gene_name in
                                reversed(
                                    sorted(
                                        zip(top_similar_ht_values,
                                            top_similar_ht_names)))
                            ]),
                        )

    if ut.logging_calc():
        ut.log_calc("noisy_lonely_gene_names",
                    sorted(list(adata.var_names[noisy_lonely_genes_mask])))

    if inplace:
        ut.set_v_data(adata, "noisy_lonely_gene", noisy_lonely_genes_mask)
        return None

    ut.log_return("noisy_lonely_genes", noisy_lonely_genes_mask)
    return ut.to_pandas_series(noisy_lonely_genes_mask, index=adata.var_names)
Example #7
0
def _related_genes(  # pylint: disable=too-many-statements,too-many-branches
    *,
    adata_of_all_genes_of_all_cells: AnnData,
    what: Union[str, ut.Matrix] = "__x__",
    rare_gene_indices_of_modules: List[List[int]],
    allowed_genes_mask: ut.NumpyVector,
    min_genes_of_modules: int,
    min_gene_maximum: int,
    min_cells_of_modules: int,
    max_cells_of_modules: int,
    min_cell_module_total: int,
    min_related_gene_fold_factor: float,
    max_related_gene_increase_factor: float,
) -> List[List[int]]:
    total_all_cells_umis_of_all_genes = ut.get_v_numpy(
        adata_of_all_genes_of_all_cells, what, sum=True)

    ut.log_calc("genes for modules:")
    modules_count = 0
    related_gene_indices_of_modules: List[List[int]] = []

    rare_gene_indices_of_any: Set[int] = set()
    for rare_gene_indices_of_module in rare_gene_indices_of_modules:
        if len(rare_gene_indices_of_module) >= min_genes_of_modules:
            rare_gene_indices_of_any.update(list(rare_gene_indices_of_module))

    for rare_gene_indices_of_module in rare_gene_indices_of_modules:
        if len(rare_gene_indices_of_module) < min_genes_of_modules:
            continue

        module_index = modules_count
        modules_count += 1

        with ut.log_step("- module", module_index):
            ut.log_calc(
                "rare_gene_names",
                sorted(adata_of_all_genes_of_all_cells.
                       var_names[rare_gene_indices_of_module]))

            adata_of_module_genes_of_all_cells = ut.slice(
                adata_of_all_genes_of_all_cells,
                name=f".module{module_index}.rare_gene",
                vars=rare_gene_indices_of_module,
                top_level=False,
            )

            total_module_genes_umis_of_all_cells = ut.get_o_numpy(
                adata_of_module_genes_of_all_cells, what, sum=True)

            mask_of_expressed_cells = total_module_genes_umis_of_all_cells > 0

            expressed_cells_count = np.sum(mask_of_expressed_cells)

            if expressed_cells_count > max_cells_of_modules:
                if ut.logging_calc():
                    ut.log_calc(
                        "expressed_cells",
                        ut.mask_description(mask_of_expressed_cells) +
                        " (too many)")
                continue

            if expressed_cells_count < min_cells_of_modules:
                if ut.logging_calc():
                    ut.log_calc(
                        "expressed_cells",
                        ut.mask_description(mask_of_expressed_cells) +
                        " (too few)")
                continue

            ut.log_calc("expressed_cells", mask_of_expressed_cells)

            adata_of_all_genes_of_expressed_cells_of_module = ut.slice(
                adata_of_all_genes_of_all_cells,
                name=f".module{module_index}.rare_cell",
                obs=mask_of_expressed_cells,
                top_level=False,
            )

            total_expressed_cells_umis_of_all_genes = ut.get_v_numpy(
                adata_of_all_genes_of_expressed_cells_of_module,
                what,
                sum=True)

            data = ut.get_vo_proper(
                adata_of_all_genes_of_expressed_cells_of_module,
                what,
                layout="column_major")
            max_expressed_cells_umis_of_all_genes = ut.max_per(data,
                                                               per="column")

            total_background_cells_umis_of_all_genes = (
                total_all_cells_umis_of_all_genes -
                total_expressed_cells_umis_of_all_genes)

            expressed_cells_fraction_of_all_genes = total_expressed_cells_umis_of_all_genes / sum(
                total_expressed_cells_umis_of_all_genes)

            background_cells_fraction_of_all_genes = total_background_cells_umis_of_all_genes / sum(
                total_background_cells_umis_of_all_genes)

            mask_of_related_genes = (
                allowed_genes_mask
                & (max_expressed_cells_umis_of_all_genes >= min_gene_maximum)
                & (expressed_cells_fraction_of_all_genes >=
                   background_cells_fraction_of_all_genes *
                   (2**min_related_gene_fold_factor)))

            related_gene_indices = np.where(mask_of_related_genes)[0]
            assert np.all(mask_of_related_genes[rare_gene_indices_of_module])

            base_genes_of_all_cells_adata = ut.slice(
                adata_of_all_genes_of_all_cells,
                name=f".module{module_index}.base",
                vars=rare_gene_indices_of_module)
            total_base_genes_of_all_cells = ut.get_o_numpy(
                base_genes_of_all_cells_adata, what, sum=True)
            mask_of_strong_base_cells = total_base_genes_of_all_cells >= min_cell_module_total
            count_of_strong_base_cells = np.sum(mask_of_strong_base_cells)

            if ut.logging_calc():
                ut.log_calc(
                    "candidate_gene_names",
                    sorted(adata_of_all_genes_of_all_cells.
                           var_names[related_gene_indices]))
                ut.log_calc("base_strong_genes", count_of_strong_base_cells)

            related_gene_indices_of_module = list(rare_gene_indices_of_module)
            for gene_index in related_gene_indices:
                if gene_index in rare_gene_indices_of_module:
                    continue

                if gene_index in rare_gene_indices_of_any:
                    ut.log_calc(
                        f"- candidate gene {adata_of_all_genes_of_all_cells.var_names[gene_index]} "
                        f"belongs to another module")
                    continue

                if gene_index not in rare_gene_indices_of_module:
                    related_gene_of_all_cells_adata = ut.slice(
                        adata_of_all_genes_of_all_cells,
                        name=
                        f".{adata_of_all_genes_of_all_cells.var_names[gene_index]}",
                        vars=np.array([gene_index]),
                    )
                    assert related_gene_of_all_cells_adata.n_vars == 1
                    total_related_genes_of_all_cells = ut.get_o_numpy(
                        related_gene_of_all_cells_adata, what, sum=True)
                    total_related_genes_of_all_cells += total_base_genes_of_all_cells
                    mask_of_strong_related_cells = total_related_genes_of_all_cells >= min_cell_module_total
                    count_of_strong_related_cells = np.sum(
                        mask_of_strong_related_cells)
                    ut.log_calc(
                        f"- candidate gene {adata_of_all_genes_of_all_cells.var_names[gene_index]} "
                        f"strong cells: {count_of_strong_related_cells} "
                        f"factor: {count_of_strong_related_cells / count_of_strong_base_cells}"
                    )
                    if count_of_strong_related_cells > max_related_gene_increase_factor * count_of_strong_base_cells:
                        continue

                related_gene_indices_of_module.append(gene_index)

            related_gene_indices_of_modules.append(
                related_gene_indices_of_module)  #

    if ut.logging_calc():
        ut.log_calc("related genes for modules:")
        for module_index, related_gene_indices_of_module in enumerate(
                related_gene_indices_of_modules):
            ut.log_calc(
                f"- module {module_index} related_gene_names",
                sorted(adata_of_all_genes_of_all_cells.
                       var_names[related_gene_indices_of_module]),
            )

    return related_gene_indices_of_modules
Example #8
0
def find_rare_gene_modules(
    adata: AnnData,
    what: Union[str, ut.Matrix] = "__x__",
    *,
    max_gene_cell_fraction: float = pr.rare_max_gene_cell_fraction,
    min_gene_maximum: int = pr.rare_min_gene_maximum,
    genes_similarity_method: str = pr.rare_genes_similarity_method,
    genes_cluster_method: str = pr.rare_genes_cluster_method,
    forbidden_gene_names: Optional[Collection[str]] = None,
    forbidden_gene_patterns: Optional[Collection[Union[str, Pattern]]] = None,
    min_genes_of_modules: int = pr.rare_min_genes_of_modules,
    min_cells_of_modules: int = pr.rare_min_cells_of_modules,
    target_pile_size: int = pr.min_target_pile_size,
    max_cells_factor_of_random_pile: float = pr.
    rare_max_cells_factor_of_random_pile,
    target_metacell_size: float = pr.target_metacell_size,
    min_modules_size_factor: float = pr.rare_min_modules_size_factor,
    min_module_correlation: float = pr.rare_min_module_correlation,
    min_related_gene_fold_factor: float = pr.rare_min_related_gene_fold_factor,
    max_related_gene_increase_factor: float = pr.
    rare_max_related_gene_increase_factor,
    min_cell_module_total: int = pr.rare_min_cell_module_total,
    reproducible: bool = pr.reproducible,
    inplace: bool = True,
) -> Optional[Tuple[ut.PandasFrame, ut.PandasFrame]]:
    """
    Detect rare genes modules based on ``what`` (default: {what}) data.

    Rare gene modules include genes which are weakly and rarely expressed, yet are highly correlated
    with each other, allowing for robust detection. Global analysis algorithms (such as metacells)
    tend to ignore or at least discount such genes.

    It is therefore useful to explicitly identify, in a pre-processing step, the few cells which
    express such rare gene modules. Once identified, these cells can be exempt from the global
    algorithm, or the global algorithm can be tweaked in some way to pay extra attention to them.

    If ``reproducible`` (default: {reproducible}) is ``True``, a slower (still parallel) but
    reproducible algorithm will be used to compute pearson correlations.

    **Input**

    Annotated ``adata``, where the observations are cells and the variables are genes, where
    ``what`` is a per-variable-per-observation matrix or the name of a per-variable-per-observation
    annotation containing such a matrix.

    **Returns**

    Observation (Cell) Annotations
        ``cells_rare_gene_module``
            The index of the rare gene module each cell expresses the most, or ``-1`` in the common
            case it does not express any rare genes module.

        ``rare_cell``
            A boolean mask for the (few) cells that express a rare gene module.

    Variable (Gene) Annotations
        ``rare_gene_module_<N>``
            A boolean mask for the genes in the gene module with index ``N``.

        ``rare_gene``
            A boolean mask for the genes in any of the rare gene modules.

    If ``inplace``, these are written to to the data, and the function returns ``None``. Otherwise
    they are returned as tuple containing two data frames.

    **Computation Parameters**

    1. Pick as candidates all genes that are expressed in at most than ``max_gene_cell_fraction``
       (default: {max_gene_cell_fraction}) of the cells, and whose maximal value in a cell is at
       least ``min_gene_maximum`` (default: {min_gene_maximum}), as long as they do not match the
       ``forbidden_gene_names`` or the ``forbidden_gene_patterns``.

    2. Compute the similarity between the genes using
       :py:func:`metacells.tools.similarity.compute_var_var_similarity` using the
       ``genes_similarity_method`` (default: {genes_similarity_method}).

    3. Create a hierarchical clustering of the candidate genes using the ``genes_cluster_method``
       (default: {genes_cluster_method}).

    4. Identify gene modules in the hierarchical clustering which contain at least
       ``min_genes_of_modules`` genes (default: {min_genes_of_modules}), with an average gene-gene
       cross-correlation of at least ``min_module_correlation`` (default:
       {min_module_correlation}).

    5. Consider cells expressing of any of the genes in the gene module. If the expected number of
       such cells in each random pile of size ``target_pile_size`` (default: {target_pile_size}), whose total number of
       UMIs of the rare gene module is at least ``min_cell_module_total`` (default: {min_cell_module_total}), is more
       than the ``max_cells_factor_of_random_pile`` (default: {max_cells_factor_of_random_pile}) as a fraction of the
       mean metacells size, then discard the rare gene module as not that rare after all.

    6. Add to the gene module all genes whose fraction in cells expressing any of the genes in the
       rare gene module is at least 2^``min_related_gene_fold_factor`` (default:
       {min_related_gene_fold_factor}) times their fraction in the rest of the population, as long
       as their maximal value in one of the expressing cells is at least ``min_gene_maximum``,
       as long as this doesn't add more than ``max_related_gene_increase_factor`` times the original
       number of cells to the rare gene module, and as long as they do not match the
       ``forbidden_gene_names`` or the ``forbidden_gene_patterns``. If a gene is above the threshold
       for multiple gene modules, associate it with the gene module for which its fold factor is
       higher.

    7. Associate cells with the rare gene module if they contain at least ``min_cell_module_total``
       (default: {min_cell_module_total}) UMIs of the expanded rare gene module. If a cell meets the
       above threshold for several rare gene modules, it is associated with the one for which it
       contains more UMIs.

    8. Discard modules which have less than ``min_cells_of_modules`` (default:
       {min_cells_of_modules}) cells or whose total UMIs are less than the ``target_metacell_size``
       (default: {target_metacell_size}) times the ``min_modules_size_factor`` (default:
       {min_modules_size_factor}).
    """
    assert min_cells_of_modules > 0
    assert min_genes_of_modules > 0

    umis_per_gene = ut.get_v_numpy(adata, what, sum=True)
    total_umis = np.sum(umis_per_gene)
    mean_umis_per_cell = total_umis / adata.n_obs
    mean_metacells_size = target_metacell_size / mean_umis_per_cell
    ut.log_calc("mean_metacells_size", mean_metacells_size)
    max_cells_of_random_pile = mean_metacells_size * max_cells_factor_of_random_pile
    ut.log_calc("max_cells_of_random_pile", max_cells_of_random_pile)

    forbidden_genes_mask = find_named_genes(adata,
                                            names=forbidden_gene_names,
                                            patterns=forbidden_gene_patterns)
    assert forbidden_genes_mask is not None

    allowed_genes_mask = ~forbidden_genes_mask.values
    ut.log_calc("allowed_genes_mask", allowed_genes_mask)

    rare_module_of_cells = np.full(adata.n_obs, -1, dtype="int32")
    list_of_rare_gene_indices_of_modules: List[List[int]] = []

    candidates = _pick_candidates(
        adata_of_all_genes_of_all_cells=adata,
        what=what,
        max_gene_cell_fraction=max_gene_cell_fraction,
        min_gene_maximum=min_gene_maximum,
        min_genes_of_modules=min_genes_of_modules,
        allowed_genes_mask=allowed_genes_mask,
    )
    if candidates is None:
        return _results(
            adata=adata,
            rare_module_of_cells=rare_module_of_cells,
            list_of_rare_gene_indices_of_modules=
            list_of_rare_gene_indices_of_modules,
            inplace=inplace,
        )
    candidate_data, candidate_genes_indices = candidates

    similarities_between_candidate_genes = _genes_similarity(
        candidate_data=candidate_data,
        what=what,
        method=genes_similarity_method,
        reproducible=reproducible)

    linkage = _cluster_genes(
        similarities_between_candidate_genes=
        similarities_between_candidate_genes,
        genes_cluster_method=genes_cluster_method,
    )

    rare_gene_indices_of_modules = _identify_genes(
        candidate_genes_indices=candidate_genes_indices,
        similarities_between_candidate_genes=
        similarities_between_candidate_genes,
        linkage=linkage,
        min_module_correlation=min_module_correlation,
    )

    max_cells_of_modules = int(max_cells_of_random_pile * adata.n_obs /
                               target_pile_size)
    ut.log_calc("max_cells_of_modules", max_cells_of_modules)

    related_gene_indices_of_modules = _related_genes(
        adata_of_all_genes_of_all_cells=adata,
        what=what,
        rare_gene_indices_of_modules=rare_gene_indices_of_modules,
        allowed_genes_mask=allowed_genes_mask,
        min_genes_of_modules=min_genes_of_modules,
        min_cells_of_modules=min_cells_of_modules,
        max_cells_of_modules=max_cells_of_modules,
        min_cell_module_total=min_cell_module_total,
        min_gene_maximum=min_gene_maximum,
        min_related_gene_fold_factor=min_related_gene_fold_factor,
        max_related_gene_increase_factor=max_related_gene_increase_factor,
    )

    _identify_cells(
        adata_of_all_genes_of_all_cells=adata,
        what=what,
        related_gene_indices_of_modules=related_gene_indices_of_modules,
        min_cells_of_modules=min_cells_of_modules,
        max_cells_of_modules=max_cells_of_modules,
        min_cell_module_total=min_cell_module_total,
        rare_module_of_cells=rare_module_of_cells,
    )

    list_of_rare_gene_indices_of_modules = _compress_modules(
        adata_of_all_genes_of_all_cells=adata,
        what=what,
        min_cells_of_modules=min_cells_of_modules,
        max_cells_of_modules=max_cells_of_modules,
        target_metacell_size=target_metacell_size,
        min_modules_size_factor=min_modules_size_factor,
        related_gene_indices_of_modules=related_gene_indices_of_modules,
        rare_module_of_cells=rare_module_of_cells,
    )

    return _results(
        adata=adata,
        rare_module_of_cells=rare_module_of_cells,
        list_of_rare_gene_indices_of_modules=
        list_of_rare_gene_indices_of_modules,
        inplace=inplace,
    )
Example #9
0
def combine_masks(  # pylint: disable=too-many-branches,too-many-statements
    adata: AnnData,
    masks: List[str],
    *,
    invert: bool = False,
    to: Optional[str] = None,
) -> Optional[ut.PandasSeries]:
    """
    Combine different pre-computed masks into a final overall mask.

    **Input**

    Annotated ``adata``, where the observations are cells and the variables are genes.

    **Returns**

    If ``to`` (default: {to}) is ``None``, returns the computed mask. Otherwise, sets the
    mask as an annotation (per-variable or per-observation depending on the type of the combined masks).

    **Computation Parameters**

    1. For each of the mask in ``masks``, fetch it. Silently ignore missing masks if the name has a
       ``?`` suffix. Invert the mask if the name has a ``~`` prefix. If the name has a ``|`` prefix
       (before the ``~`` prefix, if any), then bitwise-OR the mask into the OR mask, otherwise (or if
       it has a ``&`` prefix), bitwise-AND the mask into the AND mask.

    2. Combine (bitwise-AND) the AND mask and the OR mask into a single mask.

    3. If ``invert`` (default: {invert}), invert the result combined mask.
    """
    assert len(masks) > 0

    per: Optional[str] = None

    and_mask: Optional[ut.NumpyVector] = None
    or_mask: Optional[ut.NumpyVector] = None

    for mask_name in masks:
        log_mask_name = mask_name

        if mask_name[0] == "|":
            is_or = True
            mask_name = mask_name[1:]
        else:
            is_or = False
            if mask_name[0] == "&":
                mask_name = mask_name[1:]

        if mask_name[0] == "~":
            invert_mask = True
            mask_name = mask_name[1:]
        else:
            invert_mask = False

        if mask_name[-1] == "?":
            must_exist = False
            mask_name = mask_name[:-1]
        else:
            must_exist = True

        if mask_name in adata.obs:
            mask_per = "o"
            mask = ut.get_o_numpy(
                adata, mask_name, formatter=ut.mask_description) > 0
        elif mask_name in adata.var:
            mask_per = "v"
            mask = ut.get_v_numpy(
                adata, mask_name, formatter=ut.mask_description) > 0
        else:
            if must_exist:
                raise KeyError(f"unknown mask data: {mask_name}")
            continue

        if mask.dtype != "bool":
            raise ValueError(f"the data: {mask_name} is not a boolean mask")

        if invert_mask:
            mask = ~mask

        if ut.logging_calc():
            ut.log_calc(log_mask_name, mask)

        if per is None:
            per = mask_per
        else:
            if mask_per != per:
                raise ValueError(
                    "mixing per-observation and per-variable masks")

        if is_or:
            if or_mask is None:
                or_mask = mask
            else:
                or_mask = or_mask | mask
        else:
            if and_mask is None:
                and_mask = mask
            else:
                and_mask = and_mask & mask

    if and_mask is not None:
        if or_mask is not None:
            combined_mask = and_mask & or_mask
        else:
            combined_mask = and_mask
    else:
        if or_mask is not None:
            combined_mask = or_mask
        else:
            raise ValueError("no masks to combine")

    if invert:
        combined_mask = ~combined_mask

    if to is None:
        ut.log_return("combined", combined_mask)
        if per == "o":
            return ut.to_pandas_series(combined_mask, index=adata.obs_names)
        assert per == "v"
        return ut.to_pandas_series(combined_mask, index=adata.var_names)

    if per == "o":
        ut.set_o_data(adata, to, combined_mask)
    else:
        ut.set_v_data(adata, to, combined_mask)

    return None
Example #10
0
def collect_metacells(
    adata: AnnData,
    what: Union[str, ut.Matrix] = "__x__",
    *,
    max_cell_size: Optional[float] = pr.max_cell_size,
    max_cell_size_factor: Optional[float] = pr.max_cell_size_factor,
    cell_sizes: Optional[Union[str, ut.Vector]] = pr.cell_sizes,
    name: str = "metacells",
    top_level: bool = True,
) -> AnnData:
    """
    Collect computed metacells ``what`` (default: {what}) data.

    **Input**

    Annotated (presumably "clean") ``adata``, where the observations are cells and the variables are
    genes, and where ``what`` is a per-variable-per-observation matrix or the name of a
    per-variable-per-observation annotation containing such a matrix.

    **Returns**

    Annotated metacell data containing for each observation the sum of the data (by of the cells for
    each metacell, which contains the following annotations:

    Variable (Gene) Annotations
        ``excluded_gene``
            A mask of the genes which were excluded by name.

        ``clean_gene``
            A boolean mask of the clean genes.

        ``forbidden_gene``
            A boolean mask of genes which are forbidden from being chosen as "feature" genes based
            on their name. This is ``False`` for non-"clean" genes.

        If directly computing metecalls:

        ``feature``
            A boolean mask of the "feature" genes. This is ``False`` for non-"clean" genes.

        If using divide-and-conquer:

        ``pre_feature``, ``feature``
            The number of times the gene was used as a feature when computing the preliminary and
            final metacells. This is zero for non-"clean" genes.

    Observations (Cell) Annotations
        ``grouped``
            The number of ("clean") cells grouped into each metacell.

        ``pile``
            The index of the pile used to compute the metacell each cell was assigned to to. This is
            ``-1`` for non-"clean" cells.

        ``candidate``
            The index of the candidate metacell each cell was assigned to to. This is ``-1`` for
            non-"clean" cells.

    Also sets all relevant annotations in the full data based on their value in the clean data, with
    appropriate defaults for non-"clean" data.

    **Computation Parameters**

    1. Compute the cell's scale factors by invoking :py:func:`compute_effective_cell_sizes` using the
       ``max_cell_size`` (default: {max_cell_size}), ``max_cell_size_factor`` (default:
       {max_cell_size_factor}) and ``cell_sizes`` (default: {cell_sizes}).

    2. Scale the cell's data using these factors, if needed.

    3. Invoke :py:func:`metacells.tools.group.group_obs_data` to sum the cells into
       metacells.

    4. Pass all relevant per-gene and per-cell annotations to the result.
    """
    _cell_sizes, _max_cell_size, cell_scale_factors = compute_effective_cell_sizes(
        adata, max_cell_size=max_cell_size, max_cell_size_factor=max_cell_size_factor, cell_sizes=cell_sizes
    )

    if cell_scale_factors is not None:
        data = ut.get_vo_proper(adata, what, layout="row_major")
        what = ut.scale_by(data, cell_scale_factors, by="row")

    mdata = tl.group_obs_data(adata, what, groups="metacell", name=name)
    assert mdata is not None
    if top_level:
        ut.top_level(mdata)

    for annotation_name in ("excluded_gene", "clean_gene", "forbidden_gene", "pre_feature_gene", "feature_gene"):
        if not ut.has_data(adata, annotation_name):
            continue
        value_per_gene = ut.get_v_numpy(adata, annotation_name, formatter=ut.mask_description)
        ut.set_v_data(mdata, annotation_name, value_per_gene, formatter=ut.mask_description)

    for annotation_name in ("pile", "candidate"):
        if ut.has_data(adata, annotation_name):
            tl.group_obs_annotation(
                adata, mdata, groups="metacell", formatter=ut.groups_description, name=annotation_name, method="unique"
            )

    return mdata
Example #11
0
def compute_knn_by_features(
    adata: AnnData,
    what: Union[str, ut.Matrix] = "__x__",
    *,
    max_top_feature_genes: int = pr.max_top_feature_genes,
    similarity_value_normalization: float = pr.
    umap_similarity_value_normalization,
    similarity_log_data: bool = pr.umap_similarity_log_data,
    similarity_method: str = pr.umap_similarity_method,
    logistics_location: float = pr.logistics_location,
    logistics_slope: float = pr.logistics_slope,
    k: int,
    balanced_ranks_factor: float = pr.knn_balanced_ranks_factor,
    incoming_degree_factor: float = pr.knn_incoming_degree_factor,
    outgoing_degree_factor: float = pr.knn_outgoing_degree_factor,
    reproducible: bool = pr.reproducible,
) -> ut.PandasFrame:
    """
    Compute KNN graph between metacells based on feature genes.

    If ``reproducible`` (default: {reproducible}) is ``True``, a slower (still parallel) but
    reproducible algorithm will be used to compute pearson correlations.

    **Input**

    Annotated ``adata`` where each observation is a metacells and the variables are genes,
    are genes, where ``what`` is a per-variable-per-observation matrix or the name of a
    per-variable-per-observation annotation containing such a matrix.

    **Returns**

    Sets the following in ``adata``:

    Observations-Pair (Metacells) Annotations
        ``obs_outgoing_weights``
            A sparse square matrix where each non-zero entry is the weight of an edge between a pair
            of cells or genes, where the sum of the weights of the outgoing edges for each element
            is 1 (there is always at least one such edge).

    Also return a pandas data frame of the similarities between the observations (metacells).

    **Computation Parameters**

    1. Invoke :py:func:`metacells.tools.high.find_top_feature_genes` using ``max_top_feature_genes``
       (default: {max_top_feature_genes}) to pick the feature genes to use to compute similarities
       between the metacells.

    2. Compute the fractions of each gene in each cell, and add the
       ``similarity_value_normalization`` (default: {similarity_value_normalization}) to
       it.

    3. If ``similarity_log_data`` (default: {similarity_log_data}), invoke the
       :py:func:`metacells.utilities.computation.log_data` function to compute the log (base 2) of
       the data.

    4. Invoke :py:func:`metacells.tools.similarity.compute_obs_obs_similarity` using
       ``similarity_method`` (default: {similarity_method}), ``logistics_location`` (default:
       {logistics_slope}) and ``logistics_slope`` (default: {logistics_slope}) and convert this
       to distances.

    5. Invoke :py:func:`metacells.tools.knn_graph.compute_obs_obs_knn_graph` using the distances,
       ``k`` (no default!), ``balanced_ranks_factor`` (default: {balanced_ranks_factor}),
       ``incoming_degree_factor`` (default: {incoming_degree_factor}), ``outgoing_degree_factor``
       (default: {outgoing_degree_factor}) to compute a "skeleton" graph to overlay on top of the
       UMAP graph.
    """
    tl.find_top_feature_genes(adata, max_genes=max_top_feature_genes)

    all_data = ut.get_vo_proper(adata, what, layout="row_major")
    all_fractions = ut.fraction_by(all_data, by="row")

    top_feature_genes_mask = ut.get_v_numpy(adata, "top_feature_gene")

    top_feature_genes_fractions = all_fractions[:, top_feature_genes_mask]
    top_feature_genes_fractions = ut.to_layout(top_feature_genes_fractions,
                                               layout="row_major")
    top_feature_genes_fractions = ut.to_numpy_matrix(
        top_feature_genes_fractions)

    top_feature_genes_fractions += similarity_value_normalization

    if similarity_log_data:
        top_feature_genes_fractions = ut.log_data(top_feature_genes_fractions,
                                                  base=2)

    tdata = ut.slice(adata, vars=top_feature_genes_mask)
    similarities = tl.compute_obs_obs_similarity(
        tdata,
        top_feature_genes_fractions,
        method=similarity_method,
        reproducible=reproducible,
        logistics_location=logistics_location,
        logistics_slope=logistics_slope,
        inplace=False,
    )
    assert similarities is not None

    tl.compute_obs_obs_knn_graph(
        adata,
        similarities,
        k=k,
        balanced_ranks_factor=balanced_ranks_factor,
        incoming_degree_factor=incoming_degree_factor,
        outgoing_degree_factor=outgoing_degree_factor,
    )

    return similarities
Example #12
0
def filter_data(  # pylint: disable=dangerous-default-value
    adata: AnnData,
    obs_masks: List[str] = [],
    var_masks: List[str] = [],
    *,
    mask_obs: Optional[str] = None,
    mask_var: Optional[str] = None,
    invert_obs: bool = False,
    invert_var: bool = False,
    track_obs: Optional[str] = None,
    track_var: Optional[str] = None,
    name: Optional[str] = None,
    top_level: bool = True,
) -> Optional[Tuple[AnnData, ut.PandasSeries, ut.PandasSeries]]:
    """
    Filter (slice) the data based on previously-computed masks.

    For example, it is useful to discard cell-cycle genes, cells which have too few UMIs for
    meaningful analysis, etc. In general, the "best" filter depends on the data set.

    This function makes it easy to combine different pre-computed per-observation (cell) and
    per-variable (gene) boolean mask annotations into a final overall inclusion mask, and slice the
    data accordingly, while tracking the base index of the cells and genes in the filtered data.

    **Input**

    Annotated ``adata``, where the observations are cells and the variables are genes.

    **Returns**

    An annotated data containing a subset of the observations (cells) and variables (genes).

    If no observations and/or no variables were selected by the filter, returns ``None``.

    If ``name`` is not specified, the returned data will be unnamed. Otherwise, if the name starts
    with a ``.``, it will be appended to the current name (if any). Otherwise, ``name`` is the new
    name.

    If ``mask_obs`` and/or ``mask_var`` are specified, store the mask of the selected data as a
    per-observation and/or per-variable annotation of the full ``adata``.

    If ``track_obs`` and/or ``track_var`` are specified, store the original indices of the selected
    data as a per-observation and/or per-variable annotation of the result data.

    **Computation Parameters**

    1. Combine the masks in ``obs_masks`` and/or ``var_masks`` using
       :py:func:`metacells.tools.mask.combine_masks` passing it ``invert_obs`` and ``invert_var``,
       and ``mask_obs`` and ``mask_var`` as the ``to`` parameter. If either list of masks is empty,
       use the full mask.

    2. If the obtained masks for either the observations or variables is empty, return ``None``.
       Otherwise, return a slice of the full data containing just the observations and variables
       specified by the final masks.
    """
    if len(obs_masks) == 0:
        obs_mask = np.full(adata.n_obs, True, dtype="bool")
        if mask_obs is not None:
            ut.set_o_data(adata, mask_obs, obs_mask)
    else:
        mask = combine_masks(adata, obs_masks, invert=invert_obs, to=mask_obs)
        if mask is None:
            assert mask_obs is not None
            obs_mask = ut.get_o_numpy(
                adata, mask_obs, formatter=ut.mask_description) > 0
        else:
            obs_mask = ut.to_numpy_vector(mask, only_extract=True) > 0

    if len(var_masks) == 0:
        var_mask = np.full(adata.n_vars, True, dtype="bool")
        if mask_var is not None:
            ut.set_o_data(adata, mask_var, var_mask)
    else:
        mask = combine_masks(adata, var_masks, invert=invert_var, to=mask_var)
        if mask is None:
            assert mask_var is not None
            var_mask = ut.get_v_numpy(
                adata, mask_var, formatter=ut.mask_description) > 0
        else:
            var_mask = ut.to_numpy_vector(mask, only_extract=True) > 0

    if not np.any(obs_mask) or not np.any(var_mask):
        return None

    fdata = ut.slice(adata,
                     name=name,
                     top_level=top_level,
                     obs=obs_mask,
                     vars=var_mask,
                     track_obs=track_obs,
                     track_var=track_var)

    return (
        fdata,
        ut.to_pandas_series(obs_mask, index=adata.obs_names),
        ut.to_pandas_series(var_mask, index=adata.var_names),
    )
Example #13
0
def _apply_annotations(  # pylint: disable=too-many-branches
    adata: AnnData,
    sdata: AnnData,
    per: str,
    annotations: Dict[str, DefaultValues],
    indices: Union[str, ut.Vector],
) -> None:
    full_name = ut.get_name(adata)
    slice_name = ut.get_name(sdata)

    assert per in ("o", "v")

    if per == "o":
        full_data = adata.obs
        full_size = adata.n_obs
        slice_data = sdata.obs
        slice_size = sdata.n_obs
        full_indices = ut.get_o_numpy(sdata, indices)
    else:
        full_data = adata.var
        full_size = adata.n_vars
        slice_data = sdata.var
        slice_size = sdata.n_vars
        full_indices = ut.get_v_numpy(sdata, indices)

    for name, default_values in annotations.items():
        slice_value = slice_data.get(name)
        if slice_value is not None:
            formatter: Optional[Callable[[Any], str]] = None
        else:
            if default_values.slice == Skip or isinstance(
                    default_values.slice, Skip):
                continue

            if default_values.slice == Raise or isinstance(
                    default_values.slice, Raise):
                if slice_name is None:
                    raise KeyError(f"unknown slice data name: {name}")
                raise KeyError(
                    f"unknown slice data: {slice_name} name: {name}")

            slice_value = default_values.slice

            def formatter(_: Any) -> str:
                # pylint: disable=cell-var-from-loop
                return f"{slice_size} <- {slice_value}"

            # pylint: enable=cell-var-from-loop

        full_value = full_data.get(name)
        if full_value is not None:
            ut.unfreeze(full_value)
        else:
            if default_values.full == Skip or isinstance(
                    default_values.full, Skip):
                continue

            if default_values.full == Raise or isinstance(
                    default_values.full, Raise):
                if full_name is None:
                    raise KeyError(f"unknown full data name: {name}")
                raise KeyError(f"unknown full data: {full_name} name: {name}")

            if default_values.full is None:
                full_value = np.full(full_size, None, dtype="float32")
            else:
                full_value = np.full(full_size, default_values.full)

        full_value[full_indices] = slice_value
        if per == "o":
            ut.set_o_data(adata, name, full_value, formatter=formatter)
        else:
            ut.set_v_data(adata, name, full_value, formatter=formatter)