Esempio n. 1
0
def test_sum_groups() -> None:
    expected_sums = np.array([[5, 7, 2], [10, 8, 13]])
    expected_sizes = np.array([2, 2])
    groups = np.array([0, 1, 0, 1])

    dense_rows = np.array([[0, 1, 2], [3, 0, 4], [5, 6, 0], [7, 8, 9]], dtype="float")

    results = ut.sum_groups(dense_rows, groups, per="row")
    assert results is not None
    assert np.allclose(ut.to_numpy_matrix(results[0]), expected_sums)
    assert np.allclose(results[1], expected_sizes)

    results = ut.sum_groups(dense_rows.transpose(), groups, per="column")
    assert results is not None
    assert np.allclose(ut.to_numpy_matrix(results[0]), expected_sums.transpose())
    assert np.allclose(results[1], expected_sizes)

    sparse_rows = sparse.csr_matrix(dense_rows)

    results = ut.sum_groups(sparse_rows, groups, per="row")
    assert results is not None
    assert np.allclose(ut.to_numpy_matrix(results[0]), expected_sums)
    assert np.allclose(results[1], expected_sizes)

    results = ut.sum_groups(sparse_rows.transpose(), groups, per="column")
    assert results is not None
    assert np.allclose(ut.to_numpy_matrix(results[0]), expected_sums.transpose())
    assert np.allclose(results[1], expected_sizes)
Esempio n. 2
0
def test_pairs_corrcoef() -> None:
    np.random.seed(123456)
    first_matrix = ut.to_numpy_matrix(sparse.rand(100, 100, density=0.1, format="csr"))
    second_matrix = ut.to_numpy_matrix(sparse.rand(100, 100, density=0.1, format="csr"))

    first_dense = ut.to_layout(first_matrix, layout="row_major")
    second_dense = ut.to_layout(second_matrix, layout="row_major")
    fast_results = ut.pairs_corrcoef_rows(first_dense, second_dense, reproducible=True)

    slow_results = np.zeros(100, dtype="float")
    for row in range(100):
        slow_results[row] = np.corrcoef(first_dense[row], second_dense[row])[0, 1]
    slow_results[np.isnan(slow_results)] = 0

    assert fast_results.shape == slow_results.shape
    assert np.allclose(fast_results, slow_results, atol=1e-6)
Esempio n. 3
0
def _compute_metacell_inner_folds(
    *,
    metacell_index: int,
    cells_data: ut.Matrix,
    metacells_data: ut.Matrix,
    group_of_cells: ut.NumpyVector,
    total_umis_per_cell: ut.NumpyVector,
    total_umis_per_metacell: ut.NumpyVector,
) -> ut.NumpyVector:
    grouped_cells_mask = group_of_cells == metacell_index
    assert np.sum(grouped_cells_mask) > 0
    grouped_cells_data = ut.to_numpy_matrix(cells_data[grouped_cells_mask, :])
    total_umis_per_grouped_cell = total_umis_per_cell[grouped_cells_mask]
    metacell_data = metacells_data[metacell_index, :]
    total_umis_of_metacell = total_umis_per_metacell[metacell_index]
    expected_scale_per_grouped_cell = total_umis_per_grouped_cell / total_umis_of_metacell
    expected_cells_data = expected_scale_per_grouped_cell[:, np.
                                                          newaxis] * metacell_data[
                                                              np.newaxis, :]
    assert expected_cells_data.shape == grouped_cells_data.shape
    fold_factors_per_grouped_cell = np.log2(
        (expected_cells_data + 1) / (grouped_cells_data + 1))
    fold_factors = ut.max_per(ut.to_layout(fold_factors_per_grouped_cell,
                                           "column_major"),
                              per="column")
    max_fold_factor = np.max(fold_factors)
    ut.log_calc("max_fold_factor", max_fold_factor)
    return fold_factors
Esempio n. 4
0
def compute_query_projection(
    what: Union[str, ut.Matrix] = "__x__",
    *,
    adata: AnnData,
    qdata: AnnData,
    weights: ut.Matrix,
    atlas_total_umis: Optional[ut.Vector] = None,
    query_total_umis: Optional[ut.Vector] = None,
) -> None:
    """
    Compute the projected image of the query on the atlas.

    **Input**

    Annotated query ``qdata`` and atlas ``adata``, where the observations are cells and the variables are genes, where
    ``what`` is a per-variable-per-observation matrix or the name of a per-variable-per-observation annotation
    containing such a matrix.

    The ``weights`` of the projection where each row is a query metacell, each column is an atlas metacell, and the
    value is the weight of the atlas cell for projecting the metacell, such that the sum of weights in each row
    is one.

    **Returns**

    In addition, sets the following annotations in ``qdata``:

    Observation (Cell) Annotations
        ``projection``
            The number of UMIs of each gene in the projected image of the query to the metacell, if the total number of
            UMIs in the projection is equal to the total number of UMIs in the query metacell.

    **Computation Parameters**

    1. Compute the fraction of each gene in the atlas and the query based on the total UMIs, unless ``atlas_total_umis``
       and/or ``query_total_umis`` are specified.

    2. Compute the projected image of each query metacell on the atlas using the weights.

    3. Convert this image to UMIs count based on the total UMIs of each metacell. Note that if overriding the total
       atlas or query UMIs, this means that the result need not sum to this total.
    """
    assert np.all(adata.var_names == qdata.var_names)

    atlas_umis = ut.get_vo_proper(adata, what, layout="row_major")
    query_umis = ut.get_vo_proper(qdata, what, layout="row_major")

    if atlas_total_umis is None:
        atlas_total_umis = ut.sum_per(atlas_umis, per="row")
    atlas_total_umis = ut.to_numpy_vector(atlas_total_umis)

    if query_total_umis is None:
        query_total_umis = ut.sum_per(query_umis, per="row")
    query_total_umis = ut.to_numpy_vector(query_total_umis)

    atlas_fractions = ut.to_numpy_matrix(ut.fraction_by(atlas_umis, by="row", sums=atlas_total_umis))
    projected_fractions = weights @ atlas_fractions  # type: ignore
    projected_umis = ut.scale_by(projected_fractions, scale=query_total_umis, by="row")
    ut.set_vo_data(qdata, "projected", projected_umis)
Esempio n. 5
0
def _test_per(rows_matrix: ut.Matrix) -> None:
    columns_matrix = ut.to_layout(rows_matrix, layout="column_major")

    assert np.allclose(ut.nnz_per(rows_matrix, per="row"), np.array([2, 3]))
    assert np.allclose(ut.nnz_per(columns_matrix, per="column"), np.array([1, 2, 2]))

    assert np.allclose(ut.sum_per(rows_matrix, per="row"), np.array([3, 12]))
    assert np.allclose(ut.sum_per(columns_matrix, per="column"), np.array([3, 5, 7]))

    assert np.allclose(ut.max_per(rows_matrix, per="row"), np.array([2, 5]))
    assert np.allclose(ut.max_per(columns_matrix, per="column"), np.array([3, 4, 5]))

    assert np.allclose(ut.min_per(rows_matrix, per="row"), np.array([0, 3]))
    assert np.allclose(ut.min_per(columns_matrix, per="column"), np.array([0, 1, 2]))

    assert np.allclose(ut.sum_squared_per(rows_matrix, per="row"), np.array([5, 50]))
    assert np.allclose(ut.sum_squared_per(columns_matrix, per="column"), np.array([9, 17, 29]))

    assert np.allclose(ut.fraction_per(rows_matrix, per="row"), np.array([3 / 15, 12 / 15]))
    assert np.allclose(ut.fraction_per(columns_matrix, per="column"), np.array([3 / 15, 5 / 15, 7 / 15]))

    assert np.allclose(ut.mean_per(rows_matrix, per="row"), np.array([3 / 3, 12 / 3]))
    assert np.allclose(ut.mean_per(columns_matrix, per="column"), np.array([3 / 2, 5 / 2, 7 / 2]))

    assert np.allclose(
        ut.variance_per(rows_matrix, per="row"), np.array([5 / 3 - (3 / 3) ** 2, 50 / 3 - (12 / 3) ** 2])
    )

    assert np.allclose(
        ut.variance_per(columns_matrix, per="column"),
        np.array([9 / 2 - (3 / 2) ** 2, 17 / 2 - (5 / 2) ** 2, 29 / 2 - (7 / 2) ** 2]),
    )

    assert np.allclose(
        ut.normalized_variance_per(columns_matrix, per="column"),
        np.array(
            [(9 / 2 - (3 / 2) ** 2) / (3 / 2), (17 / 2 - (5 / 2) ** 2) / (5 / 2), (29 / 2 - (7 / 2) ** 2) / (7 / 2)]
        ),
    )

    dense = ut.to_numpy_matrix(ut.fraction_by(rows_matrix, by="row"))
    assert np.allclose(dense, np.array([[0 / 3, 1 / 3, 2 / 3], [3 / 12, 4 / 12, 5 / 12]]))

    dense = ut.to_numpy_matrix(ut.fraction_by(columns_matrix, by="column"))
    assert np.allclose(dense, np.array([[0 / 3, 1 / 5, 2 / 7], [3 / 3, 4 / 5, 5 / 7]]))
Esempio n. 6
0
def find_biased_genes(
    adata: AnnData,
    *,
    max_projection_fold_factor: float = pr.project_max_projection_fold_factor,
    min_metacells_fraction: float = pr.biased_min_metacells_fraction,
    abs_folds: bool = pr.project_abs_folds,
    to_property_name: str = "biased_gene",
) -> None:
    """
    Find genes that have a strong bias in the query compared to the atlas.

    **Input**

    Annotated query ``adata`` where the observations are cells and the variables are genes, where ``what`` is a
    per-variable-per-observation matrix or the name of a per-variable-per-observation annotation containing such a
    matrix.

    This should contain a ``projected_fold`` per-variable-per-observation matrix with the fold factor between each query
    metacell and its projected image on the atlas.

    **Returns**

    Sets the following annotations in ``adata``:

    Variable (Gene) Annotations
        ``biased_gene`` (or ``to_property_name``):
            A boolean mask indicating whether the gene has a strong bias in the query compared to the atlas.

    **Computation Parameters**

    1. Count for each such gene the number of query metacells for which the ``projected_fold`` is above
       ``max_projection_fold_factor``. If ``abs_folds`` (default: {abs_folds}), consider the absolute fold factor.

    2. Mark the gene as biased if either count is at least a ``min_metacells_fraction`` (default:
       {min_metacells_fraction}) of the metacells.
    """
    assert max_projection_fold_factor >= 0
    assert 0 <= min_metacells_fraction <= 1

    projected_fold = ut.get_vo_proper(adata, "projected_fold", layout="column_major")
    if abs_folds:
        projected_fold = np.abs(projected_fold)  # type: ignore

    high_projection_folds = ut.to_numpy_matrix(projected_fold > max_projection_fold_factor)  # type: ignore
    ut.log_calc("high_projection_folds", high_projection_folds)

    count_of_genes = ut.sum_per(high_projection_folds, per="column")
    min_count = adata.n_obs * min_metacells_fraction
    mask_of_genes = count_of_genes >= min_count

    ut.set_v_data(adata, to_property_name, mask_of_genes)
Esempio n. 7
0
def _genes_similarity(
    *,
    candidate_data: AnnData,
    what: Union[str, ut.Matrix],
    method: str,
    reproducible: bool,
) -> ut.NumpyMatrix:
    similarity = compute_var_var_similarity(candidate_data,
                                            what,
                                            method=method,
                                            reproducible=reproducible,
                                            inplace=False)
    assert similarity is not None
    return ut.to_numpy_matrix(similarity, only_extract=True)
Esempio n. 8
0
def find_systematic_genes(
    what: Union[str, ut.Matrix] = "__x__",
    *,
    adata: AnnData,
    qdata: AnnData,
    atlas_total_umis: Optional[ut.Vector] = None,
    query_total_umis: Optional[ut.Vector] = None,
    low_gene_quantile: float = pr.systematic_low_gene_quantile,
    high_gene_quantile: float = pr.systematic_high_gene_quantile,
    to_property_name: str = "systematic_gene",
) -> None:
    """
    Find genes that

    **Input**

    Annotated query ``qdata`` and atlas ``adata``, where the observations are cells and the variables are genes, where
    ``what`` is a per-variable-per-observation matrix or the name of a per-variable-per-observation annotation
    containing such a matrix.

    **Returns**

    A matrix whose rows are query metacells and columns are atlas metacells, where each entry is the weight of the atlas
    metacell in the projection of the query metacells. The sum of weights in each row (that is, for a single query
    metacell) is 1. The weighted sum of the atlas metacells using these weights is the "projected" image of the query
    metacell onto the atlas.

    In addition, sets the following annotations in ``qdata``:

    Variable (Gene) Annotations
        ``systematic_gene`` (or ``to_property_name``)
            A boolean mask indicating whether the gene is systematically higher or lower in the query compared to the
            atlas.

    **Computation Parameters**

    1. Compute the fraction of each gene out of the total UMIs in both the atlas and the query. If ``atlas_total_umis``
       and/or ``query_total_umis`` are given, use them as the basis instead of the sum of the UMIs.

    2. Compute for each gene its ``low_gene_quantile`` (default: {low_gene_quantile}) fraction in the query, and its
       ``high_gene_quantile`` (default: {high_gene_quantile}) fraction in the atlas.

    3. Compute for each gene its standard deviation in the atlas.

    4. Mark as systematic the genes for which the low quantile value in the query is at least the atlas high quantile
       value.

    5. Mark as systematic the genes for which the low quantile value in the atlas is at least the query high quantile
       value.
    """
    assert 0 <= low_gene_quantile <= 1
    assert 0 <= high_gene_quantile <= 1
    assert np.all(adata.var_names == qdata.var_names)

    query_umis = ut.get_vo_proper(qdata, what, layout="row_major")
    atlas_umis = ut.get_vo_proper(adata, what, layout="row_major")

    atlas_fractions = ut.to_numpy_matrix(ut.fraction_by(atlas_umis, by="row", sums=atlas_total_umis))
    query_fractions = ut.to_numpy_matrix(ut.fraction_by(query_umis, by="row", sums=query_total_umis))

    query_fractions = ut.to_layout(query_fractions, layout="column_major")
    atlas_fractions = ut.to_layout(atlas_fractions, layout="column_major")

    query_low_gene_values = ut.quantile_per(query_fractions, low_gene_quantile, per="column")
    atlas_low_gene_values = ut.quantile_per(atlas_fractions, low_gene_quantile, per="column")

    query_high_gene_values = ut.quantile_per(query_fractions, high_gene_quantile, per="column")
    atlas_high_gene_values = ut.quantile_per(atlas_fractions, high_gene_quantile, per="column")

    query_above_atlas = query_low_gene_values > atlas_high_gene_values
    atlas_above_query = atlas_low_gene_values >= query_high_gene_values

    systematic = query_above_atlas | atlas_above_query

    ut.set_v_data(qdata, to_property_name, systematic)
Esempio n. 9
0
def renormalize_query_by_atlas(  # pylint: disable=too-many-statements,too-many-branches
    what: str = "__x__",
    *,
    adata: AnnData,
    qdata: AnnData,
    var_annotations: Dict[str, Any],
    layers: Dict[str, Any],
    varp_annotations: Dict[str, Any],
) -> Optional[AnnData]:
    """
    Add an ``ATLASNORM`` pseudo-gene to query metacells data to compensate for the query having filtered out many genes.

    This renormalizes the gene fractions in the query to fit the atlas in case the query has aggressive filtered a
    significant amount of genes.

    **Input**

    Annotated query ``qdata`` and atlas ``adata``, where the observations are cells and the variables are genes, where
    ``X`` is a per-variable-per-observation matrix or the name of a per-variable-per-observation annotation containing
    such a matrix.

    **Returns**

    None if no normalization is needed (or possible). Otherwise, a copy of the query metacells data, with an additional
    variable (gene) called ``ATLASNORM`` to the query data, such that the total number of UMIs for each query metacells
    is as expected given the total number of UMIs of the genes common to the query and the atlas. This is skipped if the
    query and the atlas have exactly the same list of genes, or if if the query already contains a high number of genes
    missing from the atlas so that the total number of UMIs for the query metacells is already at least the expected
    based on the common genes.

    **Computation Parameters**

    1. Computes how many UMIs should be added to each query metacell so that its (total UMIs / total common gene UMIs)
       would be the same as the (total atlas UMIs / total atlas common UMIs). If this is zero (or negative), stop.

    2. Add an ``ATLASNORM`` pseudo-gene to the query with the above amount of UMIs. For each per-variable (gene)
       observation, add the value specified in ``var_annotations``, whose list of keys must cover the set of
       per-variable annotations in the query data. For each per-observation-per-variable layer, add the value specified
       in ``layers``, whose list of keys must cover the existing layers. For each per-variable-per-variable annotation,
       add the value specified in ``varp_annotations``.
    """
    for name in qdata.var.keys():
        if "|" not in name and name not in var_annotations.keys():
            raise RuntimeError(f"missing default value for variable annotation {name}")

    for name in qdata.layers.keys():
        if name not in layers.keys():
            raise RuntimeError(f"missing default value for layer {name}")

    for name in qdata.varp.keys():
        if name not in varp_annotations.keys():
            raise RuntimeError(f"missing default value for variable-variable {name}")

    if list(qdata.var_names) == list(adata.var_names):
        return None

    query_genes_list = list(qdata.var_names)
    atlas_genes_list = list(adata.var_names)
    common_genes_list = list(sorted(set(qdata.var_names) & set(adata.var_names)))
    query_gene_indices = np.array([query_genes_list.index(gene) for gene in common_genes_list])
    atlas_gene_indices = np.array([atlas_genes_list.index(gene) for gene in common_genes_list])
    common_qdata = ut.slice(qdata, name=".common", vars=query_gene_indices, track_var="full_index")
    common_adata = ut.slice(adata, name=".common", vars=atlas_gene_indices, track_var="full_index")

    assert list(common_qdata.var_names) == list(common_adata.var_names)

    atlas_total_umis_per_metacell = ut.get_o_numpy(adata, what, sum=True)
    atlas_common_umis_per_metacell = ut.get_o_numpy(common_adata, what, sum=True)
    atlas_total_umis = np.sum(atlas_total_umis_per_metacell)
    atlas_common_umis = np.sum(atlas_common_umis_per_metacell)
    atlas_disjoint_umis_fraction = atlas_total_umis / atlas_common_umis - 1.0

    ut.log_calc("atlas_total_umis", atlas_total_umis)
    ut.log_calc("atlas_common_umis", atlas_common_umis)
    ut.log_calc("atlas_disjoint_umis_fraction", atlas_disjoint_umis_fraction)

    query_total_umis_per_metacell = ut.get_o_numpy(qdata, what, sum=True)
    query_common_umis_per_metacell = ut.get_o_numpy(common_qdata, what, sum=True)
    query_total_umis = np.sum(query_total_umis_per_metacell)
    query_common_umis = np.sum(query_common_umis_per_metacell)
    query_disjoint_umis_fraction = query_total_umis / query_common_umis - 1.0

    ut.log_calc("query_total_umis", query_total_umis)
    ut.log_calc("query_common_umis", query_common_umis)
    ut.log_calc("query_disjoint_umis_fraction", query_disjoint_umis_fraction)

    if query_disjoint_umis_fraction >= atlas_disjoint_umis_fraction:
        return None

    query_normalization_umis_fraction = atlas_disjoint_umis_fraction - query_disjoint_umis_fraction
    ut.log_calc("query_normalization_umis_fraction", query_normalization_umis_fraction)
    query_normalization_umis_per_metacell = query_common_umis_per_metacell * query_normalization_umis_fraction

    _proper, dense, compressed = ut.to_proper_matrices(qdata.X)

    if dense is None:
        assert compressed is not None
        dense = ut.to_numpy_matrix(compressed)
    added = np.concatenate([dense, query_normalization_umis_per_metacell[:, np.newaxis]], axis=1)

    if compressed is not None:
        added = sp.csr_matrix(added)

    assert added.shape[0] == qdata.shape[0]
    assert added.shape[1] == qdata.shape[1] + 1

    ndata = AnnData(added)
    ndata.obs_names = qdata.obs_names
    var_names = list(qdata.var_names)
    var_names.append("ATLASNORM")
    ndata.var_names = var_names

    for name, value in qdata.uns.items():
        ut.set_m_data(ndata, name, value)

    for name, value in qdata.obs.items():
        ut.set_o_data(ndata, name, value)

    for name, value in qdata.obsp.items():
        ut.set_oo_data(ndata, name, value)

    for name in qdata.var.keys():
        if "|" in name:
            continue
        value = ut.get_v_numpy(qdata, name)
        value = np.append(value, [var_annotations[name]])
        ut.set_v_data(ndata, name, value)

    for name in qdata.layers.keys():
        data = ut.get_vo_proper(qdata, name)
        _proper, dense, compressed = ut.to_proper_matrices(data)

        if dense is None:
            assert compressed is not None
            dense = ut.to_numpy_matrix(compressed)

        values = np.full(qdata.n_obs, layers[name], dtype=dense.dtype)
        added = np.concatenate([dense, values[:, np.newaxis]], axis=1)

        if compressed is not None:
            added = sp.csr_matrix(added)

        ut.set_vo_data(ndata, name, added)

    for name in qdata.varp.keys():
        data = ut.get_vv_proper(qdata, name)
        _proper, dense, compressed = ut.to_proper_matrices(data)

        if dense is None:
            assert compressed is not None
            dense = ut.to_numpy_matrix(compressed)

        values = np.full(qdata.n_vars, varp_annotations[name], dtype=dense.dtype)
        added = np.concatenate([dense, values[:, np.newaxis]], axis=1)
        values = np.full(qdata.n_vars + 1, varp_annotations[name], dtype=dense.dtype)
        added = np.concatenate([added, values[:, np.newaxis]], axis=0)

        if compressed is not None:
            added = sp.csr_matrix(added)

        ut.set_vv_data(ndata, name, added)

    return ndata
Esempio n. 10
0
def project_query_onto_atlas(
    what: Union[str, ut.Matrix] = "__x__",
    *,
    adata: AnnData,
    qdata: AnnData,
    atlas_total_umis: Optional[ut.Vector] = None,
    query_total_umis: Optional[ut.Vector] = None,
    project_log_data: bool = pr.project_log_data,
    fold_normalization: float = pr.project_fold_normalization,
    min_significant_gene_value: float = pr.project_min_significant_gene_value,
    max_consistency_fold_factor: float = pr.project_max_consistency_fold_factor,
    candidates_count: int = pr.project_candidates_count,
    min_usage_weight: float = pr.project_min_usage_weight,
    reproducible: bool,
) -> ut.CompressedMatrix:
    """
    Project query metacells onto atlas metacells.

    **Input**

    Annotated query ``qdata`` and atlas ``adata``, where the observations are cells and the variables are genes, where
    ``what`` is a per-variable-per-observation matrix or the name of a per-variable-per-observation annotation
    containing such a matrix.

    Typically this data excludes any genes having a systematic difference between the query and the atlas, e.g. genes
    detected by by :py:func:`metacells.tools.project.find_systematic_genes`.

    **Returns**

    A matrix whose rows are query metacells and columns are atlas metacells, where each entry is the weight of the atlas
    metacell in the projection of the query metacells. The sum of weights in each row (that is, for a single query
    metacell) is 1. The weighted sum of the atlas metacells using these weights is the "projected" image of the query
    metacell onto the atlas.

    In addition, sets the following annotations in ``qdata``:

    Observation (Cell) Annotations
        ``similar``
            A boolean mask indicating whether the query metacell is similar to its projection onto the atlas. If
            ``False`` the metacells is said to be "dissimilar", which may indicate the query contains cell states that
            do not appear in the atlas.

    **Computation Parameters**

    0. All fold computations (log2 of the ratio between gene expressions as a fraction of the total UMIs) use the
       ``fold_normalization`` (default: {fold_normalization}). Fractions are computed based on the total UMIs, unless
       ``atlas_total_umis`` and/or ``query_total_umis`` are specified.

    For each query metacell:

    1. Correlate the metacell with all the atlas metacells, and pick the highest-correlated one as the "anchor".
       If ``reproducible``, a slower (still parallel) but reproducible algorithm will be used.

    2. Consider as candidates only atlas metacells whose maximal gene fold factor compared to the anchor is at most
       ``max_consistency_fold_factor`` (default: {max_consistency_fold_factor}). Ignore the fold factors of genes whose
       sum of UMIs in the anchor and the candidate metacells is less than ``min_significant_gene_value`` (default:
       {min_significant_gene_value}).

    3. Select the ``candidates_count`` (default: {candidates_count}) candidate metacells with the highest correlation
       with the query metacell.

    4. Compute the non-negative weights (with a sum of 1) of the selected candidates that give the best projection of
       the query metacells onto the atlas. Since the algorithm for computing these weights rarely produces an exact 0
       weight, reduce all weights less than the ``min_usage_weight`` (default: {min_usage_weight}) to zero. If
       ``project_log_data`` (default: {project_log_data}), compute the match on the log of the data instead of the
       actual data.
    """
    assert fold_normalization > 0
    assert candidates_count > 0
    assert min_usage_weight >= 0
    assert max_consistency_fold_factor >= 0
    assert np.all(adata.var_names == qdata.var_names)

    atlas_umis = ut.get_vo_proper(adata, what, layout="row_major")
    query_umis = ut.get_vo_proper(qdata, what, layout="row_major")

    if atlas_total_umis is None:
        atlas_total_umis = ut.sum_per(atlas_umis, per="row")
    atlas_total_umis = ut.to_numpy_vector(atlas_total_umis)

    if query_total_umis is None:
        query_total_umis = ut.sum_per(query_umis, per="row")
    query_total_umis = ut.to_numpy_vector(query_total_umis)

    atlas_fractions = ut.to_numpy_matrix(ut.fraction_by(atlas_umis, by="row", sums=atlas_total_umis))
    query_fractions = ut.to_numpy_matrix(ut.fraction_by(query_umis, by="row", sums=query_total_umis))

    atlas_fractions += fold_normalization
    query_fractions += fold_normalization

    atlas_log_fractions = np.log2(atlas_fractions)
    query_log_fractions = np.log2(query_fractions)

    atlas_fractions -= fold_normalization
    query_fractions -= fold_normalization

    if project_log_data:
        atlas_project_data = atlas_log_fractions
        query_project_data = query_log_fractions
    else:
        atlas_project_data = atlas_fractions
        query_project_data = query_fractions

    query_atlas_corr = ut.cross_corrcoef_rows(query_project_data, atlas_project_data, reproducible=reproducible)

    @ut.timed_call("project_single_metacell")
    def _project_single(query_metacell_index: int) -> Tuple[ut.NumpyVector, ut.NumpyVector]:
        return _project_single_metacell(
            atlas_umis=atlas_umis,
            query_atlas_corr=query_atlas_corr,
            atlas_project_data=atlas_project_data,
            query_project_data=query_project_data,
            atlas_log_fractions=atlas_log_fractions,
            candidates_count=candidates_count,
            min_significant_gene_value=min_significant_gene_value,
            min_usage_weight=min_usage_weight,
            max_consistency_fold_factor=max_consistency_fold_factor,
            query_metacell_index=query_metacell_index,
        )

    results = list(ut.parallel_map(_project_single, qdata.n_obs))

    indices = np.concatenate([result[0] for result in results], dtype="int32")
    data = np.concatenate([result[1] for result in results], dtype="float32")

    atlas_used_sizes = [len(result[0]) for result in results]
    atlas_used_sizes.insert(0, 0)
    indptr = np.cumsum(np.array(atlas_used_sizes))

    return sp.csr_matrix((data, indices, indptr), shape=(qdata.n_obs, adata.n_obs))
Esempio n. 11
0
def relate_genes(
    adata: AnnData,
    what: Union[str, ut.Matrix] = "__x__",
    *,
    max_sampled_cells: int = pr.related_max_sampled_cells,
    downsample_min_samples: float = pr.related_downsample_min_samples,
    downsample_min_cell_quantile: float = pr.
    related_downsample_min_cell_quantile,
    downsample_max_cell_quantile: float = pr.
    related_downsample_max_cell_quantile,
    min_gene_relative_variance: float = pr.related_min_gene_relative_variance,
    min_gene_total: int = pr.related_min_gene_total,
    min_gene_top3: int = pr.related_min_gene_top3,
    forbidden_gene_names: Optional[Collection[str]] = None,
    forbidden_gene_patterns: Optional[Collection[Union[str, Pattern]]] = None,
    genes_similarity_method: str = pr.related_genes_similarity_method,
    genes_cluster_method: str = pr.related_genes_cluster_method,
    min_genes_of_modules: int = pr.related_min_genes_of_modules,
    random_seed: int = 0,
) -> None:
    """
    Detect coarse relations between genes based on ``what`` (default: {what}) data.

    This is a quick-and-dirty way to group genes together and shouldn't only be used as a starting
    point for more precise forms of gene relationship analysis.

    **Input**

    Annotated ``adata``, where the observations are cells and the variables are genes, where
    ``what`` is a per-variable-per-observation matrix or the name of a per-variable-per-observation
    annotation containing such a matrix.

    **Returns**

    Variable-pair (Gene) Annotations
        ``related_genes_similarity``
            The similarity between each two related genes.

    Variable (Gene) Annotations
        ``related_genes_module``
            The index of the gene module for each gene.

    **Computation Parameters**

    1. If we have more than ``max_sampled_cells`` (default: {max_sampled_cells}), pick this number
       of random cells from the data using the ``random_seed``.

    2. Pick candidate genes using :py:func:`metacells.pipeline.feature.extract_feature_data`.

    3. Compute the similarity between the feature genes using
       :py:func:`metacells.tools.similarity.compute_var_var_similarity` using the
       ``genes_similarity_method`` (default: {genes_similarity_method}).

    4. Create a hierarchical clustering of the candidate genes using the ``genes_cluster_method``
       (default: {genes_cluster_method}).

    5. Identify gene modules in the hierarchical clustering which contain at least
       ``min_genes_of_modules`` genes.
    """
    if max_sampled_cells < adata.n_obs:
        np.random.seed(random_seed)
        cell_indices = np.random.choice(np.arange(adata.n_obs),
                                        size=max_sampled_cells,
                                        replace=False)
        sdata = ut.slice(adata,
                         obs=cell_indices,
                         name=".sampled",
                         top_level=False)
    else:
        sdata = ut.copy_adata(adata, top_level=False)

    fdata = extract_feature_data(
        sdata,
        what,
        top_level=False,
        downsample_min_samples=downsample_min_samples,
        downsample_min_cell_quantile=downsample_min_cell_quantile,
        downsample_max_cell_quantile=downsample_max_cell_quantile,
        min_gene_relative_variance=min_gene_relative_variance,
        min_gene_total=min_gene_total,
        min_gene_top3=min_gene_top3,
        forbidden_gene_names=forbidden_gene_names,
        forbidden_gene_patterns=forbidden_gene_patterns,
        random_seed=random_seed,
    )
    assert fdata is not None

    frame = tl.compute_var_var_similarity(fdata,
                                          what,
                                          method=genes_similarity_method,
                                          reproducible=(random_seed != 0),
                                          inplace=False)
    assert frame is not None
    similarity = ut.to_layout(ut.to_numpy_matrix(frame), layout="row_major")

    linkage = _cluster_genes(similarity, genes_cluster_method)
    clusters = _linkage_to_clusters(linkage, min_genes_of_modules,
                                    fdata.n_vars)

    cluster_of_genes = pd.Series(np.full(adata.n_vars, -1, dtype="int32"),
                                 index=adata.var_names)
    for cluster_index, gene_indices in enumerate(clusters):
        cluster_of_genes[fdata.var_names[gene_indices]] = cluster_index

    ut.set_v_data(adata,
                  "related_genes_module",
                  cluster_of_genes,
                  formatter=ut.groups_description)

    feature_gene_indices = ut.get_v_numpy(fdata, "full_gene_index")
    data = similarity.flatten(order="C")
    rows = np.repeat(feature_gene_indices, len(feature_gene_indices))
    cols = np.tile(feature_gene_indices, len(feature_gene_indices))
    full_similarity = sp.csr_matrix((data, (rows, cols)),
                                    shape=(adata.n_vars, adata.n_vars))

    ut.set_vv_data(adata, "related_genes_similarity", full_similarity)
Esempio n. 12
0
def compute_distinct_folds(
    adata: AnnData,
    what: Union[str, ut.Matrix] = "__x__",
    *,
    normalization: float = 0,
    inplace: bool = True,
) -> Optional[ut.PandasFrame]:
    """
    Compute for each observation (cell) and each variable (gene) how much is the ``what`` (default:
    {what}) value different from the overall population.

    **Input**

    Annotated ``adata``, where the observations are cells and the variables are genes, where
    ``what`` is a per-variable-per-observation matrix or the name of a per-variable-per-observation
    annotation containing such a matrix.

    **Returns**

    Per-Observation-Per-Variable (Cell-Gene) Annotations:
        ``distinct_ratio``
            For each gene in each cell, the log based 2 of the ratio between the fraction of the
            gene in the cell and the fraction of the gene in the overall population (sum of cells).

    If ``inplace`` (default: {inplace}), this is written to the data, and the function returns
    ``None``. Otherwise this is returned as a pandas frame (indexed by the observation and
    distinct gene rank).

    **Computation Parameters**

    1. Compute, for each gene, the fraction of the gene's values out of the total sum of the values
       (that is, the mean fraction of the gene's expression in the population).

    2. Compute, for each cell, for each gene, the fraction of the gene's value out of the sum
       of the values in the cell (that is, the fraction of the gene's expression in the cell).

    3. Divide the two to the distinct ratio (that is, how much the gene's expression in the cell is
       different from the overall population), first adding the ``normalization`` (default:
       {normalization}) to both.

    4. Compute the log (base 2) of the result and use it as the fold factor.
    """
    columns_data = ut.get_vo_proper(adata, what, layout="column_major")
    fractions_of_genes_in_data = ut.fraction_per(columns_data, per="column")
    fractions_of_genes_in_data += normalization

    total_umis_of_cells = ut.get_o_numpy(adata, what, sum=True)
    total_umis_of_cells[total_umis_of_cells == 0] = 1

    rows_data = ut.get_vo_proper(adata, what, layout="row_major")
    fraction_of_genes_in_cells = ut.to_numpy_matrix(
        rows_data) / total_umis_of_cells[:, np.newaxis]
    fraction_of_genes_in_cells += normalization

    zeros_mask = fractions_of_genes_in_data <= 0
    fractions_of_genes_in_data[zeros_mask] = -1
    fraction_of_genes_in_cells[:, zeros_mask] = -1

    ratio_of_genes_in_cells = fraction_of_genes_in_cells
    ratio_of_genes_in_cells /= fractions_of_genes_in_data
    assert np.min(np.min(ratio_of_genes_in_cells)) > 0

    fold_of_genes_in_cells = np.log2(ratio_of_genes_in_cells,
                                     out=ratio_of_genes_in_cells)

    if inplace:
        ut.set_vo_data(adata, "distinct_fold", fold_of_genes_in_cells)
        return None

    return ut.to_pandas_frame(fold_of_genes_in_cells,
                              index=adata.obs_names,
                              columns=adata.var_names)
Esempio n. 13
0
def find_noisy_lonely_genes(  # pylint: disable=too-many-statements
    adata: AnnData,
    what: Union[str, ut.Matrix] = "__x__",
    *,
    excluded_genes_mask: Optional[str] = None,
    max_sampled_cells: int = pr.noisy_lonely_max_sampled_cells,
    downsample_min_samples: int = pr.noisy_lonely_downsample_min_samples,
    downsample_min_cell_quantile: float = pr.
    noisy_lonely_downsample_max_cell_quantile,
    downsample_max_cell_quantile: float = pr.
    noisy_lonely_downsample_min_cell_quantile,
    min_gene_total: int = pr.noisy_lonely_min_gene_total,
    min_gene_normalized_variance: float = pr.
    noisy_lonely_min_gene_normalized_variance,
    max_gene_similarity: float = pr.noisy_lonely_max_gene_similarity,
    random_seed: int = pr.random_seed,
    inplace: bool = True,
) -> Optional[ut.PandasSeries]:
    """
    Detect "noisy lonely" genes based on ``what`` (default: {what}) data.

    Return the indices of genes which are "noisy" (have high variance compared to their mean) and
    also "lonely" (have low correlation with all other genes). Such genes should be excluded since
    they will never meaningfully help us compute groups, and will actively cause profiles to be
    considered "deviants".

    Noisy genes have high expression and variance. Lonely genes have no (or low) correlations with
    any other gene. Noisy lonely genes tend to throw off clustering algorithms. In general, such
    algorithms try to group together cells with the same overall biological state. Since the genes
    are lonely, they don't contribute towards this goal. Since they are noisy, they actively hamper
    this, because they make cells which are otherwise similar appear different (just for this lonely
    gene).

    It is therefore useful to explicitly identify, in a pre-processing step, the (few) such genes,
    and exclude them from the rest of the analysis.

    **Input**

    Annotated ``adata``, where the observations are cells and the variables are genes, where
    ``what`` is a per-variable-per-observation matrix or the name of a per-variable-per-observation
    annotation containing such a matrix.

    **Returns**

    Variable (Gene) Annotations
        ``noisy_lonely_genes``
            A boolean mask indicating whether each gene was found to be a "noisy lonely" gene.

    If ``inplace`` (default: {inplace}), this is written to the data, and the function returns
    ``None``. Otherwise this is returned as a pandas series (indexed by the variable names).

    **Computation Parameters**

    1. If we have more than ``max_sampled_cells`` (default: {max_sampled_cells}), pick this number
       of random cells from the data using the ``random_seed``.

    2. If we were specified an ``excluded_genes_mask``, this is the name of a per-variable (gene)
       annotation containing a mask of excluded genes. Get rid of all these excluded genes.

    3. Invoke :py:func:`metacells.tools.downsample.downsample_cells` to downsample the cells to the
       same total number of UMIs, using the ``downsample_min_samples`` (default:
       {downsample_min_samples}), ``downsample_min_cell_quantile`` (default:
       {downsample_min_cell_quantile}), ``downsample_max_cell_quantile`` (default:
       {downsample_max_cell_quantile}) and the ``random_seed`` (default: {random_seed}).

    4. Find "noisy" genes which have a total number of UMIs of at least ``min_gene_total`` (default:
       {min_gene_total}) and a normalized variance of at least ``min_gene_normalized_variance``
       (default: ``min_gene_normalized_variance``).

    5. Cross-correlate the noisy genes.

    6. Find the noisy "lonely" genes whose maximal correlation is at most
       ``max_gene_similarity`` (default: {max_gene_similarity}) with all other genes.
    """
    if max_sampled_cells < adata.n_obs:
        np.random.seed(random_seed)
        cell_indices = np.random.choice(np.arange(adata.n_obs),
                                        size=max_sampled_cells,
                                        replace=False)
        s_data = ut.slice(adata,
                          obs=cell_indices,
                          name=".sampled",
                          top_level=False)
    else:
        s_data = ut.copy_adata(adata, top_level=False)

    track_var: Optional[str] = "sampled_gene_index"

    if excluded_genes_mask is not None:
        results = filter_data(s_data,
                              name="included",
                              top_level=False,
                              track_var=track_var,
                              var_masks=[f"~{excluded_genes_mask}"])
        track_var = None
        assert results is not None
        i_data = results[0]
        assert i_data is not None
    else:
        i_data = s_data

    downsample_cells(
        i_data,
        what,
        downsample_min_samples=downsample_min_samples,
        downsample_min_cell_quantile=downsample_min_cell_quantile,
        downsample_max_cell_quantile=downsample_max_cell_quantile,
        random_seed=random_seed,
    )

    find_high_total_genes(i_data, "downsampled", min_gene_total=min_gene_total)

    results = filter_data(i_data,
                          name="high_total",
                          top_level=False,
                          track_var=track_var,
                          var_masks=["high_total_gene"])
    track_var = None
    assert results is not None
    ht_data = results[0]

    noisy_lonely_genes_mask = np.full(adata.n_vars, False)

    if ht_data is not None:
        ht_genes_count = ht_data.shape[1]

        ht_gene_ht_gene_similarity_frame = compute_var_var_similarity(
            ht_data,
            "downsampled",
            inplace=False,
            reproducible=(random_seed != 0))
        assert ht_gene_ht_gene_similarity_frame is not None

        ht_gene_ht_gene_similarity_matrix = ut.to_numpy_matrix(
            ht_gene_ht_gene_similarity_frame, only_extract=True)
        ht_gene_ht_gene_similarity_matrix = ut.to_layout(
            ht_gene_ht_gene_similarity_matrix,
            layout="row_major",
            symmetric=True)
        np.fill_diagonal(ht_gene_ht_gene_similarity_matrix, -1)

        htv_mask_series = find_high_normalized_variance_genes(
            ht_data,
            "downsampled",
            min_gene_normalized_variance=min_gene_normalized_variance,
            inplace=False)
        assert htv_mask_series is not None
        htv_mask = ut.to_numpy_vector(htv_mask_series)

        htv_genes_count = np.sum(htv_mask)
        assert htv_genes_count <= ht_genes_count

        if htv_genes_count > 0:
            htv_gene_ht_gene_similarity_matrix = ht_gene_ht_gene_similarity_matrix[
                htv_mask, :]
            assert ut.is_layout(htv_gene_ht_gene_similarity_matrix,
                                "row_major")
            assert htv_gene_ht_gene_similarity_matrix.shape == (
                htv_genes_count, ht_genes_count)

            max_similarity_of_htv_genes = ut.max_per(
                htv_gene_ht_gene_similarity_matrix, per="row")
            htvl_mask = max_similarity_of_htv_genes <= max_gene_similarity
            htvl_genes_count = np.sum(htvl_mask)
            ut.log_calc("noisy_lonely_genes_count", htvl_genes_count)

            if htvl_genes_count > 0:
                base_index_of_ht_genes = ut.get_v_numpy(
                    ht_data, "sampled_gene_index")
                base_index_of_htv_genes = base_index_of_ht_genes[htv_mask]
                base_index_of_htvl_genes = base_index_of_htv_genes[htvl_mask]

                noisy_lonely_genes_mask[base_index_of_htvl_genes] = True

                htvl_gene_ht_gene_similarity_matrix = htv_gene_ht_gene_similarity_matrix[
                    htvl_mask, :]
                htvl_gene_ht_gene_similarity_matrix = ut.to_layout(
                    htvl_gene_ht_gene_similarity_matrix, layout="row_major")
                assert htvl_gene_ht_gene_similarity_matrix.shape == (
                    htvl_genes_count, ht_genes_count)

                if ut.logging_calc():
                    i_gene_totals = ut.get_v_numpy(i_data,
                                                   "downsampled",
                                                   sum=True)
                    ht_mask = ut.get_v_numpy(i_data, "high_total_gene")
                    i_total = np.sum(i_gene_totals)
                    htvl_gene_totals = i_gene_totals[ht_mask][htv_mask][
                        htvl_mask]
                    top_similarity_of_htvl_genes = ut.top_per(
                        htvl_gene_ht_gene_similarity_matrix, 10, per="row")
                    for htvl_index, gene_index in enumerate(
                            base_index_of_htvl_genes):
                        gene_name = adata.var_names[gene_index]
                        gene_total = htvl_gene_totals[htvl_index]
                        gene_percent = 100 * gene_total / i_total
                        similar_ht_values = ut.to_numpy_vector(
                            top_similarity_of_htvl_genes[htvl_index, :])  #
                        assert len(similar_ht_values) == ht_genes_count
                        top_similar_ht_mask = similar_ht_values > 0
                        top_similar_ht_values = similar_ht_values[
                            top_similar_ht_mask]
                        top_similar_ht_indices = base_index_of_ht_genes[
                            top_similar_ht_mask]
                        top_similar_ht_names = adata.var_names[
                            top_similar_ht_indices]
                        ut.log_calc(
                            f"- {gene_name}",
                            f"total downsampled UMIs: {gene_total} " +
                            f"({gene_percent:.4g}%), correlated with: " +
                            ", ".join([
                                f"{similar_gene_name}: {similar_gene_value:.4g}"
                                for similar_gene_value, similar_gene_name in
                                reversed(
                                    sorted(
                                        zip(top_similar_ht_values,
                                            top_similar_ht_names)))
                            ]),
                        )

    if ut.logging_calc():
        ut.log_calc("noisy_lonely_gene_names",
                    sorted(list(adata.var_names[noisy_lonely_genes_mask])))

    if inplace:
        ut.set_v_data(adata, "noisy_lonely_gene", noisy_lonely_genes_mask)
        return None

    ut.log_return("noisy_lonely_genes", noisy_lonely_genes_mask)
    return ut.to_pandas_series(noisy_lonely_genes_mask, index=adata.var_names)
Esempio n. 14
0
def compute_outliers_matches(
    what: Union[str, ut.Matrix] = "__x__",
    *,
    adata: AnnData,
    gdata: AnnData,
    group: Union[str, ut.Vector] = "metacell",
    similar: str = "similar",
    value_normalization: float = pr.outliers_value_normalization,
    reproducible: bool,
) -> None:
    """
    Given an assignment of observations (cells) to groups (metacells), compute for each outlier the "most similar"
    group.

    **Input**

    Annotated ``adata``, where the observations are cells and the variables are genes, where ``what`` is a
    per-variable-per-observation matrix or the name of a per-variable-per-observation annotation containing such a
    matrix.

    In addition, ``gdata`` is assumed to have one observation for each group, and use the same genes as ``adata``.

    **Returns**

    Sets the following in ``adata``:

    Per-Observation (Cell) Annotations

        ``similar`` (default: {similar})
            For each observation (cell), the index of the "most similar" group.

    **Computation Parameters**

    1. Compute the log2 of the fraction of each gene in each of the outlier cells and the group metacells using
       the ``value_normalization`` (default: {value_normalization}).

    2. Cross-correlate each of the outlier cells with each of the group metacells, in a ``reproducible`` manner.
    """
    group_of_cells = ut.get_o_numpy(adata, group)
    outliers_mask = group_of_cells < 0
    odata = ut.slice(adata, obs=outliers_mask)

    outliers_data = ut.get_vo_proper(odata, what, layout="row_major")
    groups_data = ut.get_vo_proper(gdata, what, layout="row_major")

    outliers_fractions = ut.fraction_by(outliers_data, by="row")
    groups_fractions = ut.fraction_by(groups_data, by="row")

    outliers_fractions = ut.to_numpy_matrix(outliers_fractions)
    groups_fractions = ut.to_numpy_matrix(groups_fractions)

    outliers_fractions += value_normalization
    groups_fractions += value_normalization

    outliers_log_fractions = np.log2(outliers_fractions,
                                     out=outliers_fractions)
    groups_log_fractions = np.log2(groups_fractions, out=groups_fractions)

    outliers_groups_correlation = ut.cross_corrcoef_rows(
        outliers_log_fractions,
        groups_log_fractions,
        reproducible=reproducible)
    outliers_similar_group_indices = np.argmax(outliers_groups_correlation,
                                               axis=1)
    assert len(outliers_similar_group_indices) == odata.n_obs

    cells_similar_group_indices = np.full(adata.n_obs, -1, dtype="int32")
    cells_similar_group_indices[outliers_mask] = outliers_similar_group_indices
    ut.set_o_data(adata, similar, cells_similar_group_indices)
Esempio n. 15
0
def compute_groups_self_consistency(
    adata: AnnData,
    what: Union[str, ut.Matrix] = "__x__",
    *,
    group: str = "metacell",
    genes_mask: Optional[ut.NumpyVector] = None,
    self_similarity_log_data: bool = pr.self_similarity_log_data,
    self_similarity_value_normalization: float = pr.
    self_similarity_value_normalization,
    self_similarity_method: str = pr.self_similarity_method,
    reproducible: bool = pr.reproducible,
    logistics_location: float = pr.logistics_location,
    logistics_slope: float = pr.logistics_slope,
) -> ut.NumpyVector:
    """
    Compute the self consistency (similarity between two halves) of some groups.

    **Input**

    The input annotated ``adata`` is expected to contain a per-observation annotation named
    ``group`` (default: {group}) which identifies the group (metacells) each observation (cell)
    belongs to, and ``half_<group>`` which identifies the half-group each observation belongs
    to (e.g. as computed by :py:func:`split_groups`). Specifically, the indices of the halves
    of group index ``i`` are ``i`` and ``i + groups_count``.

    **Returns**

    A Numpy vector holding, for each group, the similarity between its two halves.

    **Computation Parameters**

    1. For each group, compute the sum of values in each half and normalize it to fractions (sum of 1).

    2. If ``genes_mask`` is specified, select only the genes specified in it. Note the sum of the
       fractions of the genes of each group in the result will be less than or equal to 1.

    3. If ``self_similarity_log_data`` (default: {self_similarity_log_data}), log2 the values using
       ``self_similarity_value_normalization`` (default: {self_similarity_value_normalization}).

    4. Compute the ``self_similarity_method`` (default: {self_similarity_method}) between the two
       halves. If this is the ``logistics`` similarity, then this will use ``logistics_location``
       (default: {logistics_location}) and ``logistics_slope`` (default: {logistics_slope}). If this
       is ``pearson``, and if ``reproducible`` (default: {reproducible}) is ``True``, a slower
       (still parallel) but reproducible algorithm will be used to compute Pearson correlations.
    """
    hdata = tl.group_obs_data(adata,
                              what,
                              groups=f"half_{group}",
                              name=".halves")
    assert hdata is not None

    sum_of_halves = ut.get_o_numpy(hdata, f"{what}|sum")
    halves_values = ut.to_numpy_matrix(
        ut.get_vo_proper(hdata, what, layout="row_major"))
    halves_data = ut.mustbe_numpy_matrix(
        ut.scale_by(halves_values, sum_of_halves, by="row"))

    if self_similarity_value_normalization > 0:
        halves_data += self_similarity_value_normalization

    if self_similarity_log_data:
        halves_data = ut.log_data(halves_data, base=2)

    if genes_mask is not None:
        halves_data = halves_data[:, genes_mask]

    assert hdata.n_obs % 2 == 0
    groups_count = hdata.n_obs // 2
    low_half_indices = np.arange(groups_count)
    high_half_indices = low_half_indices + groups_count

    low_half_data = halves_data[low_half_indices, :]
    high_half_data = halves_data[high_half_indices, :]

    assert self_similarity_method in ("logistics", "pearson")
    if self_similarity_method == "logistics":
        similarity = ut.pairs_logistics_rows(low_half_data,
                                             high_half_data,
                                             location=logistics_location,
                                             slope=logistics_slope)
        similarity *= -1
        similarity += 1
    else:
        similarity = ut.pairs_corrcoef_rows(low_half_data,
                                            high_half_data,
                                            reproducible=reproducible)

    return similarity
Esempio n. 16
0
def _compute_elements_similarity(  # pylint: disable=too-many-branches
    adata: AnnData,
    elements: str,
    per: str,
    what: Union[str, ut.Matrix],
    *,
    method: str,
    reproducible: bool,
    logistics_location: float,
    logistics_slope: float,
    top: Optional[int],
    bottom: Optional[int],
    inplace: bool,
) -> Optional[ut.PandasFrame]:
    assert elements in ("obs", "var")

    assert method in ("pearson", "repeated_pearson", "logistics",
                      "logistics_pearson")

    data = ut.get_vo_proper(adata, what, layout=f"{per}_major")
    dense = ut.to_numpy_matrix(data)

    similarity: ut.ProperMatrix
    if method.startswith("logistics"):
        similarity = ut.logistics(dense,
                                  location=logistics_location,
                                  slope=logistics_slope,
                                  per=per)
        similarity *= -1
        similarity += 1
    else:
        similarity = ut.corrcoef(dense, per=per, reproducible=reproducible)

    if method.endswith("_pearson"):
        similarity = ut.corrcoef(similarity,
                                 per=None,
                                 reproducible=reproducible)

    if top is not None:
        top_similarity = ut.top_per(similarity, top, per="row")

    if bottom is not None:
        similarity *= -1
        bottom_similarity = ut.top_per(similarity, bottom, per="row")
        bottom_similarity *= -1  # type: ignore

    if top is not None:
        if bottom is not None:
            similarity = top_similarity + bottom_similarity  # type: ignore
        else:
            similarity = top_similarity
    else:
        if bottom is not None:
            similarity = bottom_similarity

    if inplace:
        to = elements + "_similarity"
        if elements == "obs":
            ut.set_oo_data(adata, to, similarity)
        else:
            ut.set_vv_data(adata, to, similarity)
        return None

    if elements == "obs":
        names = adata.obs_names
    else:
        names = adata.var_names

    return ut.to_pandas_frame(similarity, index=names, columns=names)
Esempio n. 17
0
def compute_knn_by_features(
    adata: AnnData,
    what: Union[str, ut.Matrix] = "__x__",
    *,
    max_top_feature_genes: int = pr.max_top_feature_genes,
    similarity_value_normalization: float = pr.
    umap_similarity_value_normalization,
    similarity_log_data: bool = pr.umap_similarity_log_data,
    similarity_method: str = pr.umap_similarity_method,
    logistics_location: float = pr.logistics_location,
    logistics_slope: float = pr.logistics_slope,
    k: int,
    balanced_ranks_factor: float = pr.knn_balanced_ranks_factor,
    incoming_degree_factor: float = pr.knn_incoming_degree_factor,
    outgoing_degree_factor: float = pr.knn_outgoing_degree_factor,
    reproducible: bool = pr.reproducible,
) -> ut.PandasFrame:
    """
    Compute KNN graph between metacells based on feature genes.

    If ``reproducible`` (default: {reproducible}) is ``True``, a slower (still parallel) but
    reproducible algorithm will be used to compute pearson correlations.

    **Input**

    Annotated ``adata`` where each observation is a metacells and the variables are genes,
    are genes, where ``what`` is a per-variable-per-observation matrix or the name of a
    per-variable-per-observation annotation containing such a matrix.

    **Returns**

    Sets the following in ``adata``:

    Observations-Pair (Metacells) Annotations
        ``obs_outgoing_weights``
            A sparse square matrix where each non-zero entry is the weight of an edge between a pair
            of cells or genes, where the sum of the weights of the outgoing edges for each element
            is 1 (there is always at least one such edge).

    Also return a pandas data frame of the similarities between the observations (metacells).

    **Computation Parameters**

    1. Invoke :py:func:`metacells.tools.high.find_top_feature_genes` using ``max_top_feature_genes``
       (default: {max_top_feature_genes}) to pick the feature genes to use to compute similarities
       between the metacells.

    2. Compute the fractions of each gene in each cell, and add the
       ``similarity_value_normalization`` (default: {similarity_value_normalization}) to
       it.

    3. If ``similarity_log_data`` (default: {similarity_log_data}), invoke the
       :py:func:`metacells.utilities.computation.log_data` function to compute the log (base 2) of
       the data.

    4. Invoke :py:func:`metacells.tools.similarity.compute_obs_obs_similarity` using
       ``similarity_method`` (default: {similarity_method}), ``logistics_location`` (default:
       {logistics_slope}) and ``logistics_slope`` (default: {logistics_slope}) and convert this
       to distances.

    5. Invoke :py:func:`metacells.tools.knn_graph.compute_obs_obs_knn_graph` using the distances,
       ``k`` (no default!), ``balanced_ranks_factor`` (default: {balanced_ranks_factor}),
       ``incoming_degree_factor`` (default: {incoming_degree_factor}), ``outgoing_degree_factor``
       (default: {outgoing_degree_factor}) to compute a "skeleton" graph to overlay on top of the
       UMAP graph.
    """
    tl.find_top_feature_genes(adata, max_genes=max_top_feature_genes)

    all_data = ut.get_vo_proper(adata, what, layout="row_major")
    all_fractions = ut.fraction_by(all_data, by="row")

    top_feature_genes_mask = ut.get_v_numpy(adata, "top_feature_gene")

    top_feature_genes_fractions = all_fractions[:, top_feature_genes_mask]
    top_feature_genes_fractions = ut.to_layout(top_feature_genes_fractions,
                                               layout="row_major")
    top_feature_genes_fractions = ut.to_numpy_matrix(
        top_feature_genes_fractions)

    top_feature_genes_fractions += similarity_value_normalization

    if similarity_log_data:
        top_feature_genes_fractions = ut.log_data(top_feature_genes_fractions,
                                                  base=2)

    tdata = ut.slice(adata, vars=top_feature_genes_mask)
    similarities = tl.compute_obs_obs_similarity(
        tdata,
        top_feature_genes_fractions,
        method=similarity_method,
        reproducible=reproducible,
        logistics_location=logistics_location,
        logistics_slope=logistics_slope,
        inplace=False,
    )
    assert similarities is not None

    tl.compute_obs_obs_knn_graph(
        adata,
        similarities,
        k=k,
        balanced_ranks_factor=balanced_ranks_factor,
        incoming_degree_factor=incoming_degree_factor,
        outgoing_degree_factor=outgoing_degree_factor,
    )

    return similarities
Esempio n. 18
0
def compute_umap_by_features(
    adata: AnnData,
    what: Union[str, ut.Matrix] = "__x__",
    *,
    max_top_feature_genes: int = pr.max_top_feature_genes,
    similarity_value_normalization: float = pr.
    umap_similarity_value_normalization,
    similarity_log_data: bool = pr.umap_similarity_log_data,
    similarity_method: str = pr.umap_similarity_method,
    logistics_location: float = pr.logistics_location,
    logistics_slope: float = pr.logistics_slope,
    skeleton_k: int = pr.skeleton_k,
    balanced_ranks_factor: float = pr.knn_balanced_ranks_factor,
    incoming_degree_factor: float = pr.knn_incoming_degree_factor,
    outgoing_degree_factor: float = pr.knn_outgoing_degree_factor,
    umap_k: int = pr.umap_k,
    dimensions: int = 2,
    min_dist: float = pr.umap_min_dist,
    spread: float = pr.umap_spread,
    random_seed: int = pr.random_seed,
) -> None:
    """
    Compute a UMAP projection of the (meta)cells.

    **Input**

    Annotated ``adata`` where each observation is a metacells and the variables are genes,
    are genes, where ``what`` is a per-variable-per-observation matrix or the name of a
    per-variable-per-observation annotation containing such a matrix.

    **Returns**

    Sets the following annotations in ``adata``:

    Variable (Gene) Annotations
        ``top_feature_gene``
            A boolean mask of the top feature genes used to compute similarities between the
            metacells.

    Observation (Metacell) Annotations
        ``umap_x``, ``umap_y``
            The X and Y coordinates of each metacell in the UMAP projection.

    **Computation Parameters**

    1. Invoke :py:func:`metacells.pipeline.umap.compute_knn_by_features` using
       ``max_top_feature_genes`` (default: {max_top_feature_genes}),
       ``similarity_value_normalization`` (default: {similarity_value_normalization}),
       ``similarity_log_data`` (default: {similarity_log_data}), ``similarity_method`` (default:
       {similarity_method}), ``logistics_location`` (default: {logistics_location}),
       ``logistics_slope`` (default: {logistics_slope}), ``skeleton_k`` (default: {skeleton_k}),
       ``balanced_ranks_factor`` (default: {balanced_ranks_factor}), ``incoming_degree_factor``
       (default: {incoming_degree_factor}), ``outgoing_degree_factor`` (default:
       {outgoing_degree_factor}) to compute a "skeleton" graph to overlay on top of the UMAP graph.

    2. Invoke :py:func:`metacells.tools.layout.umap_by_distances` using the distances, ``umap_k``
       (default: {umap_k}), ``min_dist`` (default: {min_dist}), ``spread`` (default: {spread}),
       dimensions (default: {dimensions})q"""
    similarities = compute_knn_by_features(
        adata,
        what,
        max_top_feature_genes=max_top_feature_genes,
        similarity_value_normalization=similarity_value_normalization,
        similarity_log_data=similarity_log_data,
        similarity_method=similarity_method,
        logistics_location=logistics_location,
        logistics_slope=logistics_slope,
        k=skeleton_k,
        balanced_ranks_factor=balanced_ranks_factor,
        incoming_degree_factor=incoming_degree_factor,
        outgoing_degree_factor=outgoing_degree_factor,
        reproducible=(random_seed != 0),
    )

    distances = ut.to_numpy_matrix(similarities)
    distances *= -1
    distances += 1
    np.fill_diagonal(distances, 0.0)
    distances = sparse.csr_matrix(distances)

    tl.umap_by_distances(adata,
                         distances,
                         k=umap_k,
                         dimensions=dimensions,
                         min_dist=min_dist,
                         spread=spread,
                         random_seed=random_seed)
Esempio n. 19
0
def _compute_elements_knn_graph(
    adata: AnnData,
    elements: str,
    what: Union[str, ut.Matrix] = "__x__",
    *,
    k: int,
    balanced_ranks_factor: float,
    incoming_degree_factor: float,
    outgoing_degree_factor: float,
    inplace: bool = True,
) -> Optional[ut.PandasFrame]:
    assert elements in ("obs", "var")
    assert balanced_ranks_factor > 0.0
    assert incoming_degree_factor > 0.0
    assert outgoing_degree_factor > 0.0

    if elements == "obs":
        get_data = ut.get_oo_proper
        set_data = ut.set_oo_data
    else:
        get_data = ut.get_vv_proper
        set_data = ut.set_vv_data

    def store_matrix(matrix: ut.CompressedMatrix, name: str,
                     when: bool) -> None:  #
        if when:
            name = elements + "_" + name
            set_data(
                adata,
                name,
                matrix,
                formatter=lambda matrix: ut.ratio_description(
                    matrix.shape[0] * matrix.shape[1], "element", matrix.nnz,
                    "nonzero"),
            )
        elif ut.logging_calc():
            ut.log_calc(
                f"{elements}_{name}",
                ut.ratio_description(matrix.shape[0] * matrix.shape[1],
                                     "element", matrix.nnz, "nonzero"),
            )

    similarity = ut.to_proper_matrix(get_data(adata, what))
    similarity = ut.to_layout(similarity, "row_major", symmetric=True)
    similarity = ut.to_numpy_matrix(similarity)

    ut.log_calc("similarity", similarity)

    outgoing_ranks = _rank_outgoing(similarity)

    balanced_ranks = _balance_ranks(outgoing_ranks, k, balanced_ranks_factor)
    store_matrix(balanced_ranks, "balanced_ranks", True)

    pruned_ranks = _prune_ranks(balanced_ranks, k, incoming_degree_factor,
                                outgoing_degree_factor)
    store_matrix(pruned_ranks, "pruned_ranks", True)

    outgoing_weights = _weigh_edges(pruned_ranks)
    store_matrix(outgoing_weights, "outgoing_weights", inplace)

    if inplace:
        return None

    if elements == "obs":
        names = adata.obs_names
    else:
        names = adata.var_names

    return ut.to_pandas_frame(outgoing_weights, index=names, columns=names)
Esempio n. 20
0
def compute_direct_metacells(  # pylint: disable=too-many-statements,too-many-branches
    adata: AnnData,
    what: Union[str, ut.Matrix] = "__x__",
    *,
    feature_downsample_min_samples: int = pr.feature_downsample_min_samples,
    feature_downsample_min_cell_quantile: float = pr.feature_downsample_min_cell_quantile,
    feature_downsample_max_cell_quantile: float = pr.feature_downsample_max_cell_quantile,
    feature_min_gene_total: Optional[int] = pr.feature_min_gene_total,
    feature_min_gene_top3: Optional[int] = pr.feature_min_gene_top3,
    feature_min_gene_relative_variance: Optional[float] = pr.feature_min_gene_relative_variance,
    feature_gene_names: Optional[Collection[str]] = None,
    feature_gene_patterns: Optional[Collection[Union[str, Pattern]]] = None,
    forbidden_gene_names: Optional[Collection[str]] = None,
    forbidden_gene_patterns: Optional[Collection[Union[str, Pattern]]] = None,
    cells_similarity_value_normalization: float = pr.cells_similarity_value_normalization,
    cells_similarity_log_data: bool = pr.cells_similarity_log_data,
    cells_similarity_method: str = pr.cells_similarity_method,
    target_metacell_size: float = pr.target_metacell_size,
    max_cell_size: Optional[float] = pr.max_cell_size,
    max_cell_size_factor: Optional[float] = pr.max_cell_size_factor,
    cell_sizes: Optional[Union[str, ut.Vector]] = pr.cell_sizes,
    knn_k: Optional[int] = pr.knn_k,
    min_knn_k: Optional[int] = pr.min_knn_k,
    knn_balanced_ranks_factor: float = pr.knn_balanced_ranks_factor,
    knn_incoming_degree_factor: float = pr.knn_incoming_degree_factor,
    knn_outgoing_degree_factor: float = pr.knn_outgoing_degree_factor,
    candidates_cell_seeds: Optional[Union[str, ut.Vector]] = None,
    min_seed_size_quantile: float = pr.min_seed_size_quantile,
    max_seed_size_quantile: float = pr.max_seed_size_quantile,
    candidates_cooldown_pass: float = pr.cooldown_pass,
    candidates_cooldown_node: float = pr.cooldown_node,
    candidates_cooldown_phase: float = pr.cooldown_phase,
    candidates_min_split_size_factor: Optional[float] = pr.candidates_min_split_size_factor,
    candidates_max_merge_size_factor: Optional[float] = pr.candidates_max_merge_size_factor,
    candidates_min_metacell_cells: Optional[int] = pr.min_metacell_cells,
    candidates_max_split_min_cut_strength: Optional[float] = pr.max_split_min_cut_strength,
    candidates_min_cut_seed_cells: Optional[int] = pr.min_cut_seed_cells,
    must_complete_cover: bool = False,
    deviants_min_gene_fold_factor: float = pr.deviants_min_gene_fold_factor,
    deviants_abs_folds: bool = pr.deviants_abs_folds,
    deviants_max_gene_fraction: Optional[float] = pr.deviants_max_gene_fraction,
    deviants_max_cell_fraction: Optional[float] = pr.deviants_max_cell_fraction,
    dissolve_min_robust_size_factor: Optional[float] = pr.dissolve_min_robust_size_factor,
    dissolve_min_convincing_size_factor: Optional[float] = pr.dissolve_min_convincing_size_factor,
    dissolve_min_convincing_gene_fold_factor: float = pr.dissolve_min_convincing_gene_fold_factor,
    dissolve_min_metacell_cells: int = pr.dissolve_min_metacell_cells,
    random_seed: int = pr.random_seed,
) -> AnnData:
    """
    Directly compute metacells using ``what`` (default: {what}) data.

    This directly computes the metacells on the whole data. Like any method that directly looks at
    the whole data at once, the amount of CPU and memory needed becomes unreasonable when the data
    size grows. Above O(10,000) you are much better off using the divide-and-conquer method.

    .. note::

        The current implementation is naive in that it computes the full dense N^2 correlation
        matrix, and only then extracts the sparse graph out of it. We actually need two copies where
        each requires 4 bytes per entry, so for O(100,000) cells, we have storage of
        O(100,000,000,000). In addition, the implementation is serial for the graph clustering
        phases.

        It is possible to mitigate this by fusing the correlations phase and the graph generation
        phase, parallelizing the result, and also (somehow) parallelizing the graph clustering
        phase. This might increase the "reasonable" size for the direct approach to O(100,000).

        We have decided not to invest in this direction since it won't allow us to push the size to
        O(1,000,000) and above. Instead we provide the divide-and-conquer method, which easily
        scales to O(1,000,000) on a single multi-core server, and to "unlimited" size if we further
        enhance the implementation to use a distributed compute cluster of such servers.

    .. todo::

        Should :py:func:`compute_direct_metacells` avoid computing the graph and partition it for a
        very small number of cells?

    **Input**

    The presumably "clean" annotated ``adata``, where the observations are cells and the variables
    are genes, where ``what`` is a per-variable-per-observation matrix or the name of a
    per-variable-per-observation annotation containing such a matrix.

    **Returns**

    Sets the following annotations in ``adata``:

    Variable (Gene) Annotations
        ``high_total_gene``
            A boolean mask of genes with "high" expression level.

        ``high_relative_variance_gene``
            A boolean mask of genes with "high" normalized variance, relative to other genes with a
            similar expression level.

        ``forbidden_gene``
            A boolean mask of genes which are forbidden from being chosen as "feature" genes based
            on their name.

        ``feature_gene``
            A boolean mask of the "feature" genes.

        ``gene_deviant_votes``
            The number of cells each gene marked as deviant (if zero, the gene did not mark any cell
            as deviant). This will be zero for non-"feature" genes.

    Observation (Cell) Annotations
        ``seed``
            The index of the seed metacell each cell was assigned to to. This is ``-1`` for
            non-"clean" cells.

        ``candidate``
            The index of the candidate metacell each cell was assigned to to. This is ``-1`` for
            non-"clean" cells.

        ``cell_deviant_votes``
            The number of genes that were the reason the cell was marked as deviant (if zero, the
            cell is not deviant).

        ``dissolved``
            A boolean mask of the cells contained in a dissolved metacell.

        ``metacell``
            The integer index of the metacell each cell belongs to. The metacells are in no
            particular order. Cells with no metacell assignment ("outliers") are given a metacell
            index of ``-1``.

        ``outlier``
            A boolean mask of the cells contained in no metacell.

    **Computation Parameters**

    1. Invoke :py:func:`metacells.pipeline.feature.extract_feature_data` to extract "feature" data
       from the clean data, using the
       ``feature_downsample_min_samples`` (default: {feature_downsample_min_samples}),
       ``feature_downsample_min_cell_quantile`` (default: {feature_downsample_min_cell_quantile}),
       ``feature_downsample_max_cell_quantile`` (default: {feature_downsample_max_cell_quantile}),
       ``feature_min_gene_total`` (default: {feature_min_gene_total}), ``feature_min_gene_top3``
       (default: {feature_min_gene_top3}), ``feature_min_gene_relative_variance`` (default:
       {feature_min_gene_relative_variance}), ``feature_gene_names`` (default:
       {feature_gene_names}), ``feature_gene_patterns`` (default: {feature_gene_patterns}),
       ``forbidden_gene_names`` (default: {forbidden_gene_names}), ``forbidden_gene_patterns``
       (default: {forbidden_gene_patterns}) and ``random_seed`` (default: {random_seed}) to make
       this replicable.

    2. Compute the fractions of each variable in each cell, and add the
       ``cells_similarity_value_normalization`` (default: {cells_similarity_value_normalization}) to
       it.

    3. If ``cells_similarity_log_data`` (default: {cells_similarity_log_data}), invoke the
       :py:func:`metacells.utilities.computation.log_data` function to compute the log (base 2) of
       the data.

    4. Invoke :py:func:`metacells.tools.similarity.compute_obs_obs_similarity` to compute the
       similarity between each pair of cells, using the
       ``cells_similarity_method`` (default: {cells_similarity_method}).

    5. Invoke :py:func:`metacells.pipeline.collect.compute_effective_cell_sizes` using
       ``max_cell_size`` (default: {max_cell_size}), ``max_cell_size_factor`` (default:
       {max_cell_size_factor}) and ``cell_sizes`` (default: {cell_sizes}) to get the effective cell
       sizes to use.

    5. Invoke :py:func:`metacells.tools.knn_graph.compute_obs_obs_knn_graph` to compute a
       K-Nearest-Neighbors graph, using the
       ``knn_balanced_ranks_factor`` (default: {knn_balanced_ranks_factor}),
       ``knn_incoming_degree_factor`` (default: {knn_incoming_degree_factor})
       and
       ``knn_outgoing_degree_factor`` (default: {knn_outgoing_degree_factor}).
       If ``knn_k`` (default: {knn_k}) is not specified, then it is
       chosen to be the median number of cells required to reach the target metacell size,
       but at least ``min_knn_k`` (default: {min_knn_k}).

    6. Invoke :py:func:`metacells.tools.candidates.compute_candidate_metacells` to compute
       the candidate metacells, using the
       ``candidates_cell_seeds`` (default: {candidates_cell_seeds}),
       ``min_seed_size_quantile`` (default: {min_seed_size_quantile}),
       ``max_seed_size_quantile`` (default: {max_seed_size_quantile}),
       ``candidates_cooldown_pass`` (default: {candidates_cooldown_pass}),
       ``candidates_cooldown_node`` (default: {candidates_cooldown_node}),
       ``candidates_cooldown_phase`` (default: {candidates_cooldown_phase}),
       ``candidates_min_split_size_factor`` (default: {candidates_min_split_size_factor}),
       ``candidates_max_merge_size_factor`` (default: {candidates_max_merge_size_factor}),
       ``candidates_min_metacell_cells`` (default: {candidates_min_metacell_cells}),
       and
       ``random_seed`` (default: {random_seed})
       to make this replicable. This tries to build metacells of the
       ``target_metacell_size`` (default: {target_metacell_size})
       using the effective cell sizes.

    7. Unless ``must_complete_cover`` (default: {must_complete_cover}), invoke
       :py:func:`metacells.tools.deviants.find_deviant_cells` to remove deviants from the candidate
       metacells, using the
       ``deviants_min_gene_fold_factor`` (default: {deviants_min_gene_fold_factor}),
       ``deviants_abs_folds`` (default: {deviants_abs_folds}),
       ``deviants_max_gene_fraction`` (default: {deviants_max_gene_fraction})
       and
       ``deviants_max_cell_fraction`` (default: {deviants_max_cell_fraction}).

    8. Unless ``must_complete_cover`` (default: {must_complete_cover}), invoke
       :py:func:`metacells.tools.dissolve.dissolve_metacells` to dissolve small unconvincing
       metacells, using the same
       ``target_metacell_size`` (default: {target_metacell_size}),
       and the effective cell sizes
       and the
       ``dissolve_min_robust_size_factor`` (default: {dissolve_min_robust_size_factor}),
       ``dissolve_min_convincing_size_factor`` (default: {dissolve_min_convincing_size_factor}),
       ``dissolve_min_convincing_gene_fold_factor`` (default: {dissolve_min_convincing_size_factor})
       and
       ``dissolve_min_metacell_cells`` (default: ``dissolve_min_metacell_cells``).
    """
    fdata = extract_feature_data(
        adata,
        what,
        top_level=False,
        downsample_min_samples=feature_downsample_min_samples,
        downsample_min_cell_quantile=feature_downsample_min_cell_quantile,
        downsample_max_cell_quantile=feature_downsample_max_cell_quantile,
        min_gene_relative_variance=feature_min_gene_relative_variance,
        min_gene_total=feature_min_gene_total,
        min_gene_top3=feature_min_gene_top3,
        forced_gene_names=feature_gene_names,
        forced_gene_patterns=feature_gene_patterns,
        forbidden_gene_names=forbidden_gene_names,
        forbidden_gene_patterns=forbidden_gene_patterns,
        random_seed=random_seed,
    )

    if fdata is None:
        raise ValueError("Empty feature data, giving up")

    effective_cell_sizes, max_cell_size, _cell_scale_factors = compute_effective_cell_sizes(
        adata, max_cell_size=max_cell_size, max_cell_size_factor=max_cell_size_factor, cell_sizes=cell_sizes
    )
    ut.log_calc("effective_cell_sizes", effective_cell_sizes, formatter=ut.sizes_description)

    if max_cell_size is not None:
        if candidates_min_metacell_cells is not None:
            target_metacell_size = max(target_metacell_size, max_cell_size * candidates_min_metacell_cells)

        if dissolve_min_metacell_cells is not None:
            target_metacell_size = max(target_metacell_size, max_cell_size * dissolve_min_metacell_cells)

        if candidates_min_metacell_cells is not None or dissolve_min_metacell_cells is not None:
            ut.log_calc("target_metacell_size", target_metacell_size)

    data = ut.get_vo_proper(fdata, "downsampled", layout="row_major")
    data = ut.to_numpy_matrix(data, copy=True)

    if cells_similarity_value_normalization > 0:
        data += cells_similarity_value_normalization

    if cells_similarity_log_data:
        data = ut.log_data(data, base=2)

    if knn_k is None:
        if effective_cell_sizes is None:
            median_cell_size = 1.0
        else:
            median_cell_size = float(np.median(effective_cell_sizes))
        knn_k = int(round(target_metacell_size / median_cell_size))
        if min_knn_k is not None:
            knn_k = max(knn_k, min_knn_k)

    if knn_k == 0:
        ut.log_calc("knn_k: 0 (too small, try single metacell)")
        ut.set_o_data(fdata, "candidate", np.full(fdata.n_obs, 0, dtype="int32"), formatter=lambda _: "* <- 0")
    elif knn_k >= fdata.n_obs:
        ut.log_calc(f"knn_k: {knn_k} (too large, try single metacell)")
        ut.set_o_data(fdata, "candidate", np.full(fdata.n_obs, 0, dtype="int32"), formatter=lambda _: "* <- 0")

    else:
        ut.log_calc("knn_k", knn_k)

        tl.compute_obs_obs_similarity(fdata, data, method=cells_similarity_method, reproducible=(random_seed != 0))

        tl.compute_obs_obs_knn_graph(
            fdata,
            k=knn_k,
            balanced_ranks_factor=knn_balanced_ranks_factor,
            incoming_degree_factor=knn_incoming_degree_factor,
            outgoing_degree_factor=knn_outgoing_degree_factor,
        )

        tl.compute_candidate_metacells(
            fdata,
            target_metacell_size=target_metacell_size,
            cell_sizes=effective_cell_sizes,
            cell_seeds=candidates_cell_seeds,
            min_seed_size_quantile=min_seed_size_quantile,
            max_seed_size_quantile=max_seed_size_quantile,
            cooldown_pass=candidates_cooldown_pass,
            cooldown_node=candidates_cooldown_node,
            cooldown_phase=candidates_cooldown_phase,
            min_split_size_factor=candidates_min_split_size_factor,
            max_merge_size_factor=candidates_max_merge_size_factor,
            min_metacell_cells=candidates_min_metacell_cells,
            max_split_min_cut_strength=candidates_max_split_min_cut_strength,
            min_cut_seed_cells=candidates_min_cut_seed_cells,
            must_complete_cover=must_complete_cover,
            random_seed=random_seed,
        )

        ut.set_oo_data(adata, "obs_similarity", ut.get_oo_proper(fdata, "obs_similarity"))

        ut.set_oo_data(adata, "obs_outgoing_weights", ut.get_oo_proper(fdata, "obs_outgoing_weights"))

        seed_of_cells = ut.get_o_numpy(fdata, "seed", formatter=ut.groups_description)

        ut.set_o_data(adata, "seed", seed_of_cells, formatter=ut.groups_description)

    candidate_of_cells = ut.get_o_numpy(fdata, "candidate", formatter=ut.groups_description)

    ut.set_o_data(adata, "candidate", candidate_of_cells, formatter=ut.groups_description)

    if must_complete_cover:
        assert np.min(candidate_of_cells) == 0

        deviant_votes_of_genes = np.zeros(adata.n_vars, dtype="float32")
        deviant_votes_of_cells = np.zeros(adata.n_obs, dtype="float32")
        dissolved_of_cells = np.zeros(adata.n_obs, dtype="bool")

        ut.set_v_data(adata, "gene_deviant_votes", deviant_votes_of_genes, formatter=ut.mask_description)

        ut.set_o_data(adata, "cell_deviant_votes", deviant_votes_of_cells, formatter=ut.mask_description)

        ut.set_o_data(adata, "dissolved", dissolved_of_cells, formatter=ut.mask_description)

        ut.set_o_data(adata, "metacell", candidate_of_cells, formatter=ut.groups_description)

    else:
        tl.find_deviant_cells(
            adata,
            candidates=candidate_of_cells,
            min_gene_fold_factor=deviants_min_gene_fold_factor,
            abs_folds=deviants_abs_folds,
            max_gene_fraction=deviants_max_gene_fraction,
            max_cell_fraction=deviants_max_cell_fraction,
        )

        tl.dissolve_metacells(
            adata,
            candidates=candidate_of_cells,
            target_metacell_size=target_metacell_size,
            cell_sizes=effective_cell_sizes,
            min_robust_size_factor=dissolve_min_robust_size_factor,
            min_convincing_size_factor=dissolve_min_convincing_size_factor,
            min_convincing_gene_fold_factor=dissolve_min_convincing_gene_fold_factor,
            min_metacell_cells=dissolve_min_metacell_cells,
        )

        metacell_of_cells = ut.get_o_numpy(adata, "metacell", formatter=ut.groups_description)

        outlier_of_cells = metacell_of_cells < 0
        ut.set_o_data(adata, "outlier", outlier_of_cells, formatter=ut.mask_description)

    return fdata
Esempio n. 21
0
def compute_significant_projected_fold_factors(
    adata: AnnData,
    what: Union[str, ut.Matrix] = "__x__",
    *,
    total_umis: Optional[ut.Vector],
    projected: Union[str, ut.Matrix] = "projected",
    fold_normalization: float = pr.project_fold_normalization,
    min_significant_gene_value: float = pr.project_min_significant_gene_value,
    min_gene_fold_factor: float = pr.project_max_projection_fold_factor,
    min_entry_fold_factor: float = pr.min_entry_project_fold_factor,
    abs_folds: bool = pr.project_abs_folds,
) -> None:
    """
    Compute the significant projected fold factors of genes for each query metacell.

    This computes, for each metacell of the query, the fold factors between the actual query UMIs and the UMIs of the
    projection of the metacell onto the atlas (see :py:func:`metacells.tools.project.project_query_onto_atlas`). The
    result per-metacell-per-gene matrix is then made sparse by discarding too-low values (setting them to zero).
    Ideally, this matrix should be "very" sparse. If it contains "too many" non-zero values, more genes need to
    be ignored by the projection, or somehow corrected for batch effects prior to computing the projection.

    **Input**

    Annotated ``adata``, where the observations are query metacells and the variables are genes, where ``what`` is a
    per-variable-per-observation matrix or the name of a per-variable-per-observation annotation containing such a
    matrix.

    In addition, the ``projected`` UMIs of each query metacells onto the atlas.

    **Returns**

    Sets the following in ``gdata``:

    Per-Variable Per-Observation (Gene-Cell) Annotations
        ``projected_fold``
            For each gene and query metacell, the fold factor of this gene between the query and its projection (unless
            the value is too low to be of interest, in which case it will be zero).

    **Computation Parameters**

    1. For each group (metacell), for each gene, compute the gene's fold factor
       log2((actual UMIs + ``fold_normalization``) / (expected UMIs + ``fold_normalization``)), similarly to
       :py:func:`metacells.tools.project.project_query_onto_atlas` (the default ``fold_normalization`` is
       {fold_normalization}).

    2. Set the fold factor to zero for every case where the total UMIs in the query metacell and the projected image is
       not at least ``min_significant_gene_value`` (default: {min_significant_gene_value}).

    3. If the maximal fold factor for a gene (across all metacells) is below ``min_gene_fold_factor`` (default:
       {min_gene_fold_factor}), then set all the gene's fold factors to zero (too low to be of interest).

    4. Otherwise, for any metacell whose fold factor for the gene is less than ``min_entry_fold_factor`` (default:
       {min_entry_fold_factor}), set the fold factor to zero (too low to be of interest). If ``abs_folds`` (default:
       {abs_folds}), consider the absolute fold factors.
    """
    assert 0 <= min_entry_fold_factor <= min_gene_fold_factor
    assert fold_normalization >= 0

    metacells_data = ut.get_vo_proper(adata, what, layout="row_major")
    projected_data = ut.get_vo_proper(adata, projected, layout="row_major")

    metacells_fractions = ut.fraction_by(metacells_data,
                                         by="row",
                                         sums=total_umis)
    projected_fractions = ut.fraction_by(projected_data,
                                         by="row",
                                         sums=total_umis)

    metacells_fractions += fold_normalization  # type: ignore
    projected_fractions += fold_normalization  # type: ignore

    dense_folds = metacells_fractions / projected_fractions  # type: ignore
    dense_folds = np.log2(dense_folds, out=dense_folds)

    total_umis = ut.to_numpy_matrix(metacells_data +
                                    projected_data)  # type: ignore
    insignificant_folds_mask = total_umis < min_significant_gene_value
    ut.log_calc("insignificant entries", insignificant_folds_mask)
    dense_folds[insignificant_folds_mask] = 0.0

    significant_folds = significant_folds_matrix(dense_folds,
                                                 min_gene_fold_factor,
                                                 min_entry_fold_factor,
                                                 abs_folds)
    ut.set_vo_data(adata, "projected_fold", significant_folds)