Beispiel #1
0
def test_parallel_map() -> None:
    @ut.timed_call("invocation")
    def invocation(index: int) -> int:
        return index

    actual = list(ut.parallel_map(invocation, 100))
    expected = list(range(100))
    assert actual == expected

    # TODO: Why does pytest coverage error trying to read these files?
    for path in glob(".coverage.*"):
        os.remove(path)
Beispiel #2
0
def project_query_onto_atlas(
    what: Union[str, ut.Matrix] = "__x__",
    *,
    adata: AnnData,
    qdata: AnnData,
    atlas_total_umis: Optional[ut.Vector] = None,
    query_total_umis: Optional[ut.Vector] = None,
    project_log_data: bool = pr.project_log_data,
    fold_normalization: float = pr.project_fold_normalization,
    min_significant_gene_value: float = pr.project_min_significant_gene_value,
    max_consistency_fold_factor: float = pr.project_max_consistency_fold_factor,
    candidates_count: int = pr.project_candidates_count,
    min_usage_weight: float = pr.project_min_usage_weight,
    reproducible: bool,
) -> ut.CompressedMatrix:
    """
    Project query metacells onto atlas metacells.

    **Input**

    Annotated query ``qdata`` and atlas ``adata``, where the observations are cells and the variables are genes, where
    ``what`` is a per-variable-per-observation matrix or the name of a per-variable-per-observation annotation
    containing such a matrix.

    Typically this data excludes any genes having a systematic difference between the query and the atlas, e.g. genes
    detected by by :py:func:`metacells.tools.project.find_systematic_genes`.

    **Returns**

    A matrix whose rows are query metacells and columns are atlas metacells, where each entry is the weight of the atlas
    metacell in the projection of the query metacells. The sum of weights in each row (that is, for a single query
    metacell) is 1. The weighted sum of the atlas metacells using these weights is the "projected" image of the query
    metacell onto the atlas.

    In addition, sets the following annotations in ``qdata``:

    Observation (Cell) Annotations
        ``similar``
            A boolean mask indicating whether the query metacell is similar to its projection onto the atlas. If
            ``False`` the metacells is said to be "dissimilar", which may indicate the query contains cell states that
            do not appear in the atlas.

    **Computation Parameters**

    0. All fold computations (log2 of the ratio between gene expressions as a fraction of the total UMIs) use the
       ``fold_normalization`` (default: {fold_normalization}). Fractions are computed based on the total UMIs, unless
       ``atlas_total_umis`` and/or ``query_total_umis`` are specified.

    For each query metacell:

    1. Correlate the metacell with all the atlas metacells, and pick the highest-correlated one as the "anchor".
       If ``reproducible``, a slower (still parallel) but reproducible algorithm will be used.

    2. Consider as candidates only atlas metacells whose maximal gene fold factor compared to the anchor is at most
       ``max_consistency_fold_factor`` (default: {max_consistency_fold_factor}). Ignore the fold factors of genes whose
       sum of UMIs in the anchor and the candidate metacells is less than ``min_significant_gene_value`` (default:
       {min_significant_gene_value}).

    3. Select the ``candidates_count`` (default: {candidates_count}) candidate metacells with the highest correlation
       with the query metacell.

    4. Compute the non-negative weights (with a sum of 1) of the selected candidates that give the best projection of
       the query metacells onto the atlas. Since the algorithm for computing these weights rarely produces an exact 0
       weight, reduce all weights less than the ``min_usage_weight`` (default: {min_usage_weight}) to zero. If
       ``project_log_data`` (default: {project_log_data}), compute the match on the log of the data instead of the
       actual data.
    """
    assert fold_normalization > 0
    assert candidates_count > 0
    assert min_usage_weight >= 0
    assert max_consistency_fold_factor >= 0
    assert np.all(adata.var_names == qdata.var_names)

    atlas_umis = ut.get_vo_proper(adata, what, layout="row_major")
    query_umis = ut.get_vo_proper(qdata, what, layout="row_major")

    if atlas_total_umis is None:
        atlas_total_umis = ut.sum_per(atlas_umis, per="row")
    atlas_total_umis = ut.to_numpy_vector(atlas_total_umis)

    if query_total_umis is None:
        query_total_umis = ut.sum_per(query_umis, per="row")
    query_total_umis = ut.to_numpy_vector(query_total_umis)

    atlas_fractions = ut.to_numpy_matrix(ut.fraction_by(atlas_umis, by="row", sums=atlas_total_umis))
    query_fractions = ut.to_numpy_matrix(ut.fraction_by(query_umis, by="row", sums=query_total_umis))

    atlas_fractions += fold_normalization
    query_fractions += fold_normalization

    atlas_log_fractions = np.log2(atlas_fractions)
    query_log_fractions = np.log2(query_fractions)

    atlas_fractions -= fold_normalization
    query_fractions -= fold_normalization

    if project_log_data:
        atlas_project_data = atlas_log_fractions
        query_project_data = query_log_fractions
    else:
        atlas_project_data = atlas_fractions
        query_project_data = query_fractions

    query_atlas_corr = ut.cross_corrcoef_rows(query_project_data, atlas_project_data, reproducible=reproducible)

    @ut.timed_call("project_single_metacell")
    def _project_single(query_metacell_index: int) -> Tuple[ut.NumpyVector, ut.NumpyVector]:
        return _project_single_metacell(
            atlas_umis=atlas_umis,
            query_atlas_corr=query_atlas_corr,
            atlas_project_data=atlas_project_data,
            query_project_data=query_project_data,
            atlas_log_fractions=atlas_log_fractions,
            candidates_count=candidates_count,
            min_significant_gene_value=min_significant_gene_value,
            min_usage_weight=min_usage_weight,
            max_consistency_fold_factor=max_consistency_fold_factor,
            query_metacell_index=query_metacell_index,
        )

    results = list(ut.parallel_map(_project_single, qdata.n_obs))

    indices = np.concatenate([result[0] for result in results], dtype="int32")
    data = np.concatenate([result[1] for result in results], dtype="float32")

    atlas_used_sizes = [len(result[0]) for result in results]
    atlas_used_sizes.insert(0, 0)
    indptr = np.cumsum(np.array(atlas_used_sizes))

    return sp.csr_matrix((data, indices, indptr), shape=(qdata.n_obs, adata.n_obs))
Beispiel #3
0
def split_groups(
    adata: AnnData,
    what: Union[str, ut.Matrix] = "__x__",
    *,
    group: str = "metacell",
    feature_downsample_min_samples: int = pr.feature_downsample_min_samples,
    feature_downsample_min_cell_quantile: float = pr.
    feature_downsample_min_cell_quantile,
    feature_downsample_max_cell_quantile: float = pr.
    feature_downsample_max_cell_quantile,
    feature_min_gene_total: Optional[int] = None,
    feature_min_gene_top3: Optional[int] = None,
    feature_min_gene_relative_variance: Optional[float] = pr.
    feature_min_gene_relative_variance,
    forbidden_gene_names: Optional[Collection[str]] = None,
    forbidden_gene_patterns: Optional[Collection[Union[str, Pattern]]] = None,
    cells_similarity_value_normalization: float = pr.
    cells_similarity_value_normalization,
    cells_similarity_log_data: bool = pr.cells_similarity_log_data,
    cells_similarity_method: str = pr.cells_similarity_method,
    max_cell_size: Optional[float] = pr.max_cell_size,
    max_cell_size_factor: Optional[float] = pr.max_cell_size_factor,
    knn_balanced_ranks_factor: float = pr.knn_balanced_ranks_factor,
    knn_incoming_degree_factor: float = pr.knn_incoming_degree_factor,
    knn_outgoing_degree_factor: float = pr.knn_outgoing_degree_factor,
    min_seed_size_quantile: float = pr.min_seed_size_quantile,
    max_seed_size_quantile: float = pr.max_seed_size_quantile,
    candidates_cooldown_pass: float = pr.cooldown_pass,
    candidates_cooldown_node: float = pr.cooldown_node,
    random_seed: int = pr.random_seed,
) -> None:
    """
    Split each metacell into two parts using ``what`` (default: {what}) data.

    This creates a new partition of cells into half-metacells, which can used to
    :py:func:`compute_groups_self_consistency`.

    **Input**

    The input annotated ``adata`` is expected to contain a per-observation annotation named
    ``group`` (default: {group}) which identifies the group (metacells) each observation (cell)
    belongs to.

    **Returns**

    Sets the following annotations in ``adata``:

    Observation (Cell) Annotations
        ``half_<group>``
            The index of the half-group each cell belongs to. This is ``-1`` for ungrouped cells.
            Indices 0 to the number of groups are the first (low) halves; from the number of groups
            to twice that are the second (low) halves.

    **Computation Parameters**

    1. For each group (metacell), invoke
       :py:func:`metacells.pipeline.direct.compute_direct_metacells` on the observations (cells)
       included in the group, forcing the creation of two half-groups that cover all the group's
       cells. The parameters are passed to this call as-is, setting ``must_complete_cover`` to
       ``True`` (that is, disabling outliers detection), and disabling restrictions on the
       half-group sizes.
    """
    group_of_cells = ut.get_o_numpy(adata, group)
    groups_count = np.max(group_of_cells) + 1
    half_groups_of_cells = np.full(adata.n_obs, -1, dtype="int32")

    @ut.timed_call("split_group")
    def split_group(group_index: int) -> Tuple[ut.NumpyVector, ut.NumpyVector]:
        group_cells_mask = group_of_cells == group_index
        assert np.any(group_cells_mask)
        name = f".{group}-{group_index}/{groups_count}"
        gdata = ut.slice(adata,
                         name=name,
                         top_level=False,
                         obs=group_cells_mask,
                         track_obs="complete_cell_index")
        target_metacell_size = (gdata.n_obs + 1) // 2
        compute_direct_metacells(
            gdata,
            what,
            feature_downsample_min_samples=feature_downsample_min_samples,
            feature_downsample_min_cell_quantile=
            feature_downsample_min_cell_quantile,
            feature_downsample_max_cell_quantile=
            feature_downsample_max_cell_quantile,
            feature_min_gene_total=feature_min_gene_total,
            feature_min_gene_top3=feature_min_gene_top3,
            feature_min_gene_relative_variance=
            feature_min_gene_relative_variance,
            forbidden_gene_names=forbidden_gene_names,
            forbidden_gene_patterns=forbidden_gene_patterns,
            cells_similarity_value_normalization=
            cells_similarity_value_normalization,
            cells_similarity_log_data=cells_similarity_log_data,
            cells_similarity_method=cells_similarity_method,
            target_metacell_size=target_metacell_size,
            max_cell_size=max_cell_size,
            max_cell_size_factor=max_cell_size_factor,
            cell_sizes=None,
            knn_k=target_metacell_size,
            min_knn_k=target_metacell_size,
            knn_balanced_ranks_factor=knn_balanced_ranks_factor,
            knn_incoming_degree_factor=knn_incoming_degree_factor,
            knn_outgoing_degree_factor=knn_outgoing_degree_factor,
            min_seed_size_quantile=min_seed_size_quantile,
            max_seed_size_quantile=max_seed_size_quantile,
            candidates_cooldown_pass=candidates_cooldown_pass,
            candidates_cooldown_node=candidates_cooldown_node,
            candidates_min_split_size_factor=None,
            candidates_max_merge_size_factor=None,
            candidates_min_metacell_cells=1,
            must_complete_cover=True,
            random_seed=random_seed,
        )
        direct_groups = ut.get_o_numpy(gdata, "metacell")
        zero_count = np.sum(direct_groups == 0)
        one_count = np.sum(direct_groups == 1)
        ut.log_calc(f"group: {group_index} size: {len(direct_groups)} "
                    f"split into: {zero_count} + {one_count}")
        assert zero_count + one_count == len(direct_groups)
        assert zero_count > 0
        assert one_count > 0
        return (group_cells_mask, group_index + groups_count * direct_groups)

    for (group_cells_mask,
         group_cells_halves) in ut.parallel_map(split_group, groups_count):
        half_groups_of_cells[group_cells_mask] = group_cells_halves

    ut.set_o_data(adata,
                  f"half_{group}",
                  half_groups_of_cells,
                  formatter=ut.groups_description)
Beispiel #4
0
def compute_deviant_fold_factors(
    what: Union[str, ut.Matrix] = "__x__",
    *,
    adata: AnnData,
    gdata: AnnData,
    group: Union[str, ut.Vector] = "metacell",
    similar: Union[str, ut.Vector] = "similar",
    significant_gene_fold_factor: float = pr.significant_gene_fold_factor,
) -> None:
    """
    Given an assignment of observations (cells) to groups (metacells) or, if an outlier, to the most
    similar groups, compute for each observation and gene the fold factor relative to its group
    for the purpose of detecting deviant cells.

    Ideally, all grouped cells would have no genes with high enough fold factors to be considered deviants, and all
    outlier cells would. In practice grouped cells might have a (few) such genes to the restriction on the fraction
    of deviants.

    It is important not to read too much into the results for a single cell, but looking at which genes appear for cell
    populations (e.g., cells with specific metadata such as batch identification) might be instructive.

    **Input**

    Annotated ``adata``, where the observations are cells and the variables are genes, where ``what`` is a
    per-variable-per-observation matrix or the name of a per-variable-per-observation annotation containing such a
    matrix.

    In addition, ``gdata`` is assumed to have one observation for each group, and use the same genes as ``adata``.

    **Returns**

    Sets the following in ``adata``:

    Per-Variable Per-Observation (Gene-Cell) Annotations

        ``deviant_fold``
            The fold factor between the cell's UMIs and the expected number of UMIs for the purpose of computing
            deviant cells.

    **Computation Parameters**

    1. For each cell, compute the expected UMIs for each gene given the fraction of the gene in the metacells associated
       with the cell (the one it is belongs to, or the most similar one for outliers). If this is less than
       ``significant_gene_fold_factor`` (default: {significant_gene_fold_factor}), set it to zero so the result will be
       sparse.
    """
    cells_data = ut.get_vo_proper(adata, what, layout="row_major")
    metacells_data = ut.get_vo_proper(gdata, what, layout="row_major")
    total_umis_per_cell = ut.sum_per(cells_data, per="row")
    total_umis_per_metacell = ut.sum_per(metacells_data, per="row")

    group_of_cells = ut.get_o_numpy(adata,
                                    group,
                                    formatter=ut.groups_description)
    similar_of_cells = ut.get_o_numpy(adata,
                                      similar,
                                      formatter=ut.groups_description)

    @ut.timed_call("compute_cell_deviant_certificates")
    def _compute_cell_deviant_certificates(
            cell_index: int) -> Tuple[ut.NumpyVector, ut.NumpyVector]:
        return _compute_cell_certificates(
            cell_index=cell_index,
            cells_data=cells_data,
            metacells_data=metacells_data,
            group_of_cells=group_of_cells,
            similar_of_cells=similar_of_cells,
            total_umis_per_cell=total_umis_per_cell,
            total_umis_per_metacell=total_umis_per_metacell,
            significant_gene_fold_factor=significant_gene_fold_factor,
        )

    results = list(
        ut.parallel_map(_compute_cell_deviant_certificates, adata.n_obs))

    cell_indices = np.concatenate([
        np.full(len(result[0]), cell_index, dtype="int32")
        for cell_index, result in enumerate(results)
    ])
    gene_indices = np.concatenate([result[0] for result in results])
    fold_factors = np.concatenate([result[1] for result in results])

    deviant_folds = sparse.csr_matrix(
        (fold_factors, (cell_indices, gene_indices)), shape=adata.shape)
    ut.set_vo_data(adata, "deviant_folds", deviant_folds)
Beispiel #5
0
def compute_inner_fold_factors(
    what: Union[str, ut.Matrix] = "__x__",
    *,
    adata: AnnData,
    gdata: AnnData,
    group: Union[str, ut.Vector] = "metacell",
    min_gene_inner_fold_factor: float = pr.min_gene_inner_fold_factor,
    min_entry_inner_fold_factor: float = pr.min_entry_inner_fold_factor,
    inner_abs_folds: float = pr.inner_abs_folds,
) -> None:
    """
    Compute the inner fold factors of genes within in each metacell.

    This computes, for each cell of the metacell, the same fold factors that are used to detect deviant cells (see
    :py:func:`metacells.tools.deviants.find_deviant_cells`), and keeps the maximal fold for each gene in the metacell.
    The result per-metacell-per-gene matrix is then made sparse by discarding too-low values (setting them to zero).
    Ideally, this matrix should be "very" sparse. If it contains "too many" non-zero values, this indicates the
    metacells contains "too much" variability. This may be due to actual biology (e.g. immune cells or olfactory nerves
    which are all similar except for each one expressing one different gene), due to batch effects (similar cells in
    distinct batches differing in some genes due to technical issues), due to low data quality (the overall noise level
    is so high that this is simply the best the algorithm can do), or worse - a combination of the above.

    **Input**

    Annotated ``adata``, where the observations are cells and the variables are genes, where ``what`` is a
    per-variable-per-observation matrix or the name of a per-variable-per-observation annotation containing such a
    matrix.

    In addition, ``gdata`` is assumed to have one observation for each group, and use the same
    genes as ``adata``.

    **Returns**

    Sets the following in ``gdata``:

    Per-Variable Per-Observation (Gene-Cell) Annotations

        ``inner_fold``
            For each gene and group, the maximal fold factor of this gene in any cell contained in the group (unless the
            value is too low to be of interest, in which case it will be zero).

    **Computation Parameters**

    1. For each group (metacell), for each gene, compute the gene's maximal (in all the cells of the group) fold factor
       log2((actual UMIs + 1) / (expected UMIs + 1)), similarly to
       :py:func:`metacells.tools.deviants.find_deviant_cells`.

    2. If the maximal fold factor for a gene (across all metacells) is below ``min_gene_inner_fold_factor`` (default:
       {min_gene_inner_fold_factor}), then set all the gene's fold factors to zero (too low to be of interest). If
       ``inner_abs_folds`` (default: {inner_abs_folds}), consider the absolute fold factors.

    3. Otherwise, for any metacell whose fold factor for the gene is less than ``min_entry_inner_fold_factor`` (default:
       {min_entry_inner_fold_factor}), set the fold factor to zero (too low to be of interest).
    """
    assert 0 <= min_entry_inner_fold_factor <= min_gene_inner_fold_factor

    cells_data = ut.get_vo_proper(adata, what, layout="row_major")
    metacells_data = ut.get_vo_proper(gdata, what, layout="row_major")
    group_of_cells = ut.get_o_numpy(adata,
                                    group,
                                    formatter=ut.groups_description)
    total_umis_per_cell = ut.sum_per(cells_data, per="row")
    total_umis_per_metacell = ut.sum_per(metacells_data, per="row")

    @ut.timed_call("compute_metacell_inner_folds")
    def _compute_single_metacell_inner_folds(
            metacell_index: int) -> ut.NumpyVector:
        return _compute_metacell_inner_folds(
            metacell_index=metacell_index,
            cells_data=cells_data,
            metacells_data=metacells_data,
            group_of_cells=group_of_cells,
            total_umis_per_cell=total_umis_per_cell,
            total_umis_per_metacell=total_umis_per_metacell,
        )

    results = list(
        ut.parallel_map(_compute_single_metacell_inner_folds, gdata.n_obs))
    dense_inner_folds_by_row = np.array(results)
    dense_inner_folds_by_column = ut.to_layout(dense_inner_folds_by_row,
                                               "column_major")
    if inner_abs_folds:
        comparable_dense_inner_folds_by_column = np.abs(
            dense_inner_folds_by_column)
    else:
        comparable_dense_inner_folds_by_column = dense_inner_folds_by_column
    max_fold_per_gene = ut.max_per(comparable_dense_inner_folds_by_column,
                                   per="column")
    significant_genes_mask = max_fold_per_gene >= min_gene_inner_fold_factor
    ut.log_calc("significant_genes_mask", significant_genes_mask)
    dense_inner_folds_by_column[:, ~significant_genes_mask] = 0
    dense_inner_folds_by_column[comparable_dense_inner_folds_by_column <
                                min_entry_inner_fold_factor] = 0
    dense_inner_folds_by_row = ut.to_layout(dense_inner_folds_by_column,
                                            layout="row_major")
    sparse_inner_folds = sparse.csr_matrix(dense_inner_folds_by_row)
    ut.set_vo_data(gdata, "inner_fold", sparse_inner_folds)