Example #1
0
def compute_query_projection(
    what: Union[str, ut.Matrix] = "__x__",
    *,
    adata: AnnData,
    qdata: AnnData,
    weights: ut.Matrix,
    atlas_total_umis: Optional[ut.Vector] = None,
    query_total_umis: Optional[ut.Vector] = None,
) -> None:
    """
    Compute the projected image of the query on the atlas.

    **Input**

    Annotated query ``qdata`` and atlas ``adata``, where the observations are cells and the variables are genes, where
    ``what`` is a per-variable-per-observation matrix or the name of a per-variable-per-observation annotation
    containing such a matrix.

    The ``weights`` of the projection where each row is a query metacell, each column is an atlas metacell, and the
    value is the weight of the atlas cell for projecting the metacell, such that the sum of weights in each row
    is one.

    **Returns**

    In addition, sets the following annotations in ``qdata``:

    Observation (Cell) Annotations
        ``projection``
            The number of UMIs of each gene in the projected image of the query to the metacell, if the total number of
            UMIs in the projection is equal to the total number of UMIs in the query metacell.

    **Computation Parameters**

    1. Compute the fraction of each gene in the atlas and the query based on the total UMIs, unless ``atlas_total_umis``
       and/or ``query_total_umis`` are specified.

    2. Compute the projected image of each query metacell on the atlas using the weights.

    3. Convert this image to UMIs count based on the total UMIs of each metacell. Note that if overriding the total
       atlas or query UMIs, this means that the result need not sum to this total.
    """
    assert np.all(adata.var_names == qdata.var_names)

    atlas_umis = ut.get_vo_proper(adata, what, layout="row_major")
    query_umis = ut.get_vo_proper(qdata, what, layout="row_major")

    if atlas_total_umis is None:
        atlas_total_umis = ut.sum_per(atlas_umis, per="row")
    atlas_total_umis = ut.to_numpy_vector(atlas_total_umis)

    if query_total_umis is None:
        query_total_umis = ut.sum_per(query_umis, per="row")
    query_total_umis = ut.to_numpy_vector(query_total_umis)

    atlas_fractions = ut.to_numpy_matrix(ut.fraction_by(atlas_umis, by="row", sums=atlas_total_umis))
    projected_fractions = weights @ atlas_fractions  # type: ignore
    projected_umis = ut.scale_by(projected_fractions, scale=query_total_umis, by="row")
    ut.set_vo_data(qdata, "projected", projected_umis)
Example #2
0
def test_downsample_matrix() -> None:
    rvs = stats.poisson(10, loc=10).rvs
    matrix = sparse.random(1000, 10000, format="csr", dtype="int32", random_state=123456, data_rvs=rvs)
    assert matrix.nnz == matrix.shape[0] * matrix.shape[1] * 0.01
    old_row_sums = ut.sum_per(matrix, per="row")
    min_sum = np.min(old_row_sums)
    result = ut.downsample_matrix(matrix, per="row", samples=int(min_sum))
    assert result.shape == matrix.shape
    new_row_sums = ut.sum_per(result, per="row")
    assert np.all(new_row_sums == min_sum)

    matrix = matrix.toarray()
    result = ut.downsample_matrix(matrix, per="row", samples=int(min_sum))
    assert result.shape == matrix.shape
    new_row_sums = ut.sum_per(result, per="row")
    assert np.all(new_row_sums == min_sum)
Example #3
0
def _test_per(rows_matrix: ut.Matrix) -> None:
    columns_matrix = ut.to_layout(rows_matrix, layout="column_major")

    assert np.allclose(ut.nnz_per(rows_matrix, per="row"), np.array([2, 3]))
    assert np.allclose(ut.nnz_per(columns_matrix, per="column"), np.array([1, 2, 2]))

    assert np.allclose(ut.sum_per(rows_matrix, per="row"), np.array([3, 12]))
    assert np.allclose(ut.sum_per(columns_matrix, per="column"), np.array([3, 5, 7]))

    assert np.allclose(ut.max_per(rows_matrix, per="row"), np.array([2, 5]))
    assert np.allclose(ut.max_per(columns_matrix, per="column"), np.array([3, 4, 5]))

    assert np.allclose(ut.min_per(rows_matrix, per="row"), np.array([0, 3]))
    assert np.allclose(ut.min_per(columns_matrix, per="column"), np.array([0, 1, 2]))

    assert np.allclose(ut.sum_squared_per(rows_matrix, per="row"), np.array([5, 50]))
    assert np.allclose(ut.sum_squared_per(columns_matrix, per="column"), np.array([9, 17, 29]))

    assert np.allclose(ut.fraction_per(rows_matrix, per="row"), np.array([3 / 15, 12 / 15]))
    assert np.allclose(ut.fraction_per(columns_matrix, per="column"), np.array([3 / 15, 5 / 15, 7 / 15]))

    assert np.allclose(ut.mean_per(rows_matrix, per="row"), np.array([3 / 3, 12 / 3]))
    assert np.allclose(ut.mean_per(columns_matrix, per="column"), np.array([3 / 2, 5 / 2, 7 / 2]))

    assert np.allclose(
        ut.variance_per(rows_matrix, per="row"), np.array([5 / 3 - (3 / 3) ** 2, 50 / 3 - (12 / 3) ** 2])
    )

    assert np.allclose(
        ut.variance_per(columns_matrix, per="column"),
        np.array([9 / 2 - (3 / 2) ** 2, 17 / 2 - (5 / 2) ** 2, 29 / 2 - (7 / 2) ** 2]),
    )

    assert np.allclose(
        ut.normalized_variance_per(columns_matrix, per="column"),
        np.array(
            [(9 / 2 - (3 / 2) ** 2) / (3 / 2), (17 / 2 - (5 / 2) ** 2) / (5 / 2), (29 / 2 - (7 / 2) ** 2) / (7 / 2)]
        ),
    )

    dense = ut.to_numpy_matrix(ut.fraction_by(rows_matrix, by="row"))
    assert np.allclose(dense, np.array([[0 / 3, 1 / 3, 2 / 3], [3 / 12, 4 / 12, 5 / 12]]))

    dense = ut.to_numpy_matrix(ut.fraction_by(columns_matrix, by="column"))
    assert np.allclose(dense, np.array([[0 / 3, 1 / 5, 2 / 7], [3 / 3, 4 / 5, 5 / 7]]))
Example #4
0
def _filter_cells(
    *,
    cells_count: int,
    genes_count: int,
    deviant_genes_fold_ranks: ut.NumpyMatrix,
    deviant_gene_indices: ut.NumpyVector,
    max_cell_fraction: Optional[float],
) -> Tuple[ut.NumpyVector, ut.NumpyVector]:
    min_fold_ranks_of_cells = np.min(deviant_genes_fold_ranks, axis=1)
    assert min_fold_ranks_of_cells.size == cells_count

    threshold_cells_fold_rank = cells_count

    mask_of_deviant_cells = min_fold_ranks_of_cells < threshold_cells_fold_rank
    deviants_cells_count = sum(mask_of_deviant_cells)
    deviant_cell_fraction = deviants_cells_count / cells_count

    if ut.logging_calc():
        ut.log_calc("deviant_cells", mask_of_deviant_cells)

    if max_cell_fraction is not None and deviant_cell_fraction > max_cell_fraction:

        quantile_cells_fold_rank = np.quantile(min_fold_ranks_of_cells, max_cell_fraction)
        assert quantile_cells_fold_rank is not None

        ut.log_calc("quantile_cells_fold_rank", quantile_cells_fold_rank)

        if quantile_cells_fold_rank < threshold_cells_fold_rank:
            threshold_cells_fold_rank = quantile_cells_fold_rank

    ut.log_calc("threshold_cells_fold_rank", threshold_cells_fold_rank)
    deviant_votes = deviant_genes_fold_ranks < threshold_cells_fold_rank

    votes_of_deviant_cells = ut.sum_per(ut.to_layout(deviant_votes, "row_major"), per="row")
    assert votes_of_deviant_cells.size == cells_count

    votes_of_deviant_genes = ut.sum_per(deviant_votes, per="column")
    assert votes_of_deviant_genes.size == deviant_gene_indices.size

    votes_of_all_genes = np.zeros(genes_count, dtype="int32")
    votes_of_all_genes[deviant_gene_indices] = votes_of_deviant_genes

    return votes_of_deviant_cells, votes_of_all_genes
Example #5
0
def _weigh_edges(pruned_ranks: ut.CompressedMatrix) -> ut.CompressedMatrix:
    size = pruned_ranks.shape[0]

    total_ranks_per_row = ut.sum_per(pruned_ranks, per="row")

    ut.timed_parameters(size=size)
    scale_per_row = np.reciprocal(total_ranks_per_row, out=total_ranks_per_row)
    edge_weights = pruned_ranks.multiply(scale_per_row[:, None])
    edge_weights = ut.to_layout(edge_weights, "row_major")

    _assert_proper_compressed(edge_weights, "csr")
    return edge_weights
Example #6
0
def find_biased_genes(
    adata: AnnData,
    *,
    max_projection_fold_factor: float = pr.project_max_projection_fold_factor,
    min_metacells_fraction: float = pr.biased_min_metacells_fraction,
    abs_folds: bool = pr.project_abs_folds,
    to_property_name: str = "biased_gene",
) -> None:
    """
    Find genes that have a strong bias in the query compared to the atlas.

    **Input**

    Annotated query ``adata`` where the observations are cells and the variables are genes, where ``what`` is a
    per-variable-per-observation matrix or the name of a per-variable-per-observation annotation containing such a
    matrix.

    This should contain a ``projected_fold`` per-variable-per-observation matrix with the fold factor between each query
    metacell and its projected image on the atlas.

    **Returns**

    Sets the following annotations in ``adata``:

    Variable (Gene) Annotations
        ``biased_gene`` (or ``to_property_name``):
            A boolean mask indicating whether the gene has a strong bias in the query compared to the atlas.

    **Computation Parameters**

    1. Count for each such gene the number of query metacells for which the ``projected_fold`` is above
       ``max_projection_fold_factor``. If ``abs_folds`` (default: {abs_folds}), consider the absolute fold factor.

    2. Mark the gene as biased if either count is at least a ``min_metacells_fraction`` (default:
       {min_metacells_fraction}) of the metacells.
    """
    assert max_projection_fold_factor >= 0
    assert 0 <= min_metacells_fraction <= 1

    projected_fold = ut.get_vo_proper(adata, "projected_fold", layout="column_major")
    if abs_folds:
        projected_fold = np.abs(projected_fold)  # type: ignore

    high_projection_folds = ut.to_numpy_matrix(projected_fold > max_projection_fold_factor)  # type: ignore
    ut.log_calc("high_projection_folds", high_projection_folds)

    count_of_genes = ut.sum_per(high_projection_folds, per="column")
    min_count = adata.n_obs * min_metacells_fraction
    mask_of_genes = count_of_genes >= min_count

    ut.set_v_data(adata, to_property_name, mask_of_genes)
Example #7
0
def compute_similar_query_metacells(
    adata: AnnData,
    max_projection_fold_factor: float = pr.project_max_projection_fold_factor,
    abs_folds: bool = pr.project_abs_folds,
) -> None:
    """
    Mark query metacells that are similar to their projection on the atlas.

    This does not guarantee the query metacell is "the same as" its projection on the atlas; rather, it means the two
    are sufficiently similar that one can be "reasonably confident" in applying atlas metadata to the query metacell
    based on the projection, which is a much lower bar.

    **Input**

    Annotated query ``adata``, where the observations are cells and the variables are genes, where ``what`` is a
    per-variable-per-observation matrix or the name of a per-variable-per-observation annotation containing such a
    matrix.

    The data should contain per-observation-per-variable annotations ``projected_fold`` with the significant projection
    folds factors, as computed by :py:func:`compute_significant_projected_fold_factors`.

    **Returns**

    Sets the following in ``adata``:

    Per-Observation (Cell) Annotations

        ``similar``
            A boolean mask indicating the query metacell is similar to its projection in the atlas.

    **Computation Parameters**

    1. Mark as dissimilar any query metacells which have even one gene whose projection fold is above
       ``max_projection_fold_factor``.
    """
    assert max_projection_fold_factor >= 0

    projected_folds = ut.get_vo_proper(adata,
                                       "projected_fold",
                                       layout="row_major")
    if abs_folds:
        projected_folds = np.abs(projected_folds)  # type: ignore
    high_folds = projected_folds > max_projection_fold_factor  # type: ignore
    high_folds_per_metacell = ut.sum_per(high_folds, per="row")  # type: ignore
    similar_mask = high_folds_per_metacell == 0
    ut.set_o_data(adata, "similar", similar_mask)
Example #8
0
def project_query_onto_atlas(
    what: Union[str, ut.Matrix] = "__x__",
    *,
    adata: AnnData,
    qdata: AnnData,
    atlas_total_umis: Optional[ut.Vector] = None,
    query_total_umis: Optional[ut.Vector] = None,
    project_log_data: bool = pr.project_log_data,
    fold_normalization: float = pr.project_fold_normalization,
    min_significant_gene_value: float = pr.project_min_significant_gene_value,
    max_consistency_fold_factor: float = pr.project_max_consistency_fold_factor,
    candidates_count: int = pr.project_candidates_count,
    min_usage_weight: float = pr.project_min_usage_weight,
    reproducible: bool,
) -> ut.CompressedMatrix:
    """
    Project query metacells onto atlas metacells.

    **Input**

    Annotated query ``qdata`` and atlas ``adata``, where the observations are cells and the variables are genes, where
    ``what`` is a per-variable-per-observation matrix or the name of a per-variable-per-observation annotation
    containing such a matrix.

    Typically this data excludes any genes having a systematic difference between the query and the atlas, e.g. genes
    detected by by :py:func:`metacells.tools.project.find_systematic_genes`.

    **Returns**

    A matrix whose rows are query metacells and columns are atlas metacells, where each entry is the weight of the atlas
    metacell in the projection of the query metacells. The sum of weights in each row (that is, for a single query
    metacell) is 1. The weighted sum of the atlas metacells using these weights is the "projected" image of the query
    metacell onto the atlas.

    In addition, sets the following annotations in ``qdata``:

    Observation (Cell) Annotations
        ``similar``
            A boolean mask indicating whether the query metacell is similar to its projection onto the atlas. If
            ``False`` the metacells is said to be "dissimilar", which may indicate the query contains cell states that
            do not appear in the atlas.

    **Computation Parameters**

    0. All fold computations (log2 of the ratio between gene expressions as a fraction of the total UMIs) use the
       ``fold_normalization`` (default: {fold_normalization}). Fractions are computed based on the total UMIs, unless
       ``atlas_total_umis`` and/or ``query_total_umis`` are specified.

    For each query metacell:

    1. Correlate the metacell with all the atlas metacells, and pick the highest-correlated one as the "anchor".
       If ``reproducible``, a slower (still parallel) but reproducible algorithm will be used.

    2. Consider as candidates only atlas metacells whose maximal gene fold factor compared to the anchor is at most
       ``max_consistency_fold_factor`` (default: {max_consistency_fold_factor}). Ignore the fold factors of genes whose
       sum of UMIs in the anchor and the candidate metacells is less than ``min_significant_gene_value`` (default:
       {min_significant_gene_value}).

    3. Select the ``candidates_count`` (default: {candidates_count}) candidate metacells with the highest correlation
       with the query metacell.

    4. Compute the non-negative weights (with a sum of 1) of the selected candidates that give the best projection of
       the query metacells onto the atlas. Since the algorithm for computing these weights rarely produces an exact 0
       weight, reduce all weights less than the ``min_usage_weight`` (default: {min_usage_weight}) to zero. If
       ``project_log_data`` (default: {project_log_data}), compute the match on the log of the data instead of the
       actual data.
    """
    assert fold_normalization > 0
    assert candidates_count > 0
    assert min_usage_weight >= 0
    assert max_consistency_fold_factor >= 0
    assert np.all(adata.var_names == qdata.var_names)

    atlas_umis = ut.get_vo_proper(adata, what, layout="row_major")
    query_umis = ut.get_vo_proper(qdata, what, layout="row_major")

    if atlas_total_umis is None:
        atlas_total_umis = ut.sum_per(atlas_umis, per="row")
    atlas_total_umis = ut.to_numpy_vector(atlas_total_umis)

    if query_total_umis is None:
        query_total_umis = ut.sum_per(query_umis, per="row")
    query_total_umis = ut.to_numpy_vector(query_total_umis)

    atlas_fractions = ut.to_numpy_matrix(ut.fraction_by(atlas_umis, by="row", sums=atlas_total_umis))
    query_fractions = ut.to_numpy_matrix(ut.fraction_by(query_umis, by="row", sums=query_total_umis))

    atlas_fractions += fold_normalization
    query_fractions += fold_normalization

    atlas_log_fractions = np.log2(atlas_fractions)
    query_log_fractions = np.log2(query_fractions)

    atlas_fractions -= fold_normalization
    query_fractions -= fold_normalization

    if project_log_data:
        atlas_project_data = atlas_log_fractions
        query_project_data = query_log_fractions
    else:
        atlas_project_data = atlas_fractions
        query_project_data = query_fractions

    query_atlas_corr = ut.cross_corrcoef_rows(query_project_data, atlas_project_data, reproducible=reproducible)

    @ut.timed_call("project_single_metacell")
    def _project_single(query_metacell_index: int) -> Tuple[ut.NumpyVector, ut.NumpyVector]:
        return _project_single_metacell(
            atlas_umis=atlas_umis,
            query_atlas_corr=query_atlas_corr,
            atlas_project_data=atlas_project_data,
            query_project_data=query_project_data,
            atlas_log_fractions=atlas_log_fractions,
            candidates_count=candidates_count,
            min_significant_gene_value=min_significant_gene_value,
            min_usage_weight=min_usage_weight,
            max_consistency_fold_factor=max_consistency_fold_factor,
            query_metacell_index=query_metacell_index,
        )

    results = list(ut.parallel_map(_project_single, qdata.n_obs))

    indices = np.concatenate([result[0] for result in results], dtype="int32")
    data = np.concatenate([result[1] for result in results], dtype="float32")

    atlas_used_sizes = [len(result[0]) for result in results]
    atlas_used_sizes.insert(0, 0)
    indptr = np.cumsum(np.array(atlas_used_sizes))

    return sp.csr_matrix((data, indices, indptr), shape=(qdata.n_obs, adata.n_obs))
Example #9
0
def _collect_fold_factors(  # pylint: disable=too-many-statements
    *,
    data: ut.ProperMatrix,
    candidate_of_cells: ut.NumpyVector,
    totals_of_cells: ut.NumpyVector,
    min_gene_fold_factor: float,
    abs_folds: bool,
) -> Tuple[List[ut.CompressedMatrix], List[ut.NumpyVector]]:
    list_of_fold_factors: List[ut.CompressedMatrix] = []
    list_of_cell_index_of_rows: List[ut.NumpyVector] = []

    cells_count, genes_count = data.shape
    candidates_count = np.max(candidate_of_cells) + 1

    ut.timed_parameters(candidates=candidates_count, cells=cells_count, genes=genes_count)
    remaining_cells_count = cells_count

    for candidate_index in range(candidates_count):
        candidate_cell_indices = np.where(candidate_of_cells == candidate_index)[0]

        candidate_cells_count = candidate_cell_indices.size
        assert candidate_cells_count > 0

        list_of_cell_index_of_rows.append(candidate_cell_indices)
        remaining_cells_count -= candidate_cells_count

        if candidate_cells_count < 2:
            compressed = sparse.csr_matrix(
                ([], [], [0] * (candidate_cells_count + 1)), shape=(candidate_cells_count, genes_count)
            )
            list_of_fold_factors.append(compressed)
            assert compressed.has_sorted_indices
            assert compressed.has_canonical_format
            continue

        data_of_candidate: ut.ProperMatrix = data[candidate_cell_indices, :].copy()
        assert ut.is_layout(data_of_candidate, "row_major")
        assert data_of_candidate.shape == (candidate_cells_count, genes_count)

        totals_of_candidate_cells = totals_of_cells[candidate_cell_indices]

        totals_of_candidate_genes = ut.sum_per(ut.to_layout(data_of_candidate, "column_major"), per="column")
        assert totals_of_candidate_genes.size == genes_count

        fractions_of_candidate_genes = ut.to_numpy_vector(totals_of_candidate_genes / np.sum(totals_of_candidate_genes))

        _, dense, compressed = ut.to_proper_matrices(data_of_candidate)

        if compressed is not None:
            if compressed.nnz == 0:
                list_of_fold_factors.append(compressed)
                continue

            extension_name = "fold_factor_compressed_%s_t_%s_t_%s_t" % (  # pylint: disable=consider-using-f-string
                compressed.data.dtype,
                compressed.indices.dtype,
                compressed.indptr.dtype,
            )
            extension = getattr(xt, extension_name)

            with ut.timed_step("extensions.fold_factor_compressed"):
                extension(
                    compressed.data,
                    compressed.indices,
                    compressed.indptr,
                    min_gene_fold_factor,
                    abs_folds,
                    totals_of_candidate_cells,
                    fractions_of_candidate_genes,
                )

            ut.eliminate_zeros(compressed)

        else:
            assert dense is not None

            extension_name = f"fold_factor_dense_{dense.dtype}_t"
            extension = getattr(xt, extension_name)

            with ut.timed_step("extensions.fold_factor_dense"):
                extension(
                    dense,
                    min_gene_fold_factor,
                    abs_folds,
                    totals_of_candidate_cells,
                    fractions_of_candidate_genes,
                )

            compressed = sparse.csr_matrix(dense)
            assert compressed.has_sorted_indices
            assert compressed.has_canonical_format

        list_of_fold_factors.append(compressed)

    if remaining_cells_count > 0:
        assert remaining_cells_count == np.sum(candidate_of_cells < 0)
        list_of_cell_index_of_rows.append(np.where(candidate_of_cells < 0)[0])
        compressed = sparse.csr_matrix(
            ([], [], [0] * (remaining_cells_count + 1)), shape=(remaining_cells_count, genes_count)
        )
        assert compressed.has_sorted_indices
        assert compressed.has_canonical_format
        list_of_fold_factors.append(compressed)

    return list_of_fold_factors, list_of_cell_index_of_rows
Example #10
0
def find_properly_sampled_cells(
    adata: AnnData,
    what: Union[str, ut.Matrix] = "__x__",
    *,
    min_cell_total: Optional[int],
    max_cell_total: Optional[int],
    excluded_adata: Optional[AnnData] = None,
    max_excluded_genes_fraction: Optional[float],
    inplace: bool = True,
) -> Optional[ut.PandasSeries]:
    """
    Detect cells with a "proper" amount of ``what`` (default: {what}) data.

    Due to both technical effects and natural variance between cells, the total number of UMIs
    varies from cell to cell. We often would like to work on cells that contain a sufficient number
    of UMIs for meaningful analysis; we sometimes also wish to exclude cells which have "too many"
    UMIs.

    **Input**

    Annotated ``adata``, where the observations are cells and the variables are genes, where
    ``what`` is a per-variable-per-observation matrix or the name of a per-variable-per-observation
    annotation containing such a matrix.

    **Returns**

    Observation (Cell) Annotations
        ``properly_sampled_cell``
            A boolean mask indicating whether each cell has a "proper" amount of UMIs.

    If ``inplace`` (default: {inplace}), this is written to the data, and the function returns
    ``None``. Otherwise this is returned as a pandas series (indexed by the observation names).

    **Computation Parameters**

    1. Exclude all cells whose total data is less than the ``min_cell_total`` (no default), unless
       it is ``None``.

    2. Exclude all cells whose total data is more than the ``max_cell_total`` (no default), unless
       it is ``None``.

    3. If ``max_excluded_genes_fraction`` (no default) is not ``None``, then ``excluded_adata`` must
       not be ``None`` and should contain just the excluded genes data for each cell. Exclude all
       cells whose sum of the excluded data divided by the total data is more than the specified
       threshold.
    """
    assert (max_excluded_genes_fraction is None) == (excluded_adata is None)

    total_of_cells = ut.get_o_numpy(adata, what, sum=True)

    cells_mask = np.full(adata.n_obs, True, dtype="bool")

    if min_cell_total is not None:
        cells_mask = cells_mask & (total_of_cells >= min_cell_total)

    if max_cell_total is not None:
        cells_mask = cells_mask & (total_of_cells <= max_cell_total)

    if excluded_adata is not None:
        assert max_excluded_genes_fraction is not None
        excluded_data = ut.get_vo_proper(excluded_adata, layout="row_major")
        excluded_of_cells = ut.sum_per(excluded_data, per="row")
        if np.min(total_of_cells) == 0:
            total_of_cells = np.copy(total_of_cells)
            total_of_cells[total_of_cells == 0] = 1
        excluded_fraction = excluded_of_cells / total_of_cells
        cells_mask = cells_mask & (excluded_fraction <=
                                   max_excluded_genes_fraction)

    if inplace:
        ut.set_o_data(adata, "properly_sampled_cell", cells_mask)
        return None

    ut.log_return("properly_sampled_cell", cells_mask)
    return ut.to_pandas_series(cells_mask, index=adata.obs_names)
Example #11
0
def compute_deviant_fold_factors(
    what: Union[str, ut.Matrix] = "__x__",
    *,
    adata: AnnData,
    gdata: AnnData,
    group: Union[str, ut.Vector] = "metacell",
    similar: Union[str, ut.Vector] = "similar",
    significant_gene_fold_factor: float = pr.significant_gene_fold_factor,
) -> None:
    """
    Given an assignment of observations (cells) to groups (metacells) or, if an outlier, to the most
    similar groups, compute for each observation and gene the fold factor relative to its group
    for the purpose of detecting deviant cells.

    Ideally, all grouped cells would have no genes with high enough fold factors to be considered deviants, and all
    outlier cells would. In practice grouped cells might have a (few) such genes to the restriction on the fraction
    of deviants.

    It is important not to read too much into the results for a single cell, but looking at which genes appear for cell
    populations (e.g., cells with specific metadata such as batch identification) might be instructive.

    **Input**

    Annotated ``adata``, where the observations are cells and the variables are genes, where ``what`` is a
    per-variable-per-observation matrix or the name of a per-variable-per-observation annotation containing such a
    matrix.

    In addition, ``gdata`` is assumed to have one observation for each group, and use the same genes as ``adata``.

    **Returns**

    Sets the following in ``adata``:

    Per-Variable Per-Observation (Gene-Cell) Annotations

        ``deviant_fold``
            The fold factor between the cell's UMIs and the expected number of UMIs for the purpose of computing
            deviant cells.

    **Computation Parameters**

    1. For each cell, compute the expected UMIs for each gene given the fraction of the gene in the metacells associated
       with the cell (the one it is belongs to, or the most similar one for outliers). If this is less than
       ``significant_gene_fold_factor`` (default: {significant_gene_fold_factor}), set it to zero so the result will be
       sparse.
    """
    cells_data = ut.get_vo_proper(adata, what, layout="row_major")
    metacells_data = ut.get_vo_proper(gdata, what, layout="row_major")
    total_umis_per_cell = ut.sum_per(cells_data, per="row")
    total_umis_per_metacell = ut.sum_per(metacells_data, per="row")

    group_of_cells = ut.get_o_numpy(adata,
                                    group,
                                    formatter=ut.groups_description)
    similar_of_cells = ut.get_o_numpy(adata,
                                      similar,
                                      formatter=ut.groups_description)

    @ut.timed_call("compute_cell_deviant_certificates")
    def _compute_cell_deviant_certificates(
            cell_index: int) -> Tuple[ut.NumpyVector, ut.NumpyVector]:
        return _compute_cell_certificates(
            cell_index=cell_index,
            cells_data=cells_data,
            metacells_data=metacells_data,
            group_of_cells=group_of_cells,
            similar_of_cells=similar_of_cells,
            total_umis_per_cell=total_umis_per_cell,
            total_umis_per_metacell=total_umis_per_metacell,
            significant_gene_fold_factor=significant_gene_fold_factor,
        )

    results = list(
        ut.parallel_map(_compute_cell_deviant_certificates, adata.n_obs))

    cell_indices = np.concatenate([
        np.full(len(result[0]), cell_index, dtype="int32")
        for cell_index, result in enumerate(results)
    ])
    gene_indices = np.concatenate([result[0] for result in results])
    fold_factors = np.concatenate([result[1] for result in results])

    deviant_folds = sparse.csr_matrix(
        (fold_factors, (cell_indices, gene_indices)), shape=adata.shape)
    ut.set_vo_data(adata, "deviant_folds", deviant_folds)
Example #12
0
def compute_inner_fold_factors(
    what: Union[str, ut.Matrix] = "__x__",
    *,
    adata: AnnData,
    gdata: AnnData,
    group: Union[str, ut.Vector] = "metacell",
    min_gene_inner_fold_factor: float = pr.min_gene_inner_fold_factor,
    min_entry_inner_fold_factor: float = pr.min_entry_inner_fold_factor,
    inner_abs_folds: float = pr.inner_abs_folds,
) -> None:
    """
    Compute the inner fold factors of genes within in each metacell.

    This computes, for each cell of the metacell, the same fold factors that are used to detect deviant cells (see
    :py:func:`metacells.tools.deviants.find_deviant_cells`), and keeps the maximal fold for each gene in the metacell.
    The result per-metacell-per-gene matrix is then made sparse by discarding too-low values (setting them to zero).
    Ideally, this matrix should be "very" sparse. If it contains "too many" non-zero values, this indicates the
    metacells contains "too much" variability. This may be due to actual biology (e.g. immune cells or olfactory nerves
    which are all similar except for each one expressing one different gene), due to batch effects (similar cells in
    distinct batches differing in some genes due to technical issues), due to low data quality (the overall noise level
    is so high that this is simply the best the algorithm can do), or worse - a combination of the above.

    **Input**

    Annotated ``adata``, where the observations are cells and the variables are genes, where ``what`` is a
    per-variable-per-observation matrix or the name of a per-variable-per-observation annotation containing such a
    matrix.

    In addition, ``gdata`` is assumed to have one observation for each group, and use the same
    genes as ``adata``.

    **Returns**

    Sets the following in ``gdata``:

    Per-Variable Per-Observation (Gene-Cell) Annotations

        ``inner_fold``
            For each gene and group, the maximal fold factor of this gene in any cell contained in the group (unless the
            value is too low to be of interest, in which case it will be zero).

    **Computation Parameters**

    1. For each group (metacell), for each gene, compute the gene's maximal (in all the cells of the group) fold factor
       log2((actual UMIs + 1) / (expected UMIs + 1)), similarly to
       :py:func:`metacells.tools.deviants.find_deviant_cells`.

    2. If the maximal fold factor for a gene (across all metacells) is below ``min_gene_inner_fold_factor`` (default:
       {min_gene_inner_fold_factor}), then set all the gene's fold factors to zero (too low to be of interest). If
       ``inner_abs_folds`` (default: {inner_abs_folds}), consider the absolute fold factors.

    3. Otherwise, for any metacell whose fold factor for the gene is less than ``min_entry_inner_fold_factor`` (default:
       {min_entry_inner_fold_factor}), set the fold factor to zero (too low to be of interest).
    """
    assert 0 <= min_entry_inner_fold_factor <= min_gene_inner_fold_factor

    cells_data = ut.get_vo_proper(adata, what, layout="row_major")
    metacells_data = ut.get_vo_proper(gdata, what, layout="row_major")
    group_of_cells = ut.get_o_numpy(adata,
                                    group,
                                    formatter=ut.groups_description)
    total_umis_per_cell = ut.sum_per(cells_data, per="row")
    total_umis_per_metacell = ut.sum_per(metacells_data, per="row")

    @ut.timed_call("compute_metacell_inner_folds")
    def _compute_single_metacell_inner_folds(
            metacell_index: int) -> ut.NumpyVector:
        return _compute_metacell_inner_folds(
            metacell_index=metacell_index,
            cells_data=cells_data,
            metacells_data=metacells_data,
            group_of_cells=group_of_cells,
            total_umis_per_cell=total_umis_per_cell,
            total_umis_per_metacell=total_umis_per_metacell,
        )

    results = list(
        ut.parallel_map(_compute_single_metacell_inner_folds, gdata.n_obs))
    dense_inner_folds_by_row = np.array(results)
    dense_inner_folds_by_column = ut.to_layout(dense_inner_folds_by_row,
                                               "column_major")
    if inner_abs_folds:
        comparable_dense_inner_folds_by_column = np.abs(
            dense_inner_folds_by_column)
    else:
        comparable_dense_inner_folds_by_column = dense_inner_folds_by_column
    max_fold_per_gene = ut.max_per(comparable_dense_inner_folds_by_column,
                                   per="column")
    significant_genes_mask = max_fold_per_gene >= min_gene_inner_fold_factor
    ut.log_calc("significant_genes_mask", significant_genes_mask)
    dense_inner_folds_by_column[:, ~significant_genes_mask] = 0
    dense_inner_folds_by_column[comparable_dense_inner_folds_by_column <
                                min_entry_inner_fold_factor] = 0
    dense_inner_folds_by_row = ut.to_layout(dense_inner_folds_by_column,
                                            layout="row_major")
    sparse_inner_folds = sparse.csr_matrix(dense_inner_folds_by_row)
    ut.set_vo_data(gdata, "inner_fold", sparse_inner_folds)
Example #13
0
def _collect_group_data(
    group_index: int,
    *,
    group_of_cells: ut.NumpyVector,
    cells_data: ut.ProperMatrix,
    compatible_size: Optional[int],
    downsample_min_samples: int,
    downsample_min_cell_quantile: float,
    downsample_max_cell_quantile: float,
    min_gene_total: int,
    random_seed: int,
    variance_per_gene_per_group: ut.NumpyMatrix,
    normalized_variance_per_gene_per_group: ut.NumpyMatrix,
) -> None:
    cell_indices = np.where(group_of_cells == group_index)[0]
    cells_count = len(cell_indices)
    if cells_count < 2:
        return

    if compatible_size is None:
        ut.log_calc("  cells", cells_count)
    else:
        assert 0 < compatible_size <= cells_count
        if compatible_size < cells_count:
            np.random.seed(random_seed)
            if ut.logging_calc():
                ut.log_calc("  cells: " + ut.ratio_description(
                    len(cell_indices), "cell", compatible_size, "compatible"))
            cell_indices = np.random.choice(cell_indices,
                                            size=compatible_size,
                                            replace=False)
            assert len(cell_indices) == compatible_size

    assert ut.is_layout(cells_data, "row_major")
    group_data = cells_data[cell_indices, :]

    total_per_cell = ut.sum_per(group_data, per="row")
    samples = int(
        round(
            min(
                max(downsample_min_samples,
                    np.quantile(total_per_cell, downsample_min_cell_quantile)),
                np.quantile(total_per_cell, downsample_max_cell_quantile),
            )))
    if ut.logging_calc():
        ut.log_calc(f"  samples: {samples}")
    downsampled_data = ut.downsample_matrix(group_data,
                                            per="row",
                                            samples=samples,
                                            random_seed=random_seed)

    downsampled_data = ut.to_layout(downsampled_data, layout="column_major")
    total_per_gene = ut.sum_per(downsampled_data, per="column")
    too_small_genes = total_per_gene < min_gene_total
    if ut.logging_calc():
        included_genes_count = len(too_small_genes) - np.sum(too_small_genes)
        ut.log_calc(f"  included genes: {included_genes_count}")

    variance_per_gene = ut.variance_per(downsampled_data, per="column")
    normalized_variance_per_gene = ut.normalized_variance_per(downsampled_data,
                                                              per="column")

    variance_per_gene[too_small_genes] = None
    normalized_variance_per_gene[too_small_genes] = None

    variance_per_gene_per_group[group_index, :] = variance_per_gene
    normalized_variance_per_gene_per_group[
        group_index, :] = normalized_variance_per_gene