def compute_query_projection( what: Union[str, ut.Matrix] = "__x__", *, adata: AnnData, qdata: AnnData, weights: ut.Matrix, atlas_total_umis: Optional[ut.Vector] = None, query_total_umis: Optional[ut.Vector] = None, ) -> None: """ Compute the projected image of the query on the atlas. **Input** Annotated query ``qdata`` and atlas ``adata``, where the observations are cells and the variables are genes, where ``what`` is a per-variable-per-observation matrix or the name of a per-variable-per-observation annotation containing such a matrix. The ``weights`` of the projection where each row is a query metacell, each column is an atlas metacell, and the value is the weight of the atlas cell for projecting the metacell, such that the sum of weights in each row is one. **Returns** In addition, sets the following annotations in ``qdata``: Observation (Cell) Annotations ``projection`` The number of UMIs of each gene in the projected image of the query to the metacell, if the total number of UMIs in the projection is equal to the total number of UMIs in the query metacell. **Computation Parameters** 1. Compute the fraction of each gene in the atlas and the query based on the total UMIs, unless ``atlas_total_umis`` and/or ``query_total_umis`` are specified. 2. Compute the projected image of each query metacell on the atlas using the weights. 3. Convert this image to UMIs count based on the total UMIs of each metacell. Note that if overriding the total atlas or query UMIs, this means that the result need not sum to this total. """ assert np.all(adata.var_names == qdata.var_names) atlas_umis = ut.get_vo_proper(adata, what, layout="row_major") query_umis = ut.get_vo_proper(qdata, what, layout="row_major") if atlas_total_umis is None: atlas_total_umis = ut.sum_per(atlas_umis, per="row") atlas_total_umis = ut.to_numpy_vector(atlas_total_umis) if query_total_umis is None: query_total_umis = ut.sum_per(query_umis, per="row") query_total_umis = ut.to_numpy_vector(query_total_umis) atlas_fractions = ut.to_numpy_matrix(ut.fraction_by(atlas_umis, by="row", sums=atlas_total_umis)) projected_fractions = weights @ atlas_fractions # type: ignore projected_umis = ut.scale_by(projected_fractions, scale=query_total_umis, by="row") ut.set_vo_data(qdata, "projected", projected_umis)
def test_downsample_matrix() -> None: rvs = stats.poisson(10, loc=10).rvs matrix = sparse.random(1000, 10000, format="csr", dtype="int32", random_state=123456, data_rvs=rvs) assert matrix.nnz == matrix.shape[0] * matrix.shape[1] * 0.01 old_row_sums = ut.sum_per(matrix, per="row") min_sum = np.min(old_row_sums) result = ut.downsample_matrix(matrix, per="row", samples=int(min_sum)) assert result.shape == matrix.shape new_row_sums = ut.sum_per(result, per="row") assert np.all(new_row_sums == min_sum) matrix = matrix.toarray() result = ut.downsample_matrix(matrix, per="row", samples=int(min_sum)) assert result.shape == matrix.shape new_row_sums = ut.sum_per(result, per="row") assert np.all(new_row_sums == min_sum)
def _test_per(rows_matrix: ut.Matrix) -> None: columns_matrix = ut.to_layout(rows_matrix, layout="column_major") assert np.allclose(ut.nnz_per(rows_matrix, per="row"), np.array([2, 3])) assert np.allclose(ut.nnz_per(columns_matrix, per="column"), np.array([1, 2, 2])) assert np.allclose(ut.sum_per(rows_matrix, per="row"), np.array([3, 12])) assert np.allclose(ut.sum_per(columns_matrix, per="column"), np.array([3, 5, 7])) assert np.allclose(ut.max_per(rows_matrix, per="row"), np.array([2, 5])) assert np.allclose(ut.max_per(columns_matrix, per="column"), np.array([3, 4, 5])) assert np.allclose(ut.min_per(rows_matrix, per="row"), np.array([0, 3])) assert np.allclose(ut.min_per(columns_matrix, per="column"), np.array([0, 1, 2])) assert np.allclose(ut.sum_squared_per(rows_matrix, per="row"), np.array([5, 50])) assert np.allclose(ut.sum_squared_per(columns_matrix, per="column"), np.array([9, 17, 29])) assert np.allclose(ut.fraction_per(rows_matrix, per="row"), np.array([3 / 15, 12 / 15])) assert np.allclose(ut.fraction_per(columns_matrix, per="column"), np.array([3 / 15, 5 / 15, 7 / 15])) assert np.allclose(ut.mean_per(rows_matrix, per="row"), np.array([3 / 3, 12 / 3])) assert np.allclose(ut.mean_per(columns_matrix, per="column"), np.array([3 / 2, 5 / 2, 7 / 2])) assert np.allclose( ut.variance_per(rows_matrix, per="row"), np.array([5 / 3 - (3 / 3) ** 2, 50 / 3 - (12 / 3) ** 2]) ) assert np.allclose( ut.variance_per(columns_matrix, per="column"), np.array([9 / 2 - (3 / 2) ** 2, 17 / 2 - (5 / 2) ** 2, 29 / 2 - (7 / 2) ** 2]), ) assert np.allclose( ut.normalized_variance_per(columns_matrix, per="column"), np.array( [(9 / 2 - (3 / 2) ** 2) / (3 / 2), (17 / 2 - (5 / 2) ** 2) / (5 / 2), (29 / 2 - (7 / 2) ** 2) / (7 / 2)] ), ) dense = ut.to_numpy_matrix(ut.fraction_by(rows_matrix, by="row")) assert np.allclose(dense, np.array([[0 / 3, 1 / 3, 2 / 3], [3 / 12, 4 / 12, 5 / 12]])) dense = ut.to_numpy_matrix(ut.fraction_by(columns_matrix, by="column")) assert np.allclose(dense, np.array([[0 / 3, 1 / 5, 2 / 7], [3 / 3, 4 / 5, 5 / 7]]))
def _filter_cells( *, cells_count: int, genes_count: int, deviant_genes_fold_ranks: ut.NumpyMatrix, deviant_gene_indices: ut.NumpyVector, max_cell_fraction: Optional[float], ) -> Tuple[ut.NumpyVector, ut.NumpyVector]: min_fold_ranks_of_cells = np.min(deviant_genes_fold_ranks, axis=1) assert min_fold_ranks_of_cells.size == cells_count threshold_cells_fold_rank = cells_count mask_of_deviant_cells = min_fold_ranks_of_cells < threshold_cells_fold_rank deviants_cells_count = sum(mask_of_deviant_cells) deviant_cell_fraction = deviants_cells_count / cells_count if ut.logging_calc(): ut.log_calc("deviant_cells", mask_of_deviant_cells) if max_cell_fraction is not None and deviant_cell_fraction > max_cell_fraction: quantile_cells_fold_rank = np.quantile(min_fold_ranks_of_cells, max_cell_fraction) assert quantile_cells_fold_rank is not None ut.log_calc("quantile_cells_fold_rank", quantile_cells_fold_rank) if quantile_cells_fold_rank < threshold_cells_fold_rank: threshold_cells_fold_rank = quantile_cells_fold_rank ut.log_calc("threshold_cells_fold_rank", threshold_cells_fold_rank) deviant_votes = deviant_genes_fold_ranks < threshold_cells_fold_rank votes_of_deviant_cells = ut.sum_per(ut.to_layout(deviant_votes, "row_major"), per="row") assert votes_of_deviant_cells.size == cells_count votes_of_deviant_genes = ut.sum_per(deviant_votes, per="column") assert votes_of_deviant_genes.size == deviant_gene_indices.size votes_of_all_genes = np.zeros(genes_count, dtype="int32") votes_of_all_genes[deviant_gene_indices] = votes_of_deviant_genes return votes_of_deviant_cells, votes_of_all_genes
def _weigh_edges(pruned_ranks: ut.CompressedMatrix) -> ut.CompressedMatrix: size = pruned_ranks.shape[0] total_ranks_per_row = ut.sum_per(pruned_ranks, per="row") ut.timed_parameters(size=size) scale_per_row = np.reciprocal(total_ranks_per_row, out=total_ranks_per_row) edge_weights = pruned_ranks.multiply(scale_per_row[:, None]) edge_weights = ut.to_layout(edge_weights, "row_major") _assert_proper_compressed(edge_weights, "csr") return edge_weights
def find_biased_genes( adata: AnnData, *, max_projection_fold_factor: float = pr.project_max_projection_fold_factor, min_metacells_fraction: float = pr.biased_min_metacells_fraction, abs_folds: bool = pr.project_abs_folds, to_property_name: str = "biased_gene", ) -> None: """ Find genes that have a strong bias in the query compared to the atlas. **Input** Annotated query ``adata`` where the observations are cells and the variables are genes, where ``what`` is a per-variable-per-observation matrix or the name of a per-variable-per-observation annotation containing such a matrix. This should contain a ``projected_fold`` per-variable-per-observation matrix with the fold factor between each query metacell and its projected image on the atlas. **Returns** Sets the following annotations in ``adata``: Variable (Gene) Annotations ``biased_gene`` (or ``to_property_name``): A boolean mask indicating whether the gene has a strong bias in the query compared to the atlas. **Computation Parameters** 1. Count for each such gene the number of query metacells for which the ``projected_fold`` is above ``max_projection_fold_factor``. If ``abs_folds`` (default: {abs_folds}), consider the absolute fold factor. 2. Mark the gene as biased if either count is at least a ``min_metacells_fraction`` (default: {min_metacells_fraction}) of the metacells. """ assert max_projection_fold_factor >= 0 assert 0 <= min_metacells_fraction <= 1 projected_fold = ut.get_vo_proper(adata, "projected_fold", layout="column_major") if abs_folds: projected_fold = np.abs(projected_fold) # type: ignore high_projection_folds = ut.to_numpy_matrix(projected_fold > max_projection_fold_factor) # type: ignore ut.log_calc("high_projection_folds", high_projection_folds) count_of_genes = ut.sum_per(high_projection_folds, per="column") min_count = adata.n_obs * min_metacells_fraction mask_of_genes = count_of_genes >= min_count ut.set_v_data(adata, to_property_name, mask_of_genes)
def compute_similar_query_metacells( adata: AnnData, max_projection_fold_factor: float = pr.project_max_projection_fold_factor, abs_folds: bool = pr.project_abs_folds, ) -> None: """ Mark query metacells that are similar to their projection on the atlas. This does not guarantee the query metacell is "the same as" its projection on the atlas; rather, it means the two are sufficiently similar that one can be "reasonably confident" in applying atlas metadata to the query metacell based on the projection, which is a much lower bar. **Input** Annotated query ``adata``, where the observations are cells and the variables are genes, where ``what`` is a per-variable-per-observation matrix or the name of a per-variable-per-observation annotation containing such a matrix. The data should contain per-observation-per-variable annotations ``projected_fold`` with the significant projection folds factors, as computed by :py:func:`compute_significant_projected_fold_factors`. **Returns** Sets the following in ``adata``: Per-Observation (Cell) Annotations ``similar`` A boolean mask indicating the query metacell is similar to its projection in the atlas. **Computation Parameters** 1. Mark as dissimilar any query metacells which have even one gene whose projection fold is above ``max_projection_fold_factor``. """ assert max_projection_fold_factor >= 0 projected_folds = ut.get_vo_proper(adata, "projected_fold", layout="row_major") if abs_folds: projected_folds = np.abs(projected_folds) # type: ignore high_folds = projected_folds > max_projection_fold_factor # type: ignore high_folds_per_metacell = ut.sum_per(high_folds, per="row") # type: ignore similar_mask = high_folds_per_metacell == 0 ut.set_o_data(adata, "similar", similar_mask)
def project_query_onto_atlas( what: Union[str, ut.Matrix] = "__x__", *, adata: AnnData, qdata: AnnData, atlas_total_umis: Optional[ut.Vector] = None, query_total_umis: Optional[ut.Vector] = None, project_log_data: bool = pr.project_log_data, fold_normalization: float = pr.project_fold_normalization, min_significant_gene_value: float = pr.project_min_significant_gene_value, max_consistency_fold_factor: float = pr.project_max_consistency_fold_factor, candidates_count: int = pr.project_candidates_count, min_usage_weight: float = pr.project_min_usage_weight, reproducible: bool, ) -> ut.CompressedMatrix: """ Project query metacells onto atlas metacells. **Input** Annotated query ``qdata`` and atlas ``adata``, where the observations are cells and the variables are genes, where ``what`` is a per-variable-per-observation matrix or the name of a per-variable-per-observation annotation containing such a matrix. Typically this data excludes any genes having a systematic difference between the query and the atlas, e.g. genes detected by by :py:func:`metacells.tools.project.find_systematic_genes`. **Returns** A matrix whose rows are query metacells and columns are atlas metacells, where each entry is the weight of the atlas metacell in the projection of the query metacells. The sum of weights in each row (that is, for a single query metacell) is 1. The weighted sum of the atlas metacells using these weights is the "projected" image of the query metacell onto the atlas. In addition, sets the following annotations in ``qdata``: Observation (Cell) Annotations ``similar`` A boolean mask indicating whether the query metacell is similar to its projection onto the atlas. If ``False`` the metacells is said to be "dissimilar", which may indicate the query contains cell states that do not appear in the atlas. **Computation Parameters** 0. All fold computations (log2 of the ratio between gene expressions as a fraction of the total UMIs) use the ``fold_normalization`` (default: {fold_normalization}). Fractions are computed based on the total UMIs, unless ``atlas_total_umis`` and/or ``query_total_umis`` are specified. For each query metacell: 1. Correlate the metacell with all the atlas metacells, and pick the highest-correlated one as the "anchor". If ``reproducible``, a slower (still parallel) but reproducible algorithm will be used. 2. Consider as candidates only atlas metacells whose maximal gene fold factor compared to the anchor is at most ``max_consistency_fold_factor`` (default: {max_consistency_fold_factor}). Ignore the fold factors of genes whose sum of UMIs in the anchor and the candidate metacells is less than ``min_significant_gene_value`` (default: {min_significant_gene_value}). 3. Select the ``candidates_count`` (default: {candidates_count}) candidate metacells with the highest correlation with the query metacell. 4. Compute the non-negative weights (with a sum of 1) of the selected candidates that give the best projection of the query metacells onto the atlas. Since the algorithm for computing these weights rarely produces an exact 0 weight, reduce all weights less than the ``min_usage_weight`` (default: {min_usage_weight}) to zero. If ``project_log_data`` (default: {project_log_data}), compute the match on the log of the data instead of the actual data. """ assert fold_normalization > 0 assert candidates_count > 0 assert min_usage_weight >= 0 assert max_consistency_fold_factor >= 0 assert np.all(adata.var_names == qdata.var_names) atlas_umis = ut.get_vo_proper(adata, what, layout="row_major") query_umis = ut.get_vo_proper(qdata, what, layout="row_major") if atlas_total_umis is None: atlas_total_umis = ut.sum_per(atlas_umis, per="row") atlas_total_umis = ut.to_numpy_vector(atlas_total_umis) if query_total_umis is None: query_total_umis = ut.sum_per(query_umis, per="row") query_total_umis = ut.to_numpy_vector(query_total_umis) atlas_fractions = ut.to_numpy_matrix(ut.fraction_by(atlas_umis, by="row", sums=atlas_total_umis)) query_fractions = ut.to_numpy_matrix(ut.fraction_by(query_umis, by="row", sums=query_total_umis)) atlas_fractions += fold_normalization query_fractions += fold_normalization atlas_log_fractions = np.log2(atlas_fractions) query_log_fractions = np.log2(query_fractions) atlas_fractions -= fold_normalization query_fractions -= fold_normalization if project_log_data: atlas_project_data = atlas_log_fractions query_project_data = query_log_fractions else: atlas_project_data = atlas_fractions query_project_data = query_fractions query_atlas_corr = ut.cross_corrcoef_rows(query_project_data, atlas_project_data, reproducible=reproducible) @ut.timed_call("project_single_metacell") def _project_single(query_metacell_index: int) -> Tuple[ut.NumpyVector, ut.NumpyVector]: return _project_single_metacell( atlas_umis=atlas_umis, query_atlas_corr=query_atlas_corr, atlas_project_data=atlas_project_data, query_project_data=query_project_data, atlas_log_fractions=atlas_log_fractions, candidates_count=candidates_count, min_significant_gene_value=min_significant_gene_value, min_usage_weight=min_usage_weight, max_consistency_fold_factor=max_consistency_fold_factor, query_metacell_index=query_metacell_index, ) results = list(ut.parallel_map(_project_single, qdata.n_obs)) indices = np.concatenate([result[0] for result in results], dtype="int32") data = np.concatenate([result[1] for result in results], dtype="float32") atlas_used_sizes = [len(result[0]) for result in results] atlas_used_sizes.insert(0, 0) indptr = np.cumsum(np.array(atlas_used_sizes)) return sp.csr_matrix((data, indices, indptr), shape=(qdata.n_obs, adata.n_obs))
def _collect_fold_factors( # pylint: disable=too-many-statements *, data: ut.ProperMatrix, candidate_of_cells: ut.NumpyVector, totals_of_cells: ut.NumpyVector, min_gene_fold_factor: float, abs_folds: bool, ) -> Tuple[List[ut.CompressedMatrix], List[ut.NumpyVector]]: list_of_fold_factors: List[ut.CompressedMatrix] = [] list_of_cell_index_of_rows: List[ut.NumpyVector] = [] cells_count, genes_count = data.shape candidates_count = np.max(candidate_of_cells) + 1 ut.timed_parameters(candidates=candidates_count, cells=cells_count, genes=genes_count) remaining_cells_count = cells_count for candidate_index in range(candidates_count): candidate_cell_indices = np.where(candidate_of_cells == candidate_index)[0] candidate_cells_count = candidate_cell_indices.size assert candidate_cells_count > 0 list_of_cell_index_of_rows.append(candidate_cell_indices) remaining_cells_count -= candidate_cells_count if candidate_cells_count < 2: compressed = sparse.csr_matrix( ([], [], [0] * (candidate_cells_count + 1)), shape=(candidate_cells_count, genes_count) ) list_of_fold_factors.append(compressed) assert compressed.has_sorted_indices assert compressed.has_canonical_format continue data_of_candidate: ut.ProperMatrix = data[candidate_cell_indices, :].copy() assert ut.is_layout(data_of_candidate, "row_major") assert data_of_candidate.shape == (candidate_cells_count, genes_count) totals_of_candidate_cells = totals_of_cells[candidate_cell_indices] totals_of_candidate_genes = ut.sum_per(ut.to_layout(data_of_candidate, "column_major"), per="column") assert totals_of_candidate_genes.size == genes_count fractions_of_candidate_genes = ut.to_numpy_vector(totals_of_candidate_genes / np.sum(totals_of_candidate_genes)) _, dense, compressed = ut.to_proper_matrices(data_of_candidate) if compressed is not None: if compressed.nnz == 0: list_of_fold_factors.append(compressed) continue extension_name = "fold_factor_compressed_%s_t_%s_t_%s_t" % ( # pylint: disable=consider-using-f-string compressed.data.dtype, compressed.indices.dtype, compressed.indptr.dtype, ) extension = getattr(xt, extension_name) with ut.timed_step("extensions.fold_factor_compressed"): extension( compressed.data, compressed.indices, compressed.indptr, min_gene_fold_factor, abs_folds, totals_of_candidate_cells, fractions_of_candidate_genes, ) ut.eliminate_zeros(compressed) else: assert dense is not None extension_name = f"fold_factor_dense_{dense.dtype}_t" extension = getattr(xt, extension_name) with ut.timed_step("extensions.fold_factor_dense"): extension( dense, min_gene_fold_factor, abs_folds, totals_of_candidate_cells, fractions_of_candidate_genes, ) compressed = sparse.csr_matrix(dense) assert compressed.has_sorted_indices assert compressed.has_canonical_format list_of_fold_factors.append(compressed) if remaining_cells_count > 0: assert remaining_cells_count == np.sum(candidate_of_cells < 0) list_of_cell_index_of_rows.append(np.where(candidate_of_cells < 0)[0]) compressed = sparse.csr_matrix( ([], [], [0] * (remaining_cells_count + 1)), shape=(remaining_cells_count, genes_count) ) assert compressed.has_sorted_indices assert compressed.has_canonical_format list_of_fold_factors.append(compressed) return list_of_fold_factors, list_of_cell_index_of_rows
def find_properly_sampled_cells( adata: AnnData, what: Union[str, ut.Matrix] = "__x__", *, min_cell_total: Optional[int], max_cell_total: Optional[int], excluded_adata: Optional[AnnData] = None, max_excluded_genes_fraction: Optional[float], inplace: bool = True, ) -> Optional[ut.PandasSeries]: """ Detect cells with a "proper" amount of ``what`` (default: {what}) data. Due to both technical effects and natural variance between cells, the total number of UMIs varies from cell to cell. We often would like to work on cells that contain a sufficient number of UMIs for meaningful analysis; we sometimes also wish to exclude cells which have "too many" UMIs. **Input** Annotated ``adata``, where the observations are cells and the variables are genes, where ``what`` is a per-variable-per-observation matrix or the name of a per-variable-per-observation annotation containing such a matrix. **Returns** Observation (Cell) Annotations ``properly_sampled_cell`` A boolean mask indicating whether each cell has a "proper" amount of UMIs. If ``inplace`` (default: {inplace}), this is written to the data, and the function returns ``None``. Otherwise this is returned as a pandas series (indexed by the observation names). **Computation Parameters** 1. Exclude all cells whose total data is less than the ``min_cell_total`` (no default), unless it is ``None``. 2. Exclude all cells whose total data is more than the ``max_cell_total`` (no default), unless it is ``None``. 3. If ``max_excluded_genes_fraction`` (no default) is not ``None``, then ``excluded_adata`` must not be ``None`` and should contain just the excluded genes data for each cell. Exclude all cells whose sum of the excluded data divided by the total data is more than the specified threshold. """ assert (max_excluded_genes_fraction is None) == (excluded_adata is None) total_of_cells = ut.get_o_numpy(adata, what, sum=True) cells_mask = np.full(adata.n_obs, True, dtype="bool") if min_cell_total is not None: cells_mask = cells_mask & (total_of_cells >= min_cell_total) if max_cell_total is not None: cells_mask = cells_mask & (total_of_cells <= max_cell_total) if excluded_adata is not None: assert max_excluded_genes_fraction is not None excluded_data = ut.get_vo_proper(excluded_adata, layout="row_major") excluded_of_cells = ut.sum_per(excluded_data, per="row") if np.min(total_of_cells) == 0: total_of_cells = np.copy(total_of_cells) total_of_cells[total_of_cells == 0] = 1 excluded_fraction = excluded_of_cells / total_of_cells cells_mask = cells_mask & (excluded_fraction <= max_excluded_genes_fraction) if inplace: ut.set_o_data(adata, "properly_sampled_cell", cells_mask) return None ut.log_return("properly_sampled_cell", cells_mask) return ut.to_pandas_series(cells_mask, index=adata.obs_names)
def compute_deviant_fold_factors( what: Union[str, ut.Matrix] = "__x__", *, adata: AnnData, gdata: AnnData, group: Union[str, ut.Vector] = "metacell", similar: Union[str, ut.Vector] = "similar", significant_gene_fold_factor: float = pr.significant_gene_fold_factor, ) -> None: """ Given an assignment of observations (cells) to groups (metacells) or, if an outlier, to the most similar groups, compute for each observation and gene the fold factor relative to its group for the purpose of detecting deviant cells. Ideally, all grouped cells would have no genes with high enough fold factors to be considered deviants, and all outlier cells would. In practice grouped cells might have a (few) such genes to the restriction on the fraction of deviants. It is important not to read too much into the results for a single cell, but looking at which genes appear for cell populations (e.g., cells with specific metadata such as batch identification) might be instructive. **Input** Annotated ``adata``, where the observations are cells and the variables are genes, where ``what`` is a per-variable-per-observation matrix or the name of a per-variable-per-observation annotation containing such a matrix. In addition, ``gdata`` is assumed to have one observation for each group, and use the same genes as ``adata``. **Returns** Sets the following in ``adata``: Per-Variable Per-Observation (Gene-Cell) Annotations ``deviant_fold`` The fold factor between the cell's UMIs and the expected number of UMIs for the purpose of computing deviant cells. **Computation Parameters** 1. For each cell, compute the expected UMIs for each gene given the fraction of the gene in the metacells associated with the cell (the one it is belongs to, or the most similar one for outliers). If this is less than ``significant_gene_fold_factor`` (default: {significant_gene_fold_factor}), set it to zero so the result will be sparse. """ cells_data = ut.get_vo_proper(adata, what, layout="row_major") metacells_data = ut.get_vo_proper(gdata, what, layout="row_major") total_umis_per_cell = ut.sum_per(cells_data, per="row") total_umis_per_metacell = ut.sum_per(metacells_data, per="row") group_of_cells = ut.get_o_numpy(adata, group, formatter=ut.groups_description) similar_of_cells = ut.get_o_numpy(adata, similar, formatter=ut.groups_description) @ut.timed_call("compute_cell_deviant_certificates") def _compute_cell_deviant_certificates( cell_index: int) -> Tuple[ut.NumpyVector, ut.NumpyVector]: return _compute_cell_certificates( cell_index=cell_index, cells_data=cells_data, metacells_data=metacells_data, group_of_cells=group_of_cells, similar_of_cells=similar_of_cells, total_umis_per_cell=total_umis_per_cell, total_umis_per_metacell=total_umis_per_metacell, significant_gene_fold_factor=significant_gene_fold_factor, ) results = list( ut.parallel_map(_compute_cell_deviant_certificates, adata.n_obs)) cell_indices = np.concatenate([ np.full(len(result[0]), cell_index, dtype="int32") for cell_index, result in enumerate(results) ]) gene_indices = np.concatenate([result[0] for result in results]) fold_factors = np.concatenate([result[1] for result in results]) deviant_folds = sparse.csr_matrix( (fold_factors, (cell_indices, gene_indices)), shape=adata.shape) ut.set_vo_data(adata, "deviant_folds", deviant_folds)
def compute_inner_fold_factors( what: Union[str, ut.Matrix] = "__x__", *, adata: AnnData, gdata: AnnData, group: Union[str, ut.Vector] = "metacell", min_gene_inner_fold_factor: float = pr.min_gene_inner_fold_factor, min_entry_inner_fold_factor: float = pr.min_entry_inner_fold_factor, inner_abs_folds: float = pr.inner_abs_folds, ) -> None: """ Compute the inner fold factors of genes within in each metacell. This computes, for each cell of the metacell, the same fold factors that are used to detect deviant cells (see :py:func:`metacells.tools.deviants.find_deviant_cells`), and keeps the maximal fold for each gene in the metacell. The result per-metacell-per-gene matrix is then made sparse by discarding too-low values (setting them to zero). Ideally, this matrix should be "very" sparse. If it contains "too many" non-zero values, this indicates the metacells contains "too much" variability. This may be due to actual biology (e.g. immune cells or olfactory nerves which are all similar except for each one expressing one different gene), due to batch effects (similar cells in distinct batches differing in some genes due to technical issues), due to low data quality (the overall noise level is so high that this is simply the best the algorithm can do), or worse - a combination of the above. **Input** Annotated ``adata``, where the observations are cells and the variables are genes, where ``what`` is a per-variable-per-observation matrix or the name of a per-variable-per-observation annotation containing such a matrix. In addition, ``gdata`` is assumed to have one observation for each group, and use the same genes as ``adata``. **Returns** Sets the following in ``gdata``: Per-Variable Per-Observation (Gene-Cell) Annotations ``inner_fold`` For each gene and group, the maximal fold factor of this gene in any cell contained in the group (unless the value is too low to be of interest, in which case it will be zero). **Computation Parameters** 1. For each group (metacell), for each gene, compute the gene's maximal (in all the cells of the group) fold factor log2((actual UMIs + 1) / (expected UMIs + 1)), similarly to :py:func:`metacells.tools.deviants.find_deviant_cells`. 2. If the maximal fold factor for a gene (across all metacells) is below ``min_gene_inner_fold_factor`` (default: {min_gene_inner_fold_factor}), then set all the gene's fold factors to zero (too low to be of interest). If ``inner_abs_folds`` (default: {inner_abs_folds}), consider the absolute fold factors. 3. Otherwise, for any metacell whose fold factor for the gene is less than ``min_entry_inner_fold_factor`` (default: {min_entry_inner_fold_factor}), set the fold factor to zero (too low to be of interest). """ assert 0 <= min_entry_inner_fold_factor <= min_gene_inner_fold_factor cells_data = ut.get_vo_proper(adata, what, layout="row_major") metacells_data = ut.get_vo_proper(gdata, what, layout="row_major") group_of_cells = ut.get_o_numpy(adata, group, formatter=ut.groups_description) total_umis_per_cell = ut.sum_per(cells_data, per="row") total_umis_per_metacell = ut.sum_per(metacells_data, per="row") @ut.timed_call("compute_metacell_inner_folds") def _compute_single_metacell_inner_folds( metacell_index: int) -> ut.NumpyVector: return _compute_metacell_inner_folds( metacell_index=metacell_index, cells_data=cells_data, metacells_data=metacells_data, group_of_cells=group_of_cells, total_umis_per_cell=total_umis_per_cell, total_umis_per_metacell=total_umis_per_metacell, ) results = list( ut.parallel_map(_compute_single_metacell_inner_folds, gdata.n_obs)) dense_inner_folds_by_row = np.array(results) dense_inner_folds_by_column = ut.to_layout(dense_inner_folds_by_row, "column_major") if inner_abs_folds: comparable_dense_inner_folds_by_column = np.abs( dense_inner_folds_by_column) else: comparable_dense_inner_folds_by_column = dense_inner_folds_by_column max_fold_per_gene = ut.max_per(comparable_dense_inner_folds_by_column, per="column") significant_genes_mask = max_fold_per_gene >= min_gene_inner_fold_factor ut.log_calc("significant_genes_mask", significant_genes_mask) dense_inner_folds_by_column[:, ~significant_genes_mask] = 0 dense_inner_folds_by_column[comparable_dense_inner_folds_by_column < min_entry_inner_fold_factor] = 0 dense_inner_folds_by_row = ut.to_layout(dense_inner_folds_by_column, layout="row_major") sparse_inner_folds = sparse.csr_matrix(dense_inner_folds_by_row) ut.set_vo_data(gdata, "inner_fold", sparse_inner_folds)
def _collect_group_data( group_index: int, *, group_of_cells: ut.NumpyVector, cells_data: ut.ProperMatrix, compatible_size: Optional[int], downsample_min_samples: int, downsample_min_cell_quantile: float, downsample_max_cell_quantile: float, min_gene_total: int, random_seed: int, variance_per_gene_per_group: ut.NumpyMatrix, normalized_variance_per_gene_per_group: ut.NumpyMatrix, ) -> None: cell_indices = np.where(group_of_cells == group_index)[0] cells_count = len(cell_indices) if cells_count < 2: return if compatible_size is None: ut.log_calc(" cells", cells_count) else: assert 0 < compatible_size <= cells_count if compatible_size < cells_count: np.random.seed(random_seed) if ut.logging_calc(): ut.log_calc(" cells: " + ut.ratio_description( len(cell_indices), "cell", compatible_size, "compatible")) cell_indices = np.random.choice(cell_indices, size=compatible_size, replace=False) assert len(cell_indices) == compatible_size assert ut.is_layout(cells_data, "row_major") group_data = cells_data[cell_indices, :] total_per_cell = ut.sum_per(group_data, per="row") samples = int( round( min( max(downsample_min_samples, np.quantile(total_per_cell, downsample_min_cell_quantile)), np.quantile(total_per_cell, downsample_max_cell_quantile), ))) if ut.logging_calc(): ut.log_calc(f" samples: {samples}") downsampled_data = ut.downsample_matrix(group_data, per="row", samples=samples, random_seed=random_seed) downsampled_data = ut.to_layout(downsampled_data, layout="column_major") total_per_gene = ut.sum_per(downsampled_data, per="column") too_small_genes = total_per_gene < min_gene_total if ut.logging_calc(): included_genes_count = len(too_small_genes) - np.sum(too_small_genes) ut.log_calc(f" included genes: {included_genes_count}") variance_per_gene = ut.variance_per(downsampled_data, per="column") normalized_variance_per_gene = ut.normalized_variance_per(downsampled_data, per="column") variance_per_gene[too_small_genes] = None normalized_variance_per_gene[too_small_genes] = None variance_per_gene_per_group[group_index, :] = variance_per_gene normalized_variance_per_gene_per_group[ group_index, :] = normalized_variance_per_gene