def _project_single_metacell( *, query_metacell_index: int, atlas_umis: ut.Matrix, query_atlas_corr: ut.NumpyMatrix, atlas_project_data: ut.NumpyMatrix, query_project_data: ut.NumpyMatrix, atlas_log_fractions: ut.NumpyMatrix, candidates_count: int, min_significant_gene_value: float, min_usage_weight: float, max_consistency_fold_factor: float, ) -> Tuple[ut.NumpyVector, ut.NumpyVector]: query_metacell_project_data = query_project_data[query_metacell_index, :] query_metacell_atlas_correlations = query_atlas_corr[query_metacell_index, :] query_metacell_atlas_order = np.argsort(-query_metacell_atlas_correlations) atlas_anchor_index = query_metacell_atlas_order[0] ut.log_calc("atlas_anchor_index", atlas_anchor_index) atlas_anchor_log_fractions = atlas_log_fractions[atlas_anchor_index, :] atlas_anchor_umis = ut.to_numpy_vector(atlas_umis[atlas_anchor_index, :]) atlas_candidate_indices_list = [atlas_anchor_index] position = 1 while len(atlas_candidate_indices_list) < candidates_count and position < len(query_metacell_atlas_order): atlas_metacell_index = query_metacell_atlas_order[position] position += 1 atlas_metacell_log_fractions = atlas_log_fractions[atlas_metacell_index, :] atlas_metacell_consistency_fold_factors = np.abs(atlas_metacell_log_fractions - atlas_anchor_log_fractions) atlas_metacell_umis = ut.to_numpy_vector(atlas_umis[atlas_metacell_index, :]) atlas_metacell_significant_genes_mask = atlas_metacell_umis + atlas_anchor_umis >= min_significant_gene_value atlas_metacell_consistency = np.max( atlas_metacell_consistency_fold_factors[atlas_metacell_significant_genes_mask] ) if atlas_metacell_consistency <= max_consistency_fold_factor / 2.0: atlas_candidate_indices_list.append(atlas_metacell_index) atlas_candidate_indices = np.array(sorted(atlas_candidate_indices_list)) atlas_candidates_project_data = atlas_project_data[atlas_candidate_indices, :] represent_result = ut.represent(query_metacell_project_data, atlas_candidates_project_data) assert represent_result is not None atlas_candidate_weights = represent_result[1] atlas_candidate_weights[atlas_candidate_weights < min_usage_weight] = 0 atlas_candidate_weights[atlas_candidate_weights < min_usage_weight] /= np.sum(atlas_candidate_weights) atlas_used_mask = atlas_candidate_weights > 0 atlas_used_indices = atlas_candidate_indices[atlas_used_mask].astype("int32") ut.log_return("atlas_used_indices", atlas_used_indices) atlas_used_weights = atlas_candidate_weights[atlas_used_mask] atlas_used_weights = atlas_used_weights.astype("float32") ut.log_return("atlas_used_weights", atlas_used_weights) return (atlas_used_indices, atlas_used_weights)
def find_high_relative_variance_genes( adata: AnnData, what: Union[str, ut.Matrix] = "__x__", *, min_gene_relative_variance: float = pr.significant_gene_relative_variance, window_size: int = pr.relative_variance_window_size, inplace: bool = True, ) -> Optional[ut.PandasSeries]: """ Find genes which have high relative variance of ``what`` (default: {what}) data. The relative variance measures the variance / mean of each gene relative to the other genes with a similar level of expression. See :py:func:`metacells.utilities.computation.relative_variance_per` for details. Genes with a high relative variance are good candidates for being selected as "feature genes", that is, be used to compute the similarity between cells. Using the relative variance compensates for the bias for selecting higher-expression genes, whose normalized variance can to be larger due to random noise alone. **Input** Annotated ``adata``, where the observations are cells and the variables are genes, where ``what`` is a per-variable-per-observation matrix or the name of a per-variable-per-observation annotation containing such a matrix. **Returns** Variable (Gene) Annotations ``high_relative_variance_gene`` A boolean mask indicating whether each gene was found to have a high relative variance. If ``inplace`` (default: {inplace}), this is written to the data, and the function returns ``None``. Otherwise this is returned as a pandas series (indexed by the variable names). **Computation Parameters** 1. Use :py:func:`metacells.utilities.computation.relative_variance_per` to get the relative variance of each gene. 2. Select the genes whose relative variance is at least ``min_gene_relative_variance`` (default: {min_gene_relative_variance}). """ data = ut.get_vo_proper(adata, what, layout="column_major") relative_variance_of_genes = ut.relative_variance_per(data, per="column", window_size=window_size) genes_mask = relative_variance_of_genes >= min_gene_relative_variance if inplace: ut.set_v_data(adata, "high_relative_variance_gene", genes_mask) return None ut.log_return("high_relative_variance_genes", genes_mask) return ut.to_pandas_series(genes_mask, index=adata.var_names)
def find_top_feature_genes( adata: AnnData, *, max_genes: int = pr.max_top_feature_genes, inplace: bool = True, ) -> Optional[ut.PandasSeries]: """ Find genes which have high ``feature_gene`` value. This is applied after computing metacells to pick the "strongest" feature genes. If using the direct algorithm (:py:func:`metacells.pipeline.direct.compute_direct_metacells`) then all feature genes are equally "strong"; however, if using the divide-and-conquer algorithm (:py:func:`metacells.pipeline.divide_and_conquer.divide_and_conquer_pipeline`, :py:func:`metacells.pipeline.divide_and_conquer.compute_divide_and_conquer_metacells`) then this will pick the genes which were most commonly used as features across all the piles. **Input** Annotated ``adata``, where the observations are cells and the variables are genes, where ``feature_gene`` is a per-variable (gene) annotation counting how many times each gene was used as a feature. **Returns** Variable (Gene) Annotations ``top_feature_gene`` A boolean mask indicating whether each gene was found to be a top feature gene. If ``inplace`` (default: {inplace}), this is written to the data, and the function returns ``None``. Otherwise this is returned as a pandas series (indexed by the variable names). **Computation Parameters** 1. Look for the lowest positive ``feature_gene`` threshold such that at most ``max_genes`` are picked as top feature genes. Note we may still pick more than ``max_genes``, for example when using the direct algorithm, we always return all feature genes as there's no way to distinguish between them using the ``feature_gene`` data. """ feature_of_gene = ut.get_v_numpy(adata, "feature_gene", formatter=ut.mask_description) max_threshold = np.max(feature_of_gene) assert max_threshold > 0 threshold = 0 selected_count = max_genes + 1 while selected_count > max_genes and threshold < max_threshold: threshold = threshold + 1 genes_mask = feature_of_gene >= threshold selected_count = np.sum(genes_mask) ut.log_calc(f"threshold: {threshold} selected: {selected_count}") if inplace: ut.set_v_data(adata, "top_feature_gene", genes_mask) return None ut.log_return("top_feature_gene", genes_mask) return ut.to_pandas_series(genes_mask, index=adata.var_names)
def find_high_normalized_variance_genes( adata: AnnData, what: Union[str, ut.Matrix] = "__x__", *, min_gene_normalized_variance: float = pr.significant_gene_normalized_variance, inplace: bool = True, ) -> Optional[ut.PandasSeries]: """ Find genes which have high normalized variance of ``what`` (default: {what}) data. The normalized variance measures the variance / mean of each gene. See :py:func:`metacells.utilities.computation.normalized_variance_per` for details. Genes with a high normalized variance are "noisy", that is, have significantly different expression level in different cells. **Input** Annotated ``adata``, where the observations are cells and the variables are genes, where ``what`` is a per-variable-per-observation matrix or the name of a per-variable-per-observation annotation containing such a matrix. **Returns** Variable (Gene) Annotations ``high_normalized_variance_gene`` A boolean mask indicating whether each gene was found to have a high normalized variance. If ``inplace`` (default: {inplace}), this is written to the data, and the function returns ``None``. Otherwise this is returned as a pandas series (indexed by the variable names). **Computation Parameters** 1. Use :py:func:`metacells.utilities.computation.normalized_variance_per` to get the normalized variance of each gene. 2. Select the genes whose normalized variance is at least ``min_gene_normalized_variance`` (default: {min_gene_normalized_variance}). """ data = ut.get_vo_proper(adata, what, layout="column_major") normalized_variance_of_genes = ut.normalized_variance_per(data, per="column") genes_mask = normalized_variance_of_genes >= min_gene_normalized_variance if inplace: ut.set_v_data(adata, "high_normalized_variance_gene", genes_mask) return None ut.log_return("high_normalized_variance_genes", genes_mask) return ut.to_pandas_series(genes_mask, index=adata.var_names)
def find_high_topN_genes( # pylint: disable=invalid-name adata: AnnData, what: Union[str, ut.Matrix] = "__x__", *, topN: int, # pylint: disable=invalid-name min_gene_topN: int, # pylint: disable=invalid-name inplace: bool = True, ) -> Optional[ut.PandasSeries]: """ Find genes which have high total top-Nth value of ``what`` (default: {what}) data. This should typically only be applied to downsampled data to ensure that variance in sampling depth does not affect the result. Genes with too-low expression are typically excluded from computations. In particular, genes may have all-zero expression, in which case including them just slows the computations (and triggers numeric edge cases). **Input** Annotated ``adata``, where the observations are cells and the variables are genes, where ``what`` is a per-variable-per-observation matrix or the name of a per-variable-per-observation annotation containing such a matrix. **Returns** Variable (Gene) Annotations ``high_top<topN>_gene`` A boolean mask indicating whether each gene was found to have a high top-Nth value. If ``inplace`` (default: {inplace}), this is written to the data, and the function returns ``None``. Otherwise this is returned as a pandas series (indexed by the variable names). **Computation Parameters** 1. Use :py:func:`metacells.utilities.computation.top_per` to get the top-Nth UMIs of each gene. 2. Select the genes whose fraction is at least ``min_gene_topN``. """ data_of_genes = ut.get_vo_proper(adata, what, layout="column_major") rank = max(adata.n_obs - topN - 1, 1) topN_of_genes = ut.rank_per(data_of_genes, per="column", rank=rank) # pylint: disable=invalid-name genes_mask = topN_of_genes >= min_gene_topN if inplace: ut.set_v_data(adata, f"high_top{topN}_gene", genes_mask) return None ut.log_return(f"high_top{topN}_genes", genes_mask) return ut.to_pandas_series(genes_mask, index=adata.var_names)
def find_properly_sampled_genes( adata: AnnData, what: Union[str, ut.Matrix] = "__x__", *, min_gene_total: int = pr.properly_sampled_min_gene_total, inplace: bool = True, ) -> Optional[ut.PandasSeries]: """ Detect genes with a "proper" amount of ``what`` (default: {what}) data. Due to both technical effects and natural variance between genes, the expression of genes varies greatly between cells. This is exactly the information we are trying to analyze. We often would like to work on genes that have a sufficient level of expression for meaningful analysis. Specifically, it doesn't make sense to analyze genes that have zero expression in all the cells. .. todo:: Provide additional optional criteria for "properly sampled genes"? **Input** Annotated ``adata``, where the observations are cells and the variables are genes, where ``what`` is a per-variable-per-observation matrix or the name of a per-variable-per-observation annotation containing such a matrix. **Returns** Variable (Gene) Annotations ``properly_sampled_gene`` A boolean mask indicating whether each gene has a "proper" number of UMIs. If ``inplace`` (default: {inplace}), this is written to the data and the function returns ``None``. Otherwise this is returned as a pandas series (indexed by the variable names). **Computation Parameters** 1. Exclude all genes whose total data is less than the ``min_gene_total`` (default: {min_gene_total}). """ total_of_genes = ut.get_v_numpy(adata, what, sum=True) genes_mask = total_of_genes >= min_gene_total if inplace: ut.set_v_data(adata, "properly_sampled_gene", genes_mask) return None ut.log_return("properly_sampled_gene", genes_mask) return ut.to_pandas_series(genes_mask, index=adata.obs_names)
def find_high_fraction_genes( adata: AnnData, what: Union[str, ut.Matrix] = "__x__", *, min_gene_fraction: float = pr.significant_gene_fraction, inplace: bool = True, ) -> Optional[ut.PandasSeries]: """ Find genes which have high fraction of the total ``what`` (default: {what}) data of the cells. Genes with too-low expression are typically excluded from computations. In particular, genes may have all-zero expression, in which case including them just slows the computations (and triggers numeric edge cases). **Input** Annotated ``adata``, where the observations are cells and the variables are genes, where ``what`` is a per-variable-per-observation matrix or the name of a per-variable-per-observation annotation containing such a matrix. **Returns** Variable (Gene) Annotations ``high_fraction_gene`` A boolean mask indicating whether each gene was found to have a high normalized variance. If ``inplace`` (default: {inplace}), this is written to the data, and the function returns ``None``. Otherwise this is returned as a pandas series (indexed by the variable names). **Computation Parameters** 1. Use :py:func:`metacells.utilities.computation.fraction_per` to get the fraction of each gene. 2. Select the genes whose fraction is at least ``min_gene_fraction`` (default: {min_gene_fraction}). """ data = ut.get_vo_proper(adata, what, layout="column_major") fraction_of_genes = ut.fraction_per(data, per="column") genes_mask = fraction_of_genes >= min_gene_fraction if inplace: ut.set_v_data(adata, "high_fraction_gene", genes_mask) return None ut.log_return("high_fraction_genes", genes_mask) return ut.to_pandas_series(genes_mask, index=adata.var_names)
def find_high_total_genes( adata: AnnData, what: Union[str, ut.Matrix] = "__x__", *, min_gene_total: int, inplace: bool = True, ) -> Optional[ut.PandasSeries]: """ Find genes which have high total number of ``what`` (default: {what}) data. This should typically only be applied to downsampled data to ensure that variance in sampling depth does not affect the result. Genes with too-low expression are typically excluded from computations. In particular, genes may have all-zero expression, in which case including them just slows the computations (and triggers numeric edge cases). **Input** Annotated ``adata``, where the observations are cells and the variables are genes, where ``what`` is a per-variable-per-observation matrix or the name of a per-variable-per-observation annotation containing such a matrix. **Returns** Variable (Gene) Annotations ``high_total_gene`` A boolean mask indicating whether each gene was found to have a high normalized variance. If ``inplace`` (default: {inplace}), this is written to the data, and the function returns ``None``. Otherwise this is returned as a pandas series (indexed by the variable names). **Computation Parameters** 1. Use :py:func:`metacells.utilities.computation.sum_per` to get the total UMIs of each gene. 2. Select the genes whose fraction is at least ``min_gene_total``. """ total_of_genes = ut.get_v_numpy(adata, what, sum=True) genes_mask = total_of_genes >= min_gene_total if inplace: ut.set_v_data(adata, "high_total_gene", genes_mask) return None ut.log_return("high_total_genes", genes_mask) return ut.to_pandas_series(genes_mask, index=adata.var_names)
def find_named_genes( adata: AnnData, *, names: Optional[Collection[str]] = None, patterns: Optional[Collection[Union[str, Pattern]]] = None, to: Optional[str] = None, invert: bool = False, ) -> Optional[ut.PandasSeries]: """ Find genes by their (case-insensitive) name. This creates a mask of all the genes whose name appears in ``names`` or matches any of the ``patterns``. If ``invert`` (default: {invert}), invert the resulting mask. If ``to`` (default: {to}) is specified, this is stored as a per-variable (gene) annotation with that name, and returns ``None``. This is useful to fill gene masks such as ``excluded_genes`` (genes which should be excluded from the rest of the processing) and ``forbidden_genes`` (genes which must not be chosen as feature genes). Otherwise, it returns it as a pandas series (indexed by the variable, that is gene, names). """ if names is None: names_mask = np.zeros(adata.n_vars, dtype="bool") else: lower_names_set = {name.lower() for name in names} names_mask = np.array([name.lower() in lower_names_set for name in adata.var_names]) # if patterns is None: patterns_mask = np.zeros(adata.n_vars, dtype="bool") else: patterns_mask = ut.patterns_matches(patterns, adata.var_names) genes_mask = names_mask | patterns_mask if invert: genes_mask = ~genes_mask if to is not None: ut.set_v_data(adata, to, genes_mask) return None ut.log_return("named_genes", genes_mask) return ut.to_pandas_series(genes_mask, index=adata.var_names)
def _results( *, adata: AnnData, rare_module_of_cells: ut.NumpyVector, list_of_rare_gene_indices_of_modules: List[List[int]], inplace: bool, ) -> Optional[Tuple[ut.PandasFrame, ut.PandasFrame]]: assert np.max( rare_module_of_cells) == len(list_of_rare_gene_indices_of_modules) - 1 if not inplace: var_metrics = ut.to_pandas_frame(index=adata.var_names) rare_gene_mask = np.zeros(adata.n_vars, dtype="bool") for module_index, rare_gene_indices_of_module in enumerate( list_of_rare_gene_indices_of_modules): rare_module_gene_mask = np.zeros(adata.n_vars, dtype="bool") rare_module_gene_mask[rare_gene_indices_of_module] = True property_name = f"rare_gene_module_{module_index}" if inplace: ut.set_v_data(adata, property_name, rare_module_gene_mask) else: var_metrics[property_name] = rare_module_gene_mask ut.log_return(property_name, rare_module_gene_mask) rare_gene_mask |= rare_module_gene_mask if inplace: ut.set_v_data(adata, "rare_gene", rare_gene_mask) else: var_metrics["rare_gene"] = rare_gene_mask ut.log_return("rare_gene", rare_gene_mask) if inplace: ut.set_o_data(adata, "cells_rare_gene_module", rare_module_of_cells, formatter=ut.groups_description) ut.set_o_data(adata, "rare_cell", rare_module_of_cells >= 0) return None obs_metrics = ut.to_pandas_frame(index=adata.obs_names) ut.log_return("cells_rare_gene_module", rare_module_of_cells, formatter=ut.groups_description) ut.log_return("rare_cell", rare_module_of_cells >= 0) return obs_metrics, var_metrics
def compute_candidate_metacells( # pylint: disable=too-many-statements,too-many-branches adata: AnnData, what: Union[str, ut.Matrix] = "obs_outgoing_weights", *, target_metacell_size: float, cell_sizes: Optional[Union[str, ut.Vector]] = pr.candidates_cell_sizes, cell_seeds: Optional[Union[str, ut.Vector]] = None, min_seed_size_quantile: float = pr.min_seed_size_quantile, max_seed_size_quantile: float = pr.max_seed_size_quantile, cooldown_pass: float = pr.cooldown_pass, cooldown_node: float = pr.cooldown_node, cooldown_phase: float = pr.cooldown_phase, min_split_size_factor: Optional[float] = pr.candidates_min_split_size_factor, max_merge_size_factor: Optional[float] = pr.candidates_max_merge_size_factor, min_metacell_cells: Optional[int] = pr.candidates_min_metacell_cells, max_split_min_cut_strength: Optional[float] = pr.max_split_min_cut_strength, min_cut_seed_cells: Optional[int] = pr.min_cut_seed_cells, must_complete_cover: bool = False, random_seed: int = 0, inplace: bool = True, ) -> Optional[ut.PandasSeries]: """ Assign observations (cells) to (raw, candidate) metacells based on ``what`` data. (a weighted directed graph). These candidate metacells typically go through additional vetting (e.g. deviant detection and dissolving too-small metacells) to obtain the final metacells. **Input** Annotated ``adata``, where the observations are cells and the variables are genes, where ``what`` is a per-observation-per-observation matrix where each row is the outgoing weights from each observation to the rest, or just the name of a per-observation-per-observation annotation containing such a matrix. Typically this matrix will be sparse for efficient processing. **Returns** Observation (Cell) Annotations ``candidate`` The integer index of the (raw, candidate) metacell each cell belongs to. The metacells are in no particular order. If ``inplace`` (default: {inplace}), this is written to the data, and the function returns ``None``. Otherwise this is returned as a pandas series (indexed by the variable names). **Computation Parameters** 1. We are trying to build metacells of ``target_metacell_size``, using the ``cell_sizes`` (default: {cell_sizes}) to assign a size for each node (cell). This can be a string name of a per-observation annotation or a vector of values. 2. We start with some an assignment of cells to ``cell_seeds`` (default: {cell_seeds}). If no seeds are provided, we use :py:func:`choose_seeds` using ``min_seed_size_quantile`` (default: {min_seed_size_quantile}) and ``max_seed_size_quantile`` (default: {max_seed_size_quantile}) to compute them, picking a number of seeds such that the average metacell size would match the target. 3. We optimize the seeds using :py:func:`optimize_partitions` to obtain initial communities by maximizing the "stability" of the solution (probability of starting at a random node and moving either forward or backward in the graph and staying within the same metacell, divided by the probability of staying in the metacell if the edges connected random nodes). We pass it the ``cooldown_pass`` {cooldown_pass}) and ``cooldown_node`` (default: {cooldown_node}). 4. If ``min_split_size_factor`` (default: {min_split_size_factor}) is specified, randomly split to two each community whose size is partition method on each community whose size is at least ``target_metacell_size * min_split_size_factor`` and re-optimize the solution (resulting in one additional metacell). Every time we re-optimize, we multiply 1 - ``cooldown_pass`` by 1 - ``cooldown_phase`` (default: {cooldown_phase}). 5. If ``max_split_min_cut_strength`` (default: {max_split_min_cut_strength}) is specified, and the minimal cut of a candidate is lower, split it into two. If one of the partitions is smaller than ``min_cut_seed_cells``, then mark the cells in it as outliers, or if ``must_complete_cover`` is ``True``, skip the cut altogether. 5. If ``max_merge_size_factor`` (default: {max_merge_size_factor}) or ``min_metacell_cells`` (default: {min_metacell_cells}) are specified, make outliers of cells of a community whose size is at most ``target_metacell_size * max_merge_size_factor`` or contains less cells and re-optimize, which will assign these cells to other metacells (resulting on one less metacell). We again apply the ``cooldown_phase`` every time we re-optimize. 6. Repeat the above steps until all metacells candidates are in the acceptable size range. """ edge_weights = ut.get_oo_proper(adata, what, layout="row_major") assert edge_weights.shape[0] == edge_weights.shape[1] assert 0.0 < cooldown_pass < 1.0 assert 0.0 <= cooldown_node <= 1.0 assert 0.0 < cooldown_phase <= 1.0 size = edge_weights.shape[0] outgoing_edge_weights = ut.mustbe_compressed_matrix(edge_weights) assert ut.is_layout(outgoing_edge_weights, "row_major") incoming_edge_weights = ut.mustbe_compressed_matrix(ut.to_layout(outgoing_edge_weights, layout="column_major")) assert ut.is_layout(incoming_edge_weights, "column_major") assert outgoing_edge_weights.data.dtype == "float32" assert outgoing_edge_weights.indices.dtype == "int32" assert outgoing_edge_weights.indptr.dtype == "int32" assert incoming_edge_weights.data.dtype == "float32" assert incoming_edge_weights.indices.dtype == "int32" assert incoming_edge_weights.indptr.dtype == "int32" node_sizes = ut.maybe_o_numpy(adata, cell_sizes, formatter=ut.sizes_description) if node_sizes is None: node_sizes = np.full(size, 1.0, dtype="float32") else: node_sizes = node_sizes.astype("float32") ut.log_calc("node_sizes", node_sizes, formatter=ut.sizes_description) assert target_metacell_size > 0 max_metacell_size = None min_metacell_size = None if min_split_size_factor is not None: assert min_split_size_factor > 0 max_metacell_size = ceil(target_metacell_size * min_split_size_factor) - 1 ut.log_calc("max_metacell_size", max_metacell_size) if max_merge_size_factor is not None: assert max_merge_size_factor > 0 min_metacell_size = floor(target_metacell_size * max_merge_size_factor) + 1 ut.log_calc("min_metacell_size", min_metacell_size) target_metacell_cells = max( 1.0 if min_metacell_cells is None else float(min_metacell_cells), float(target_metacell_size / np.mean(node_sizes)), ) ut.log_calc("target_metacell_cells", target_metacell_cells) if min_split_size_factor is not None and max_merge_size_factor is not None: assert max_merge_size_factor < min_split_size_factor assert min_metacell_size is not None assert max_metacell_size is not None assert min_metacell_size <= max_metacell_size community_of_nodes = ut.maybe_o_numpy(adata, cell_seeds, formatter=ut.groups_description) if community_of_nodes is not None: assert community_of_nodes.dtype == "int32" else: target_seeds_count = ceil(size / target_metacell_cells) ut.log_calc("target_seeds_count", target_seeds_count) community_of_nodes = np.full(size, -1, dtype="int32") _choose_seeds( outgoing_edge_weights=outgoing_edge_weights, incoming_edge_weights=incoming_edge_weights, seed_of_cells=community_of_nodes, max_seeds_count=target_seeds_count, min_seed_size_quantile=min_seed_size_quantile, max_seed_size_quantile=max_seed_size_quantile, random_seed=random_seed, ) ut.set_o_data(adata, "seed", community_of_nodes, formatter=ut.groups_description) community_of_nodes = community_of_nodes.copy() np.random.seed(random_seed) cold_temperature = 1 - cooldown_pass old_score = 1e9 old_communities = community_of_nodes old_small_nodes_count = len(community_of_nodes) atomic_candidates: Set[Tuple[int, ...]] = set() kept_communities_count = 0 while True: cold_temperature, score = _optimize_split_communities( # outgoing_edge_weights=outgoing_edge_weights, incoming_edge_weights=incoming_edge_weights, community_of_nodes=community_of_nodes, node_sizes=node_sizes, target_metacell_size=target_metacell_size, max_metacell_size=max_metacell_size, max_split_min_cut_strength=max_split_min_cut_strength, min_cut_seed_cells=min_cut_seed_cells, must_complete_cover=must_complete_cover, min_seed_size_quantile=min_seed_size_quantile, max_seed_size_quantile=max_seed_size_quantile, random_seed=random_seed, cooldown_pass=cooldown_pass, cooldown_node=cooldown_node, cooldown_phase=cooldown_phase, kept_communities_count=kept_communities_count, cold_temperature=cold_temperature, atomic_candidates=atomic_candidates, ) small_communities, small_nodes_count = _find_small_communities( community_of_nodes=community_of_nodes, node_sizes=node_sizes, min_metacell_size=min_metacell_size, min_metacell_cells=min_metacell_cells, ) small_communities_count = len(small_communities) if small_communities_count < 2: break if (old_small_nodes_count, old_score) <= (small_nodes_count, score): ut.logger().debug("is not better, revert") community_of_nodes = old_communities score = old_score ut.log_calc("communities", community_of_nodes, formatter=ut.groups_description) ut.log_calc("score", score) break old_score = score old_communities = community_of_nodes.copy() old_small_nodes_count = small_nodes_count kept_communities_count = _cancel_communities( community_of_nodes=community_of_nodes, cancelled_communities=small_communities ) _choose_seeds( outgoing_edge_weights=outgoing_edge_weights, incoming_edge_weights=incoming_edge_weights, seed_of_cells=community_of_nodes, max_seeds_count=kept_communities_count + small_communities_count - 1, min_seed_size_quantile=min_seed_size_quantile, max_seed_size_quantile=max_seed_size_quantile, random_seed=random_seed, ) if inplace: ut.set_o_data(adata, "candidate", community_of_nodes, formatter=ut.groups_description) return None if must_complete_cover: assert np.min(community_of_nodes) == 0 else: community_of_nodes[community_of_nodes < 0] = -1 ut.log_return("candidate", community_of_nodes, formatter=ut.groups_description) return ut.to_pandas_series(community_of_nodes, index=adata.obs_names)
def find_metacells_significant_genes( adata: AnnData, what: Union[str, ut.Matrix] = "__x__", *, min_gene_range_fold: float = pr.min_significant_metacells_gene_range_fold_factor, normalization: float = pr.metacells_gene_range_normalization, min_gene_fraction: float = pr.min_significant_metacells_gene_fraction, inplace: bool = True, ) -> Optional[ut.PandasSeries]: """ Find genes which have a significant signal in metacells data. This computation is too unreliable to be used on cells. Find genes which have a high maximal expression in at least one metacell, and a wide range of expression across the metacells. Such genes are good candidates for being used as marker genes and/or to compute distances between metacells. **Input** Annotated ``adata``, where the observations are cells and the variables are genes, where ``what`` is a per-variable-per-observation matrix or the name of a per-variable-per-observation annotation containing such a matrix. **Returns** Variable (Gene) Annotations ``significant_gene`` A boolean mask indicating whether each gene was found to be significant. If ``inplace`` (default: {inplace}), this is written to the data, and the function returns ``None``. Otherwise this is returned as a pandas series (indexed by the variable names). **Computation Parameters** 1. Compute the minimal and maximal expression level of each gene. 2. Select the genes whose fold factor (log2 of maximal over minimal value, using the ``normalization`` (default: {normalization}) is at least ``min_gene_range_fold`` (default: {min_gene_range_fold}). 3. Select the genes whose maximal expression is at least ``min_gene_fraction`` (default: {min_gene_fraction}). """ assert normalization >= 0 data = ut.get_vo_proper(adata, what, layout="row_major") fractions_of_genes = ut.to_layout(ut.fraction_by(data, by="row"), layout="column_major") min_fraction_of_genes = ut.min_per(fractions_of_genes, per="column") max_fraction_of_genes = ut.max_per(fractions_of_genes, per="column") high_max_fraction_genes_mask = max_fraction_of_genes >= min_gene_fraction ut.log_calc("high max fraction genes", high_max_fraction_genes_mask) min_fraction_of_genes += normalization max_fraction_of_genes += normalization max_fraction_of_genes /= min_fraction_of_genes range_fold_of_genes = np.log2(max_fraction_of_genes, out=max_fraction_of_genes) high_range_genes_mask = range_fold_of_genes >= min_gene_range_fold ut.log_calc("high range genes", high_range_genes_mask) significant_genes_mask = high_max_fraction_genes_mask & high_range_genes_mask if inplace: ut.set_v_data(adata, "significant_gene", significant_genes_mask) return None ut.log_return("significant_genes", significant_genes_mask) return ut.to_pandas_series(significant_genes_mask, index=adata.var_names)
def find_deviant_cells( adata: AnnData, what: Union[str, ut.Matrix] = "__x__", *, candidates: Union[str, ut.Vector] = "candidate", min_gene_fold_factor: float = pr.deviants_min_gene_fold_factor, abs_folds: bool = pr.deviants_abs_folds, max_gene_fraction: Optional[float] = pr.deviants_max_gene_fraction, max_cell_fraction: Optional[float] = pr.deviants_max_cell_fraction, inplace: bool = True, ) -> Optional[Tuple[ut.PandasSeries, ut.PandasSeries]]: """ Find cells which are have significantly different gene expression from the metacells they are belong to based on ``what`` (default: {what}) data. **Input** Annotated ``adata``, where the observations are cells and the variables are genes, where ``what`` is a per-variable-per-observation matrix or the name of a per-variable-per-observation annotation containing such a matrix. **Returns** Observation (Cell) Annotations ``cell_deviant_votes`` The number of genes that were the reason the cell was marked as deviant (if zero, the cell is not deviant). Variable (Gene) Annotations ``gene_deviant_votes`` The number of cells each gene marked as deviant (if zero, the gene did not mark any cell as deviant). If ``inplace`` (default: {inplace}), this is written to the data, and the function returns ``None``. Otherwise this is returned as two pandas series (indexed by the observation and variable names). **Computation Parameters** Intuitively, we first select some fraction of the genes which were least predictable compared to the mean expression in the candidate metacells. We then mark as deviants some fraction of the cells whose expression of these genes was least predictable compared to the mean expression in the candidate metacells. Operationally: 1. Compute for each candidate metacell the mean fraction of the UMIs expressed by each gene. Scale this by each cell's total UMIs to compute the expected number of UMIs for each cell. Compute the fold factor log2((actual UMIs + 1) / (expected UMIs + 1)) for each gene for each cell. 2. Ignore all fold factors less than the ``min_gene_fold_factor`` (default: {min_gene_fold_factor}). If ``abs_folds`` (default: {abs_folds}), consider the absolute fold factors. Count the number of genes which have a fold factor above this minimum in at least one cell. If the fraction of such genes is above ``max_gene_fraction`` (default: {max_gene_fraction}), then raise the minimal gene fold factor such that at most this fraction of genes remain. 3. For each remaining gene, rank all the cells where it is expressed above the min fold factor. Give an artificial maximum rank to all cells with fold factor 0, that is, below the minimum. 4. For each cell, compute the minimal rank it has in any of these genes. That is, if a cell has a rank of 1, it means that it has at least one gene whose expression fold factor is the worst (highest) across all cells (and is also above the minimum). 5. Select as deviants all cells whose minimal rank is below the artificial maximum rank, that is, which contain at least one gene whose expression fold factor is high relative to the rest of the cells. If the fraction of such cells is higher than ``max_cell_fraction`` (default: {max_cell_fraction}), reduce the maximal rank such that at most this fraction of cells are selected as deviants. """ if max_gene_fraction is None: max_gene_fraction = 1 if max_cell_fraction is None: max_cell_fraction = 1 assert min_gene_fold_factor > 0 assert 0 < max_gene_fraction < 1 assert 0 < max_cell_fraction < 1 cells_count, genes_count = adata.shape assert cells_count > 0 candidate_of_cells = ut.get_o_numpy(adata, candidates, formatter=ut.groups_description) totals_of_cells = ut.get_o_numpy(adata, what, sum=True) assert totals_of_cells.size == cells_count data = ut.get_vo_proper(adata, what, layout="row_major") list_of_fold_factors, list_of_cell_index_of_rows = _collect_fold_factors( data=data, candidate_of_cells=candidate_of_cells, totals_of_cells=totals_of_cells, min_gene_fold_factor=min_gene_fold_factor, abs_folds=abs_folds, ) fold_factors = _construct_fold_factors(cells_count, list_of_fold_factors, list_of_cell_index_of_rows) if fold_factors is None: votes_of_deviant_cells = np.zeros(adata.n_obs, dtype="int32") votes_of_deviant_genes = np.zeros(adata.n_vars, dtype="int32") else: deviant_gene_indices = _filter_genes( cells_count=cells_count, genes_count=genes_count, fold_factors=fold_factors, min_gene_fold_factor=min_gene_fold_factor, max_gene_fraction=max_gene_fraction, ) deviant_genes_fold_ranks = _fold_ranks( cells_count=cells_count, fold_factors=fold_factors, deviant_gene_indices=deviant_gene_indices ) votes_of_deviant_cells, votes_of_deviant_genes = _filter_cells( cells_count=cells_count, genes_count=genes_count, deviant_genes_fold_ranks=deviant_genes_fold_ranks, deviant_gene_indices=deviant_gene_indices, max_cell_fraction=max_cell_fraction, ) if inplace: ut.set_v_data(adata, "gene_deviant_votes", votes_of_deviant_genes, formatter=ut.mask_description) ut.set_o_data(adata, "cell_deviant_votes", votes_of_deviant_cells, formatter=ut.mask_description) return None ut.log_return("gene_deviant_votes", votes_of_deviant_genes, formatter=ut.mask_description) ut.log_return("cell_deviant_votes", votes_of_deviant_cells, formatter=ut.mask_description) return ( ut.to_pandas_series(votes_of_deviant_cells, index=adata.obs_names), ut.to_pandas_series(votes_of_deviant_genes, index=adata.var_names), )
def find_noisy_lonely_genes( # pylint: disable=too-many-statements adata: AnnData, what: Union[str, ut.Matrix] = "__x__", *, excluded_genes_mask: Optional[str] = None, max_sampled_cells: int = pr.noisy_lonely_max_sampled_cells, downsample_min_samples: int = pr.noisy_lonely_downsample_min_samples, downsample_min_cell_quantile: float = pr. noisy_lonely_downsample_max_cell_quantile, downsample_max_cell_quantile: float = pr. noisy_lonely_downsample_min_cell_quantile, min_gene_total: int = pr.noisy_lonely_min_gene_total, min_gene_normalized_variance: float = pr. noisy_lonely_min_gene_normalized_variance, max_gene_similarity: float = pr.noisy_lonely_max_gene_similarity, random_seed: int = pr.random_seed, inplace: bool = True, ) -> Optional[ut.PandasSeries]: """ Detect "noisy lonely" genes based on ``what`` (default: {what}) data. Return the indices of genes which are "noisy" (have high variance compared to their mean) and also "lonely" (have low correlation with all other genes). Such genes should be excluded since they will never meaningfully help us compute groups, and will actively cause profiles to be considered "deviants". Noisy genes have high expression and variance. Lonely genes have no (or low) correlations with any other gene. Noisy lonely genes tend to throw off clustering algorithms. In general, such algorithms try to group together cells with the same overall biological state. Since the genes are lonely, they don't contribute towards this goal. Since they are noisy, they actively hamper this, because they make cells which are otherwise similar appear different (just for this lonely gene). It is therefore useful to explicitly identify, in a pre-processing step, the (few) such genes, and exclude them from the rest of the analysis. **Input** Annotated ``adata``, where the observations are cells and the variables are genes, where ``what`` is a per-variable-per-observation matrix or the name of a per-variable-per-observation annotation containing such a matrix. **Returns** Variable (Gene) Annotations ``noisy_lonely_genes`` A boolean mask indicating whether each gene was found to be a "noisy lonely" gene. If ``inplace`` (default: {inplace}), this is written to the data, and the function returns ``None``. Otherwise this is returned as a pandas series (indexed by the variable names). **Computation Parameters** 1. If we have more than ``max_sampled_cells`` (default: {max_sampled_cells}), pick this number of random cells from the data using the ``random_seed``. 2. If we were specified an ``excluded_genes_mask``, this is the name of a per-variable (gene) annotation containing a mask of excluded genes. Get rid of all these excluded genes. 3. Invoke :py:func:`metacells.tools.downsample.downsample_cells` to downsample the cells to the same total number of UMIs, using the ``downsample_min_samples`` (default: {downsample_min_samples}), ``downsample_min_cell_quantile`` (default: {downsample_min_cell_quantile}), ``downsample_max_cell_quantile`` (default: {downsample_max_cell_quantile}) and the ``random_seed`` (default: {random_seed}). 4. Find "noisy" genes which have a total number of UMIs of at least ``min_gene_total`` (default: {min_gene_total}) and a normalized variance of at least ``min_gene_normalized_variance`` (default: ``min_gene_normalized_variance``). 5. Cross-correlate the noisy genes. 6. Find the noisy "lonely" genes whose maximal correlation is at most ``max_gene_similarity`` (default: {max_gene_similarity}) with all other genes. """ if max_sampled_cells < adata.n_obs: np.random.seed(random_seed) cell_indices = np.random.choice(np.arange(adata.n_obs), size=max_sampled_cells, replace=False) s_data = ut.slice(adata, obs=cell_indices, name=".sampled", top_level=False) else: s_data = ut.copy_adata(adata, top_level=False) track_var: Optional[str] = "sampled_gene_index" if excluded_genes_mask is not None: results = filter_data(s_data, name="included", top_level=False, track_var=track_var, var_masks=[f"~{excluded_genes_mask}"]) track_var = None assert results is not None i_data = results[0] assert i_data is not None else: i_data = s_data downsample_cells( i_data, what, downsample_min_samples=downsample_min_samples, downsample_min_cell_quantile=downsample_min_cell_quantile, downsample_max_cell_quantile=downsample_max_cell_quantile, random_seed=random_seed, ) find_high_total_genes(i_data, "downsampled", min_gene_total=min_gene_total) results = filter_data(i_data, name="high_total", top_level=False, track_var=track_var, var_masks=["high_total_gene"]) track_var = None assert results is not None ht_data = results[0] noisy_lonely_genes_mask = np.full(adata.n_vars, False) if ht_data is not None: ht_genes_count = ht_data.shape[1] ht_gene_ht_gene_similarity_frame = compute_var_var_similarity( ht_data, "downsampled", inplace=False, reproducible=(random_seed != 0)) assert ht_gene_ht_gene_similarity_frame is not None ht_gene_ht_gene_similarity_matrix = ut.to_numpy_matrix( ht_gene_ht_gene_similarity_frame, only_extract=True) ht_gene_ht_gene_similarity_matrix = ut.to_layout( ht_gene_ht_gene_similarity_matrix, layout="row_major", symmetric=True) np.fill_diagonal(ht_gene_ht_gene_similarity_matrix, -1) htv_mask_series = find_high_normalized_variance_genes( ht_data, "downsampled", min_gene_normalized_variance=min_gene_normalized_variance, inplace=False) assert htv_mask_series is not None htv_mask = ut.to_numpy_vector(htv_mask_series) htv_genes_count = np.sum(htv_mask) assert htv_genes_count <= ht_genes_count if htv_genes_count > 0: htv_gene_ht_gene_similarity_matrix = ht_gene_ht_gene_similarity_matrix[ htv_mask, :] assert ut.is_layout(htv_gene_ht_gene_similarity_matrix, "row_major") assert htv_gene_ht_gene_similarity_matrix.shape == ( htv_genes_count, ht_genes_count) max_similarity_of_htv_genes = ut.max_per( htv_gene_ht_gene_similarity_matrix, per="row") htvl_mask = max_similarity_of_htv_genes <= max_gene_similarity htvl_genes_count = np.sum(htvl_mask) ut.log_calc("noisy_lonely_genes_count", htvl_genes_count) if htvl_genes_count > 0: base_index_of_ht_genes = ut.get_v_numpy( ht_data, "sampled_gene_index") base_index_of_htv_genes = base_index_of_ht_genes[htv_mask] base_index_of_htvl_genes = base_index_of_htv_genes[htvl_mask] noisy_lonely_genes_mask[base_index_of_htvl_genes] = True htvl_gene_ht_gene_similarity_matrix = htv_gene_ht_gene_similarity_matrix[ htvl_mask, :] htvl_gene_ht_gene_similarity_matrix = ut.to_layout( htvl_gene_ht_gene_similarity_matrix, layout="row_major") assert htvl_gene_ht_gene_similarity_matrix.shape == ( htvl_genes_count, ht_genes_count) if ut.logging_calc(): i_gene_totals = ut.get_v_numpy(i_data, "downsampled", sum=True) ht_mask = ut.get_v_numpy(i_data, "high_total_gene") i_total = np.sum(i_gene_totals) htvl_gene_totals = i_gene_totals[ht_mask][htv_mask][ htvl_mask] top_similarity_of_htvl_genes = ut.top_per( htvl_gene_ht_gene_similarity_matrix, 10, per="row") for htvl_index, gene_index in enumerate( base_index_of_htvl_genes): gene_name = adata.var_names[gene_index] gene_total = htvl_gene_totals[htvl_index] gene_percent = 100 * gene_total / i_total similar_ht_values = ut.to_numpy_vector( top_similarity_of_htvl_genes[htvl_index, :]) # assert len(similar_ht_values) == ht_genes_count top_similar_ht_mask = similar_ht_values > 0 top_similar_ht_values = similar_ht_values[ top_similar_ht_mask] top_similar_ht_indices = base_index_of_ht_genes[ top_similar_ht_mask] top_similar_ht_names = adata.var_names[ top_similar_ht_indices] ut.log_calc( f"- {gene_name}", f"total downsampled UMIs: {gene_total} " + f"({gene_percent:.4g}%), correlated with: " + ", ".join([ f"{similar_gene_name}: {similar_gene_value:.4g}" for similar_gene_value, similar_gene_name in reversed( sorted( zip(top_similar_ht_values, top_similar_ht_names))) ]), ) if ut.logging_calc(): ut.log_calc("noisy_lonely_gene_names", sorted(list(adata.var_names[noisy_lonely_genes_mask]))) if inplace: ut.set_v_data(adata, "noisy_lonely_gene", noisy_lonely_genes_mask) return None ut.log_return("noisy_lonely_genes", noisy_lonely_genes_mask) return ut.to_pandas_series(noisy_lonely_genes_mask, index=adata.var_names)
def dissolve_metacells( adata: AnnData, what: Union[str, ut.Matrix] = "__x__", *, candidates: Union[str, ut.Vector] = "candidate", deviants: Optional[Union[str, ut.Vector]] = "cell_deviant_votes", target_metacell_size: float = pr.target_metacell_size, cell_sizes: Optional[Union[str, ut.Vector]] = pr.dissolve_cell_sizes, min_metacell_cells: int = pr.dissolve_min_metacell_cells, min_robust_size_factor: Optional[float] = pr. dissolve_min_robust_size_factor, min_convincing_size_factor: Optional[float] = pr. dissolve_min_convincing_size_factor, min_convincing_gene_fold_factor: float = pr. dissolve_min_convincing_gene_fold_factor, abs_folds: bool = pr.dissolve_abs_folds, inplace: bool = True, ) -> Optional[ut.PandasFrame]: """ Dissolve too-small metacells based on ``what`` (default: {what}) data. **Input** Annotated ``adata``, where the observations are cells and the variables are genes, where ``what`` is a per-variable-per-observation matrix or the name of a per-variable-per-observation annotation containing such a matrix. **Returns** Observation (Cell) Annotations ``metacell`` The integer index of the metacell each cell belongs to. The metacells are in no particular order. Cells with no metacell assignment are given a metacell index of ``-1``. ``dissolved`` A boolean mask of the cells which were in a dissolved metacell. If ``inplace`` (default: {inplace}), this is written to the data, and the function returns ``None``. Otherwise this is returned as a pandas data frame (indexed by the observation names). **Computation Parameters** 1. Mark all cells with non-zero ``deviants`` (default: {deviants}) as "outliers". This can be the name of a per-observation (cell) annotation, or an explicit boolean mask of cells, or a or ``None`` if there are no deviant cells to mark. 2. Any metacell which has less cells than the ``min_metacell_cells`` is dissolved. 3. We are trying to create metacells of size ``target_metacell_size``. Compute the sizes of the resulting metacells by summing the ``cell_sizes`` (default: {cell_sizes}). If it is ``None``, each has a size of one. These parameters are typically identical to these passed to :py:func:`metacells.tools.candidates.compute_candidate_metacells`. 4. If ``min_robust_size_factor`` (default: {min_robust_size_factor}) is specified, then any metacell whose total size is at least ``target_metacell_size * min_robust_size_factor`` is preserved. 5. If ``min_convincing_size_factor`` (default: {min_convincing_size_factor}) is specified, then any remaining metacells whose size is at least ``target_metacell_size * min_convincing_size_factor`` are preserved, given they contain at least one gene whose fold factor (log2((actual + 1) / (expected + 1))) is at least ``min_convincing_gene_fold_factor`` (default: {min_convincing_gene_fold_factor}). If ``abs_folds``, consider the absolute fold factors. That is, we only preserve these smaller metacells if there is at least one gene whose expression is significantly different from the mean of the population. 6 . Any remaining metacell is dissolved into "outlier" cells. """ dissolved_of_cells = np.zeros(adata.n_obs, dtype="bool") candidate_of_cells = ut.get_o_numpy(adata, candidates, formatter=ut.groups_description) candidate_of_cells = np.copy(candidate_of_cells) deviant_of_cells = ut.maybe_o_numpy(adata, deviants, formatter=ut.mask_description) if deviant_of_cells is not None: deviant_of_cells = deviant_of_cells > 0 cell_sizes = ut.maybe_o_numpy(adata, cell_sizes, formatter=ut.sizes_description) if deviant_of_cells is not None: candidate_of_cells[deviant_of_cells > 0] = -1 candidate_of_cells = ut.compress_indices(candidate_of_cells) candidates_count = np.max(candidate_of_cells) + 1 data = ut.get_vo_proper(adata, what, layout="column_major") fraction_of_genes = ut.fraction_per(data, per="column") if min_robust_size_factor is None: min_robust_size = None else: min_robust_size = target_metacell_size * min_robust_size_factor ut.log_calc("min_robust_size", min_robust_size) if min_convincing_size_factor is None: min_convincing_size = None else: min_convincing_size = target_metacell_size * min_convincing_size_factor ut.log_calc("min_convincing_size", min_convincing_size) did_dissolve = False for candidate_index in range(candidates_count): candidate_cell_indices = np.where( candidate_of_cells == candidate_index)[0] if not _keep_candidate( adata, candidate_index, data=data, cell_sizes=cell_sizes, fraction_of_genes=fraction_of_genes, min_metacell_cells=min_metacell_cells, min_robust_size=min_robust_size, min_convincing_size=min_convincing_size, min_convincing_gene_fold_factor=min_convincing_gene_fold_factor, abs_folds=abs_folds, candidates_count=candidates_count, candidate_cell_indices=candidate_cell_indices, ): dissolved_of_cells[candidate_cell_indices] = True candidate_of_cells[candidate_cell_indices] = -1 did_dissolve = True if did_dissolve: metacell_of_cells = ut.compress_indices(candidate_of_cells) else: metacell_of_cells = candidate_of_cells if inplace: ut.set_o_data(adata, "dissolved", dissolved_of_cells, formatter=ut.mask_description) ut.set_o_data(adata, "metacell", metacell_of_cells, formatter=ut.groups_description) return None ut.log_return("dissolved", dissolved_of_cells) ut.log_return("metacell", metacell_of_cells, formatter=ut.groups_description) obs_frame = ut.to_pandas_frame(index=adata.obs_names) obs_frame["dissolved"] = dissolved_of_cells obs_frame["metacell"] = metacell_of_cells return obs_frame
def find_properly_sampled_cells( adata: AnnData, what: Union[str, ut.Matrix] = "__x__", *, min_cell_total: Optional[int], max_cell_total: Optional[int], excluded_adata: Optional[AnnData] = None, max_excluded_genes_fraction: Optional[float], inplace: bool = True, ) -> Optional[ut.PandasSeries]: """ Detect cells with a "proper" amount of ``what`` (default: {what}) data. Due to both technical effects and natural variance between cells, the total number of UMIs varies from cell to cell. We often would like to work on cells that contain a sufficient number of UMIs for meaningful analysis; we sometimes also wish to exclude cells which have "too many" UMIs. **Input** Annotated ``adata``, where the observations are cells and the variables are genes, where ``what`` is a per-variable-per-observation matrix or the name of a per-variable-per-observation annotation containing such a matrix. **Returns** Observation (Cell) Annotations ``properly_sampled_cell`` A boolean mask indicating whether each cell has a "proper" amount of UMIs. If ``inplace`` (default: {inplace}), this is written to the data, and the function returns ``None``. Otherwise this is returned as a pandas series (indexed by the observation names). **Computation Parameters** 1. Exclude all cells whose total data is less than the ``min_cell_total`` (no default), unless it is ``None``. 2. Exclude all cells whose total data is more than the ``max_cell_total`` (no default), unless it is ``None``. 3. If ``max_excluded_genes_fraction`` (no default) is not ``None``, then ``excluded_adata`` must not be ``None`` and should contain just the excluded genes data for each cell. Exclude all cells whose sum of the excluded data divided by the total data is more than the specified threshold. """ assert (max_excluded_genes_fraction is None) == (excluded_adata is None) total_of_cells = ut.get_o_numpy(adata, what, sum=True) cells_mask = np.full(adata.n_obs, True, dtype="bool") if min_cell_total is not None: cells_mask = cells_mask & (total_of_cells >= min_cell_total) if max_cell_total is not None: cells_mask = cells_mask & (total_of_cells <= max_cell_total) if excluded_adata is not None: assert max_excluded_genes_fraction is not None excluded_data = ut.get_vo_proper(excluded_adata, layout="row_major") excluded_of_cells = ut.sum_per(excluded_data, per="row") if np.min(total_of_cells) == 0: total_of_cells = np.copy(total_of_cells) total_of_cells[total_of_cells == 0] = 1 excluded_fraction = excluded_of_cells / total_of_cells cells_mask = cells_mask & (excluded_fraction <= max_excluded_genes_fraction) if inplace: ut.set_o_data(adata, "properly_sampled_cell", cells_mask) return None ut.log_return("properly_sampled_cell", cells_mask) return ut.to_pandas_series(cells_mask, index=adata.obs_names)
def combine_masks( # pylint: disable=too-many-branches,too-many-statements adata: AnnData, masks: List[str], *, invert: bool = False, to: Optional[str] = None, ) -> Optional[ut.PandasSeries]: """ Combine different pre-computed masks into a final overall mask. **Input** Annotated ``adata``, where the observations are cells and the variables are genes. **Returns** If ``to`` (default: {to}) is ``None``, returns the computed mask. Otherwise, sets the mask as an annotation (per-variable or per-observation depending on the type of the combined masks). **Computation Parameters** 1. For each of the mask in ``masks``, fetch it. Silently ignore missing masks if the name has a ``?`` suffix. Invert the mask if the name has a ``~`` prefix. If the name has a ``|`` prefix (before the ``~`` prefix, if any), then bitwise-OR the mask into the OR mask, otherwise (or if it has a ``&`` prefix), bitwise-AND the mask into the AND mask. 2. Combine (bitwise-AND) the AND mask and the OR mask into a single mask. 3. If ``invert`` (default: {invert}), invert the result combined mask. """ assert len(masks) > 0 per: Optional[str] = None and_mask: Optional[ut.NumpyVector] = None or_mask: Optional[ut.NumpyVector] = None for mask_name in masks: log_mask_name = mask_name if mask_name[0] == "|": is_or = True mask_name = mask_name[1:] else: is_or = False if mask_name[0] == "&": mask_name = mask_name[1:] if mask_name[0] == "~": invert_mask = True mask_name = mask_name[1:] else: invert_mask = False if mask_name[-1] == "?": must_exist = False mask_name = mask_name[:-1] else: must_exist = True if mask_name in adata.obs: mask_per = "o" mask = ut.get_o_numpy( adata, mask_name, formatter=ut.mask_description) > 0 elif mask_name in adata.var: mask_per = "v" mask = ut.get_v_numpy( adata, mask_name, formatter=ut.mask_description) > 0 else: if must_exist: raise KeyError(f"unknown mask data: {mask_name}") continue if mask.dtype != "bool": raise ValueError(f"the data: {mask_name} is not a boolean mask") if invert_mask: mask = ~mask if ut.logging_calc(): ut.log_calc(log_mask_name, mask) if per is None: per = mask_per else: if mask_per != per: raise ValueError( "mixing per-observation and per-variable masks") if is_or: if or_mask is None: or_mask = mask else: or_mask = or_mask | mask else: if and_mask is None: and_mask = mask else: and_mask = and_mask & mask if and_mask is not None: if or_mask is not None: combined_mask = and_mask & or_mask else: combined_mask = and_mask else: if or_mask is not None: combined_mask = or_mask else: raise ValueError("no masks to combine") if invert: combined_mask = ~combined_mask if to is None: ut.log_return("combined", combined_mask) if per == "o": return ut.to_pandas_series(combined_mask, index=adata.obs_names) assert per == "v" return ut.to_pandas_series(combined_mask, index=adata.var_names) if per == "o": ut.set_o_data(adata, to, combined_mask) else: ut.set_v_data(adata, to, combined_mask) return None