def find_high_relative_variance_genes( adata: AnnData, what: Union[str, ut.Matrix] = "__x__", *, min_gene_relative_variance: float = pr.significant_gene_relative_variance, window_size: int = pr.relative_variance_window_size, inplace: bool = True, ) -> Optional[ut.PandasSeries]: """ Find genes which have high relative variance of ``what`` (default: {what}) data. The relative variance measures the variance / mean of each gene relative to the other genes with a similar level of expression. See :py:func:`metacells.utilities.computation.relative_variance_per` for details. Genes with a high relative variance are good candidates for being selected as "feature genes", that is, be used to compute the similarity between cells. Using the relative variance compensates for the bias for selecting higher-expression genes, whose normalized variance can to be larger due to random noise alone. **Input** Annotated ``adata``, where the observations are cells and the variables are genes, where ``what`` is a per-variable-per-observation matrix or the name of a per-variable-per-observation annotation containing such a matrix. **Returns** Variable (Gene) Annotations ``high_relative_variance_gene`` A boolean mask indicating whether each gene was found to have a high relative variance. If ``inplace`` (default: {inplace}), this is written to the data, and the function returns ``None``. Otherwise this is returned as a pandas series (indexed by the variable names). **Computation Parameters** 1. Use :py:func:`metacells.utilities.computation.relative_variance_per` to get the relative variance of each gene. 2. Select the genes whose relative variance is at least ``min_gene_relative_variance`` (default: {min_gene_relative_variance}). """ data = ut.get_vo_proper(adata, what, layout="column_major") relative_variance_of_genes = ut.relative_variance_per(data, per="column", window_size=window_size) genes_mask = relative_variance_of_genes >= min_gene_relative_variance if inplace: ut.set_v_data(adata, "high_relative_variance_gene", genes_mask) return None ut.log_return("high_relative_variance_genes", genes_mask) return ut.to_pandas_series(genes_mask, index=adata.var_names)
def find_top_feature_genes( adata: AnnData, *, max_genes: int = pr.max_top_feature_genes, inplace: bool = True, ) -> Optional[ut.PandasSeries]: """ Find genes which have high ``feature_gene`` value. This is applied after computing metacells to pick the "strongest" feature genes. If using the direct algorithm (:py:func:`metacells.pipeline.direct.compute_direct_metacells`) then all feature genes are equally "strong"; however, if using the divide-and-conquer algorithm (:py:func:`metacells.pipeline.divide_and_conquer.divide_and_conquer_pipeline`, :py:func:`metacells.pipeline.divide_and_conquer.compute_divide_and_conquer_metacells`) then this will pick the genes which were most commonly used as features across all the piles. **Input** Annotated ``adata``, where the observations are cells and the variables are genes, where ``feature_gene`` is a per-variable (gene) annotation counting how many times each gene was used as a feature. **Returns** Variable (Gene) Annotations ``top_feature_gene`` A boolean mask indicating whether each gene was found to be a top feature gene. If ``inplace`` (default: {inplace}), this is written to the data, and the function returns ``None``. Otherwise this is returned as a pandas series (indexed by the variable names). **Computation Parameters** 1. Look for the lowest positive ``feature_gene`` threshold such that at most ``max_genes`` are picked as top feature genes. Note we may still pick more than ``max_genes``, for example when using the direct algorithm, we always return all feature genes as there's no way to distinguish between them using the ``feature_gene`` data. """ feature_of_gene = ut.get_v_numpy(adata, "feature_gene", formatter=ut.mask_description) max_threshold = np.max(feature_of_gene) assert max_threshold > 0 threshold = 0 selected_count = max_genes + 1 while selected_count > max_genes and threshold < max_threshold: threshold = threshold + 1 genes_mask = feature_of_gene >= threshold selected_count = np.sum(genes_mask) ut.log_calc(f"threshold: {threshold} selected: {selected_count}") if inplace: ut.set_v_data(adata, "top_feature_gene", genes_mask) return None ut.log_return("top_feature_gene", genes_mask) return ut.to_pandas_series(genes_mask, index=adata.var_names)
def find_high_normalized_variance_genes( adata: AnnData, what: Union[str, ut.Matrix] = "__x__", *, min_gene_normalized_variance: float = pr.significant_gene_normalized_variance, inplace: bool = True, ) -> Optional[ut.PandasSeries]: """ Find genes which have high normalized variance of ``what`` (default: {what}) data. The normalized variance measures the variance / mean of each gene. See :py:func:`metacells.utilities.computation.normalized_variance_per` for details. Genes with a high normalized variance are "noisy", that is, have significantly different expression level in different cells. **Input** Annotated ``adata``, where the observations are cells and the variables are genes, where ``what`` is a per-variable-per-observation matrix or the name of a per-variable-per-observation annotation containing such a matrix. **Returns** Variable (Gene) Annotations ``high_normalized_variance_gene`` A boolean mask indicating whether each gene was found to have a high normalized variance. If ``inplace`` (default: {inplace}), this is written to the data, and the function returns ``None``. Otherwise this is returned as a pandas series (indexed by the variable names). **Computation Parameters** 1. Use :py:func:`metacells.utilities.computation.normalized_variance_per` to get the normalized variance of each gene. 2. Select the genes whose normalized variance is at least ``min_gene_normalized_variance`` (default: {min_gene_normalized_variance}). """ data = ut.get_vo_proper(adata, what, layout="column_major") normalized_variance_of_genes = ut.normalized_variance_per(data, per="column") genes_mask = normalized_variance_of_genes >= min_gene_normalized_variance if inplace: ut.set_v_data(adata, "high_normalized_variance_gene", genes_mask) return None ut.log_return("high_normalized_variance_genes", genes_mask) return ut.to_pandas_series(genes_mask, index=adata.var_names)
def find_high_topN_genes( # pylint: disable=invalid-name adata: AnnData, what: Union[str, ut.Matrix] = "__x__", *, topN: int, # pylint: disable=invalid-name min_gene_topN: int, # pylint: disable=invalid-name inplace: bool = True, ) -> Optional[ut.PandasSeries]: """ Find genes which have high total top-Nth value of ``what`` (default: {what}) data. This should typically only be applied to downsampled data to ensure that variance in sampling depth does not affect the result. Genes with too-low expression are typically excluded from computations. In particular, genes may have all-zero expression, in which case including them just slows the computations (and triggers numeric edge cases). **Input** Annotated ``adata``, where the observations are cells and the variables are genes, where ``what`` is a per-variable-per-observation matrix or the name of a per-variable-per-observation annotation containing such a matrix. **Returns** Variable (Gene) Annotations ``high_top<topN>_gene`` A boolean mask indicating whether each gene was found to have a high top-Nth value. If ``inplace`` (default: {inplace}), this is written to the data, and the function returns ``None``. Otherwise this is returned as a pandas series (indexed by the variable names). **Computation Parameters** 1. Use :py:func:`metacells.utilities.computation.top_per` to get the top-Nth UMIs of each gene. 2. Select the genes whose fraction is at least ``min_gene_topN``. """ data_of_genes = ut.get_vo_proper(adata, what, layout="column_major") rank = max(adata.n_obs - topN - 1, 1) topN_of_genes = ut.rank_per(data_of_genes, per="column", rank=rank) # pylint: disable=invalid-name genes_mask = topN_of_genes >= min_gene_topN if inplace: ut.set_v_data(adata, f"high_top{topN}_gene", genes_mask) return None ut.log_return(f"high_top{topN}_genes", genes_mask) return ut.to_pandas_series(genes_mask, index=adata.var_names)
def find_properly_sampled_genes( adata: AnnData, what: Union[str, ut.Matrix] = "__x__", *, min_gene_total: int = pr.properly_sampled_min_gene_total, inplace: bool = True, ) -> Optional[ut.PandasSeries]: """ Detect genes with a "proper" amount of ``what`` (default: {what}) data. Due to both technical effects and natural variance between genes, the expression of genes varies greatly between cells. This is exactly the information we are trying to analyze. We often would like to work on genes that have a sufficient level of expression for meaningful analysis. Specifically, it doesn't make sense to analyze genes that have zero expression in all the cells. .. todo:: Provide additional optional criteria for "properly sampled genes"? **Input** Annotated ``adata``, where the observations are cells and the variables are genes, where ``what`` is a per-variable-per-observation matrix or the name of a per-variable-per-observation annotation containing such a matrix. **Returns** Variable (Gene) Annotations ``properly_sampled_gene`` A boolean mask indicating whether each gene has a "proper" number of UMIs. If ``inplace`` (default: {inplace}), this is written to the data and the function returns ``None``. Otherwise this is returned as a pandas series (indexed by the variable names). **Computation Parameters** 1. Exclude all genes whose total data is less than the ``min_gene_total`` (default: {min_gene_total}). """ total_of_genes = ut.get_v_numpy(adata, what, sum=True) genes_mask = total_of_genes >= min_gene_total if inplace: ut.set_v_data(adata, "properly_sampled_gene", genes_mask) return None ut.log_return("properly_sampled_gene", genes_mask) return ut.to_pandas_series(genes_mask, index=adata.obs_names)
def find_high_total_genes( adata: AnnData, what: Union[str, ut.Matrix] = "__x__", *, min_gene_total: int, inplace: bool = True, ) -> Optional[ut.PandasSeries]: """ Find genes which have high total number of ``what`` (default: {what}) data. This should typically only be applied to downsampled data to ensure that variance in sampling depth does not affect the result. Genes with too-low expression are typically excluded from computations. In particular, genes may have all-zero expression, in which case including them just slows the computations (and triggers numeric edge cases). **Input** Annotated ``adata``, where the observations are cells and the variables are genes, where ``what`` is a per-variable-per-observation matrix or the name of a per-variable-per-observation annotation containing such a matrix. **Returns** Variable (Gene) Annotations ``high_total_gene`` A boolean mask indicating whether each gene was found to have a high normalized variance. If ``inplace`` (default: {inplace}), this is written to the data, and the function returns ``None``. Otherwise this is returned as a pandas series (indexed by the variable names). **Computation Parameters** 1. Use :py:func:`metacells.utilities.computation.sum_per` to get the total UMIs of each gene. 2. Select the genes whose fraction is at least ``min_gene_total``. """ total_of_genes = ut.get_v_numpy(adata, what, sum=True) genes_mask = total_of_genes >= min_gene_total if inplace: ut.set_v_data(adata, "high_total_gene", genes_mask) return None ut.log_return("high_total_genes", genes_mask) return ut.to_pandas_series(genes_mask, index=adata.var_names)
def find_high_fraction_genes( adata: AnnData, what: Union[str, ut.Matrix] = "__x__", *, min_gene_fraction: float = pr.significant_gene_fraction, inplace: bool = True, ) -> Optional[ut.PandasSeries]: """ Find genes which have high fraction of the total ``what`` (default: {what}) data of the cells. Genes with too-low expression are typically excluded from computations. In particular, genes may have all-zero expression, in which case including them just slows the computations (and triggers numeric edge cases). **Input** Annotated ``adata``, where the observations are cells and the variables are genes, where ``what`` is a per-variable-per-observation matrix or the name of a per-variable-per-observation annotation containing such a matrix. **Returns** Variable (Gene) Annotations ``high_fraction_gene`` A boolean mask indicating whether each gene was found to have a high normalized variance. If ``inplace`` (default: {inplace}), this is written to the data, and the function returns ``None``. Otherwise this is returned as a pandas series (indexed by the variable names). **Computation Parameters** 1. Use :py:func:`metacells.utilities.computation.fraction_per` to get the fraction of each gene. 2. Select the genes whose fraction is at least ``min_gene_fraction`` (default: {min_gene_fraction}). """ data = ut.get_vo_proper(adata, what, layout="column_major") fraction_of_genes = ut.fraction_per(data, per="column") genes_mask = fraction_of_genes >= min_gene_fraction if inplace: ut.set_v_data(adata, "high_fraction_gene", genes_mask) return None ut.log_return("high_fraction_genes", genes_mask) return ut.to_pandas_series(genes_mask, index=adata.var_names)
def find_named_genes( adata: AnnData, *, names: Optional[Collection[str]] = None, patterns: Optional[Collection[Union[str, Pattern]]] = None, to: Optional[str] = None, invert: bool = False, ) -> Optional[ut.PandasSeries]: """ Find genes by their (case-insensitive) name. This creates a mask of all the genes whose name appears in ``names`` or matches any of the ``patterns``. If ``invert`` (default: {invert}), invert the resulting mask. If ``to`` (default: {to}) is specified, this is stored as a per-variable (gene) annotation with that name, and returns ``None``. This is useful to fill gene masks such as ``excluded_genes`` (genes which should be excluded from the rest of the processing) and ``forbidden_genes`` (genes which must not be chosen as feature genes). Otherwise, it returns it as a pandas series (indexed by the variable, that is gene, names). """ if names is None: names_mask = np.zeros(adata.n_vars, dtype="bool") else: lower_names_set = {name.lower() for name in names} names_mask = np.array([name.lower() in lower_names_set for name in adata.var_names]) # if patterns is None: patterns_mask = np.zeros(adata.n_vars, dtype="bool") else: patterns_mask = ut.patterns_matches(patterns, adata.var_names) genes_mask = names_mask | patterns_mask if invert: genes_mask = ~genes_mask if to is not None: ut.set_v_data(adata, to, genes_mask) return None ut.log_return("named_genes", genes_mask) return ut.to_pandas_series(genes_mask, index=adata.var_names)
def find_deviant_cells( adata: AnnData, what: Union[str, ut.Matrix] = "__x__", *, candidates: Union[str, ut.Vector] = "candidate", min_gene_fold_factor: float = pr.deviants_min_gene_fold_factor, abs_folds: bool = pr.deviants_abs_folds, max_gene_fraction: Optional[float] = pr.deviants_max_gene_fraction, max_cell_fraction: Optional[float] = pr.deviants_max_cell_fraction, inplace: bool = True, ) -> Optional[Tuple[ut.PandasSeries, ut.PandasSeries]]: """ Find cells which are have significantly different gene expression from the metacells they are belong to based on ``what`` (default: {what}) data. **Input** Annotated ``adata``, where the observations are cells and the variables are genes, where ``what`` is a per-variable-per-observation matrix or the name of a per-variable-per-observation annotation containing such a matrix. **Returns** Observation (Cell) Annotations ``cell_deviant_votes`` The number of genes that were the reason the cell was marked as deviant (if zero, the cell is not deviant). Variable (Gene) Annotations ``gene_deviant_votes`` The number of cells each gene marked as deviant (if zero, the gene did not mark any cell as deviant). If ``inplace`` (default: {inplace}), this is written to the data, and the function returns ``None``. Otherwise this is returned as two pandas series (indexed by the observation and variable names). **Computation Parameters** Intuitively, we first select some fraction of the genes which were least predictable compared to the mean expression in the candidate metacells. We then mark as deviants some fraction of the cells whose expression of these genes was least predictable compared to the mean expression in the candidate metacells. Operationally: 1. Compute for each candidate metacell the mean fraction of the UMIs expressed by each gene. Scale this by each cell's total UMIs to compute the expected number of UMIs for each cell. Compute the fold factor log2((actual UMIs + 1) / (expected UMIs + 1)) for each gene for each cell. 2. Ignore all fold factors less than the ``min_gene_fold_factor`` (default: {min_gene_fold_factor}). If ``abs_folds`` (default: {abs_folds}), consider the absolute fold factors. Count the number of genes which have a fold factor above this minimum in at least one cell. If the fraction of such genes is above ``max_gene_fraction`` (default: {max_gene_fraction}), then raise the minimal gene fold factor such that at most this fraction of genes remain. 3. For each remaining gene, rank all the cells where it is expressed above the min fold factor. Give an artificial maximum rank to all cells with fold factor 0, that is, below the minimum. 4. For each cell, compute the minimal rank it has in any of these genes. That is, if a cell has a rank of 1, it means that it has at least one gene whose expression fold factor is the worst (highest) across all cells (and is also above the minimum). 5. Select as deviants all cells whose minimal rank is below the artificial maximum rank, that is, which contain at least one gene whose expression fold factor is high relative to the rest of the cells. If the fraction of such cells is higher than ``max_cell_fraction`` (default: {max_cell_fraction}), reduce the maximal rank such that at most this fraction of cells are selected as deviants. """ if max_gene_fraction is None: max_gene_fraction = 1 if max_cell_fraction is None: max_cell_fraction = 1 assert min_gene_fold_factor > 0 assert 0 < max_gene_fraction < 1 assert 0 < max_cell_fraction < 1 cells_count, genes_count = adata.shape assert cells_count > 0 candidate_of_cells = ut.get_o_numpy(adata, candidates, formatter=ut.groups_description) totals_of_cells = ut.get_o_numpy(adata, what, sum=True) assert totals_of_cells.size == cells_count data = ut.get_vo_proper(adata, what, layout="row_major") list_of_fold_factors, list_of_cell_index_of_rows = _collect_fold_factors( data=data, candidate_of_cells=candidate_of_cells, totals_of_cells=totals_of_cells, min_gene_fold_factor=min_gene_fold_factor, abs_folds=abs_folds, ) fold_factors = _construct_fold_factors(cells_count, list_of_fold_factors, list_of_cell_index_of_rows) if fold_factors is None: votes_of_deviant_cells = np.zeros(adata.n_obs, dtype="int32") votes_of_deviant_genes = np.zeros(adata.n_vars, dtype="int32") else: deviant_gene_indices = _filter_genes( cells_count=cells_count, genes_count=genes_count, fold_factors=fold_factors, min_gene_fold_factor=min_gene_fold_factor, max_gene_fraction=max_gene_fraction, ) deviant_genes_fold_ranks = _fold_ranks( cells_count=cells_count, fold_factors=fold_factors, deviant_gene_indices=deviant_gene_indices ) votes_of_deviant_cells, votes_of_deviant_genes = _filter_cells( cells_count=cells_count, genes_count=genes_count, deviant_genes_fold_ranks=deviant_genes_fold_ranks, deviant_gene_indices=deviant_gene_indices, max_cell_fraction=max_cell_fraction, ) if inplace: ut.set_v_data(adata, "gene_deviant_votes", votes_of_deviant_genes, formatter=ut.mask_description) ut.set_o_data(adata, "cell_deviant_votes", votes_of_deviant_cells, formatter=ut.mask_description) return None ut.log_return("gene_deviant_votes", votes_of_deviant_genes, formatter=ut.mask_description) ut.log_return("cell_deviant_votes", votes_of_deviant_cells, formatter=ut.mask_description) return ( ut.to_pandas_series(votes_of_deviant_cells, index=adata.obs_names), ut.to_pandas_series(votes_of_deviant_genes, index=adata.var_names), )
def find_noisy_lonely_genes( # pylint: disable=too-many-statements adata: AnnData, what: Union[str, ut.Matrix] = "__x__", *, excluded_genes_mask: Optional[str] = None, max_sampled_cells: int = pr.noisy_lonely_max_sampled_cells, downsample_min_samples: int = pr.noisy_lonely_downsample_min_samples, downsample_min_cell_quantile: float = pr. noisy_lonely_downsample_max_cell_quantile, downsample_max_cell_quantile: float = pr. noisy_lonely_downsample_min_cell_quantile, min_gene_total: int = pr.noisy_lonely_min_gene_total, min_gene_normalized_variance: float = pr. noisy_lonely_min_gene_normalized_variance, max_gene_similarity: float = pr.noisy_lonely_max_gene_similarity, random_seed: int = pr.random_seed, inplace: bool = True, ) -> Optional[ut.PandasSeries]: """ Detect "noisy lonely" genes based on ``what`` (default: {what}) data. Return the indices of genes which are "noisy" (have high variance compared to their mean) and also "lonely" (have low correlation with all other genes). Such genes should be excluded since they will never meaningfully help us compute groups, and will actively cause profiles to be considered "deviants". Noisy genes have high expression and variance. Lonely genes have no (or low) correlations with any other gene. Noisy lonely genes tend to throw off clustering algorithms. In general, such algorithms try to group together cells with the same overall biological state. Since the genes are lonely, they don't contribute towards this goal. Since they are noisy, they actively hamper this, because they make cells which are otherwise similar appear different (just for this lonely gene). It is therefore useful to explicitly identify, in a pre-processing step, the (few) such genes, and exclude them from the rest of the analysis. **Input** Annotated ``adata``, where the observations are cells and the variables are genes, where ``what`` is a per-variable-per-observation matrix or the name of a per-variable-per-observation annotation containing such a matrix. **Returns** Variable (Gene) Annotations ``noisy_lonely_genes`` A boolean mask indicating whether each gene was found to be a "noisy lonely" gene. If ``inplace`` (default: {inplace}), this is written to the data, and the function returns ``None``. Otherwise this is returned as a pandas series (indexed by the variable names). **Computation Parameters** 1. If we have more than ``max_sampled_cells`` (default: {max_sampled_cells}), pick this number of random cells from the data using the ``random_seed``. 2. If we were specified an ``excluded_genes_mask``, this is the name of a per-variable (gene) annotation containing a mask of excluded genes. Get rid of all these excluded genes. 3. Invoke :py:func:`metacells.tools.downsample.downsample_cells` to downsample the cells to the same total number of UMIs, using the ``downsample_min_samples`` (default: {downsample_min_samples}), ``downsample_min_cell_quantile`` (default: {downsample_min_cell_quantile}), ``downsample_max_cell_quantile`` (default: {downsample_max_cell_quantile}) and the ``random_seed`` (default: {random_seed}). 4. Find "noisy" genes which have a total number of UMIs of at least ``min_gene_total`` (default: {min_gene_total}) and a normalized variance of at least ``min_gene_normalized_variance`` (default: ``min_gene_normalized_variance``). 5. Cross-correlate the noisy genes. 6. Find the noisy "lonely" genes whose maximal correlation is at most ``max_gene_similarity`` (default: {max_gene_similarity}) with all other genes. """ if max_sampled_cells < adata.n_obs: np.random.seed(random_seed) cell_indices = np.random.choice(np.arange(adata.n_obs), size=max_sampled_cells, replace=False) s_data = ut.slice(adata, obs=cell_indices, name=".sampled", top_level=False) else: s_data = ut.copy_adata(adata, top_level=False) track_var: Optional[str] = "sampled_gene_index" if excluded_genes_mask is not None: results = filter_data(s_data, name="included", top_level=False, track_var=track_var, var_masks=[f"~{excluded_genes_mask}"]) track_var = None assert results is not None i_data = results[0] assert i_data is not None else: i_data = s_data downsample_cells( i_data, what, downsample_min_samples=downsample_min_samples, downsample_min_cell_quantile=downsample_min_cell_quantile, downsample_max_cell_quantile=downsample_max_cell_quantile, random_seed=random_seed, ) find_high_total_genes(i_data, "downsampled", min_gene_total=min_gene_total) results = filter_data(i_data, name="high_total", top_level=False, track_var=track_var, var_masks=["high_total_gene"]) track_var = None assert results is not None ht_data = results[0] noisy_lonely_genes_mask = np.full(adata.n_vars, False) if ht_data is not None: ht_genes_count = ht_data.shape[1] ht_gene_ht_gene_similarity_frame = compute_var_var_similarity( ht_data, "downsampled", inplace=False, reproducible=(random_seed != 0)) assert ht_gene_ht_gene_similarity_frame is not None ht_gene_ht_gene_similarity_matrix = ut.to_numpy_matrix( ht_gene_ht_gene_similarity_frame, only_extract=True) ht_gene_ht_gene_similarity_matrix = ut.to_layout( ht_gene_ht_gene_similarity_matrix, layout="row_major", symmetric=True) np.fill_diagonal(ht_gene_ht_gene_similarity_matrix, -1) htv_mask_series = find_high_normalized_variance_genes( ht_data, "downsampled", min_gene_normalized_variance=min_gene_normalized_variance, inplace=False) assert htv_mask_series is not None htv_mask = ut.to_numpy_vector(htv_mask_series) htv_genes_count = np.sum(htv_mask) assert htv_genes_count <= ht_genes_count if htv_genes_count > 0: htv_gene_ht_gene_similarity_matrix = ht_gene_ht_gene_similarity_matrix[ htv_mask, :] assert ut.is_layout(htv_gene_ht_gene_similarity_matrix, "row_major") assert htv_gene_ht_gene_similarity_matrix.shape == ( htv_genes_count, ht_genes_count) max_similarity_of_htv_genes = ut.max_per( htv_gene_ht_gene_similarity_matrix, per="row") htvl_mask = max_similarity_of_htv_genes <= max_gene_similarity htvl_genes_count = np.sum(htvl_mask) ut.log_calc("noisy_lonely_genes_count", htvl_genes_count) if htvl_genes_count > 0: base_index_of_ht_genes = ut.get_v_numpy( ht_data, "sampled_gene_index") base_index_of_htv_genes = base_index_of_ht_genes[htv_mask] base_index_of_htvl_genes = base_index_of_htv_genes[htvl_mask] noisy_lonely_genes_mask[base_index_of_htvl_genes] = True htvl_gene_ht_gene_similarity_matrix = htv_gene_ht_gene_similarity_matrix[ htvl_mask, :] htvl_gene_ht_gene_similarity_matrix = ut.to_layout( htvl_gene_ht_gene_similarity_matrix, layout="row_major") assert htvl_gene_ht_gene_similarity_matrix.shape == ( htvl_genes_count, ht_genes_count) if ut.logging_calc(): i_gene_totals = ut.get_v_numpy(i_data, "downsampled", sum=True) ht_mask = ut.get_v_numpy(i_data, "high_total_gene") i_total = np.sum(i_gene_totals) htvl_gene_totals = i_gene_totals[ht_mask][htv_mask][ htvl_mask] top_similarity_of_htvl_genes = ut.top_per( htvl_gene_ht_gene_similarity_matrix, 10, per="row") for htvl_index, gene_index in enumerate( base_index_of_htvl_genes): gene_name = adata.var_names[gene_index] gene_total = htvl_gene_totals[htvl_index] gene_percent = 100 * gene_total / i_total similar_ht_values = ut.to_numpy_vector( top_similarity_of_htvl_genes[htvl_index, :]) # assert len(similar_ht_values) == ht_genes_count top_similar_ht_mask = similar_ht_values > 0 top_similar_ht_values = similar_ht_values[ top_similar_ht_mask] top_similar_ht_indices = base_index_of_ht_genes[ top_similar_ht_mask] top_similar_ht_names = adata.var_names[ top_similar_ht_indices] ut.log_calc( f"- {gene_name}", f"total downsampled UMIs: {gene_total} " + f"({gene_percent:.4g}%), correlated with: " + ", ".join([ f"{similar_gene_name}: {similar_gene_value:.4g}" for similar_gene_value, similar_gene_name in reversed( sorted( zip(top_similar_ht_values, top_similar_ht_names))) ]), ) if ut.logging_calc(): ut.log_calc("noisy_lonely_gene_names", sorted(list(adata.var_names[noisy_lonely_genes_mask]))) if inplace: ut.set_v_data(adata, "noisy_lonely_gene", noisy_lonely_genes_mask) return None ut.log_return("noisy_lonely_genes", noisy_lonely_genes_mask) return ut.to_pandas_series(noisy_lonely_genes_mask, index=adata.var_names)
def find_properly_sampled_cells( adata: AnnData, what: Union[str, ut.Matrix] = "__x__", *, min_cell_total: Optional[int], max_cell_total: Optional[int], excluded_adata: Optional[AnnData] = None, max_excluded_genes_fraction: Optional[float], inplace: bool = True, ) -> Optional[ut.PandasSeries]: """ Detect cells with a "proper" amount of ``what`` (default: {what}) data. Due to both technical effects and natural variance between cells, the total number of UMIs varies from cell to cell. We often would like to work on cells that contain a sufficient number of UMIs for meaningful analysis; we sometimes also wish to exclude cells which have "too many" UMIs. **Input** Annotated ``adata``, where the observations are cells and the variables are genes, where ``what`` is a per-variable-per-observation matrix or the name of a per-variable-per-observation annotation containing such a matrix. **Returns** Observation (Cell) Annotations ``properly_sampled_cell`` A boolean mask indicating whether each cell has a "proper" amount of UMIs. If ``inplace`` (default: {inplace}), this is written to the data, and the function returns ``None``. Otherwise this is returned as a pandas series (indexed by the observation names). **Computation Parameters** 1. Exclude all cells whose total data is less than the ``min_cell_total`` (no default), unless it is ``None``. 2. Exclude all cells whose total data is more than the ``max_cell_total`` (no default), unless it is ``None``. 3. If ``max_excluded_genes_fraction`` (no default) is not ``None``, then ``excluded_adata`` must not be ``None`` and should contain just the excluded genes data for each cell. Exclude all cells whose sum of the excluded data divided by the total data is more than the specified threshold. """ assert (max_excluded_genes_fraction is None) == (excluded_adata is None) total_of_cells = ut.get_o_numpy(adata, what, sum=True) cells_mask = np.full(adata.n_obs, True, dtype="bool") if min_cell_total is not None: cells_mask = cells_mask & (total_of_cells >= min_cell_total) if max_cell_total is not None: cells_mask = cells_mask & (total_of_cells <= max_cell_total) if excluded_adata is not None: assert max_excluded_genes_fraction is not None excluded_data = ut.get_vo_proper(excluded_adata, layout="row_major") excluded_of_cells = ut.sum_per(excluded_data, per="row") if np.min(total_of_cells) == 0: total_of_cells = np.copy(total_of_cells) total_of_cells[total_of_cells == 0] = 1 excluded_fraction = excluded_of_cells / total_of_cells cells_mask = cells_mask & (excluded_fraction <= max_excluded_genes_fraction) if inplace: ut.set_o_data(adata, "properly_sampled_cell", cells_mask) return None ut.log_return("properly_sampled_cell", cells_mask) return ut.to_pandas_series(cells_mask, index=adata.obs_names)
def combine_masks( # pylint: disable=too-many-branches,too-many-statements adata: AnnData, masks: List[str], *, invert: bool = False, to: Optional[str] = None, ) -> Optional[ut.PandasSeries]: """ Combine different pre-computed masks into a final overall mask. **Input** Annotated ``adata``, where the observations are cells and the variables are genes. **Returns** If ``to`` (default: {to}) is ``None``, returns the computed mask. Otherwise, sets the mask as an annotation (per-variable or per-observation depending on the type of the combined masks). **Computation Parameters** 1. For each of the mask in ``masks``, fetch it. Silently ignore missing masks if the name has a ``?`` suffix. Invert the mask if the name has a ``~`` prefix. If the name has a ``|`` prefix (before the ``~`` prefix, if any), then bitwise-OR the mask into the OR mask, otherwise (or if it has a ``&`` prefix), bitwise-AND the mask into the AND mask. 2. Combine (bitwise-AND) the AND mask and the OR mask into a single mask. 3. If ``invert`` (default: {invert}), invert the result combined mask. """ assert len(masks) > 0 per: Optional[str] = None and_mask: Optional[ut.NumpyVector] = None or_mask: Optional[ut.NumpyVector] = None for mask_name in masks: log_mask_name = mask_name if mask_name[0] == "|": is_or = True mask_name = mask_name[1:] else: is_or = False if mask_name[0] == "&": mask_name = mask_name[1:] if mask_name[0] == "~": invert_mask = True mask_name = mask_name[1:] else: invert_mask = False if mask_name[-1] == "?": must_exist = False mask_name = mask_name[:-1] else: must_exist = True if mask_name in adata.obs: mask_per = "o" mask = ut.get_o_numpy( adata, mask_name, formatter=ut.mask_description) > 0 elif mask_name in adata.var: mask_per = "v" mask = ut.get_v_numpy( adata, mask_name, formatter=ut.mask_description) > 0 else: if must_exist: raise KeyError(f"unknown mask data: {mask_name}") continue if mask.dtype != "bool": raise ValueError(f"the data: {mask_name} is not a boolean mask") if invert_mask: mask = ~mask if ut.logging_calc(): ut.log_calc(log_mask_name, mask) if per is None: per = mask_per else: if mask_per != per: raise ValueError( "mixing per-observation and per-variable masks") if is_or: if or_mask is None: or_mask = mask else: or_mask = or_mask | mask else: if and_mask is None: and_mask = mask else: and_mask = and_mask & mask if and_mask is not None: if or_mask is not None: combined_mask = and_mask & or_mask else: combined_mask = and_mask else: if or_mask is not None: combined_mask = or_mask else: raise ValueError("no masks to combine") if invert: combined_mask = ~combined_mask if to is None: ut.log_return("combined", combined_mask) if per == "o": return ut.to_pandas_series(combined_mask, index=adata.obs_names) assert per == "v" return ut.to_pandas_series(combined_mask, index=adata.var_names) if per == "o": ut.set_o_data(adata, to, combined_mask) else: ut.set_v_data(adata, to, combined_mask) return None
def find_metacells_significant_genes( adata: AnnData, what: Union[str, ut.Matrix] = "__x__", *, min_gene_range_fold: float = pr.min_significant_metacells_gene_range_fold_factor, normalization: float = pr.metacells_gene_range_normalization, min_gene_fraction: float = pr.min_significant_metacells_gene_fraction, inplace: bool = True, ) -> Optional[ut.PandasSeries]: """ Find genes which have a significant signal in metacells data. This computation is too unreliable to be used on cells. Find genes which have a high maximal expression in at least one metacell, and a wide range of expression across the metacells. Such genes are good candidates for being used as marker genes and/or to compute distances between metacells. **Input** Annotated ``adata``, where the observations are cells and the variables are genes, where ``what`` is a per-variable-per-observation matrix or the name of a per-variable-per-observation annotation containing such a matrix. **Returns** Variable (Gene) Annotations ``significant_gene`` A boolean mask indicating whether each gene was found to be significant. If ``inplace`` (default: {inplace}), this is written to the data, and the function returns ``None``. Otherwise this is returned as a pandas series (indexed by the variable names). **Computation Parameters** 1. Compute the minimal and maximal expression level of each gene. 2. Select the genes whose fold factor (log2 of maximal over minimal value, using the ``normalization`` (default: {normalization}) is at least ``min_gene_range_fold`` (default: {min_gene_range_fold}). 3. Select the genes whose maximal expression is at least ``min_gene_fraction`` (default: {min_gene_fraction}). """ assert normalization >= 0 data = ut.get_vo_proper(adata, what, layout="row_major") fractions_of_genes = ut.to_layout(ut.fraction_by(data, by="row"), layout="column_major") min_fraction_of_genes = ut.min_per(fractions_of_genes, per="column") max_fraction_of_genes = ut.max_per(fractions_of_genes, per="column") high_max_fraction_genes_mask = max_fraction_of_genes >= min_gene_fraction ut.log_calc("high max fraction genes", high_max_fraction_genes_mask) min_fraction_of_genes += normalization max_fraction_of_genes += normalization max_fraction_of_genes /= min_fraction_of_genes range_fold_of_genes = np.log2(max_fraction_of_genes, out=max_fraction_of_genes) high_range_genes_mask = range_fold_of_genes >= min_gene_range_fold ut.log_calc("high range genes", high_range_genes_mask) significant_genes_mask = high_max_fraction_genes_mask & high_range_genes_mask if inplace: ut.set_v_data(adata, "significant_gene", significant_genes_mask) return None ut.log_return("significant_genes", significant_genes_mask) return ut.to_pandas_series(significant_genes_mask, index=adata.var_names)
def compute_subset_distinct_genes( adata: AnnData, what: Union[str, ut.Matrix] = "__x__", *, prefix: Optional[str] = None, scale: Optional[Union[bool, str, ut.NumpyVector]], subset: Union[str, ut.NumpyVector], normalization: float, ) -> Optional[Tuple[ut.PandasSeries, ut.PandasSeries]]: """ Given a subset of the observations (cells), compute for each gene how distinct its ``what`` (default: {what}) value is in the subset compared to the overall population. This is the area-under-curve of the receiver operating characteristic (AUROC) for the gene, that is, the probability that a randomly selected observation (cell) in the subset will have a higher value than a randomly selected observation (cell) outside the subset. **Input** Annotated ``adata``, where the observations are cells and the variables are genes, where ``what`` is a per-variable-per-observation matrix or the name of a per-variable-per-observation annotation containing such a matrix. **Returns** Variable (Gene) Annotations ``<prefix>_fold`` Store the ratio of the expression of the gene in the subset as opposed to the rest of the population. ``<prefix>_auroc`` Store the distinctiveness of the gene in the subset as opposed to the rest of the population. If ``prefix`` (default: {prefix}), is specified, this is written to the data. Otherwise this is returned as two pandas series (indexed by the gene names). **Computation Parameters** 1. Use the ``subset`` to assign a boolean label to each observation (cell). The ``subset`` can be a vector of integer observation names, or a boolean mask, or the string name of a per-observation annotation containing the boolean mask. 2. If ``scale`` is ``False``, use the data as-is. If it is ``True``, divide the data by the sum of each observation (cell). If it is a string, it should be the name of a per-observation annotation to use. Otherwise, it should be a vector of the scale factor for each observation (cell). 3. Compute the fold ratios using the ``normalization`` (no default!) and the AUROC for each gene, for the scaled data based on this mask. """ if isinstance(subset, str): subset = ut.get_o_numpy(adata, subset) if subset.dtype != "bool": mask: ut.NumpyVector = np.full(adata.n_obs, False) mask[subset] = True subset = mask scale_of_cells: Optional[ut.NumpyVector] = None if not isinstance(scale, bool): scale_of_cells = ut.maybe_o_numpy(adata, scale, formatter=ut.sizes_description) elif scale: scale_of_cells = ut.get_o_numpy(adata, what, sum=True) else: scale_of_cells = None matrix = ut.get_vo_proper(adata, what, layout="column_major").transpose() fold_of_genes, auroc_of_genes = ut.matrix_rows_folds_and_aurocs( matrix, columns_subset=subset, columns_scale=scale_of_cells, normalization=normalization) if prefix is not None: ut.set_v_data(adata, f"{prefix}_auroc", auroc_of_genes) ut.set_v_data(adata, f"{prefix}_fold", fold_of_genes) return None return ( ut.to_pandas_series(fold_of_genes, index=adata.var_names), ut.to_pandas_series(auroc_of_genes, index=adata.var_names), )
def group_obs_annotation( adata: AnnData, gdata: AnnData, *, groups: Union[str, ut.Vector], name: str, formatter: Optional[Callable[[Any], Any]] = None, method: str = "majority", min_value_fraction: float = 0.5, conflict: Optional[Any] = None, inplace: bool = True, ) -> Optional[ut.PandasSeries]: """ Transfer per-observation data from the per-observation (cell) ``adata`` to the per-group-of-observations (metacells) ``gdata``. **Input** Annotated ``adata``, where the observations are cells and the variables are genes, and the ``gdata`` containing the per-metacells summed data. **Returns** Observations (Cell) Annotations ``<name>`` The per-group-observation annotation computed based on the per-observation annotation. If ``inplace`` (default: {inplace}), this is written to the ``gdata``, and the function returns ``None``. Otherwise this is returned as a pandas series (indexed by the group observation names). **Computation Parameters** 1. Iterate on all the observations (groups, metacells) in ``gdata``. 2. Consider all the cells whose ``groups`` annotation maps them into this group. 3. Consider all the ``name`` annotation values of these cells. 4. Compute an annotation value for the whole group of cells using the ``method``. Supported methods are: ``unique`` All the values of all the cells in the group are expected to be the same, use this unique value for the whole groups. ``majority`` Use the most common value across all cells in the group as the value for the whole group. If this value doesn't have at least ``min_value_fraction`` (default: {min_value_fraction}) of the cells, use the ``conflict`` (default: {conflict}) value instead. """ group_of_cells = ut.get_o_numpy(adata, groups, formatter=ut.groups_description) values_of_cells = ut.get_o_numpy(adata, name, formatter=formatter) value_of_groups = np.empty(gdata.n_obs, dtype=values_of_cells.dtype) assert method in ("unique", "majority") if method == "unique": with ut.timed_step(".unique"): value_of_groups[group_of_cells] = values_of_cells else: assert method == "majority" with ut.timed_step(".majority"): for group_index in range(gdata.n_obs): cells_mask = group_of_cells == group_index cells_count = np.sum(cells_mask) assert cells_count > 0 values_of_cells_of_group = values_of_cells[cells_mask] unique_values_of_group, unique_counts_of_group = np.unique( values_of_cells_of_group, return_counts=True) majority_index = np.argmax(unique_counts_of_group) majority_count = unique_counts_of_group[majority_index] if majority_count / cells_count < min_value_fraction: value_of_groups[group_index] = conflict else: majority_value = unique_values_of_group[majority_index] value_of_groups[group_index] = majority_value if inplace: ut.set_o_data(gdata, name, value_of_groups) return None return ut.to_pandas_series(value_of_groups, index=gdata.obs_names)
def compute_candidate_metacells( # pylint: disable=too-many-statements,too-many-branches adata: AnnData, what: Union[str, ut.Matrix] = "obs_outgoing_weights", *, target_metacell_size: float, cell_sizes: Optional[Union[str, ut.Vector]] = pr.candidates_cell_sizes, cell_seeds: Optional[Union[str, ut.Vector]] = None, min_seed_size_quantile: float = pr.min_seed_size_quantile, max_seed_size_quantile: float = pr.max_seed_size_quantile, cooldown_pass: float = pr.cooldown_pass, cooldown_node: float = pr.cooldown_node, cooldown_phase: float = pr.cooldown_phase, min_split_size_factor: Optional[float] = pr.candidates_min_split_size_factor, max_merge_size_factor: Optional[float] = pr.candidates_max_merge_size_factor, min_metacell_cells: Optional[int] = pr.candidates_min_metacell_cells, max_split_min_cut_strength: Optional[float] = pr.max_split_min_cut_strength, min_cut_seed_cells: Optional[int] = pr.min_cut_seed_cells, must_complete_cover: bool = False, random_seed: int = 0, inplace: bool = True, ) -> Optional[ut.PandasSeries]: """ Assign observations (cells) to (raw, candidate) metacells based on ``what`` data. (a weighted directed graph). These candidate metacells typically go through additional vetting (e.g. deviant detection and dissolving too-small metacells) to obtain the final metacells. **Input** Annotated ``adata``, where the observations are cells and the variables are genes, where ``what`` is a per-observation-per-observation matrix where each row is the outgoing weights from each observation to the rest, or just the name of a per-observation-per-observation annotation containing such a matrix. Typically this matrix will be sparse for efficient processing. **Returns** Observation (Cell) Annotations ``candidate`` The integer index of the (raw, candidate) metacell each cell belongs to. The metacells are in no particular order. If ``inplace`` (default: {inplace}), this is written to the data, and the function returns ``None``. Otherwise this is returned as a pandas series (indexed by the variable names). **Computation Parameters** 1. We are trying to build metacells of ``target_metacell_size``, using the ``cell_sizes`` (default: {cell_sizes}) to assign a size for each node (cell). This can be a string name of a per-observation annotation or a vector of values. 2. We start with some an assignment of cells to ``cell_seeds`` (default: {cell_seeds}). If no seeds are provided, we use :py:func:`choose_seeds` using ``min_seed_size_quantile`` (default: {min_seed_size_quantile}) and ``max_seed_size_quantile`` (default: {max_seed_size_quantile}) to compute them, picking a number of seeds such that the average metacell size would match the target. 3. We optimize the seeds using :py:func:`optimize_partitions` to obtain initial communities by maximizing the "stability" of the solution (probability of starting at a random node and moving either forward or backward in the graph and staying within the same metacell, divided by the probability of staying in the metacell if the edges connected random nodes). We pass it the ``cooldown_pass`` {cooldown_pass}) and ``cooldown_node`` (default: {cooldown_node}). 4. If ``min_split_size_factor`` (default: {min_split_size_factor}) is specified, randomly split to two each community whose size is partition method on each community whose size is at least ``target_metacell_size * min_split_size_factor`` and re-optimize the solution (resulting in one additional metacell). Every time we re-optimize, we multiply 1 - ``cooldown_pass`` by 1 - ``cooldown_phase`` (default: {cooldown_phase}). 5. If ``max_split_min_cut_strength`` (default: {max_split_min_cut_strength}) is specified, and the minimal cut of a candidate is lower, split it into two. If one of the partitions is smaller than ``min_cut_seed_cells``, then mark the cells in it as outliers, or if ``must_complete_cover`` is ``True``, skip the cut altogether. 5. If ``max_merge_size_factor`` (default: {max_merge_size_factor}) or ``min_metacell_cells`` (default: {min_metacell_cells}) are specified, make outliers of cells of a community whose size is at most ``target_metacell_size * max_merge_size_factor`` or contains less cells and re-optimize, which will assign these cells to other metacells (resulting on one less metacell). We again apply the ``cooldown_phase`` every time we re-optimize. 6. Repeat the above steps until all metacells candidates are in the acceptable size range. """ edge_weights = ut.get_oo_proper(adata, what, layout="row_major") assert edge_weights.shape[0] == edge_weights.shape[1] assert 0.0 < cooldown_pass < 1.0 assert 0.0 <= cooldown_node <= 1.0 assert 0.0 < cooldown_phase <= 1.0 size = edge_weights.shape[0] outgoing_edge_weights = ut.mustbe_compressed_matrix(edge_weights) assert ut.is_layout(outgoing_edge_weights, "row_major") incoming_edge_weights = ut.mustbe_compressed_matrix(ut.to_layout(outgoing_edge_weights, layout="column_major")) assert ut.is_layout(incoming_edge_weights, "column_major") assert outgoing_edge_weights.data.dtype == "float32" assert outgoing_edge_weights.indices.dtype == "int32" assert outgoing_edge_weights.indptr.dtype == "int32" assert incoming_edge_weights.data.dtype == "float32" assert incoming_edge_weights.indices.dtype == "int32" assert incoming_edge_weights.indptr.dtype == "int32" node_sizes = ut.maybe_o_numpy(adata, cell_sizes, formatter=ut.sizes_description) if node_sizes is None: node_sizes = np.full(size, 1.0, dtype="float32") else: node_sizes = node_sizes.astype("float32") ut.log_calc("node_sizes", node_sizes, formatter=ut.sizes_description) assert target_metacell_size > 0 max_metacell_size = None min_metacell_size = None if min_split_size_factor is not None: assert min_split_size_factor > 0 max_metacell_size = ceil(target_metacell_size * min_split_size_factor) - 1 ut.log_calc("max_metacell_size", max_metacell_size) if max_merge_size_factor is not None: assert max_merge_size_factor > 0 min_metacell_size = floor(target_metacell_size * max_merge_size_factor) + 1 ut.log_calc("min_metacell_size", min_metacell_size) target_metacell_cells = max( 1.0 if min_metacell_cells is None else float(min_metacell_cells), float(target_metacell_size / np.mean(node_sizes)), ) ut.log_calc("target_metacell_cells", target_metacell_cells) if min_split_size_factor is not None and max_merge_size_factor is not None: assert max_merge_size_factor < min_split_size_factor assert min_metacell_size is not None assert max_metacell_size is not None assert min_metacell_size <= max_metacell_size community_of_nodes = ut.maybe_o_numpy(adata, cell_seeds, formatter=ut.groups_description) if community_of_nodes is not None: assert community_of_nodes.dtype == "int32" else: target_seeds_count = ceil(size / target_metacell_cells) ut.log_calc("target_seeds_count", target_seeds_count) community_of_nodes = np.full(size, -1, dtype="int32") _choose_seeds( outgoing_edge_weights=outgoing_edge_weights, incoming_edge_weights=incoming_edge_weights, seed_of_cells=community_of_nodes, max_seeds_count=target_seeds_count, min_seed_size_quantile=min_seed_size_quantile, max_seed_size_quantile=max_seed_size_quantile, random_seed=random_seed, ) ut.set_o_data(adata, "seed", community_of_nodes, formatter=ut.groups_description) community_of_nodes = community_of_nodes.copy() np.random.seed(random_seed) cold_temperature = 1 - cooldown_pass old_score = 1e9 old_communities = community_of_nodes old_small_nodes_count = len(community_of_nodes) atomic_candidates: Set[Tuple[int, ...]] = set() kept_communities_count = 0 while True: cold_temperature, score = _optimize_split_communities( # outgoing_edge_weights=outgoing_edge_weights, incoming_edge_weights=incoming_edge_weights, community_of_nodes=community_of_nodes, node_sizes=node_sizes, target_metacell_size=target_metacell_size, max_metacell_size=max_metacell_size, max_split_min_cut_strength=max_split_min_cut_strength, min_cut_seed_cells=min_cut_seed_cells, must_complete_cover=must_complete_cover, min_seed_size_quantile=min_seed_size_quantile, max_seed_size_quantile=max_seed_size_quantile, random_seed=random_seed, cooldown_pass=cooldown_pass, cooldown_node=cooldown_node, cooldown_phase=cooldown_phase, kept_communities_count=kept_communities_count, cold_temperature=cold_temperature, atomic_candidates=atomic_candidates, ) small_communities, small_nodes_count = _find_small_communities( community_of_nodes=community_of_nodes, node_sizes=node_sizes, min_metacell_size=min_metacell_size, min_metacell_cells=min_metacell_cells, ) small_communities_count = len(small_communities) if small_communities_count < 2: break if (old_small_nodes_count, old_score) <= (small_nodes_count, score): ut.logger().debug("is not better, revert") community_of_nodes = old_communities score = old_score ut.log_calc("communities", community_of_nodes, formatter=ut.groups_description) ut.log_calc("score", score) break old_score = score old_communities = community_of_nodes.copy() old_small_nodes_count = small_nodes_count kept_communities_count = _cancel_communities( community_of_nodes=community_of_nodes, cancelled_communities=small_communities ) _choose_seeds( outgoing_edge_weights=outgoing_edge_weights, incoming_edge_weights=incoming_edge_weights, seed_of_cells=community_of_nodes, max_seeds_count=kept_communities_count + small_communities_count - 1, min_seed_size_quantile=min_seed_size_quantile, max_seed_size_quantile=max_seed_size_quantile, random_seed=random_seed, ) if inplace: ut.set_o_data(adata, "candidate", community_of_nodes, formatter=ut.groups_description) return None if must_complete_cover: assert np.min(community_of_nodes) == 0 else: community_of_nodes[community_of_nodes < 0] = -1 ut.log_return("candidate", community_of_nodes, formatter=ut.groups_description) return ut.to_pandas_series(community_of_nodes, index=adata.obs_names)
def filter_data( # pylint: disable=dangerous-default-value adata: AnnData, obs_masks: List[str] = [], var_masks: List[str] = [], *, mask_obs: Optional[str] = None, mask_var: Optional[str] = None, invert_obs: bool = False, invert_var: bool = False, track_obs: Optional[str] = None, track_var: Optional[str] = None, name: Optional[str] = None, top_level: bool = True, ) -> Optional[Tuple[AnnData, ut.PandasSeries, ut.PandasSeries]]: """ Filter (slice) the data based on previously-computed masks. For example, it is useful to discard cell-cycle genes, cells which have too few UMIs for meaningful analysis, etc. In general, the "best" filter depends on the data set. This function makes it easy to combine different pre-computed per-observation (cell) and per-variable (gene) boolean mask annotations into a final overall inclusion mask, and slice the data accordingly, while tracking the base index of the cells and genes in the filtered data. **Input** Annotated ``adata``, where the observations are cells and the variables are genes. **Returns** An annotated data containing a subset of the observations (cells) and variables (genes). If no observations and/or no variables were selected by the filter, returns ``None``. If ``name`` is not specified, the returned data will be unnamed. Otherwise, if the name starts with a ``.``, it will be appended to the current name (if any). Otherwise, ``name`` is the new name. If ``mask_obs`` and/or ``mask_var`` are specified, store the mask of the selected data as a per-observation and/or per-variable annotation of the full ``adata``. If ``track_obs`` and/or ``track_var`` are specified, store the original indices of the selected data as a per-observation and/or per-variable annotation of the result data. **Computation Parameters** 1. Combine the masks in ``obs_masks`` and/or ``var_masks`` using :py:func:`metacells.tools.mask.combine_masks` passing it ``invert_obs`` and ``invert_var``, and ``mask_obs`` and ``mask_var`` as the ``to`` parameter. If either list of masks is empty, use the full mask. 2. If the obtained masks for either the observations or variables is empty, return ``None``. Otherwise, return a slice of the full data containing just the observations and variables specified by the final masks. """ if len(obs_masks) == 0: obs_mask = np.full(adata.n_obs, True, dtype="bool") if mask_obs is not None: ut.set_o_data(adata, mask_obs, obs_mask) else: mask = combine_masks(adata, obs_masks, invert=invert_obs, to=mask_obs) if mask is None: assert mask_obs is not None obs_mask = ut.get_o_numpy( adata, mask_obs, formatter=ut.mask_description) > 0 else: obs_mask = ut.to_numpy_vector(mask, only_extract=True) > 0 if len(var_masks) == 0: var_mask = np.full(adata.n_vars, True, dtype="bool") if mask_var is not None: ut.set_o_data(adata, mask_var, var_mask) else: mask = combine_masks(adata, var_masks, invert=invert_var, to=mask_var) if mask is None: assert mask_var is not None var_mask = ut.get_v_numpy( adata, mask_var, formatter=ut.mask_description) > 0 else: var_mask = ut.to_numpy_vector(mask, only_extract=True) > 0 if not np.any(obs_mask) or not np.any(var_mask): return None fdata = ut.slice(adata, name=name, top_level=top_level, obs=obs_mask, vars=var_mask, track_obs=track_obs, track_var=track_var) return ( fdata, ut.to_pandas_series(obs_mask, index=adata.obs_names), ut.to_pandas_series(var_mask, index=adata.var_names), )