def dissolve_metacells( adata: AnnData, what: Union[str, ut.Matrix] = "__x__", *, candidates: Union[str, ut.Vector] = "candidate", deviants: Optional[Union[str, ut.Vector]] = "cell_deviant_votes", target_metacell_size: float = pr.target_metacell_size, cell_sizes: Optional[Union[str, ut.Vector]] = pr.dissolve_cell_sizes, min_metacell_cells: int = pr.dissolve_min_metacell_cells, min_robust_size_factor: Optional[float] = pr. dissolve_min_robust_size_factor, min_convincing_size_factor: Optional[float] = pr. dissolve_min_convincing_size_factor, min_convincing_gene_fold_factor: float = pr. dissolve_min_convincing_gene_fold_factor, abs_folds: bool = pr.dissolve_abs_folds, inplace: bool = True, ) -> Optional[ut.PandasFrame]: """ Dissolve too-small metacells based on ``what`` (default: {what}) data. **Input** Annotated ``adata``, where the observations are cells and the variables are genes, where ``what`` is a per-variable-per-observation matrix or the name of a per-variable-per-observation annotation containing such a matrix. **Returns** Observation (Cell) Annotations ``metacell`` The integer index of the metacell each cell belongs to. The metacells are in no particular order. Cells with no metacell assignment are given a metacell index of ``-1``. ``dissolved`` A boolean mask of the cells which were in a dissolved metacell. If ``inplace`` (default: {inplace}), this is written to the data, and the function returns ``None``. Otherwise this is returned as a pandas data frame (indexed by the observation names). **Computation Parameters** 1. Mark all cells with non-zero ``deviants`` (default: {deviants}) as "outliers". This can be the name of a per-observation (cell) annotation, or an explicit boolean mask of cells, or a or ``None`` if there are no deviant cells to mark. 2. Any metacell which has less cells than the ``min_metacell_cells`` is dissolved. 3. We are trying to create metacells of size ``target_metacell_size``. Compute the sizes of the resulting metacells by summing the ``cell_sizes`` (default: {cell_sizes}). If it is ``None``, each has a size of one. These parameters are typically identical to these passed to :py:func:`metacells.tools.candidates.compute_candidate_metacells`. 4. If ``min_robust_size_factor`` (default: {min_robust_size_factor}) is specified, then any metacell whose total size is at least ``target_metacell_size * min_robust_size_factor`` is preserved. 5. If ``min_convincing_size_factor`` (default: {min_convincing_size_factor}) is specified, then any remaining metacells whose size is at least ``target_metacell_size * min_convincing_size_factor`` are preserved, given they contain at least one gene whose fold factor (log2((actual + 1) / (expected + 1))) is at least ``min_convincing_gene_fold_factor`` (default: {min_convincing_gene_fold_factor}). If ``abs_folds``, consider the absolute fold factors. That is, we only preserve these smaller metacells if there is at least one gene whose expression is significantly different from the mean of the population. 6 . Any remaining metacell is dissolved into "outlier" cells. """ dissolved_of_cells = np.zeros(adata.n_obs, dtype="bool") candidate_of_cells = ut.get_o_numpy(adata, candidates, formatter=ut.groups_description) candidate_of_cells = np.copy(candidate_of_cells) deviant_of_cells = ut.maybe_o_numpy(adata, deviants, formatter=ut.mask_description) if deviant_of_cells is not None: deviant_of_cells = deviant_of_cells > 0 cell_sizes = ut.maybe_o_numpy(adata, cell_sizes, formatter=ut.sizes_description) if deviant_of_cells is not None: candidate_of_cells[deviant_of_cells > 0] = -1 candidate_of_cells = ut.compress_indices(candidate_of_cells) candidates_count = np.max(candidate_of_cells) + 1 data = ut.get_vo_proper(adata, what, layout="column_major") fraction_of_genes = ut.fraction_per(data, per="column") if min_robust_size_factor is None: min_robust_size = None else: min_robust_size = target_metacell_size * min_robust_size_factor ut.log_calc("min_robust_size", min_robust_size) if min_convincing_size_factor is None: min_convincing_size = None else: min_convincing_size = target_metacell_size * min_convincing_size_factor ut.log_calc("min_convincing_size", min_convincing_size) did_dissolve = False for candidate_index in range(candidates_count): candidate_cell_indices = np.where( candidate_of_cells == candidate_index)[0] if not _keep_candidate( adata, candidate_index, data=data, cell_sizes=cell_sizes, fraction_of_genes=fraction_of_genes, min_metacell_cells=min_metacell_cells, min_robust_size=min_robust_size, min_convincing_size=min_convincing_size, min_convincing_gene_fold_factor=min_convincing_gene_fold_factor, abs_folds=abs_folds, candidates_count=candidates_count, candidate_cell_indices=candidate_cell_indices, ): dissolved_of_cells[candidate_cell_indices] = True candidate_of_cells[candidate_cell_indices] = -1 did_dissolve = True if did_dissolve: metacell_of_cells = ut.compress_indices(candidate_of_cells) else: metacell_of_cells = candidate_of_cells if inplace: ut.set_o_data(adata, "dissolved", dissolved_of_cells, formatter=ut.mask_description) ut.set_o_data(adata, "metacell", metacell_of_cells, formatter=ut.groups_description) return None ut.log_return("dissolved", dissolved_of_cells) ut.log_return("metacell", metacell_of_cells, formatter=ut.groups_description) obs_frame = ut.to_pandas_frame(index=adata.obs_names) obs_frame["dissolved"] = dissolved_of_cells obs_frame["metacell"] = metacell_of_cells return obs_frame
def _compute_elements_knn_graph( adata: AnnData, elements: str, what: Union[str, ut.Matrix] = "__x__", *, k: int, balanced_ranks_factor: float, incoming_degree_factor: float, outgoing_degree_factor: float, inplace: bool = True, ) -> Optional[ut.PandasFrame]: assert elements in ("obs", "var") assert balanced_ranks_factor > 0.0 assert incoming_degree_factor > 0.0 assert outgoing_degree_factor > 0.0 if elements == "obs": get_data = ut.get_oo_proper set_data = ut.set_oo_data else: get_data = ut.get_vv_proper set_data = ut.set_vv_data def store_matrix(matrix: ut.CompressedMatrix, name: str, when: bool) -> None: # if when: name = elements + "_" + name set_data( adata, name, matrix, formatter=lambda matrix: ut.ratio_description( matrix.shape[0] * matrix.shape[1], "element", matrix.nnz, "nonzero"), ) elif ut.logging_calc(): ut.log_calc( f"{elements}_{name}", ut.ratio_description(matrix.shape[0] * matrix.shape[1], "element", matrix.nnz, "nonzero"), ) similarity = ut.to_proper_matrix(get_data(adata, what)) similarity = ut.to_layout(similarity, "row_major", symmetric=True) similarity = ut.to_numpy_matrix(similarity) ut.log_calc("similarity", similarity) outgoing_ranks = _rank_outgoing(similarity) balanced_ranks = _balance_ranks(outgoing_ranks, k, balanced_ranks_factor) store_matrix(balanced_ranks, "balanced_ranks", True) pruned_ranks = _prune_ranks(balanced_ranks, k, incoming_degree_factor, outgoing_degree_factor) store_matrix(pruned_ranks, "pruned_ranks", True) outgoing_weights = _weigh_edges(pruned_ranks) store_matrix(outgoing_weights, "outgoing_weights", inplace) if inplace: return None if elements == "obs": names = adata.obs_names else: names = adata.var_names return ut.to_pandas_frame(outgoing_weights, index=names, columns=names)
def _keep_candidate( # pylint: disable=too-many-branches adata: AnnData, candidate_index: int, *, data: ut.ProperMatrix, cell_sizes: Optional[ut.NumpyVector], fraction_of_genes: ut.NumpyVector, min_metacell_cells: int, min_robust_size: Optional[float], min_convincing_size: Optional[float], min_convincing_gene_fold_factor: float, abs_folds: bool, candidates_count: int, candidate_cell_indices: ut.NumpyVector, ) -> bool: genes_count = data.shape[1] if cell_sizes is None: candidate_total_size = candidate_cell_indices.size else: candidate_total_size = np.sum(cell_sizes[candidate_cell_indices]) if candidate_cell_indices.size < min_metacell_cells: if ut.logging_calc(): ut.log_calc( f'- candidate: {ut.progress_description(candidates_count, candidate_index, "candidate")} ' f"cells: {candidate_cell_indices.size} " f"size: {candidate_total_size:g} " f"is: little") return False if min_robust_size is not None and candidate_total_size >= min_robust_size: if ut.logging_calc(): ut.log_calc( f'- candidate: {ut.progress_description(candidates_count, candidate_index, "candidate")} ' f"cells: {candidate_cell_indices.size} " f"size: {candidate_total_size:g} " f"is: robust") return True if min_convincing_size is None: if ut.logging_calc(): ut.log_calc( f'- candidate: {ut.progress_description(candidates_count, candidate_index, "candidate")} ' f"cells: {candidate_cell_indices.size} " f"size: {candidate_total_size:g} " f"is: accepted") return True if candidate_total_size < min_convincing_size: if ut.logging_calc(): ut.log_calc( f'- candidate: {ut.progress_description(candidates_count, candidate_index, "candidate")} ' f"cells: {candidate_cell_indices.size} " f"size: {candidate_total_size:g} " f"is: unconvincing") return False candidate_data = data[candidate_cell_indices, :] candidate_data_of_genes = ut.to_numpy_vector(candidate_data.sum(axis=0)) assert candidate_data_of_genes.size == genes_count candidate_total = np.sum(candidate_data_of_genes) candidate_expected_of_genes = fraction_of_genes * candidate_total candidate_expected_of_genes += 1 candidate_data_of_genes += 1 candidate_data_of_genes /= candidate_expected_of_genes np.log2(candidate_data_of_genes, out=candidate_data_of_genes) if abs_folds: convincing_genes_mask = np.abs( candidate_data_of_genes) >= min_convincing_gene_fold_factor else: convincing_genes_mask = candidate_data_of_genes >= min_convincing_gene_fold_factor keep_candidate = bool(np.any(convincing_genes_mask)) if ut.logging_calc(): convincing_gene_indices = np.where(convincing_genes_mask)[0] if keep_candidate: ut.log_calc( f'- candidate: {ut.progress_description(candidates_count, candidate_index, "candidate")} ' f"cells: {candidate_cell_indices.size} " f"size: {candidate_total_size:g} " f"is: convincing because:") for fold_factor, name in reversed( sorted( zip(candidate_data_of_genes[convincing_gene_indices], adata.var_names[convincing_gene_indices]))): ut.log_calc(f" {name}: {ut.fold_description(fold_factor)}") else: ut.log_calc( f'- candidate: {ut.progress_description(candidates_count, candidate_index, "candidate")} ' f"cells: {candidate_cell_indices.size} " f"size: {candidate_total_size:g} " f"is: not convincing") return keep_candidate
def _identify_cells( *, adata_of_all_genes_of_all_cells: AnnData, what: Union[str, ut.Matrix] = "__x__", related_gene_indices_of_modules: List[List[int]], min_cell_module_total: int, min_cells_of_modules: int, max_cells_of_modules: int, rare_module_of_cells: ut.NumpyVector, ) -> None: max_strength_of_cells = np.zeros(adata_of_all_genes_of_all_cells.n_obs) ut.log_calc("cells for modules:") modules_count = len(related_gene_indices_of_modules) for module_index, related_gene_indices_of_module in enumerate( related_gene_indices_of_modules): if len(related_gene_indices_of_module) == 0: continue with ut.log_step( "- module", module_index, formatter=lambda module_index: ut.progress_description( modules_count, module_index, "module"), ): adata_of_related_genes_of_all_cells = ut.slice( adata_of_all_genes_of_all_cells, name=f".module{module_index}.related_genes", vars=related_gene_indices_of_module, top_level=False, ) total_related_genes_of_all_cells = ut.get_o_numpy( adata_of_related_genes_of_all_cells, what, sum=True) mask_of_strong_cells_of_module = total_related_genes_of_all_cells >= min_cell_module_total median_strength_of_module = np.median( total_related_genes_of_all_cells[ mask_of_strong_cells_of_module]) # strong_cells_count = np.sum(mask_of_strong_cells_of_module) if strong_cells_count > max_cells_of_modules: if ut.logging_calc(): ut.log_calc( "strong_cells", ut.mask_description(mask_of_strong_cells_of_module) + " (too many)") # related_gene_indices_of_module.clear() continue if strong_cells_count < min_cells_of_modules: if ut.logging_calc(): ut.log_calc( "strong_cells", ut.mask_description(mask_of_strong_cells_of_module) + " (too few)") # related_gene_indices_of_module.clear() continue ut.log_calc("strong_cells", mask_of_strong_cells_of_module) strength_of_all_cells = total_related_genes_of_all_cells / median_strength_of_module mask_of_strong_cells_of_module &= strength_of_all_cells >= max_strength_of_cells max_strength_of_cells[ mask_of_strong_cells_of_module] = strength_of_all_cells[ mask_of_strong_cells_of_module] rare_module_of_cells[mask_of_strong_cells_of_module] = module_index
def _compress_modules( *, adata_of_all_genes_of_all_cells: AnnData, what: Union[str, ut.Matrix] = "__x__", min_cells_of_modules: int, max_cells_of_modules: int, target_metacell_size: float, min_modules_size_factor: float, related_gene_indices_of_modules: List[List[int]], rare_module_of_cells: ut.NumpyVector, ) -> List[List[int]]: list_of_rare_gene_indices_of_modules: List[List[int]] = [] list_of_names_of_genes_of_modules: List[List[str]] = [] min_umis_of_modules = target_metacell_size * min_modules_size_factor ut.log_calc("min_umis_of_modules", min_umis_of_modules) total_all_genes_of_all_cells = ut.get_o_numpy( adata_of_all_genes_of_all_cells, what, sum=True) cell_counts_of_modules: List[int] = [] ut.log_calc("compress modules:") modules_count = len(related_gene_indices_of_modules) for module_index, gene_indices_of_module in enumerate( related_gene_indices_of_modules): if len(gene_indices_of_module) == 0: continue with ut.log_step( "- module", module_index, formatter=lambda module_index: ut.progress_description( modules_count, module_index, "module"), ): module_cells_mask = rare_module_of_cells == module_index module_cells_count = np.sum(module_cells_mask) module_umis_count = np.sum( total_all_genes_of_all_cells[module_cells_mask]) if module_cells_count < min_cells_of_modules: if ut.logging_calc(): ut.log_calc("cells", str(module_cells_count) + " (too few)") rare_module_of_cells[module_cells_mask] = -1 continue if module_cells_count > max_cells_of_modules: if ut.logging_calc(): ut.log_calc("cells", str(module_cells_count) + " (too many)") rare_module_of_cells[module_cells_mask] = -1 continue ut.log_calc("cells", module_cells_count) if module_umis_count < min_umis_of_modules: if ut.logging_calc(): ut.log_calc("UMIs", str(module_umis_count) + " (too few)") rare_module_of_cells[module_cells_mask] = -1 continue ut.log_calc("UMIs", module_umis_count) next_module_index = len(list_of_rare_gene_indices_of_modules) if module_index != next_module_index: ut.log_calc("is reindexed to", next_module_index) rare_module_of_cells[module_cells_mask] = next_module_index module_index = next_module_index next_module_index += 1 list_of_rare_gene_indices_of_modules.append(gene_indices_of_module) if ut.logging_calc(): cell_counts_of_modules.append(np.sum(module_cells_mask)) list_of_names_of_genes_of_modules.append( # sorted(adata_of_all_genes_of_all_cells. var_names[gene_indices_of_module])) if ut.logging_calc(): ut.log_calc("final modules:") for module_index, (module_cells_count, module_gene_names) in enumerate( zip(cell_counts_of_modules, list_of_names_of_genes_of_modules)): ut.log_calc( f"- module: {module_index} cells: {module_cells_count} genes: {module_gene_names}" ) # return list_of_rare_gene_indices_of_modules
def find_rare_gene_modules( adata: AnnData, what: Union[str, ut.Matrix] = "__x__", *, max_gene_cell_fraction: float = pr.rare_max_gene_cell_fraction, min_gene_maximum: int = pr.rare_min_gene_maximum, genes_similarity_method: str = pr.rare_genes_similarity_method, genes_cluster_method: str = pr.rare_genes_cluster_method, forbidden_gene_names: Optional[Collection[str]] = None, forbidden_gene_patterns: Optional[Collection[Union[str, Pattern]]] = None, min_genes_of_modules: int = pr.rare_min_genes_of_modules, min_cells_of_modules: int = pr.rare_min_cells_of_modules, target_pile_size: int = pr.min_target_pile_size, max_cells_factor_of_random_pile: float = pr. rare_max_cells_factor_of_random_pile, target_metacell_size: float = pr.target_metacell_size, min_modules_size_factor: float = pr.rare_min_modules_size_factor, min_module_correlation: float = pr.rare_min_module_correlation, min_related_gene_fold_factor: float = pr.rare_min_related_gene_fold_factor, max_related_gene_increase_factor: float = pr. rare_max_related_gene_increase_factor, min_cell_module_total: int = pr.rare_min_cell_module_total, reproducible: bool = pr.reproducible, inplace: bool = True, ) -> Optional[Tuple[ut.PandasFrame, ut.PandasFrame]]: """ Detect rare genes modules based on ``what`` (default: {what}) data. Rare gene modules include genes which are weakly and rarely expressed, yet are highly correlated with each other, allowing for robust detection. Global analysis algorithms (such as metacells) tend to ignore or at least discount such genes. It is therefore useful to explicitly identify, in a pre-processing step, the few cells which express such rare gene modules. Once identified, these cells can be exempt from the global algorithm, or the global algorithm can be tweaked in some way to pay extra attention to them. If ``reproducible`` (default: {reproducible}) is ``True``, a slower (still parallel) but reproducible algorithm will be used to compute pearson correlations. **Input** Annotated ``adata``, where the observations are cells and the variables are genes, where ``what`` is a per-variable-per-observation matrix or the name of a per-variable-per-observation annotation containing such a matrix. **Returns** Observation (Cell) Annotations ``cells_rare_gene_module`` The index of the rare gene module each cell expresses the most, or ``-1`` in the common case it does not express any rare genes module. ``rare_cell`` A boolean mask for the (few) cells that express a rare gene module. Variable (Gene) Annotations ``rare_gene_module_<N>`` A boolean mask for the genes in the gene module with index ``N``. ``rare_gene`` A boolean mask for the genes in any of the rare gene modules. If ``inplace``, these are written to to the data, and the function returns ``None``. Otherwise they are returned as tuple containing two data frames. **Computation Parameters** 1. Pick as candidates all genes that are expressed in at most than ``max_gene_cell_fraction`` (default: {max_gene_cell_fraction}) of the cells, and whose maximal value in a cell is at least ``min_gene_maximum`` (default: {min_gene_maximum}), as long as they do not match the ``forbidden_gene_names`` or the ``forbidden_gene_patterns``. 2. Compute the similarity between the genes using :py:func:`metacells.tools.similarity.compute_var_var_similarity` using the ``genes_similarity_method`` (default: {genes_similarity_method}). 3. Create a hierarchical clustering of the candidate genes using the ``genes_cluster_method`` (default: {genes_cluster_method}). 4. Identify gene modules in the hierarchical clustering which contain at least ``min_genes_of_modules`` genes (default: {min_genes_of_modules}), with an average gene-gene cross-correlation of at least ``min_module_correlation`` (default: {min_module_correlation}). 5. Consider cells expressing of any of the genes in the gene module. If the expected number of such cells in each random pile of size ``target_pile_size`` (default: {target_pile_size}), whose total number of UMIs of the rare gene module is at least ``min_cell_module_total`` (default: {min_cell_module_total}), is more than the ``max_cells_factor_of_random_pile`` (default: {max_cells_factor_of_random_pile}) as a fraction of the mean metacells size, then discard the rare gene module as not that rare after all. 6. Add to the gene module all genes whose fraction in cells expressing any of the genes in the rare gene module is at least 2^``min_related_gene_fold_factor`` (default: {min_related_gene_fold_factor}) times their fraction in the rest of the population, as long as their maximal value in one of the expressing cells is at least ``min_gene_maximum``, as long as this doesn't add more than ``max_related_gene_increase_factor`` times the original number of cells to the rare gene module, and as long as they do not match the ``forbidden_gene_names`` or the ``forbidden_gene_patterns``. If a gene is above the threshold for multiple gene modules, associate it with the gene module for which its fold factor is higher. 7. Associate cells with the rare gene module if they contain at least ``min_cell_module_total`` (default: {min_cell_module_total}) UMIs of the expanded rare gene module. If a cell meets the above threshold for several rare gene modules, it is associated with the one for which it contains more UMIs. 8. Discard modules which have less than ``min_cells_of_modules`` (default: {min_cells_of_modules}) cells or whose total UMIs are less than the ``target_metacell_size`` (default: {target_metacell_size}) times the ``min_modules_size_factor`` (default: {min_modules_size_factor}). """ assert min_cells_of_modules > 0 assert min_genes_of_modules > 0 umis_per_gene = ut.get_v_numpy(adata, what, sum=True) total_umis = np.sum(umis_per_gene) mean_umis_per_cell = total_umis / adata.n_obs mean_metacells_size = target_metacell_size / mean_umis_per_cell ut.log_calc("mean_metacells_size", mean_metacells_size) max_cells_of_random_pile = mean_metacells_size * max_cells_factor_of_random_pile ut.log_calc("max_cells_of_random_pile", max_cells_of_random_pile) forbidden_genes_mask = find_named_genes(adata, names=forbidden_gene_names, patterns=forbidden_gene_patterns) assert forbidden_genes_mask is not None allowed_genes_mask = ~forbidden_genes_mask.values ut.log_calc("allowed_genes_mask", allowed_genes_mask) rare_module_of_cells = np.full(adata.n_obs, -1, dtype="int32") list_of_rare_gene_indices_of_modules: List[List[int]] = [] candidates = _pick_candidates( adata_of_all_genes_of_all_cells=adata, what=what, max_gene_cell_fraction=max_gene_cell_fraction, min_gene_maximum=min_gene_maximum, min_genes_of_modules=min_genes_of_modules, allowed_genes_mask=allowed_genes_mask, ) if candidates is None: return _results( adata=adata, rare_module_of_cells=rare_module_of_cells, list_of_rare_gene_indices_of_modules= list_of_rare_gene_indices_of_modules, inplace=inplace, ) candidate_data, candidate_genes_indices = candidates similarities_between_candidate_genes = _genes_similarity( candidate_data=candidate_data, what=what, method=genes_similarity_method, reproducible=reproducible) linkage = _cluster_genes( similarities_between_candidate_genes= similarities_between_candidate_genes, genes_cluster_method=genes_cluster_method, ) rare_gene_indices_of_modules = _identify_genes( candidate_genes_indices=candidate_genes_indices, similarities_between_candidate_genes= similarities_between_candidate_genes, linkage=linkage, min_module_correlation=min_module_correlation, ) max_cells_of_modules = int(max_cells_of_random_pile * adata.n_obs / target_pile_size) ut.log_calc("max_cells_of_modules", max_cells_of_modules) related_gene_indices_of_modules = _related_genes( adata_of_all_genes_of_all_cells=adata, what=what, rare_gene_indices_of_modules=rare_gene_indices_of_modules, allowed_genes_mask=allowed_genes_mask, min_genes_of_modules=min_genes_of_modules, min_cells_of_modules=min_cells_of_modules, max_cells_of_modules=max_cells_of_modules, min_cell_module_total=min_cell_module_total, min_gene_maximum=min_gene_maximum, min_related_gene_fold_factor=min_related_gene_fold_factor, max_related_gene_increase_factor=max_related_gene_increase_factor, ) _identify_cells( adata_of_all_genes_of_all_cells=adata, what=what, related_gene_indices_of_modules=related_gene_indices_of_modules, min_cells_of_modules=min_cells_of_modules, max_cells_of_modules=max_cells_of_modules, min_cell_module_total=min_cell_module_total, rare_module_of_cells=rare_module_of_cells, ) list_of_rare_gene_indices_of_modules = _compress_modules( adata_of_all_genes_of_all_cells=adata, what=what, min_cells_of_modules=min_cells_of_modules, max_cells_of_modules=max_cells_of_modules, target_metacell_size=target_metacell_size, min_modules_size_factor=min_modules_size_factor, related_gene_indices_of_modules=related_gene_indices_of_modules, rare_module_of_cells=rare_module_of_cells, ) return _results( adata=adata, rare_module_of_cells=rare_module_of_cells, list_of_rare_gene_indices_of_modules= list_of_rare_gene_indices_of_modules, inplace=inplace, )
def _related_genes( # pylint: disable=too-many-statements,too-many-branches *, adata_of_all_genes_of_all_cells: AnnData, what: Union[str, ut.Matrix] = "__x__", rare_gene_indices_of_modules: List[List[int]], allowed_genes_mask: ut.NumpyVector, min_genes_of_modules: int, min_gene_maximum: int, min_cells_of_modules: int, max_cells_of_modules: int, min_cell_module_total: int, min_related_gene_fold_factor: float, max_related_gene_increase_factor: float, ) -> List[List[int]]: total_all_cells_umis_of_all_genes = ut.get_v_numpy( adata_of_all_genes_of_all_cells, what, sum=True) ut.log_calc("genes for modules:") modules_count = 0 related_gene_indices_of_modules: List[List[int]] = [] rare_gene_indices_of_any: Set[int] = set() for rare_gene_indices_of_module in rare_gene_indices_of_modules: if len(rare_gene_indices_of_module) >= min_genes_of_modules: rare_gene_indices_of_any.update(list(rare_gene_indices_of_module)) for rare_gene_indices_of_module in rare_gene_indices_of_modules: if len(rare_gene_indices_of_module) < min_genes_of_modules: continue module_index = modules_count modules_count += 1 with ut.log_step("- module", module_index): ut.log_calc( "rare_gene_names", sorted(adata_of_all_genes_of_all_cells. var_names[rare_gene_indices_of_module])) adata_of_module_genes_of_all_cells = ut.slice( adata_of_all_genes_of_all_cells, name=f".module{module_index}.rare_gene", vars=rare_gene_indices_of_module, top_level=False, ) total_module_genes_umis_of_all_cells = ut.get_o_numpy( adata_of_module_genes_of_all_cells, what, sum=True) mask_of_expressed_cells = total_module_genes_umis_of_all_cells > 0 expressed_cells_count = np.sum(mask_of_expressed_cells) if expressed_cells_count > max_cells_of_modules: if ut.logging_calc(): ut.log_calc( "expressed_cells", ut.mask_description(mask_of_expressed_cells) + " (too many)") continue if expressed_cells_count < min_cells_of_modules: if ut.logging_calc(): ut.log_calc( "expressed_cells", ut.mask_description(mask_of_expressed_cells) + " (too few)") continue ut.log_calc("expressed_cells", mask_of_expressed_cells) adata_of_all_genes_of_expressed_cells_of_module = ut.slice( adata_of_all_genes_of_all_cells, name=f".module{module_index}.rare_cell", obs=mask_of_expressed_cells, top_level=False, ) total_expressed_cells_umis_of_all_genes = ut.get_v_numpy( adata_of_all_genes_of_expressed_cells_of_module, what, sum=True) data = ut.get_vo_proper( adata_of_all_genes_of_expressed_cells_of_module, what, layout="column_major") max_expressed_cells_umis_of_all_genes = ut.max_per(data, per="column") total_background_cells_umis_of_all_genes = ( total_all_cells_umis_of_all_genes - total_expressed_cells_umis_of_all_genes) expressed_cells_fraction_of_all_genes = total_expressed_cells_umis_of_all_genes / sum( total_expressed_cells_umis_of_all_genes) background_cells_fraction_of_all_genes = total_background_cells_umis_of_all_genes / sum( total_background_cells_umis_of_all_genes) mask_of_related_genes = ( allowed_genes_mask & (max_expressed_cells_umis_of_all_genes >= min_gene_maximum) & (expressed_cells_fraction_of_all_genes >= background_cells_fraction_of_all_genes * (2**min_related_gene_fold_factor))) related_gene_indices = np.where(mask_of_related_genes)[0] assert np.all(mask_of_related_genes[rare_gene_indices_of_module]) base_genes_of_all_cells_adata = ut.slice( adata_of_all_genes_of_all_cells, name=f".module{module_index}.base", vars=rare_gene_indices_of_module) total_base_genes_of_all_cells = ut.get_o_numpy( base_genes_of_all_cells_adata, what, sum=True) mask_of_strong_base_cells = total_base_genes_of_all_cells >= min_cell_module_total count_of_strong_base_cells = np.sum(mask_of_strong_base_cells) if ut.logging_calc(): ut.log_calc( "candidate_gene_names", sorted(adata_of_all_genes_of_all_cells. var_names[related_gene_indices])) ut.log_calc("base_strong_genes", count_of_strong_base_cells) related_gene_indices_of_module = list(rare_gene_indices_of_module) for gene_index in related_gene_indices: if gene_index in rare_gene_indices_of_module: continue if gene_index in rare_gene_indices_of_any: ut.log_calc( f"- candidate gene {adata_of_all_genes_of_all_cells.var_names[gene_index]} " f"belongs to another module") continue if gene_index not in rare_gene_indices_of_module: related_gene_of_all_cells_adata = ut.slice( adata_of_all_genes_of_all_cells, name= f".{adata_of_all_genes_of_all_cells.var_names[gene_index]}", vars=np.array([gene_index]), ) assert related_gene_of_all_cells_adata.n_vars == 1 total_related_genes_of_all_cells = ut.get_o_numpy( related_gene_of_all_cells_adata, what, sum=True) total_related_genes_of_all_cells += total_base_genes_of_all_cells mask_of_strong_related_cells = total_related_genes_of_all_cells >= min_cell_module_total count_of_strong_related_cells = np.sum( mask_of_strong_related_cells) ut.log_calc( f"- candidate gene {adata_of_all_genes_of_all_cells.var_names[gene_index]} " f"strong cells: {count_of_strong_related_cells} " f"factor: {count_of_strong_related_cells / count_of_strong_base_cells}" ) if count_of_strong_related_cells > max_related_gene_increase_factor * count_of_strong_base_cells: continue related_gene_indices_of_module.append(gene_index) related_gene_indices_of_modules.append( related_gene_indices_of_module) # if ut.logging_calc(): ut.log_calc("related genes for modules:") for module_index, related_gene_indices_of_module in enumerate( related_gene_indices_of_modules): ut.log_calc( f"- module {module_index} related_gene_names", sorted(adata_of_all_genes_of_all_cells. var_names[related_gene_indices_of_module]), ) return related_gene_indices_of_modules
def compute_direct_metacells( # pylint: disable=too-many-statements,too-many-branches adata: AnnData, what: Union[str, ut.Matrix] = "__x__", *, feature_downsample_min_samples: int = pr.feature_downsample_min_samples, feature_downsample_min_cell_quantile: float = pr.feature_downsample_min_cell_quantile, feature_downsample_max_cell_quantile: float = pr.feature_downsample_max_cell_quantile, feature_min_gene_total: Optional[int] = pr.feature_min_gene_total, feature_min_gene_top3: Optional[int] = pr.feature_min_gene_top3, feature_min_gene_relative_variance: Optional[float] = pr.feature_min_gene_relative_variance, feature_gene_names: Optional[Collection[str]] = None, feature_gene_patterns: Optional[Collection[Union[str, Pattern]]] = None, forbidden_gene_names: Optional[Collection[str]] = None, forbidden_gene_patterns: Optional[Collection[Union[str, Pattern]]] = None, cells_similarity_value_normalization: float = pr.cells_similarity_value_normalization, cells_similarity_log_data: bool = pr.cells_similarity_log_data, cells_similarity_method: str = pr.cells_similarity_method, target_metacell_size: float = pr.target_metacell_size, max_cell_size: Optional[float] = pr.max_cell_size, max_cell_size_factor: Optional[float] = pr.max_cell_size_factor, cell_sizes: Optional[Union[str, ut.Vector]] = pr.cell_sizes, knn_k: Optional[int] = pr.knn_k, min_knn_k: Optional[int] = pr.min_knn_k, knn_balanced_ranks_factor: float = pr.knn_balanced_ranks_factor, knn_incoming_degree_factor: float = pr.knn_incoming_degree_factor, knn_outgoing_degree_factor: float = pr.knn_outgoing_degree_factor, candidates_cell_seeds: Optional[Union[str, ut.Vector]] = None, min_seed_size_quantile: float = pr.min_seed_size_quantile, max_seed_size_quantile: float = pr.max_seed_size_quantile, candidates_cooldown_pass: float = pr.cooldown_pass, candidates_cooldown_node: float = pr.cooldown_node, candidates_cooldown_phase: float = pr.cooldown_phase, candidates_min_split_size_factor: Optional[float] = pr.candidates_min_split_size_factor, candidates_max_merge_size_factor: Optional[float] = pr.candidates_max_merge_size_factor, candidates_min_metacell_cells: Optional[int] = pr.min_metacell_cells, candidates_max_split_min_cut_strength: Optional[float] = pr.max_split_min_cut_strength, candidates_min_cut_seed_cells: Optional[int] = pr.min_cut_seed_cells, must_complete_cover: bool = False, deviants_min_gene_fold_factor: float = pr.deviants_min_gene_fold_factor, deviants_abs_folds: bool = pr.deviants_abs_folds, deviants_max_gene_fraction: Optional[float] = pr.deviants_max_gene_fraction, deviants_max_cell_fraction: Optional[float] = pr.deviants_max_cell_fraction, dissolve_min_robust_size_factor: Optional[float] = pr.dissolve_min_robust_size_factor, dissolve_min_convincing_size_factor: Optional[float] = pr.dissolve_min_convincing_size_factor, dissolve_min_convincing_gene_fold_factor: float = pr.dissolve_min_convincing_gene_fold_factor, dissolve_min_metacell_cells: int = pr.dissolve_min_metacell_cells, random_seed: int = pr.random_seed, ) -> AnnData: """ Directly compute metacells using ``what`` (default: {what}) data. This directly computes the metacells on the whole data. Like any method that directly looks at the whole data at once, the amount of CPU and memory needed becomes unreasonable when the data size grows. Above O(10,000) you are much better off using the divide-and-conquer method. .. note:: The current implementation is naive in that it computes the full dense N^2 correlation matrix, and only then extracts the sparse graph out of it. We actually need two copies where each requires 4 bytes per entry, so for O(100,000) cells, we have storage of O(100,000,000,000). In addition, the implementation is serial for the graph clustering phases. It is possible to mitigate this by fusing the correlations phase and the graph generation phase, parallelizing the result, and also (somehow) parallelizing the graph clustering phase. This might increase the "reasonable" size for the direct approach to O(100,000). We have decided not to invest in this direction since it won't allow us to push the size to O(1,000,000) and above. Instead we provide the divide-and-conquer method, which easily scales to O(1,000,000) on a single multi-core server, and to "unlimited" size if we further enhance the implementation to use a distributed compute cluster of such servers. .. todo:: Should :py:func:`compute_direct_metacells` avoid computing the graph and partition it for a very small number of cells? **Input** The presumably "clean" annotated ``adata``, where the observations are cells and the variables are genes, where ``what`` is a per-variable-per-observation matrix or the name of a per-variable-per-observation annotation containing such a matrix. **Returns** Sets the following annotations in ``adata``: Variable (Gene) Annotations ``high_total_gene`` A boolean mask of genes with "high" expression level. ``high_relative_variance_gene`` A boolean mask of genes with "high" normalized variance, relative to other genes with a similar expression level. ``forbidden_gene`` A boolean mask of genes which are forbidden from being chosen as "feature" genes based on their name. ``feature_gene`` A boolean mask of the "feature" genes. ``gene_deviant_votes`` The number of cells each gene marked as deviant (if zero, the gene did not mark any cell as deviant). This will be zero for non-"feature" genes. Observation (Cell) Annotations ``seed`` The index of the seed metacell each cell was assigned to to. This is ``-1`` for non-"clean" cells. ``candidate`` The index of the candidate metacell each cell was assigned to to. This is ``-1`` for non-"clean" cells. ``cell_deviant_votes`` The number of genes that were the reason the cell was marked as deviant (if zero, the cell is not deviant). ``dissolved`` A boolean mask of the cells contained in a dissolved metacell. ``metacell`` The integer index of the metacell each cell belongs to. The metacells are in no particular order. Cells with no metacell assignment ("outliers") are given a metacell index of ``-1``. ``outlier`` A boolean mask of the cells contained in no metacell. **Computation Parameters** 1. Invoke :py:func:`metacells.pipeline.feature.extract_feature_data` to extract "feature" data from the clean data, using the ``feature_downsample_min_samples`` (default: {feature_downsample_min_samples}), ``feature_downsample_min_cell_quantile`` (default: {feature_downsample_min_cell_quantile}), ``feature_downsample_max_cell_quantile`` (default: {feature_downsample_max_cell_quantile}), ``feature_min_gene_total`` (default: {feature_min_gene_total}), ``feature_min_gene_top3`` (default: {feature_min_gene_top3}), ``feature_min_gene_relative_variance`` (default: {feature_min_gene_relative_variance}), ``feature_gene_names`` (default: {feature_gene_names}), ``feature_gene_patterns`` (default: {feature_gene_patterns}), ``forbidden_gene_names`` (default: {forbidden_gene_names}), ``forbidden_gene_patterns`` (default: {forbidden_gene_patterns}) and ``random_seed`` (default: {random_seed}) to make this replicable. 2. Compute the fractions of each variable in each cell, and add the ``cells_similarity_value_normalization`` (default: {cells_similarity_value_normalization}) to it. 3. If ``cells_similarity_log_data`` (default: {cells_similarity_log_data}), invoke the :py:func:`metacells.utilities.computation.log_data` function to compute the log (base 2) of the data. 4. Invoke :py:func:`metacells.tools.similarity.compute_obs_obs_similarity` to compute the similarity between each pair of cells, using the ``cells_similarity_method`` (default: {cells_similarity_method}). 5. Invoke :py:func:`metacells.pipeline.collect.compute_effective_cell_sizes` using ``max_cell_size`` (default: {max_cell_size}), ``max_cell_size_factor`` (default: {max_cell_size_factor}) and ``cell_sizes`` (default: {cell_sizes}) to get the effective cell sizes to use. 5. Invoke :py:func:`metacells.tools.knn_graph.compute_obs_obs_knn_graph` to compute a K-Nearest-Neighbors graph, using the ``knn_balanced_ranks_factor`` (default: {knn_balanced_ranks_factor}), ``knn_incoming_degree_factor`` (default: {knn_incoming_degree_factor}) and ``knn_outgoing_degree_factor`` (default: {knn_outgoing_degree_factor}). If ``knn_k`` (default: {knn_k}) is not specified, then it is chosen to be the median number of cells required to reach the target metacell size, but at least ``min_knn_k`` (default: {min_knn_k}). 6. Invoke :py:func:`metacells.tools.candidates.compute_candidate_metacells` to compute the candidate metacells, using the ``candidates_cell_seeds`` (default: {candidates_cell_seeds}), ``min_seed_size_quantile`` (default: {min_seed_size_quantile}), ``max_seed_size_quantile`` (default: {max_seed_size_quantile}), ``candidates_cooldown_pass`` (default: {candidates_cooldown_pass}), ``candidates_cooldown_node`` (default: {candidates_cooldown_node}), ``candidates_cooldown_phase`` (default: {candidates_cooldown_phase}), ``candidates_min_split_size_factor`` (default: {candidates_min_split_size_factor}), ``candidates_max_merge_size_factor`` (default: {candidates_max_merge_size_factor}), ``candidates_min_metacell_cells`` (default: {candidates_min_metacell_cells}), and ``random_seed`` (default: {random_seed}) to make this replicable. This tries to build metacells of the ``target_metacell_size`` (default: {target_metacell_size}) using the effective cell sizes. 7. Unless ``must_complete_cover`` (default: {must_complete_cover}), invoke :py:func:`metacells.tools.deviants.find_deviant_cells` to remove deviants from the candidate metacells, using the ``deviants_min_gene_fold_factor`` (default: {deviants_min_gene_fold_factor}), ``deviants_abs_folds`` (default: {deviants_abs_folds}), ``deviants_max_gene_fraction`` (default: {deviants_max_gene_fraction}) and ``deviants_max_cell_fraction`` (default: {deviants_max_cell_fraction}). 8. Unless ``must_complete_cover`` (default: {must_complete_cover}), invoke :py:func:`metacells.tools.dissolve.dissolve_metacells` to dissolve small unconvincing metacells, using the same ``target_metacell_size`` (default: {target_metacell_size}), and the effective cell sizes and the ``dissolve_min_robust_size_factor`` (default: {dissolve_min_robust_size_factor}), ``dissolve_min_convincing_size_factor`` (default: {dissolve_min_convincing_size_factor}), ``dissolve_min_convincing_gene_fold_factor`` (default: {dissolve_min_convincing_size_factor}) and ``dissolve_min_metacell_cells`` (default: ``dissolve_min_metacell_cells``). """ fdata = extract_feature_data( adata, what, top_level=False, downsample_min_samples=feature_downsample_min_samples, downsample_min_cell_quantile=feature_downsample_min_cell_quantile, downsample_max_cell_quantile=feature_downsample_max_cell_quantile, min_gene_relative_variance=feature_min_gene_relative_variance, min_gene_total=feature_min_gene_total, min_gene_top3=feature_min_gene_top3, forced_gene_names=feature_gene_names, forced_gene_patterns=feature_gene_patterns, forbidden_gene_names=forbidden_gene_names, forbidden_gene_patterns=forbidden_gene_patterns, random_seed=random_seed, ) if fdata is None: raise ValueError("Empty feature data, giving up") effective_cell_sizes, max_cell_size, _cell_scale_factors = compute_effective_cell_sizes( adata, max_cell_size=max_cell_size, max_cell_size_factor=max_cell_size_factor, cell_sizes=cell_sizes ) ut.log_calc("effective_cell_sizes", effective_cell_sizes, formatter=ut.sizes_description) if max_cell_size is not None: if candidates_min_metacell_cells is not None: target_metacell_size = max(target_metacell_size, max_cell_size * candidates_min_metacell_cells) if dissolve_min_metacell_cells is not None: target_metacell_size = max(target_metacell_size, max_cell_size * dissolve_min_metacell_cells) if candidates_min_metacell_cells is not None or dissolve_min_metacell_cells is not None: ut.log_calc("target_metacell_size", target_metacell_size) data = ut.get_vo_proper(fdata, "downsampled", layout="row_major") data = ut.to_numpy_matrix(data, copy=True) if cells_similarity_value_normalization > 0: data += cells_similarity_value_normalization if cells_similarity_log_data: data = ut.log_data(data, base=2) if knn_k is None: if effective_cell_sizes is None: median_cell_size = 1.0 else: median_cell_size = float(np.median(effective_cell_sizes)) knn_k = int(round(target_metacell_size / median_cell_size)) if min_knn_k is not None: knn_k = max(knn_k, min_knn_k) if knn_k == 0: ut.log_calc("knn_k: 0 (too small, try single metacell)") ut.set_o_data(fdata, "candidate", np.full(fdata.n_obs, 0, dtype="int32"), formatter=lambda _: "* <- 0") elif knn_k >= fdata.n_obs: ut.log_calc(f"knn_k: {knn_k} (too large, try single metacell)") ut.set_o_data(fdata, "candidate", np.full(fdata.n_obs, 0, dtype="int32"), formatter=lambda _: "* <- 0") else: ut.log_calc("knn_k", knn_k) tl.compute_obs_obs_similarity(fdata, data, method=cells_similarity_method, reproducible=(random_seed != 0)) tl.compute_obs_obs_knn_graph( fdata, k=knn_k, balanced_ranks_factor=knn_balanced_ranks_factor, incoming_degree_factor=knn_incoming_degree_factor, outgoing_degree_factor=knn_outgoing_degree_factor, ) tl.compute_candidate_metacells( fdata, target_metacell_size=target_metacell_size, cell_sizes=effective_cell_sizes, cell_seeds=candidates_cell_seeds, min_seed_size_quantile=min_seed_size_quantile, max_seed_size_quantile=max_seed_size_quantile, cooldown_pass=candidates_cooldown_pass, cooldown_node=candidates_cooldown_node, cooldown_phase=candidates_cooldown_phase, min_split_size_factor=candidates_min_split_size_factor, max_merge_size_factor=candidates_max_merge_size_factor, min_metacell_cells=candidates_min_metacell_cells, max_split_min_cut_strength=candidates_max_split_min_cut_strength, min_cut_seed_cells=candidates_min_cut_seed_cells, must_complete_cover=must_complete_cover, random_seed=random_seed, ) ut.set_oo_data(adata, "obs_similarity", ut.get_oo_proper(fdata, "obs_similarity")) ut.set_oo_data(adata, "obs_outgoing_weights", ut.get_oo_proper(fdata, "obs_outgoing_weights")) seed_of_cells = ut.get_o_numpy(fdata, "seed", formatter=ut.groups_description) ut.set_o_data(adata, "seed", seed_of_cells, formatter=ut.groups_description) candidate_of_cells = ut.get_o_numpy(fdata, "candidate", formatter=ut.groups_description) ut.set_o_data(adata, "candidate", candidate_of_cells, formatter=ut.groups_description) if must_complete_cover: assert np.min(candidate_of_cells) == 0 deviant_votes_of_genes = np.zeros(adata.n_vars, dtype="float32") deviant_votes_of_cells = np.zeros(adata.n_obs, dtype="float32") dissolved_of_cells = np.zeros(adata.n_obs, dtype="bool") ut.set_v_data(adata, "gene_deviant_votes", deviant_votes_of_genes, formatter=ut.mask_description) ut.set_o_data(adata, "cell_deviant_votes", deviant_votes_of_cells, formatter=ut.mask_description) ut.set_o_data(adata, "dissolved", dissolved_of_cells, formatter=ut.mask_description) ut.set_o_data(adata, "metacell", candidate_of_cells, formatter=ut.groups_description) else: tl.find_deviant_cells( adata, candidates=candidate_of_cells, min_gene_fold_factor=deviants_min_gene_fold_factor, abs_folds=deviants_abs_folds, max_gene_fraction=deviants_max_gene_fraction, max_cell_fraction=deviants_max_cell_fraction, ) tl.dissolve_metacells( adata, candidates=candidate_of_cells, target_metacell_size=target_metacell_size, cell_sizes=effective_cell_sizes, min_robust_size_factor=dissolve_min_robust_size_factor, min_convincing_size_factor=dissolve_min_convincing_size_factor, min_convincing_gene_fold_factor=dissolve_min_convincing_gene_fold_factor, min_metacell_cells=dissolve_min_metacell_cells, ) metacell_of_cells = ut.get_o_numpy(adata, "metacell", formatter=ut.groups_description) outlier_of_cells = metacell_of_cells < 0 ut.set_o_data(adata, "outlier", outlier_of_cells, formatter=ut.mask_description) return fdata
def combine_masks( # pylint: disable=too-many-branches,too-many-statements adata: AnnData, masks: List[str], *, invert: bool = False, to: Optional[str] = None, ) -> Optional[ut.PandasSeries]: """ Combine different pre-computed masks into a final overall mask. **Input** Annotated ``adata``, where the observations are cells and the variables are genes. **Returns** If ``to`` (default: {to}) is ``None``, returns the computed mask. Otherwise, sets the mask as an annotation (per-variable or per-observation depending on the type of the combined masks). **Computation Parameters** 1. For each of the mask in ``masks``, fetch it. Silently ignore missing masks if the name has a ``?`` suffix. Invert the mask if the name has a ``~`` prefix. If the name has a ``|`` prefix (before the ``~`` prefix, if any), then bitwise-OR the mask into the OR mask, otherwise (or if it has a ``&`` prefix), bitwise-AND the mask into the AND mask. 2. Combine (bitwise-AND) the AND mask and the OR mask into a single mask. 3. If ``invert`` (default: {invert}), invert the result combined mask. """ assert len(masks) > 0 per: Optional[str] = None and_mask: Optional[ut.NumpyVector] = None or_mask: Optional[ut.NumpyVector] = None for mask_name in masks: log_mask_name = mask_name if mask_name[0] == "|": is_or = True mask_name = mask_name[1:] else: is_or = False if mask_name[0] == "&": mask_name = mask_name[1:] if mask_name[0] == "~": invert_mask = True mask_name = mask_name[1:] else: invert_mask = False if mask_name[-1] == "?": must_exist = False mask_name = mask_name[:-1] else: must_exist = True if mask_name in adata.obs: mask_per = "o" mask = ut.get_o_numpy( adata, mask_name, formatter=ut.mask_description) > 0 elif mask_name in adata.var: mask_per = "v" mask = ut.get_v_numpy( adata, mask_name, formatter=ut.mask_description) > 0 else: if must_exist: raise KeyError(f"unknown mask data: {mask_name}") continue if mask.dtype != "bool": raise ValueError(f"the data: {mask_name} is not a boolean mask") if invert_mask: mask = ~mask if ut.logging_calc(): ut.log_calc(log_mask_name, mask) if per is None: per = mask_per else: if mask_per != per: raise ValueError( "mixing per-observation and per-variable masks") if is_or: if or_mask is None: or_mask = mask else: or_mask = or_mask | mask else: if and_mask is None: and_mask = mask else: and_mask = and_mask & mask if and_mask is not None: if or_mask is not None: combined_mask = and_mask & or_mask else: combined_mask = and_mask else: if or_mask is not None: combined_mask = or_mask else: raise ValueError("no masks to combine") if invert: combined_mask = ~combined_mask if to is None: ut.log_return("combined", combined_mask) if per == "o": return ut.to_pandas_series(combined_mask, index=adata.obs_names) assert per == "v" return ut.to_pandas_series(combined_mask, index=adata.var_names) if per == "o": ut.set_o_data(adata, to, combined_mask) else: ut.set_v_data(adata, to, combined_mask) return None
def compute_significant_projected_fold_factors( adata: AnnData, what: Union[str, ut.Matrix] = "__x__", *, total_umis: Optional[ut.Vector], projected: Union[str, ut.Matrix] = "projected", fold_normalization: float = pr.project_fold_normalization, min_significant_gene_value: float = pr.project_min_significant_gene_value, min_gene_fold_factor: float = pr.project_max_projection_fold_factor, min_entry_fold_factor: float = pr.min_entry_project_fold_factor, abs_folds: bool = pr.project_abs_folds, ) -> None: """ Compute the significant projected fold factors of genes for each query metacell. This computes, for each metacell of the query, the fold factors between the actual query UMIs and the UMIs of the projection of the metacell onto the atlas (see :py:func:`metacells.tools.project.project_query_onto_atlas`). The result per-metacell-per-gene matrix is then made sparse by discarding too-low values (setting them to zero). Ideally, this matrix should be "very" sparse. If it contains "too many" non-zero values, more genes need to be ignored by the projection, or somehow corrected for batch effects prior to computing the projection. **Input** Annotated ``adata``, where the observations are query metacells and the variables are genes, where ``what`` is a per-variable-per-observation matrix or the name of a per-variable-per-observation annotation containing such a matrix. In addition, the ``projected`` UMIs of each query metacells onto the atlas. **Returns** Sets the following in ``gdata``: Per-Variable Per-Observation (Gene-Cell) Annotations ``projected_fold`` For each gene and query metacell, the fold factor of this gene between the query and its projection (unless the value is too low to be of interest, in which case it will be zero). **Computation Parameters** 1. For each group (metacell), for each gene, compute the gene's fold factor log2((actual UMIs + ``fold_normalization``) / (expected UMIs + ``fold_normalization``)), similarly to :py:func:`metacells.tools.project.project_query_onto_atlas` (the default ``fold_normalization`` is {fold_normalization}). 2. Set the fold factor to zero for every case where the total UMIs in the query metacell and the projected image is not at least ``min_significant_gene_value`` (default: {min_significant_gene_value}). 3. If the maximal fold factor for a gene (across all metacells) is below ``min_gene_fold_factor`` (default: {min_gene_fold_factor}), then set all the gene's fold factors to zero (too low to be of interest). 4. Otherwise, for any metacell whose fold factor for the gene is less than ``min_entry_fold_factor`` (default: {min_entry_fold_factor}), set the fold factor to zero (too low to be of interest). If ``abs_folds`` (default: {abs_folds}), consider the absolute fold factors. """ assert 0 <= min_entry_fold_factor <= min_gene_fold_factor assert fold_normalization >= 0 metacells_data = ut.get_vo_proper(adata, what, layout="row_major") projected_data = ut.get_vo_proper(adata, projected, layout="row_major") metacells_fractions = ut.fraction_by(metacells_data, by="row", sums=total_umis) projected_fractions = ut.fraction_by(projected_data, by="row", sums=total_umis) metacells_fractions += fold_normalization # type: ignore projected_fractions += fold_normalization # type: ignore dense_folds = metacells_fractions / projected_fractions # type: ignore dense_folds = np.log2(dense_folds, out=dense_folds) total_umis = ut.to_numpy_matrix(metacells_data + projected_data) # type: ignore insignificant_folds_mask = total_umis < min_significant_gene_value ut.log_calc("insignificant entries", insignificant_folds_mask) dense_folds[insignificant_folds_mask] = 0.0 significant_folds = significant_folds_matrix(dense_folds, min_gene_fold_factor, min_entry_fold_factor, abs_folds) ut.set_vo_data(adata, "projected_fold", significant_folds)
def compute_inner_fold_factors( what: Union[str, ut.Matrix] = "__x__", *, adata: AnnData, gdata: AnnData, group: Union[str, ut.Vector] = "metacell", min_gene_inner_fold_factor: float = pr.min_gene_inner_fold_factor, min_entry_inner_fold_factor: float = pr.min_entry_inner_fold_factor, inner_abs_folds: float = pr.inner_abs_folds, ) -> None: """ Compute the inner fold factors of genes within in each metacell. This computes, for each cell of the metacell, the same fold factors that are used to detect deviant cells (see :py:func:`metacells.tools.deviants.find_deviant_cells`), and keeps the maximal fold for each gene in the metacell. The result per-metacell-per-gene matrix is then made sparse by discarding too-low values (setting them to zero). Ideally, this matrix should be "very" sparse. If it contains "too many" non-zero values, this indicates the metacells contains "too much" variability. This may be due to actual biology (e.g. immune cells or olfactory nerves which are all similar except for each one expressing one different gene), due to batch effects (similar cells in distinct batches differing in some genes due to technical issues), due to low data quality (the overall noise level is so high that this is simply the best the algorithm can do), or worse - a combination of the above. **Input** Annotated ``adata``, where the observations are cells and the variables are genes, where ``what`` is a per-variable-per-observation matrix or the name of a per-variable-per-observation annotation containing such a matrix. In addition, ``gdata`` is assumed to have one observation for each group, and use the same genes as ``adata``. **Returns** Sets the following in ``gdata``: Per-Variable Per-Observation (Gene-Cell) Annotations ``inner_fold`` For each gene and group, the maximal fold factor of this gene in any cell contained in the group (unless the value is too low to be of interest, in which case it will be zero). **Computation Parameters** 1. For each group (metacell), for each gene, compute the gene's maximal (in all the cells of the group) fold factor log2((actual UMIs + 1) / (expected UMIs + 1)), similarly to :py:func:`metacells.tools.deviants.find_deviant_cells`. 2. If the maximal fold factor for a gene (across all metacells) is below ``min_gene_inner_fold_factor`` (default: {min_gene_inner_fold_factor}), then set all the gene's fold factors to zero (too low to be of interest). If ``inner_abs_folds`` (default: {inner_abs_folds}), consider the absolute fold factors. 3. Otherwise, for any metacell whose fold factor for the gene is less than ``min_entry_inner_fold_factor`` (default: {min_entry_inner_fold_factor}), set the fold factor to zero (too low to be of interest). """ assert 0 <= min_entry_inner_fold_factor <= min_gene_inner_fold_factor cells_data = ut.get_vo_proper(adata, what, layout="row_major") metacells_data = ut.get_vo_proper(gdata, what, layout="row_major") group_of_cells = ut.get_o_numpy(adata, group, formatter=ut.groups_description) total_umis_per_cell = ut.sum_per(cells_data, per="row") total_umis_per_metacell = ut.sum_per(metacells_data, per="row") @ut.timed_call("compute_metacell_inner_folds") def _compute_single_metacell_inner_folds( metacell_index: int) -> ut.NumpyVector: return _compute_metacell_inner_folds( metacell_index=metacell_index, cells_data=cells_data, metacells_data=metacells_data, group_of_cells=group_of_cells, total_umis_per_cell=total_umis_per_cell, total_umis_per_metacell=total_umis_per_metacell, ) results = list( ut.parallel_map(_compute_single_metacell_inner_folds, gdata.n_obs)) dense_inner_folds_by_row = np.array(results) dense_inner_folds_by_column = ut.to_layout(dense_inner_folds_by_row, "column_major") if inner_abs_folds: comparable_dense_inner_folds_by_column = np.abs( dense_inner_folds_by_column) else: comparable_dense_inner_folds_by_column = dense_inner_folds_by_column max_fold_per_gene = ut.max_per(comparable_dense_inner_folds_by_column, per="column") significant_genes_mask = max_fold_per_gene >= min_gene_inner_fold_factor ut.log_calc("significant_genes_mask", significant_genes_mask) dense_inner_folds_by_column[:, ~significant_genes_mask] = 0 dense_inner_folds_by_column[comparable_dense_inner_folds_by_column < min_entry_inner_fold_factor] = 0 dense_inner_folds_by_row = ut.to_layout(dense_inner_folds_by_column, layout="row_major") sparse_inner_folds = sparse.csr_matrix(dense_inner_folds_by_row) ut.set_vo_data(gdata, "inner_fold", sparse_inner_folds)
def compute_type_compatible_sizes( adatas: List[AnnData], *, size: str = "grouped", kind: str = "type", ) -> None: """ Given multiple annotated data of groups, compute a "compatible" size for each one to allow for consistent inner normalized variance comparison. Since the inner normalized variance quality measure is sensitive to the group (metacell) sizes, it is useful to artificially shrink the groups so the sizes will be similar between the compared data sets. Assuming each group (metacell) has a type annotation, for each such type, we give each one a "compatible" size (less than or equal to its actual size) so that using this reduced size will give us comparable measures between all the data sets. The "compatible" sizes are chosen such that the density distributions of the sizes in all data sets would be as similar to each other as possible. .. note:: This is only effective if the groups are "similar" in size. Using this to compare very coarse grouping (few thousands of cells) with fine-grained ones (few dozens of cells) will still result in very different results. **Input** Several annotated ``adatas`` where each observation is a group. Should contain per-observation ``size`` annotation (default: {size}) and ``kind`` annotation (default: {kind}). **Returns** Sets the following in each ``adata``: Per-Observation (group) Annotations: ``compatible_size`` The number of grouped cells in the group to use for computing excess R^2 and inner normalized variance. **Computation** 1. For each type, sort the groups (metacells) in increasing number of grouped observations (cells). 2. Consider the maximal quantile (rank) of the next smallest group (metacell) in each data set. 3. Compute the minimal number of grouped observations in all the metacells whose quantile is up to this maximal quantile. 4. Use this as the "compatible" size for all these groups, and remove them from consideration. 5. Loop until all groups are assigned a "compatible" size. """ assert len(adatas) > 0 if len(adatas) == 1: ut.set_o_data( adatas[0], "compatible_size", ut.get_o_numpy(adatas[0], size, formatter=ut.sizes_description)) return group_sizes_of_data = [ ut.get_o_numpy(adata, size, formatter=ut.sizes_description) for adata in adatas ] group_types_of_data = [ut.get_o_numpy(adata, kind) for adata in adatas] unique_types: Set[Any] = set() for group_types in group_types_of_data: unique_types.update(group_types) compatible_size_of_data = [np.full(adata.n_obs, -1) for adata in adatas] groups_count_of_data: List[int] = [] for type_index, group_type in enumerate(sorted(unique_types)): with ut.log_step( f"- {group_type}", ut.progress_description(len(unique_types), type_index, "type")): sorted_group_indices_of_data = [ np.argsort(group_sizes)[group_types == group_type] for group_sizes, group_types in zip(group_sizes_of_data, group_types_of_data) ] groups_count_of_data = [ len(sorted_group_indices) for sorted_group_indices in sorted_group_indices_of_data ] ut.log_calc("group_counts", groups_count_of_data) def _for_each(value_of_data: List[T]) -> List[T]: return [ value for groups_count, value in zip( groups_count_of_data, value_of_data) if groups_count > 0 ] groups_count_of_each = _for_each(groups_count_of_data) if len(groups_count_of_each) == 0: continue sorted_group_indices_of_each = _for_each( sorted_group_indices_of_data) group_sizes_of_each = _for_each(group_sizes_of_data) compatible_size_of_each = _for_each(compatible_size_of_data) if len(groups_count_of_each) == 1: compatible_size_of_each[0][ sorted_group_indices_of_each[0]] = group_sizes_of_each[0][ sorted_group_indices_of_each[0]] group_quantile_of_each = [ (np.arange(len(sorted_group_indices)) + 1) / len(sorted_group_indices) for sorted_group_indices in sorted_group_indices_of_each ] next_position_of_each = np.full(len(group_quantile_of_each), 0) while True: next_quantile_of_each = [ group_quantile[next_position] for group_quantile, next_position in zip( group_quantile_of_each, next_position_of_each) ] next_quantile = max(next_quantile_of_each) last_position_of_each = next_position_of_each.copy() next_position_of_each[:] = [ np.sum(group_quantile <= next_quantile) for group_quantile in group_quantile_of_each ] positions_of_each = [ range(last_position, next_position) for last_position, next_position in zip( last_position_of_each, next_position_of_each) ] sizes_of_each = [ group_sizes[sorted_group_indices[positions]] for group_sizes, sorted_group_indices, positions in zip( group_sizes_of_each, sorted_group_indices_of_each, positions_of_each) ] min_size_of_each = [ np.min(sizes) for sizes, positions in zip( sizes_of_each, positions_of_each) ] min_size = min(min_size_of_each) for sorted_group_indices, positions, compatible_size in zip( sorted_group_indices_of_each, positions_of_each, compatible_size_of_each): compatible_size[sorted_group_indices[positions]] = min_size is_done_of_each = [ next_position == groups_count for next_position, groups_count in zip( next_position_of_each, groups_count_of_each) ] if all(is_done_of_each): break assert not any(is_done_of_each) for adata, compatible_size in zip(adatas, compatible_size_of_data): assert np.min(compatible_size) > 0 ut.set_o_data(adata, "compatible_size", compatible_size)
def _collect_group_data( group_index: int, *, group_of_cells: ut.NumpyVector, cells_data: ut.ProperMatrix, compatible_size: Optional[int], downsample_min_samples: int, downsample_min_cell_quantile: float, downsample_max_cell_quantile: float, min_gene_total: int, random_seed: int, variance_per_gene_per_group: ut.NumpyMatrix, normalized_variance_per_gene_per_group: ut.NumpyMatrix, ) -> None: cell_indices = np.where(group_of_cells == group_index)[0] cells_count = len(cell_indices) if cells_count < 2: return if compatible_size is None: ut.log_calc(" cells", cells_count) else: assert 0 < compatible_size <= cells_count if compatible_size < cells_count: np.random.seed(random_seed) if ut.logging_calc(): ut.log_calc(" cells: " + ut.ratio_description( len(cell_indices), "cell", compatible_size, "compatible")) cell_indices = np.random.choice(cell_indices, size=compatible_size, replace=False) assert len(cell_indices) == compatible_size assert ut.is_layout(cells_data, "row_major") group_data = cells_data[cell_indices, :] total_per_cell = ut.sum_per(group_data, per="row") samples = int( round( min( max(downsample_min_samples, np.quantile(total_per_cell, downsample_min_cell_quantile)), np.quantile(total_per_cell, downsample_max_cell_quantile), ))) if ut.logging_calc(): ut.log_calc(f" samples: {samples}") downsampled_data = ut.downsample_matrix(group_data, per="row", samples=samples, random_seed=random_seed) downsampled_data = ut.to_layout(downsampled_data, layout="column_major") total_per_gene = ut.sum_per(downsampled_data, per="column") too_small_genes = total_per_gene < min_gene_total if ut.logging_calc(): included_genes_count = len(too_small_genes) - np.sum(too_small_genes) ut.log_calc(f" included genes: {included_genes_count}") variance_per_gene = ut.variance_per(downsampled_data, per="column") normalized_variance_per_gene = ut.normalized_variance_per(downsampled_data, per="column") variance_per_gene[too_small_genes] = None normalized_variance_per_gene[too_small_genes] = None variance_per_gene_per_group[group_index, :] = variance_per_gene normalized_variance_per_gene_per_group[ group_index, :] = normalized_variance_per_gene
def find_metacells_significant_genes( adata: AnnData, what: Union[str, ut.Matrix] = "__x__", *, min_gene_range_fold: float = pr.min_significant_metacells_gene_range_fold_factor, normalization: float = pr.metacells_gene_range_normalization, min_gene_fraction: float = pr.min_significant_metacells_gene_fraction, inplace: bool = True, ) -> Optional[ut.PandasSeries]: """ Find genes which have a significant signal in metacells data. This computation is too unreliable to be used on cells. Find genes which have a high maximal expression in at least one metacell, and a wide range of expression across the metacells. Such genes are good candidates for being used as marker genes and/or to compute distances between metacells. **Input** Annotated ``adata``, where the observations are cells and the variables are genes, where ``what`` is a per-variable-per-observation matrix or the name of a per-variable-per-observation annotation containing such a matrix. **Returns** Variable (Gene) Annotations ``significant_gene`` A boolean mask indicating whether each gene was found to be significant. If ``inplace`` (default: {inplace}), this is written to the data, and the function returns ``None``. Otherwise this is returned as a pandas series (indexed by the variable names). **Computation Parameters** 1. Compute the minimal and maximal expression level of each gene. 2. Select the genes whose fold factor (log2 of maximal over minimal value, using the ``normalization`` (default: {normalization}) is at least ``min_gene_range_fold`` (default: {min_gene_range_fold}). 3. Select the genes whose maximal expression is at least ``min_gene_fraction`` (default: {min_gene_fraction}). """ assert normalization >= 0 data = ut.get_vo_proper(adata, what, layout="row_major") fractions_of_genes = ut.to_layout(ut.fraction_by(data, by="row"), layout="column_major") min_fraction_of_genes = ut.min_per(fractions_of_genes, per="column") max_fraction_of_genes = ut.max_per(fractions_of_genes, per="column") high_max_fraction_genes_mask = max_fraction_of_genes >= min_gene_fraction ut.log_calc("high max fraction genes", high_max_fraction_genes_mask) min_fraction_of_genes += normalization max_fraction_of_genes += normalization max_fraction_of_genes /= min_fraction_of_genes range_fold_of_genes = np.log2(max_fraction_of_genes, out=max_fraction_of_genes) high_range_genes_mask = range_fold_of_genes >= min_gene_range_fold ut.log_calc("high range genes", high_range_genes_mask) significant_genes_mask = high_max_fraction_genes_mask & high_range_genes_mask if inplace: ut.set_v_data(adata, "significant_gene", significant_genes_mask) return None ut.log_return("significant_genes", significant_genes_mask) return ut.to_pandas_series(significant_genes_mask, index=adata.var_names)