def _filter_genes( *, cells_count: int, genes_count: int, fold_factors: ut.CompressedMatrix, min_gene_fold_factor: float, max_gene_fraction: Optional[float] = None, ) -> ut.NumpyVector: ut.timed_parameters(cells=cells_count, genes=genes_count, fold_factors=fold_factors.nnz) max_fold_factors_of_genes = ut.max_per(fold_factors, per="column") assert max_fold_factors_of_genes.size == genes_count mask_of_deviant_genes = max_fold_factors_of_genes >= min_gene_fold_factor deviant_gene_fraction = np.sum(mask_of_deviant_genes) / genes_count if max_gene_fraction is not None and deviant_gene_fraction > max_gene_fraction: if ut.logging_calc(): ut.log_calc("candidate_deviant_genes", mask_of_deviant_genes) quantile_gene_fold_factor = np.quantile(max_fold_factors_of_genes, 1 - max_gene_fraction) assert quantile_gene_fold_factor is not None ut.log_calc("quantile_gene_fold_factor", quantile_gene_fold_factor) if quantile_gene_fold_factor > min_gene_fold_factor: min_gene_fold_factor = quantile_gene_fold_factor mask_of_deviant_genes = max_fold_factors_of_genes >= min_gene_fold_factor fold_factors.data[fold_factors.data < min_gene_fold_factor] = 0 ut.eliminate_zeros(fold_factors) if ut.logging_calc(): ut.log_calc("deviant_genes", mask_of_deviant_genes) deviant_gene_indices = np.where(mask_of_deviant_genes)[0] return deviant_gene_indices
def store_matrix(matrix: ut.CompressedMatrix, name: str, when: bool) -> None: # if when: name = elements + "_" + name set_data( adata, name, matrix, formatter=lambda matrix: ut.ratio_description( matrix.shape[0] * matrix.shape[1], "element", matrix.nnz, "nonzero"), ) elif ut.logging_calc(): ut.log_calc( f"{elements}_{name}", ut.ratio_description(matrix.shape[0] * matrix.shape[1], "element", matrix.nnz, "nonzero"), )
def _filter_cells( *, cells_count: int, genes_count: int, deviant_genes_fold_ranks: ut.NumpyMatrix, deviant_gene_indices: ut.NumpyVector, max_cell_fraction: Optional[float], ) -> Tuple[ut.NumpyVector, ut.NumpyVector]: min_fold_ranks_of_cells = np.min(deviant_genes_fold_ranks, axis=1) assert min_fold_ranks_of_cells.size == cells_count threshold_cells_fold_rank = cells_count mask_of_deviant_cells = min_fold_ranks_of_cells < threshold_cells_fold_rank deviants_cells_count = sum(mask_of_deviant_cells) deviant_cell_fraction = deviants_cells_count / cells_count if ut.logging_calc(): ut.log_calc("deviant_cells", mask_of_deviant_cells) if max_cell_fraction is not None and deviant_cell_fraction > max_cell_fraction: quantile_cells_fold_rank = np.quantile(min_fold_ranks_of_cells, max_cell_fraction) assert quantile_cells_fold_rank is not None ut.log_calc("quantile_cells_fold_rank", quantile_cells_fold_rank) if quantile_cells_fold_rank < threshold_cells_fold_rank: threshold_cells_fold_rank = quantile_cells_fold_rank ut.log_calc("threshold_cells_fold_rank", threshold_cells_fold_rank) deviant_votes = deviant_genes_fold_ranks < threshold_cells_fold_rank votes_of_deviant_cells = ut.sum_per(ut.to_layout(deviant_votes, "row_major"), per="row") assert votes_of_deviant_cells.size == cells_count votes_of_deviant_genes = ut.sum_per(deviant_votes, per="column") assert votes_of_deviant_genes.size == deviant_gene_indices.size votes_of_all_genes = np.zeros(genes_count, dtype="int32") votes_of_all_genes[deviant_gene_indices] = votes_of_deviant_genes return votes_of_deviant_cells, votes_of_all_genes
def find_noisy_lonely_genes( # pylint: disable=too-many-statements adata: AnnData, what: Union[str, ut.Matrix] = "__x__", *, excluded_genes_mask: Optional[str] = None, max_sampled_cells: int = pr.noisy_lonely_max_sampled_cells, downsample_min_samples: int = pr.noisy_lonely_downsample_min_samples, downsample_min_cell_quantile: float = pr. noisy_lonely_downsample_max_cell_quantile, downsample_max_cell_quantile: float = pr. noisy_lonely_downsample_min_cell_quantile, min_gene_total: int = pr.noisy_lonely_min_gene_total, min_gene_normalized_variance: float = pr. noisy_lonely_min_gene_normalized_variance, max_gene_similarity: float = pr.noisy_lonely_max_gene_similarity, random_seed: int = pr.random_seed, inplace: bool = True, ) -> Optional[ut.PandasSeries]: """ Detect "noisy lonely" genes based on ``what`` (default: {what}) data. Return the indices of genes which are "noisy" (have high variance compared to their mean) and also "lonely" (have low correlation with all other genes). Such genes should be excluded since they will never meaningfully help us compute groups, and will actively cause profiles to be considered "deviants". Noisy genes have high expression and variance. Lonely genes have no (or low) correlations with any other gene. Noisy lonely genes tend to throw off clustering algorithms. In general, such algorithms try to group together cells with the same overall biological state. Since the genes are lonely, they don't contribute towards this goal. Since they are noisy, they actively hamper this, because they make cells which are otherwise similar appear different (just for this lonely gene). It is therefore useful to explicitly identify, in a pre-processing step, the (few) such genes, and exclude them from the rest of the analysis. **Input** Annotated ``adata``, where the observations are cells and the variables are genes, where ``what`` is a per-variable-per-observation matrix or the name of a per-variable-per-observation annotation containing such a matrix. **Returns** Variable (Gene) Annotations ``noisy_lonely_genes`` A boolean mask indicating whether each gene was found to be a "noisy lonely" gene. If ``inplace`` (default: {inplace}), this is written to the data, and the function returns ``None``. Otherwise this is returned as a pandas series (indexed by the variable names). **Computation Parameters** 1. If we have more than ``max_sampled_cells`` (default: {max_sampled_cells}), pick this number of random cells from the data using the ``random_seed``. 2. If we were specified an ``excluded_genes_mask``, this is the name of a per-variable (gene) annotation containing a mask of excluded genes. Get rid of all these excluded genes. 3. Invoke :py:func:`metacells.tools.downsample.downsample_cells` to downsample the cells to the same total number of UMIs, using the ``downsample_min_samples`` (default: {downsample_min_samples}), ``downsample_min_cell_quantile`` (default: {downsample_min_cell_quantile}), ``downsample_max_cell_quantile`` (default: {downsample_max_cell_quantile}) and the ``random_seed`` (default: {random_seed}). 4. Find "noisy" genes which have a total number of UMIs of at least ``min_gene_total`` (default: {min_gene_total}) and a normalized variance of at least ``min_gene_normalized_variance`` (default: ``min_gene_normalized_variance``). 5. Cross-correlate the noisy genes. 6. Find the noisy "lonely" genes whose maximal correlation is at most ``max_gene_similarity`` (default: {max_gene_similarity}) with all other genes. """ if max_sampled_cells < adata.n_obs: np.random.seed(random_seed) cell_indices = np.random.choice(np.arange(adata.n_obs), size=max_sampled_cells, replace=False) s_data = ut.slice(adata, obs=cell_indices, name=".sampled", top_level=False) else: s_data = ut.copy_adata(adata, top_level=False) track_var: Optional[str] = "sampled_gene_index" if excluded_genes_mask is not None: results = filter_data(s_data, name="included", top_level=False, track_var=track_var, var_masks=[f"~{excluded_genes_mask}"]) track_var = None assert results is not None i_data = results[0] assert i_data is not None else: i_data = s_data downsample_cells( i_data, what, downsample_min_samples=downsample_min_samples, downsample_min_cell_quantile=downsample_min_cell_quantile, downsample_max_cell_quantile=downsample_max_cell_quantile, random_seed=random_seed, ) find_high_total_genes(i_data, "downsampled", min_gene_total=min_gene_total) results = filter_data(i_data, name="high_total", top_level=False, track_var=track_var, var_masks=["high_total_gene"]) track_var = None assert results is not None ht_data = results[0] noisy_lonely_genes_mask = np.full(adata.n_vars, False) if ht_data is not None: ht_genes_count = ht_data.shape[1] ht_gene_ht_gene_similarity_frame = compute_var_var_similarity( ht_data, "downsampled", inplace=False, reproducible=(random_seed != 0)) assert ht_gene_ht_gene_similarity_frame is not None ht_gene_ht_gene_similarity_matrix = ut.to_numpy_matrix( ht_gene_ht_gene_similarity_frame, only_extract=True) ht_gene_ht_gene_similarity_matrix = ut.to_layout( ht_gene_ht_gene_similarity_matrix, layout="row_major", symmetric=True) np.fill_diagonal(ht_gene_ht_gene_similarity_matrix, -1) htv_mask_series = find_high_normalized_variance_genes( ht_data, "downsampled", min_gene_normalized_variance=min_gene_normalized_variance, inplace=False) assert htv_mask_series is not None htv_mask = ut.to_numpy_vector(htv_mask_series) htv_genes_count = np.sum(htv_mask) assert htv_genes_count <= ht_genes_count if htv_genes_count > 0: htv_gene_ht_gene_similarity_matrix = ht_gene_ht_gene_similarity_matrix[ htv_mask, :] assert ut.is_layout(htv_gene_ht_gene_similarity_matrix, "row_major") assert htv_gene_ht_gene_similarity_matrix.shape == ( htv_genes_count, ht_genes_count) max_similarity_of_htv_genes = ut.max_per( htv_gene_ht_gene_similarity_matrix, per="row") htvl_mask = max_similarity_of_htv_genes <= max_gene_similarity htvl_genes_count = np.sum(htvl_mask) ut.log_calc("noisy_lonely_genes_count", htvl_genes_count) if htvl_genes_count > 0: base_index_of_ht_genes = ut.get_v_numpy( ht_data, "sampled_gene_index") base_index_of_htv_genes = base_index_of_ht_genes[htv_mask] base_index_of_htvl_genes = base_index_of_htv_genes[htvl_mask] noisy_lonely_genes_mask[base_index_of_htvl_genes] = True htvl_gene_ht_gene_similarity_matrix = htv_gene_ht_gene_similarity_matrix[ htvl_mask, :] htvl_gene_ht_gene_similarity_matrix = ut.to_layout( htvl_gene_ht_gene_similarity_matrix, layout="row_major") assert htvl_gene_ht_gene_similarity_matrix.shape == ( htvl_genes_count, ht_genes_count) if ut.logging_calc(): i_gene_totals = ut.get_v_numpy(i_data, "downsampled", sum=True) ht_mask = ut.get_v_numpy(i_data, "high_total_gene") i_total = np.sum(i_gene_totals) htvl_gene_totals = i_gene_totals[ht_mask][htv_mask][ htvl_mask] top_similarity_of_htvl_genes = ut.top_per( htvl_gene_ht_gene_similarity_matrix, 10, per="row") for htvl_index, gene_index in enumerate( base_index_of_htvl_genes): gene_name = adata.var_names[gene_index] gene_total = htvl_gene_totals[htvl_index] gene_percent = 100 * gene_total / i_total similar_ht_values = ut.to_numpy_vector( top_similarity_of_htvl_genes[htvl_index, :]) # assert len(similar_ht_values) == ht_genes_count top_similar_ht_mask = similar_ht_values > 0 top_similar_ht_values = similar_ht_values[ top_similar_ht_mask] top_similar_ht_indices = base_index_of_ht_genes[ top_similar_ht_mask] top_similar_ht_names = adata.var_names[ top_similar_ht_indices] ut.log_calc( f"- {gene_name}", f"total downsampled UMIs: {gene_total} " + f"({gene_percent:.4g}%), correlated with: " + ", ".join([ f"{similar_gene_name}: {similar_gene_value:.4g}" for similar_gene_value, similar_gene_name in reversed( sorted( zip(top_similar_ht_values, top_similar_ht_names))) ]), ) if ut.logging_calc(): ut.log_calc("noisy_lonely_gene_names", sorted(list(adata.var_names[noisy_lonely_genes_mask]))) if inplace: ut.set_v_data(adata, "noisy_lonely_gene", noisy_lonely_genes_mask) return None ut.log_return("noisy_lonely_genes", noisy_lonely_genes_mask) return ut.to_pandas_series(noisy_lonely_genes_mask, index=adata.var_names)
def _keep_candidate( # pylint: disable=too-many-branches adata: AnnData, candidate_index: int, *, data: ut.ProperMatrix, cell_sizes: Optional[ut.NumpyVector], fraction_of_genes: ut.NumpyVector, min_metacell_cells: int, min_robust_size: Optional[float], min_convincing_size: Optional[float], min_convincing_gene_fold_factor: float, abs_folds: bool, candidates_count: int, candidate_cell_indices: ut.NumpyVector, ) -> bool: genes_count = data.shape[1] if cell_sizes is None: candidate_total_size = candidate_cell_indices.size else: candidate_total_size = np.sum(cell_sizes[candidate_cell_indices]) if candidate_cell_indices.size < min_metacell_cells: if ut.logging_calc(): ut.log_calc( f'- candidate: {ut.progress_description(candidates_count, candidate_index, "candidate")} ' f"cells: {candidate_cell_indices.size} " f"size: {candidate_total_size:g} " f"is: little") return False if min_robust_size is not None and candidate_total_size >= min_robust_size: if ut.logging_calc(): ut.log_calc( f'- candidate: {ut.progress_description(candidates_count, candidate_index, "candidate")} ' f"cells: {candidate_cell_indices.size} " f"size: {candidate_total_size:g} " f"is: robust") return True if min_convincing_size is None: if ut.logging_calc(): ut.log_calc( f'- candidate: {ut.progress_description(candidates_count, candidate_index, "candidate")} ' f"cells: {candidate_cell_indices.size} " f"size: {candidate_total_size:g} " f"is: accepted") return True if candidate_total_size < min_convincing_size: if ut.logging_calc(): ut.log_calc( f'- candidate: {ut.progress_description(candidates_count, candidate_index, "candidate")} ' f"cells: {candidate_cell_indices.size} " f"size: {candidate_total_size:g} " f"is: unconvincing") return False candidate_data = data[candidate_cell_indices, :] candidate_data_of_genes = ut.to_numpy_vector(candidate_data.sum(axis=0)) assert candidate_data_of_genes.size == genes_count candidate_total = np.sum(candidate_data_of_genes) candidate_expected_of_genes = fraction_of_genes * candidate_total candidate_expected_of_genes += 1 candidate_data_of_genes += 1 candidate_data_of_genes /= candidate_expected_of_genes np.log2(candidate_data_of_genes, out=candidate_data_of_genes) if abs_folds: convincing_genes_mask = np.abs( candidate_data_of_genes) >= min_convincing_gene_fold_factor else: convincing_genes_mask = candidate_data_of_genes >= min_convincing_gene_fold_factor keep_candidate = bool(np.any(convincing_genes_mask)) if ut.logging_calc(): convincing_gene_indices = np.where(convincing_genes_mask)[0] if keep_candidate: ut.log_calc( f'- candidate: {ut.progress_description(candidates_count, candidate_index, "candidate")} ' f"cells: {candidate_cell_indices.size} " f"size: {candidate_total_size:g} " f"is: convincing because:") for fold_factor, name in reversed( sorted( zip(candidate_data_of_genes[convincing_gene_indices], adata.var_names[convincing_gene_indices]))): ut.log_calc(f" {name}: {ut.fold_description(fold_factor)}") else: ut.log_calc( f'- candidate: {ut.progress_description(candidates_count, candidate_index, "candidate")} ' f"cells: {candidate_cell_indices.size} " f"size: {candidate_total_size:g} " f"is: not convincing") return keep_candidate
def _compress_modules( *, adata_of_all_genes_of_all_cells: AnnData, what: Union[str, ut.Matrix] = "__x__", min_cells_of_modules: int, max_cells_of_modules: int, target_metacell_size: float, min_modules_size_factor: float, related_gene_indices_of_modules: List[List[int]], rare_module_of_cells: ut.NumpyVector, ) -> List[List[int]]: list_of_rare_gene_indices_of_modules: List[List[int]] = [] list_of_names_of_genes_of_modules: List[List[str]] = [] min_umis_of_modules = target_metacell_size * min_modules_size_factor ut.log_calc("min_umis_of_modules", min_umis_of_modules) total_all_genes_of_all_cells = ut.get_o_numpy( adata_of_all_genes_of_all_cells, what, sum=True) cell_counts_of_modules: List[int] = [] ut.log_calc("compress modules:") modules_count = len(related_gene_indices_of_modules) for module_index, gene_indices_of_module in enumerate( related_gene_indices_of_modules): if len(gene_indices_of_module) == 0: continue with ut.log_step( "- module", module_index, formatter=lambda module_index: ut.progress_description( modules_count, module_index, "module"), ): module_cells_mask = rare_module_of_cells == module_index module_cells_count = np.sum(module_cells_mask) module_umis_count = np.sum( total_all_genes_of_all_cells[module_cells_mask]) if module_cells_count < min_cells_of_modules: if ut.logging_calc(): ut.log_calc("cells", str(module_cells_count) + " (too few)") rare_module_of_cells[module_cells_mask] = -1 continue if module_cells_count > max_cells_of_modules: if ut.logging_calc(): ut.log_calc("cells", str(module_cells_count) + " (too many)") rare_module_of_cells[module_cells_mask] = -1 continue ut.log_calc("cells", module_cells_count) if module_umis_count < min_umis_of_modules: if ut.logging_calc(): ut.log_calc("UMIs", str(module_umis_count) + " (too few)") rare_module_of_cells[module_cells_mask] = -1 continue ut.log_calc("UMIs", module_umis_count) next_module_index = len(list_of_rare_gene_indices_of_modules) if module_index != next_module_index: ut.log_calc("is reindexed to", next_module_index) rare_module_of_cells[module_cells_mask] = next_module_index module_index = next_module_index next_module_index += 1 list_of_rare_gene_indices_of_modules.append(gene_indices_of_module) if ut.logging_calc(): cell_counts_of_modules.append(np.sum(module_cells_mask)) list_of_names_of_genes_of_modules.append( # sorted(adata_of_all_genes_of_all_cells. var_names[gene_indices_of_module])) if ut.logging_calc(): ut.log_calc("final modules:") for module_index, (module_cells_count, module_gene_names) in enumerate( zip(cell_counts_of_modules, list_of_names_of_genes_of_modules)): ut.log_calc( f"- module: {module_index} cells: {module_cells_count} genes: {module_gene_names}" ) # return list_of_rare_gene_indices_of_modules
def _identify_cells( *, adata_of_all_genes_of_all_cells: AnnData, what: Union[str, ut.Matrix] = "__x__", related_gene_indices_of_modules: List[List[int]], min_cell_module_total: int, min_cells_of_modules: int, max_cells_of_modules: int, rare_module_of_cells: ut.NumpyVector, ) -> None: max_strength_of_cells = np.zeros(adata_of_all_genes_of_all_cells.n_obs) ut.log_calc("cells for modules:") modules_count = len(related_gene_indices_of_modules) for module_index, related_gene_indices_of_module in enumerate( related_gene_indices_of_modules): if len(related_gene_indices_of_module) == 0: continue with ut.log_step( "- module", module_index, formatter=lambda module_index: ut.progress_description( modules_count, module_index, "module"), ): adata_of_related_genes_of_all_cells = ut.slice( adata_of_all_genes_of_all_cells, name=f".module{module_index}.related_genes", vars=related_gene_indices_of_module, top_level=False, ) total_related_genes_of_all_cells = ut.get_o_numpy( adata_of_related_genes_of_all_cells, what, sum=True) mask_of_strong_cells_of_module = total_related_genes_of_all_cells >= min_cell_module_total median_strength_of_module = np.median( total_related_genes_of_all_cells[ mask_of_strong_cells_of_module]) # strong_cells_count = np.sum(mask_of_strong_cells_of_module) if strong_cells_count > max_cells_of_modules: if ut.logging_calc(): ut.log_calc( "strong_cells", ut.mask_description(mask_of_strong_cells_of_module) + " (too many)") # related_gene_indices_of_module.clear() continue if strong_cells_count < min_cells_of_modules: if ut.logging_calc(): ut.log_calc( "strong_cells", ut.mask_description(mask_of_strong_cells_of_module) + " (too few)") # related_gene_indices_of_module.clear() continue ut.log_calc("strong_cells", mask_of_strong_cells_of_module) strength_of_all_cells = total_related_genes_of_all_cells / median_strength_of_module mask_of_strong_cells_of_module &= strength_of_all_cells >= max_strength_of_cells max_strength_of_cells[ mask_of_strong_cells_of_module] = strength_of_all_cells[ mask_of_strong_cells_of_module] rare_module_of_cells[mask_of_strong_cells_of_module] = module_index
def _related_genes( # pylint: disable=too-many-statements,too-many-branches *, adata_of_all_genes_of_all_cells: AnnData, what: Union[str, ut.Matrix] = "__x__", rare_gene_indices_of_modules: List[List[int]], allowed_genes_mask: ut.NumpyVector, min_genes_of_modules: int, min_gene_maximum: int, min_cells_of_modules: int, max_cells_of_modules: int, min_cell_module_total: int, min_related_gene_fold_factor: float, max_related_gene_increase_factor: float, ) -> List[List[int]]: total_all_cells_umis_of_all_genes = ut.get_v_numpy( adata_of_all_genes_of_all_cells, what, sum=True) ut.log_calc("genes for modules:") modules_count = 0 related_gene_indices_of_modules: List[List[int]] = [] rare_gene_indices_of_any: Set[int] = set() for rare_gene_indices_of_module in rare_gene_indices_of_modules: if len(rare_gene_indices_of_module) >= min_genes_of_modules: rare_gene_indices_of_any.update(list(rare_gene_indices_of_module)) for rare_gene_indices_of_module in rare_gene_indices_of_modules: if len(rare_gene_indices_of_module) < min_genes_of_modules: continue module_index = modules_count modules_count += 1 with ut.log_step("- module", module_index): ut.log_calc( "rare_gene_names", sorted(adata_of_all_genes_of_all_cells. var_names[rare_gene_indices_of_module])) adata_of_module_genes_of_all_cells = ut.slice( adata_of_all_genes_of_all_cells, name=f".module{module_index}.rare_gene", vars=rare_gene_indices_of_module, top_level=False, ) total_module_genes_umis_of_all_cells = ut.get_o_numpy( adata_of_module_genes_of_all_cells, what, sum=True) mask_of_expressed_cells = total_module_genes_umis_of_all_cells > 0 expressed_cells_count = np.sum(mask_of_expressed_cells) if expressed_cells_count > max_cells_of_modules: if ut.logging_calc(): ut.log_calc( "expressed_cells", ut.mask_description(mask_of_expressed_cells) + " (too many)") continue if expressed_cells_count < min_cells_of_modules: if ut.logging_calc(): ut.log_calc( "expressed_cells", ut.mask_description(mask_of_expressed_cells) + " (too few)") continue ut.log_calc("expressed_cells", mask_of_expressed_cells) adata_of_all_genes_of_expressed_cells_of_module = ut.slice( adata_of_all_genes_of_all_cells, name=f".module{module_index}.rare_cell", obs=mask_of_expressed_cells, top_level=False, ) total_expressed_cells_umis_of_all_genes = ut.get_v_numpy( adata_of_all_genes_of_expressed_cells_of_module, what, sum=True) data = ut.get_vo_proper( adata_of_all_genes_of_expressed_cells_of_module, what, layout="column_major") max_expressed_cells_umis_of_all_genes = ut.max_per(data, per="column") total_background_cells_umis_of_all_genes = ( total_all_cells_umis_of_all_genes - total_expressed_cells_umis_of_all_genes) expressed_cells_fraction_of_all_genes = total_expressed_cells_umis_of_all_genes / sum( total_expressed_cells_umis_of_all_genes) background_cells_fraction_of_all_genes = total_background_cells_umis_of_all_genes / sum( total_background_cells_umis_of_all_genes) mask_of_related_genes = ( allowed_genes_mask & (max_expressed_cells_umis_of_all_genes >= min_gene_maximum) & (expressed_cells_fraction_of_all_genes >= background_cells_fraction_of_all_genes * (2**min_related_gene_fold_factor))) related_gene_indices = np.where(mask_of_related_genes)[0] assert np.all(mask_of_related_genes[rare_gene_indices_of_module]) base_genes_of_all_cells_adata = ut.slice( adata_of_all_genes_of_all_cells, name=f".module{module_index}.base", vars=rare_gene_indices_of_module) total_base_genes_of_all_cells = ut.get_o_numpy( base_genes_of_all_cells_adata, what, sum=True) mask_of_strong_base_cells = total_base_genes_of_all_cells >= min_cell_module_total count_of_strong_base_cells = np.sum(mask_of_strong_base_cells) if ut.logging_calc(): ut.log_calc( "candidate_gene_names", sorted(adata_of_all_genes_of_all_cells. var_names[related_gene_indices])) ut.log_calc("base_strong_genes", count_of_strong_base_cells) related_gene_indices_of_module = list(rare_gene_indices_of_module) for gene_index in related_gene_indices: if gene_index in rare_gene_indices_of_module: continue if gene_index in rare_gene_indices_of_any: ut.log_calc( f"- candidate gene {adata_of_all_genes_of_all_cells.var_names[gene_index]} " f"belongs to another module") continue if gene_index not in rare_gene_indices_of_module: related_gene_of_all_cells_adata = ut.slice( adata_of_all_genes_of_all_cells, name= f".{adata_of_all_genes_of_all_cells.var_names[gene_index]}", vars=np.array([gene_index]), ) assert related_gene_of_all_cells_adata.n_vars == 1 total_related_genes_of_all_cells = ut.get_o_numpy( related_gene_of_all_cells_adata, what, sum=True) total_related_genes_of_all_cells += total_base_genes_of_all_cells mask_of_strong_related_cells = total_related_genes_of_all_cells >= min_cell_module_total count_of_strong_related_cells = np.sum( mask_of_strong_related_cells) ut.log_calc( f"- candidate gene {adata_of_all_genes_of_all_cells.var_names[gene_index]} " f"strong cells: {count_of_strong_related_cells} " f"factor: {count_of_strong_related_cells / count_of_strong_base_cells}" ) if count_of_strong_related_cells > max_related_gene_increase_factor * count_of_strong_base_cells: continue related_gene_indices_of_module.append(gene_index) related_gene_indices_of_modules.append( related_gene_indices_of_module) # if ut.logging_calc(): ut.log_calc("related genes for modules:") for module_index, related_gene_indices_of_module in enumerate( related_gene_indices_of_modules): ut.log_calc( f"- module {module_index} related_gene_names", sorted(adata_of_all_genes_of_all_cells. var_names[related_gene_indices_of_module]), ) return related_gene_indices_of_modules
def combine_masks( # pylint: disable=too-many-branches,too-many-statements adata: AnnData, masks: List[str], *, invert: bool = False, to: Optional[str] = None, ) -> Optional[ut.PandasSeries]: """ Combine different pre-computed masks into a final overall mask. **Input** Annotated ``adata``, where the observations are cells and the variables are genes. **Returns** If ``to`` (default: {to}) is ``None``, returns the computed mask. Otherwise, sets the mask as an annotation (per-variable or per-observation depending on the type of the combined masks). **Computation Parameters** 1. For each of the mask in ``masks``, fetch it. Silently ignore missing masks if the name has a ``?`` suffix. Invert the mask if the name has a ``~`` prefix. If the name has a ``|`` prefix (before the ``~`` prefix, if any), then bitwise-OR the mask into the OR mask, otherwise (or if it has a ``&`` prefix), bitwise-AND the mask into the AND mask. 2. Combine (bitwise-AND) the AND mask and the OR mask into a single mask. 3. If ``invert`` (default: {invert}), invert the result combined mask. """ assert len(masks) > 0 per: Optional[str] = None and_mask: Optional[ut.NumpyVector] = None or_mask: Optional[ut.NumpyVector] = None for mask_name in masks: log_mask_name = mask_name if mask_name[0] == "|": is_or = True mask_name = mask_name[1:] else: is_or = False if mask_name[0] == "&": mask_name = mask_name[1:] if mask_name[0] == "~": invert_mask = True mask_name = mask_name[1:] else: invert_mask = False if mask_name[-1] == "?": must_exist = False mask_name = mask_name[:-1] else: must_exist = True if mask_name in adata.obs: mask_per = "o" mask = ut.get_o_numpy( adata, mask_name, formatter=ut.mask_description) > 0 elif mask_name in adata.var: mask_per = "v" mask = ut.get_v_numpy( adata, mask_name, formatter=ut.mask_description) > 0 else: if must_exist: raise KeyError(f"unknown mask data: {mask_name}") continue if mask.dtype != "bool": raise ValueError(f"the data: {mask_name} is not a boolean mask") if invert_mask: mask = ~mask if ut.logging_calc(): ut.log_calc(log_mask_name, mask) if per is None: per = mask_per else: if mask_per != per: raise ValueError( "mixing per-observation and per-variable masks") if is_or: if or_mask is None: or_mask = mask else: or_mask = or_mask | mask else: if and_mask is None: and_mask = mask else: and_mask = and_mask & mask if and_mask is not None: if or_mask is not None: combined_mask = and_mask & or_mask else: combined_mask = and_mask else: if or_mask is not None: combined_mask = or_mask else: raise ValueError("no masks to combine") if invert: combined_mask = ~combined_mask if to is None: ut.log_return("combined", combined_mask) if per == "o": return ut.to_pandas_series(combined_mask, index=adata.obs_names) assert per == "v" return ut.to_pandas_series(combined_mask, index=adata.var_names) if per == "o": ut.set_o_data(adata, to, combined_mask) else: ut.set_v_data(adata, to, combined_mask) return None
def _collect_group_data( group_index: int, *, group_of_cells: ut.NumpyVector, cells_data: ut.ProperMatrix, compatible_size: Optional[int], downsample_min_samples: int, downsample_min_cell_quantile: float, downsample_max_cell_quantile: float, min_gene_total: int, random_seed: int, variance_per_gene_per_group: ut.NumpyMatrix, normalized_variance_per_gene_per_group: ut.NumpyMatrix, ) -> None: cell_indices = np.where(group_of_cells == group_index)[0] cells_count = len(cell_indices) if cells_count < 2: return if compatible_size is None: ut.log_calc(" cells", cells_count) else: assert 0 < compatible_size <= cells_count if compatible_size < cells_count: np.random.seed(random_seed) if ut.logging_calc(): ut.log_calc(" cells: " + ut.ratio_description( len(cell_indices), "cell", compatible_size, "compatible")) cell_indices = np.random.choice(cell_indices, size=compatible_size, replace=False) assert len(cell_indices) == compatible_size assert ut.is_layout(cells_data, "row_major") group_data = cells_data[cell_indices, :] total_per_cell = ut.sum_per(group_data, per="row") samples = int( round( min( max(downsample_min_samples, np.quantile(total_per_cell, downsample_min_cell_quantile)), np.quantile(total_per_cell, downsample_max_cell_quantile), ))) if ut.logging_calc(): ut.log_calc(f" samples: {samples}") downsampled_data = ut.downsample_matrix(group_data, per="row", samples=samples, random_seed=random_seed) downsampled_data = ut.to_layout(downsampled_data, layout="column_major") total_per_gene = ut.sum_per(downsampled_data, per="column") too_small_genes = total_per_gene < min_gene_total if ut.logging_calc(): included_genes_count = len(too_small_genes) - np.sum(too_small_genes) ut.log_calc(f" included genes: {included_genes_count}") variance_per_gene = ut.variance_per(downsampled_data, per="column") normalized_variance_per_gene = ut.normalized_variance_per(downsampled_data, per="column") variance_per_gene[too_small_genes] = None normalized_variance_per_gene[too_small_genes] = None variance_per_gene_per_group[group_index, :] = variance_per_gene normalized_variance_per_gene_per_group[ group_index, :] = normalized_variance_per_gene