def _pick_candidates( *, adata_of_all_genes_of_all_cells: AnnData, what: Union[str, ut.Matrix] = "__x__", max_gene_cell_fraction: float, min_gene_maximum: int, min_genes_of_modules: int, allowed_genes_mask: ut.NumpyVector, ) -> Optional[Tuple[AnnData, ut.NumpyVector]]: data = ut.get_vo_proper(adata_of_all_genes_of_all_cells, what, layout="column_major") nnz_cells_of_genes = ut.nnz_per(data, per="column") nnz_cell_fraction_of_genes = nnz_cells_of_genes / adata_of_all_genes_of_all_cells.n_obs nnz_cell_fraction_mask_of_genes = nnz_cell_fraction_of_genes <= max_gene_cell_fraction max_umis_of_genes = ut.max_per(data, per="column") max_umis_mask_of_genes = max_umis_of_genes >= min_gene_maximum candidates_mask_of_genes = max_umis_mask_of_genes & nnz_cell_fraction_mask_of_genes & allowed_genes_mask ut.log_calc("candidate_genes", candidates_mask_of_genes) candidate_genes_indices = np.where(candidates_mask_of_genes)[0] candidate_genes_count = candidate_genes_indices.size if candidate_genes_count < min_genes_of_modules: return None candidate_data = ut.slice(adata_of_all_genes_of_all_cells, name=".candidate_genes", vars=candidate_genes_indices, top_level=False) return candidate_data, candidate_genes_indices
def split_group(group_index: int) -> Tuple[ut.NumpyVector, ut.NumpyVector]: group_cells_mask = group_of_cells == group_index assert np.any(group_cells_mask) name = f".{group}-{group_index}/{groups_count}" gdata = ut.slice(adata, name=name, top_level=False, obs=group_cells_mask, track_obs="complete_cell_index") target_metacell_size = (gdata.n_obs + 1) // 2 compute_direct_metacells( gdata, what, feature_downsample_min_samples=feature_downsample_min_samples, feature_downsample_min_cell_quantile= feature_downsample_min_cell_quantile, feature_downsample_max_cell_quantile= feature_downsample_max_cell_quantile, feature_min_gene_total=feature_min_gene_total, feature_min_gene_top3=feature_min_gene_top3, feature_min_gene_relative_variance= feature_min_gene_relative_variance, forbidden_gene_names=forbidden_gene_names, forbidden_gene_patterns=forbidden_gene_patterns, cells_similarity_value_normalization= cells_similarity_value_normalization, cells_similarity_log_data=cells_similarity_log_data, cells_similarity_method=cells_similarity_method, target_metacell_size=target_metacell_size, max_cell_size=max_cell_size, max_cell_size_factor=max_cell_size_factor, cell_sizes=None, knn_k=target_metacell_size, min_knn_k=target_metacell_size, knn_balanced_ranks_factor=knn_balanced_ranks_factor, knn_incoming_degree_factor=knn_incoming_degree_factor, knn_outgoing_degree_factor=knn_outgoing_degree_factor, min_seed_size_quantile=min_seed_size_quantile, max_seed_size_quantile=max_seed_size_quantile, candidates_cooldown_pass=candidates_cooldown_pass, candidates_cooldown_node=candidates_cooldown_node, candidates_min_split_size_factor=None, candidates_max_merge_size_factor=None, candidates_min_metacell_cells=1, must_complete_cover=True, random_seed=random_seed, ) direct_groups = ut.get_o_numpy(gdata, "metacell") zero_count = np.sum(direct_groups == 0) one_count = np.sum(direct_groups == 1) ut.log_calc(f"group: {group_index} size: {len(direct_groups)} " f"split into: {zero_count} + {one_count}") assert zero_count + one_count == len(direct_groups) assert zero_count > 0 assert one_count > 0 return (group_cells_mask, group_index + groups_count * direct_groups)
def renormalize_query_by_atlas( # pylint: disable=too-many-statements,too-many-branches what: str = "__x__", *, adata: AnnData, qdata: AnnData, var_annotations: Dict[str, Any], layers: Dict[str, Any], varp_annotations: Dict[str, Any], ) -> Optional[AnnData]: """ Add an ``ATLASNORM`` pseudo-gene to query metacells data to compensate for the query having filtered out many genes. This renormalizes the gene fractions in the query to fit the atlas in case the query has aggressive filtered a significant amount of genes. **Input** Annotated query ``qdata`` and atlas ``adata``, where the observations are cells and the variables are genes, where ``X`` is a per-variable-per-observation matrix or the name of a per-variable-per-observation annotation containing such a matrix. **Returns** None if no normalization is needed (or possible). Otherwise, a copy of the query metacells data, with an additional variable (gene) called ``ATLASNORM`` to the query data, such that the total number of UMIs for each query metacells is as expected given the total number of UMIs of the genes common to the query and the atlas. This is skipped if the query and the atlas have exactly the same list of genes, or if if the query already contains a high number of genes missing from the atlas so that the total number of UMIs for the query metacells is already at least the expected based on the common genes. **Computation Parameters** 1. Computes how many UMIs should be added to each query metacell so that its (total UMIs / total common gene UMIs) would be the same as the (total atlas UMIs / total atlas common UMIs). If this is zero (or negative), stop. 2. Add an ``ATLASNORM`` pseudo-gene to the query with the above amount of UMIs. For each per-variable (gene) observation, add the value specified in ``var_annotations``, whose list of keys must cover the set of per-variable annotations in the query data. For each per-observation-per-variable layer, add the value specified in ``layers``, whose list of keys must cover the existing layers. For each per-variable-per-variable annotation, add the value specified in ``varp_annotations``. """ for name in qdata.var.keys(): if "|" not in name and name not in var_annotations.keys(): raise RuntimeError(f"missing default value for variable annotation {name}") for name in qdata.layers.keys(): if name not in layers.keys(): raise RuntimeError(f"missing default value for layer {name}") for name in qdata.varp.keys(): if name not in varp_annotations.keys(): raise RuntimeError(f"missing default value for variable-variable {name}") if list(qdata.var_names) == list(adata.var_names): return None query_genes_list = list(qdata.var_names) atlas_genes_list = list(adata.var_names) common_genes_list = list(sorted(set(qdata.var_names) & set(adata.var_names))) query_gene_indices = np.array([query_genes_list.index(gene) for gene in common_genes_list]) atlas_gene_indices = np.array([atlas_genes_list.index(gene) for gene in common_genes_list]) common_qdata = ut.slice(qdata, name=".common", vars=query_gene_indices, track_var="full_index") common_adata = ut.slice(adata, name=".common", vars=atlas_gene_indices, track_var="full_index") assert list(common_qdata.var_names) == list(common_adata.var_names) atlas_total_umis_per_metacell = ut.get_o_numpy(adata, what, sum=True) atlas_common_umis_per_metacell = ut.get_o_numpy(common_adata, what, sum=True) atlas_total_umis = np.sum(atlas_total_umis_per_metacell) atlas_common_umis = np.sum(atlas_common_umis_per_metacell) atlas_disjoint_umis_fraction = atlas_total_umis / atlas_common_umis - 1.0 ut.log_calc("atlas_total_umis", atlas_total_umis) ut.log_calc("atlas_common_umis", atlas_common_umis) ut.log_calc("atlas_disjoint_umis_fraction", atlas_disjoint_umis_fraction) query_total_umis_per_metacell = ut.get_o_numpy(qdata, what, sum=True) query_common_umis_per_metacell = ut.get_o_numpy(common_qdata, what, sum=True) query_total_umis = np.sum(query_total_umis_per_metacell) query_common_umis = np.sum(query_common_umis_per_metacell) query_disjoint_umis_fraction = query_total_umis / query_common_umis - 1.0 ut.log_calc("query_total_umis", query_total_umis) ut.log_calc("query_common_umis", query_common_umis) ut.log_calc("query_disjoint_umis_fraction", query_disjoint_umis_fraction) if query_disjoint_umis_fraction >= atlas_disjoint_umis_fraction: return None query_normalization_umis_fraction = atlas_disjoint_umis_fraction - query_disjoint_umis_fraction ut.log_calc("query_normalization_umis_fraction", query_normalization_umis_fraction) query_normalization_umis_per_metacell = query_common_umis_per_metacell * query_normalization_umis_fraction _proper, dense, compressed = ut.to_proper_matrices(qdata.X) if dense is None: assert compressed is not None dense = ut.to_numpy_matrix(compressed) added = np.concatenate([dense, query_normalization_umis_per_metacell[:, np.newaxis]], axis=1) if compressed is not None: added = sp.csr_matrix(added) assert added.shape[0] == qdata.shape[0] assert added.shape[1] == qdata.shape[1] + 1 ndata = AnnData(added) ndata.obs_names = qdata.obs_names var_names = list(qdata.var_names) var_names.append("ATLASNORM") ndata.var_names = var_names for name, value in qdata.uns.items(): ut.set_m_data(ndata, name, value) for name, value in qdata.obs.items(): ut.set_o_data(ndata, name, value) for name, value in qdata.obsp.items(): ut.set_oo_data(ndata, name, value) for name in qdata.var.keys(): if "|" in name: continue value = ut.get_v_numpy(qdata, name) value = np.append(value, [var_annotations[name]]) ut.set_v_data(ndata, name, value) for name in qdata.layers.keys(): data = ut.get_vo_proper(qdata, name) _proper, dense, compressed = ut.to_proper_matrices(data) if dense is None: assert compressed is not None dense = ut.to_numpy_matrix(compressed) values = np.full(qdata.n_obs, layers[name], dtype=dense.dtype) added = np.concatenate([dense, values[:, np.newaxis]], axis=1) if compressed is not None: added = sp.csr_matrix(added) ut.set_vo_data(ndata, name, added) for name in qdata.varp.keys(): data = ut.get_vv_proper(qdata, name) _proper, dense, compressed = ut.to_proper_matrices(data) if dense is None: assert compressed is not None dense = ut.to_numpy_matrix(compressed) values = np.full(qdata.n_vars, varp_annotations[name], dtype=dense.dtype) added = np.concatenate([dense, values[:, np.newaxis]], axis=1) values = np.full(qdata.n_vars + 1, varp_annotations[name], dtype=dense.dtype) added = np.concatenate([added, values[:, np.newaxis]], axis=0) if compressed is not None: added = sp.csr_matrix(added) ut.set_vv_data(ndata, name, added) return ndata
def relate_genes( adata: AnnData, what: Union[str, ut.Matrix] = "__x__", *, max_sampled_cells: int = pr.related_max_sampled_cells, downsample_min_samples: float = pr.related_downsample_min_samples, downsample_min_cell_quantile: float = pr. related_downsample_min_cell_quantile, downsample_max_cell_quantile: float = pr. related_downsample_max_cell_quantile, min_gene_relative_variance: float = pr.related_min_gene_relative_variance, min_gene_total: int = pr.related_min_gene_total, min_gene_top3: int = pr.related_min_gene_top3, forbidden_gene_names: Optional[Collection[str]] = None, forbidden_gene_patterns: Optional[Collection[Union[str, Pattern]]] = None, genes_similarity_method: str = pr.related_genes_similarity_method, genes_cluster_method: str = pr.related_genes_cluster_method, min_genes_of_modules: int = pr.related_min_genes_of_modules, random_seed: int = 0, ) -> None: """ Detect coarse relations between genes based on ``what`` (default: {what}) data. This is a quick-and-dirty way to group genes together and shouldn't only be used as a starting point for more precise forms of gene relationship analysis. **Input** Annotated ``adata``, where the observations are cells and the variables are genes, where ``what`` is a per-variable-per-observation matrix or the name of a per-variable-per-observation annotation containing such a matrix. **Returns** Variable-pair (Gene) Annotations ``related_genes_similarity`` The similarity between each two related genes. Variable (Gene) Annotations ``related_genes_module`` The index of the gene module for each gene. **Computation Parameters** 1. If we have more than ``max_sampled_cells`` (default: {max_sampled_cells}), pick this number of random cells from the data using the ``random_seed``. 2. Pick candidate genes using :py:func:`metacells.pipeline.feature.extract_feature_data`. 3. Compute the similarity between the feature genes using :py:func:`metacells.tools.similarity.compute_var_var_similarity` using the ``genes_similarity_method`` (default: {genes_similarity_method}). 4. Create a hierarchical clustering of the candidate genes using the ``genes_cluster_method`` (default: {genes_cluster_method}). 5. Identify gene modules in the hierarchical clustering which contain at least ``min_genes_of_modules`` genes. """ if max_sampled_cells < adata.n_obs: np.random.seed(random_seed) cell_indices = np.random.choice(np.arange(adata.n_obs), size=max_sampled_cells, replace=False) sdata = ut.slice(adata, obs=cell_indices, name=".sampled", top_level=False) else: sdata = ut.copy_adata(adata, top_level=False) fdata = extract_feature_data( sdata, what, top_level=False, downsample_min_samples=downsample_min_samples, downsample_min_cell_quantile=downsample_min_cell_quantile, downsample_max_cell_quantile=downsample_max_cell_quantile, min_gene_relative_variance=min_gene_relative_variance, min_gene_total=min_gene_total, min_gene_top3=min_gene_top3, forbidden_gene_names=forbidden_gene_names, forbidden_gene_patterns=forbidden_gene_patterns, random_seed=random_seed, ) assert fdata is not None frame = tl.compute_var_var_similarity(fdata, what, method=genes_similarity_method, reproducible=(random_seed != 0), inplace=False) assert frame is not None similarity = ut.to_layout(ut.to_numpy_matrix(frame), layout="row_major") linkage = _cluster_genes(similarity, genes_cluster_method) clusters = _linkage_to_clusters(linkage, min_genes_of_modules, fdata.n_vars) cluster_of_genes = pd.Series(np.full(adata.n_vars, -1, dtype="int32"), index=adata.var_names) for cluster_index, gene_indices in enumerate(clusters): cluster_of_genes[fdata.var_names[gene_indices]] = cluster_index ut.set_v_data(adata, "related_genes_module", cluster_of_genes, formatter=ut.groups_description) feature_gene_indices = ut.get_v_numpy(fdata, "full_gene_index") data = similarity.flatten(order="C") rows = np.repeat(feature_gene_indices, len(feature_gene_indices)) cols = np.tile(feature_gene_indices, len(feature_gene_indices)) full_similarity = sp.csr_matrix((data, (rows, cols)), shape=(adata.n_vars, adata.n_vars)) ut.set_vv_data(adata, "related_genes_similarity", full_similarity)
def find_noisy_lonely_genes( # pylint: disable=too-many-statements adata: AnnData, what: Union[str, ut.Matrix] = "__x__", *, excluded_genes_mask: Optional[str] = None, max_sampled_cells: int = pr.noisy_lonely_max_sampled_cells, downsample_min_samples: int = pr.noisy_lonely_downsample_min_samples, downsample_min_cell_quantile: float = pr. noisy_lonely_downsample_max_cell_quantile, downsample_max_cell_quantile: float = pr. noisy_lonely_downsample_min_cell_quantile, min_gene_total: int = pr.noisy_lonely_min_gene_total, min_gene_normalized_variance: float = pr. noisy_lonely_min_gene_normalized_variance, max_gene_similarity: float = pr.noisy_lonely_max_gene_similarity, random_seed: int = pr.random_seed, inplace: bool = True, ) -> Optional[ut.PandasSeries]: """ Detect "noisy lonely" genes based on ``what`` (default: {what}) data. Return the indices of genes which are "noisy" (have high variance compared to their mean) and also "lonely" (have low correlation with all other genes). Such genes should be excluded since they will never meaningfully help us compute groups, and will actively cause profiles to be considered "deviants". Noisy genes have high expression and variance. Lonely genes have no (or low) correlations with any other gene. Noisy lonely genes tend to throw off clustering algorithms. In general, such algorithms try to group together cells with the same overall biological state. Since the genes are lonely, they don't contribute towards this goal. Since they are noisy, they actively hamper this, because they make cells which are otherwise similar appear different (just for this lonely gene). It is therefore useful to explicitly identify, in a pre-processing step, the (few) such genes, and exclude them from the rest of the analysis. **Input** Annotated ``adata``, where the observations are cells and the variables are genes, where ``what`` is a per-variable-per-observation matrix or the name of a per-variable-per-observation annotation containing such a matrix. **Returns** Variable (Gene) Annotations ``noisy_lonely_genes`` A boolean mask indicating whether each gene was found to be a "noisy lonely" gene. If ``inplace`` (default: {inplace}), this is written to the data, and the function returns ``None``. Otherwise this is returned as a pandas series (indexed by the variable names). **Computation Parameters** 1. If we have more than ``max_sampled_cells`` (default: {max_sampled_cells}), pick this number of random cells from the data using the ``random_seed``. 2. If we were specified an ``excluded_genes_mask``, this is the name of a per-variable (gene) annotation containing a mask of excluded genes. Get rid of all these excluded genes. 3. Invoke :py:func:`metacells.tools.downsample.downsample_cells` to downsample the cells to the same total number of UMIs, using the ``downsample_min_samples`` (default: {downsample_min_samples}), ``downsample_min_cell_quantile`` (default: {downsample_min_cell_quantile}), ``downsample_max_cell_quantile`` (default: {downsample_max_cell_quantile}) and the ``random_seed`` (default: {random_seed}). 4. Find "noisy" genes which have a total number of UMIs of at least ``min_gene_total`` (default: {min_gene_total}) and a normalized variance of at least ``min_gene_normalized_variance`` (default: ``min_gene_normalized_variance``). 5. Cross-correlate the noisy genes. 6. Find the noisy "lonely" genes whose maximal correlation is at most ``max_gene_similarity`` (default: {max_gene_similarity}) with all other genes. """ if max_sampled_cells < adata.n_obs: np.random.seed(random_seed) cell_indices = np.random.choice(np.arange(adata.n_obs), size=max_sampled_cells, replace=False) s_data = ut.slice(adata, obs=cell_indices, name=".sampled", top_level=False) else: s_data = ut.copy_adata(adata, top_level=False) track_var: Optional[str] = "sampled_gene_index" if excluded_genes_mask is not None: results = filter_data(s_data, name="included", top_level=False, track_var=track_var, var_masks=[f"~{excluded_genes_mask}"]) track_var = None assert results is not None i_data = results[0] assert i_data is not None else: i_data = s_data downsample_cells( i_data, what, downsample_min_samples=downsample_min_samples, downsample_min_cell_quantile=downsample_min_cell_quantile, downsample_max_cell_quantile=downsample_max_cell_quantile, random_seed=random_seed, ) find_high_total_genes(i_data, "downsampled", min_gene_total=min_gene_total) results = filter_data(i_data, name="high_total", top_level=False, track_var=track_var, var_masks=["high_total_gene"]) track_var = None assert results is not None ht_data = results[0] noisy_lonely_genes_mask = np.full(adata.n_vars, False) if ht_data is not None: ht_genes_count = ht_data.shape[1] ht_gene_ht_gene_similarity_frame = compute_var_var_similarity( ht_data, "downsampled", inplace=False, reproducible=(random_seed != 0)) assert ht_gene_ht_gene_similarity_frame is not None ht_gene_ht_gene_similarity_matrix = ut.to_numpy_matrix( ht_gene_ht_gene_similarity_frame, only_extract=True) ht_gene_ht_gene_similarity_matrix = ut.to_layout( ht_gene_ht_gene_similarity_matrix, layout="row_major", symmetric=True) np.fill_diagonal(ht_gene_ht_gene_similarity_matrix, -1) htv_mask_series = find_high_normalized_variance_genes( ht_data, "downsampled", min_gene_normalized_variance=min_gene_normalized_variance, inplace=False) assert htv_mask_series is not None htv_mask = ut.to_numpy_vector(htv_mask_series) htv_genes_count = np.sum(htv_mask) assert htv_genes_count <= ht_genes_count if htv_genes_count > 0: htv_gene_ht_gene_similarity_matrix = ht_gene_ht_gene_similarity_matrix[ htv_mask, :] assert ut.is_layout(htv_gene_ht_gene_similarity_matrix, "row_major") assert htv_gene_ht_gene_similarity_matrix.shape == ( htv_genes_count, ht_genes_count) max_similarity_of_htv_genes = ut.max_per( htv_gene_ht_gene_similarity_matrix, per="row") htvl_mask = max_similarity_of_htv_genes <= max_gene_similarity htvl_genes_count = np.sum(htvl_mask) ut.log_calc("noisy_lonely_genes_count", htvl_genes_count) if htvl_genes_count > 0: base_index_of_ht_genes = ut.get_v_numpy( ht_data, "sampled_gene_index") base_index_of_htv_genes = base_index_of_ht_genes[htv_mask] base_index_of_htvl_genes = base_index_of_htv_genes[htvl_mask] noisy_lonely_genes_mask[base_index_of_htvl_genes] = True htvl_gene_ht_gene_similarity_matrix = htv_gene_ht_gene_similarity_matrix[ htvl_mask, :] htvl_gene_ht_gene_similarity_matrix = ut.to_layout( htvl_gene_ht_gene_similarity_matrix, layout="row_major") assert htvl_gene_ht_gene_similarity_matrix.shape == ( htvl_genes_count, ht_genes_count) if ut.logging_calc(): i_gene_totals = ut.get_v_numpy(i_data, "downsampled", sum=True) ht_mask = ut.get_v_numpy(i_data, "high_total_gene") i_total = np.sum(i_gene_totals) htvl_gene_totals = i_gene_totals[ht_mask][htv_mask][ htvl_mask] top_similarity_of_htvl_genes = ut.top_per( htvl_gene_ht_gene_similarity_matrix, 10, per="row") for htvl_index, gene_index in enumerate( base_index_of_htvl_genes): gene_name = adata.var_names[gene_index] gene_total = htvl_gene_totals[htvl_index] gene_percent = 100 * gene_total / i_total similar_ht_values = ut.to_numpy_vector( top_similarity_of_htvl_genes[htvl_index, :]) # assert len(similar_ht_values) == ht_genes_count top_similar_ht_mask = similar_ht_values > 0 top_similar_ht_values = similar_ht_values[ top_similar_ht_mask] top_similar_ht_indices = base_index_of_ht_genes[ top_similar_ht_mask] top_similar_ht_names = adata.var_names[ top_similar_ht_indices] ut.log_calc( f"- {gene_name}", f"total downsampled UMIs: {gene_total} " + f"({gene_percent:.4g}%), correlated with: " + ", ".join([ f"{similar_gene_name}: {similar_gene_value:.4g}" for similar_gene_value, similar_gene_name in reversed( sorted( zip(top_similar_ht_values, top_similar_ht_names))) ]), ) if ut.logging_calc(): ut.log_calc("noisy_lonely_gene_names", sorted(list(adata.var_names[noisy_lonely_genes_mask]))) if inplace: ut.set_v_data(adata, "noisy_lonely_gene", noisy_lonely_genes_mask) return None ut.log_return("noisy_lonely_genes", noisy_lonely_genes_mask) return ut.to_pandas_series(noisy_lonely_genes_mask, index=adata.var_names)
def _identify_cells( *, adata_of_all_genes_of_all_cells: AnnData, what: Union[str, ut.Matrix] = "__x__", related_gene_indices_of_modules: List[List[int]], min_cell_module_total: int, min_cells_of_modules: int, max_cells_of_modules: int, rare_module_of_cells: ut.NumpyVector, ) -> None: max_strength_of_cells = np.zeros(adata_of_all_genes_of_all_cells.n_obs) ut.log_calc("cells for modules:") modules_count = len(related_gene_indices_of_modules) for module_index, related_gene_indices_of_module in enumerate( related_gene_indices_of_modules): if len(related_gene_indices_of_module) == 0: continue with ut.log_step( "- module", module_index, formatter=lambda module_index: ut.progress_description( modules_count, module_index, "module"), ): adata_of_related_genes_of_all_cells = ut.slice( adata_of_all_genes_of_all_cells, name=f".module{module_index}.related_genes", vars=related_gene_indices_of_module, top_level=False, ) total_related_genes_of_all_cells = ut.get_o_numpy( adata_of_related_genes_of_all_cells, what, sum=True) mask_of_strong_cells_of_module = total_related_genes_of_all_cells >= min_cell_module_total median_strength_of_module = np.median( total_related_genes_of_all_cells[ mask_of_strong_cells_of_module]) # strong_cells_count = np.sum(mask_of_strong_cells_of_module) if strong_cells_count > max_cells_of_modules: if ut.logging_calc(): ut.log_calc( "strong_cells", ut.mask_description(mask_of_strong_cells_of_module) + " (too many)") # related_gene_indices_of_module.clear() continue if strong_cells_count < min_cells_of_modules: if ut.logging_calc(): ut.log_calc( "strong_cells", ut.mask_description(mask_of_strong_cells_of_module) + " (too few)") # related_gene_indices_of_module.clear() continue ut.log_calc("strong_cells", mask_of_strong_cells_of_module) strength_of_all_cells = total_related_genes_of_all_cells / median_strength_of_module mask_of_strong_cells_of_module &= strength_of_all_cells >= max_strength_of_cells max_strength_of_cells[ mask_of_strong_cells_of_module] = strength_of_all_cells[ mask_of_strong_cells_of_module] rare_module_of_cells[mask_of_strong_cells_of_module] = module_index
def _related_genes( # pylint: disable=too-many-statements,too-many-branches *, adata_of_all_genes_of_all_cells: AnnData, what: Union[str, ut.Matrix] = "__x__", rare_gene_indices_of_modules: List[List[int]], allowed_genes_mask: ut.NumpyVector, min_genes_of_modules: int, min_gene_maximum: int, min_cells_of_modules: int, max_cells_of_modules: int, min_cell_module_total: int, min_related_gene_fold_factor: float, max_related_gene_increase_factor: float, ) -> List[List[int]]: total_all_cells_umis_of_all_genes = ut.get_v_numpy( adata_of_all_genes_of_all_cells, what, sum=True) ut.log_calc("genes for modules:") modules_count = 0 related_gene_indices_of_modules: List[List[int]] = [] rare_gene_indices_of_any: Set[int] = set() for rare_gene_indices_of_module in rare_gene_indices_of_modules: if len(rare_gene_indices_of_module) >= min_genes_of_modules: rare_gene_indices_of_any.update(list(rare_gene_indices_of_module)) for rare_gene_indices_of_module in rare_gene_indices_of_modules: if len(rare_gene_indices_of_module) < min_genes_of_modules: continue module_index = modules_count modules_count += 1 with ut.log_step("- module", module_index): ut.log_calc( "rare_gene_names", sorted(adata_of_all_genes_of_all_cells. var_names[rare_gene_indices_of_module])) adata_of_module_genes_of_all_cells = ut.slice( adata_of_all_genes_of_all_cells, name=f".module{module_index}.rare_gene", vars=rare_gene_indices_of_module, top_level=False, ) total_module_genes_umis_of_all_cells = ut.get_o_numpy( adata_of_module_genes_of_all_cells, what, sum=True) mask_of_expressed_cells = total_module_genes_umis_of_all_cells > 0 expressed_cells_count = np.sum(mask_of_expressed_cells) if expressed_cells_count > max_cells_of_modules: if ut.logging_calc(): ut.log_calc( "expressed_cells", ut.mask_description(mask_of_expressed_cells) + " (too many)") continue if expressed_cells_count < min_cells_of_modules: if ut.logging_calc(): ut.log_calc( "expressed_cells", ut.mask_description(mask_of_expressed_cells) + " (too few)") continue ut.log_calc("expressed_cells", mask_of_expressed_cells) adata_of_all_genes_of_expressed_cells_of_module = ut.slice( adata_of_all_genes_of_all_cells, name=f".module{module_index}.rare_cell", obs=mask_of_expressed_cells, top_level=False, ) total_expressed_cells_umis_of_all_genes = ut.get_v_numpy( adata_of_all_genes_of_expressed_cells_of_module, what, sum=True) data = ut.get_vo_proper( adata_of_all_genes_of_expressed_cells_of_module, what, layout="column_major") max_expressed_cells_umis_of_all_genes = ut.max_per(data, per="column") total_background_cells_umis_of_all_genes = ( total_all_cells_umis_of_all_genes - total_expressed_cells_umis_of_all_genes) expressed_cells_fraction_of_all_genes = total_expressed_cells_umis_of_all_genes / sum( total_expressed_cells_umis_of_all_genes) background_cells_fraction_of_all_genes = total_background_cells_umis_of_all_genes / sum( total_background_cells_umis_of_all_genes) mask_of_related_genes = ( allowed_genes_mask & (max_expressed_cells_umis_of_all_genes >= min_gene_maximum) & (expressed_cells_fraction_of_all_genes >= background_cells_fraction_of_all_genes * (2**min_related_gene_fold_factor))) related_gene_indices = np.where(mask_of_related_genes)[0] assert np.all(mask_of_related_genes[rare_gene_indices_of_module]) base_genes_of_all_cells_adata = ut.slice( adata_of_all_genes_of_all_cells, name=f".module{module_index}.base", vars=rare_gene_indices_of_module) total_base_genes_of_all_cells = ut.get_o_numpy( base_genes_of_all_cells_adata, what, sum=True) mask_of_strong_base_cells = total_base_genes_of_all_cells >= min_cell_module_total count_of_strong_base_cells = np.sum(mask_of_strong_base_cells) if ut.logging_calc(): ut.log_calc( "candidate_gene_names", sorted(adata_of_all_genes_of_all_cells. var_names[related_gene_indices])) ut.log_calc("base_strong_genes", count_of_strong_base_cells) related_gene_indices_of_module = list(rare_gene_indices_of_module) for gene_index in related_gene_indices: if gene_index in rare_gene_indices_of_module: continue if gene_index in rare_gene_indices_of_any: ut.log_calc( f"- candidate gene {adata_of_all_genes_of_all_cells.var_names[gene_index]} " f"belongs to another module") continue if gene_index not in rare_gene_indices_of_module: related_gene_of_all_cells_adata = ut.slice( adata_of_all_genes_of_all_cells, name= f".{adata_of_all_genes_of_all_cells.var_names[gene_index]}", vars=np.array([gene_index]), ) assert related_gene_of_all_cells_adata.n_vars == 1 total_related_genes_of_all_cells = ut.get_o_numpy( related_gene_of_all_cells_adata, what, sum=True) total_related_genes_of_all_cells += total_base_genes_of_all_cells mask_of_strong_related_cells = total_related_genes_of_all_cells >= min_cell_module_total count_of_strong_related_cells = np.sum( mask_of_strong_related_cells) ut.log_calc( f"- candidate gene {adata_of_all_genes_of_all_cells.var_names[gene_index]} " f"strong cells: {count_of_strong_related_cells} " f"factor: {count_of_strong_related_cells / count_of_strong_base_cells}" ) if count_of_strong_related_cells > max_related_gene_increase_factor * count_of_strong_base_cells: continue related_gene_indices_of_module.append(gene_index) related_gene_indices_of_modules.append( related_gene_indices_of_module) # if ut.logging_calc(): ut.log_calc("related genes for modules:") for module_index, related_gene_indices_of_module in enumerate( related_gene_indices_of_modules): ut.log_calc( f"- module {module_index} related_gene_names", sorted(adata_of_all_genes_of_all_cells. var_names[related_gene_indices_of_module]), ) return related_gene_indices_of_modules
def compute_knn_by_features( adata: AnnData, what: Union[str, ut.Matrix] = "__x__", *, max_top_feature_genes: int = pr.max_top_feature_genes, similarity_value_normalization: float = pr. umap_similarity_value_normalization, similarity_log_data: bool = pr.umap_similarity_log_data, similarity_method: str = pr.umap_similarity_method, logistics_location: float = pr.logistics_location, logistics_slope: float = pr.logistics_slope, k: int, balanced_ranks_factor: float = pr.knn_balanced_ranks_factor, incoming_degree_factor: float = pr.knn_incoming_degree_factor, outgoing_degree_factor: float = pr.knn_outgoing_degree_factor, reproducible: bool = pr.reproducible, ) -> ut.PandasFrame: """ Compute KNN graph between metacells based on feature genes. If ``reproducible`` (default: {reproducible}) is ``True``, a slower (still parallel) but reproducible algorithm will be used to compute pearson correlations. **Input** Annotated ``adata`` where each observation is a metacells and the variables are genes, are genes, where ``what`` is a per-variable-per-observation matrix or the name of a per-variable-per-observation annotation containing such a matrix. **Returns** Sets the following in ``adata``: Observations-Pair (Metacells) Annotations ``obs_outgoing_weights`` A sparse square matrix where each non-zero entry is the weight of an edge between a pair of cells or genes, where the sum of the weights of the outgoing edges for each element is 1 (there is always at least one such edge). Also return a pandas data frame of the similarities between the observations (metacells). **Computation Parameters** 1. Invoke :py:func:`metacells.tools.high.find_top_feature_genes` using ``max_top_feature_genes`` (default: {max_top_feature_genes}) to pick the feature genes to use to compute similarities between the metacells. 2. Compute the fractions of each gene in each cell, and add the ``similarity_value_normalization`` (default: {similarity_value_normalization}) to it. 3. If ``similarity_log_data`` (default: {similarity_log_data}), invoke the :py:func:`metacells.utilities.computation.log_data` function to compute the log (base 2) of the data. 4. Invoke :py:func:`metacells.tools.similarity.compute_obs_obs_similarity` using ``similarity_method`` (default: {similarity_method}), ``logistics_location`` (default: {logistics_slope}) and ``logistics_slope`` (default: {logistics_slope}) and convert this to distances. 5. Invoke :py:func:`metacells.tools.knn_graph.compute_obs_obs_knn_graph` using the distances, ``k`` (no default!), ``balanced_ranks_factor`` (default: {balanced_ranks_factor}), ``incoming_degree_factor`` (default: {incoming_degree_factor}), ``outgoing_degree_factor`` (default: {outgoing_degree_factor}) to compute a "skeleton" graph to overlay on top of the UMAP graph. """ tl.find_top_feature_genes(adata, max_genes=max_top_feature_genes) all_data = ut.get_vo_proper(adata, what, layout="row_major") all_fractions = ut.fraction_by(all_data, by="row") top_feature_genes_mask = ut.get_v_numpy(adata, "top_feature_gene") top_feature_genes_fractions = all_fractions[:, top_feature_genes_mask] top_feature_genes_fractions = ut.to_layout(top_feature_genes_fractions, layout="row_major") top_feature_genes_fractions = ut.to_numpy_matrix( top_feature_genes_fractions) top_feature_genes_fractions += similarity_value_normalization if similarity_log_data: top_feature_genes_fractions = ut.log_data(top_feature_genes_fractions, base=2) tdata = ut.slice(adata, vars=top_feature_genes_mask) similarities = tl.compute_obs_obs_similarity( tdata, top_feature_genes_fractions, method=similarity_method, reproducible=reproducible, logistics_location=logistics_location, logistics_slope=logistics_slope, inplace=False, ) assert similarities is not None tl.compute_obs_obs_knn_graph( adata, similarities, k=k, balanced_ranks_factor=balanced_ranks_factor, incoming_degree_factor=incoming_degree_factor, outgoing_degree_factor=outgoing_degree_factor, ) return similarities
def compute_outliers_matches( what: Union[str, ut.Matrix] = "__x__", *, adata: AnnData, gdata: AnnData, group: Union[str, ut.Vector] = "metacell", similar: str = "similar", value_normalization: float = pr.outliers_value_normalization, reproducible: bool, ) -> None: """ Given an assignment of observations (cells) to groups (metacells), compute for each outlier the "most similar" group. **Input** Annotated ``adata``, where the observations are cells and the variables are genes, where ``what`` is a per-variable-per-observation matrix or the name of a per-variable-per-observation annotation containing such a matrix. In addition, ``gdata`` is assumed to have one observation for each group, and use the same genes as ``adata``. **Returns** Sets the following in ``adata``: Per-Observation (Cell) Annotations ``similar`` (default: {similar}) For each observation (cell), the index of the "most similar" group. **Computation Parameters** 1. Compute the log2 of the fraction of each gene in each of the outlier cells and the group metacells using the ``value_normalization`` (default: {value_normalization}). 2. Cross-correlate each of the outlier cells with each of the group metacells, in a ``reproducible`` manner. """ group_of_cells = ut.get_o_numpy(adata, group) outliers_mask = group_of_cells < 0 odata = ut.slice(adata, obs=outliers_mask) outliers_data = ut.get_vo_proper(odata, what, layout="row_major") groups_data = ut.get_vo_proper(gdata, what, layout="row_major") outliers_fractions = ut.fraction_by(outliers_data, by="row") groups_fractions = ut.fraction_by(groups_data, by="row") outliers_fractions = ut.to_numpy_matrix(outliers_fractions) groups_fractions = ut.to_numpy_matrix(groups_fractions) outliers_fractions += value_normalization groups_fractions += value_normalization outliers_log_fractions = np.log2(outliers_fractions, out=outliers_fractions) groups_log_fractions = np.log2(groups_fractions, out=groups_fractions) outliers_groups_correlation = ut.cross_corrcoef_rows( outliers_log_fractions, groups_log_fractions, reproducible=reproducible) outliers_similar_group_indices = np.argmax(outliers_groups_correlation, axis=1) assert len(outliers_similar_group_indices) == odata.n_obs cells_similar_group_indices = np.full(adata.n_obs, -1, dtype="int32") cells_similar_group_indices[outliers_mask] = outliers_similar_group_indices ut.set_o_data(adata, similar, cells_similar_group_indices)
def filter_data( # pylint: disable=dangerous-default-value adata: AnnData, obs_masks: List[str] = [], var_masks: List[str] = [], *, mask_obs: Optional[str] = None, mask_var: Optional[str] = None, invert_obs: bool = False, invert_var: bool = False, track_obs: Optional[str] = None, track_var: Optional[str] = None, name: Optional[str] = None, top_level: bool = True, ) -> Optional[Tuple[AnnData, ut.PandasSeries, ut.PandasSeries]]: """ Filter (slice) the data based on previously-computed masks. For example, it is useful to discard cell-cycle genes, cells which have too few UMIs for meaningful analysis, etc. In general, the "best" filter depends on the data set. This function makes it easy to combine different pre-computed per-observation (cell) and per-variable (gene) boolean mask annotations into a final overall inclusion mask, and slice the data accordingly, while tracking the base index of the cells and genes in the filtered data. **Input** Annotated ``adata``, where the observations are cells and the variables are genes. **Returns** An annotated data containing a subset of the observations (cells) and variables (genes). If no observations and/or no variables were selected by the filter, returns ``None``. If ``name`` is not specified, the returned data will be unnamed. Otherwise, if the name starts with a ``.``, it will be appended to the current name (if any). Otherwise, ``name`` is the new name. If ``mask_obs`` and/or ``mask_var`` are specified, store the mask of the selected data as a per-observation and/or per-variable annotation of the full ``adata``. If ``track_obs`` and/or ``track_var`` are specified, store the original indices of the selected data as a per-observation and/or per-variable annotation of the result data. **Computation Parameters** 1. Combine the masks in ``obs_masks`` and/or ``var_masks`` using :py:func:`metacells.tools.mask.combine_masks` passing it ``invert_obs`` and ``invert_var``, and ``mask_obs`` and ``mask_var`` as the ``to`` parameter. If either list of masks is empty, use the full mask. 2. If the obtained masks for either the observations or variables is empty, return ``None``. Otherwise, return a slice of the full data containing just the observations and variables specified by the final masks. """ if len(obs_masks) == 0: obs_mask = np.full(adata.n_obs, True, dtype="bool") if mask_obs is not None: ut.set_o_data(adata, mask_obs, obs_mask) else: mask = combine_masks(adata, obs_masks, invert=invert_obs, to=mask_obs) if mask is None: assert mask_obs is not None obs_mask = ut.get_o_numpy( adata, mask_obs, formatter=ut.mask_description) > 0 else: obs_mask = ut.to_numpy_vector(mask, only_extract=True) > 0 if len(var_masks) == 0: var_mask = np.full(adata.n_vars, True, dtype="bool") if mask_var is not None: ut.set_o_data(adata, mask_var, var_mask) else: mask = combine_masks(adata, var_masks, invert=invert_var, to=mask_var) if mask is None: assert mask_var is not None var_mask = ut.get_v_numpy( adata, mask_var, formatter=ut.mask_description) > 0 else: var_mask = ut.to_numpy_vector(mask, only_extract=True) > 0 if not np.any(obs_mask) or not np.any(var_mask): return None fdata = ut.slice(adata, name=name, top_level=top_level, obs=obs_mask, vars=var_mask, track_obs=track_obs, track_var=track_var) return ( fdata, ut.to_pandas_series(obs_mask, index=adata.obs_names), ut.to_pandas_series(var_mask, index=adata.var_names), )