def _fold_ranks( *, cells_count: int, fold_factors: ut.CompressedMatrix, deviant_gene_indices: ut.NumpyVector, ) -> ut.NumpyMatrix: assert fold_factors.getformat() == "csc" deviant_genes_count = deviant_gene_indices.size ut.timed_parameters(cells=cells_count, deviant_genes=deviant_genes_count) deviant_genes_fold_ranks = np.full((cells_count, deviant_genes_count), cells_count, order="F") assert ut.is_layout(deviant_genes_fold_ranks, "column_major") for deviant_gene_index, gene_index in enumerate(deviant_gene_indices): gene_start_offset = fold_factors.indptr[gene_index] gene_stop_offset = fold_factors.indptr[gene_index + 1] gene_fold_factors = fold_factors.data[gene_start_offset:gene_stop_offset] gene_suspect_cell_indices = fold_factors.indices[gene_start_offset:gene_stop_offset] gene_fold_ranks = stats.rankdata(gene_fold_factors, method="min") gene_fold_ranks *= -1 gene_fold_ranks += gene_fold_ranks.size + 1 deviant_genes_fold_ranks[gene_suspect_cell_indices, deviant_gene_index] = gene_fold_ranks return deviant_genes_fold_ranks
def _filter_genes( *, cells_count: int, genes_count: int, fold_factors: ut.CompressedMatrix, min_gene_fold_factor: float, max_gene_fraction: Optional[float] = None, ) -> ut.NumpyVector: ut.timed_parameters(cells=cells_count, genes=genes_count, fold_factors=fold_factors.nnz) max_fold_factors_of_genes = ut.max_per(fold_factors, per="column") assert max_fold_factors_of_genes.size == genes_count mask_of_deviant_genes = max_fold_factors_of_genes >= min_gene_fold_factor deviant_gene_fraction = np.sum(mask_of_deviant_genes) / genes_count if max_gene_fraction is not None and deviant_gene_fraction > max_gene_fraction: if ut.logging_calc(): ut.log_calc("candidate_deviant_genes", mask_of_deviant_genes) quantile_gene_fold_factor = np.quantile(max_fold_factors_of_genes, 1 - max_gene_fraction) assert quantile_gene_fold_factor is not None ut.log_calc("quantile_gene_fold_factor", quantile_gene_fold_factor) if quantile_gene_fold_factor > min_gene_fold_factor: min_gene_fold_factor = quantile_gene_fold_factor mask_of_deviant_genes = max_fold_factors_of_genes >= min_gene_fold_factor fold_factors.data[fold_factors.data < min_gene_fold_factor] = 0 ut.eliminate_zeros(fold_factors) if ut.logging_calc(): ut.log_calc("deviant_genes", mask_of_deviant_genes) deviant_gene_indices = np.where(mask_of_deviant_genes)[0] return deviant_gene_indices
def _prune_ranks(balanced_ranks: ut.CompressedMatrix, k: int, incoming_degree_factor: float, outgoing_degree_factor: float) -> ut.CompressedMatrix: size = balanced_ranks.shape[0] incoming_degree = int(round(k * incoming_degree_factor)) incoming_degree = min(incoming_degree, size - 1) ut.log_calc("incoming_degree", incoming_degree) outgoing_degree = int(round(k * outgoing_degree_factor)) outgoing_degree = min(outgoing_degree, size - 1) ut.log_calc("outgoing_degree", outgoing_degree) all_indices = np.arange(size) with ut.timed_step("numpy.argmax"): ut.timed_parameters(results=size, elements=balanced_ranks.nnz / size) max_index_of_each = ut.to_numpy_vector(balanced_ranks.argmax(axis=1)) preserved_row_indices = all_indices preserved_column_indices = max_index_of_each preserved_balanced_ranks = ut.to_numpy_vector( balanced_ranks[preserved_row_indices, preserved_column_indices]) assert np.min(preserved_balanced_ranks) > 0 preserved_matrix = sp.coo_matrix( (preserved_balanced_ranks, (preserved_row_indices, preserved_column_indices)), shape=balanced_ranks.shape) preserved_matrix.has_canonical_format = True pruned_ranks = ut.mustbe_compressed_matrix( ut.to_layout(balanced_ranks, "column_major", symmetric=True)) _assert_proper_compressed(pruned_ranks, "csc") pruned_ranks = ut.prune_per(pruned_ranks, incoming_degree) _assert_proper_compressed(pruned_ranks, "csc") pruned_ranks = ut.mustbe_compressed_matrix( ut.to_layout(pruned_ranks, "row_major")) _assert_proper_compressed(pruned_ranks, "csr") pruned_ranks = ut.prune_per(pruned_ranks, outgoing_degree) _assert_proper_compressed(pruned_ranks, "csr") with ut.timed_step("sparse.maximum"): ut.timed_parameters(collected=pruned_ranks.nnz, preserved=preserved_matrix.nnz) pruned_ranks = pruned_ranks.maximum(preserved_matrix) pruned_ranks = pruned_ranks.maximum(preserved_matrix.transpose()) ut.sort_compressed_indices(pruned_ranks) pruned_ranks = ut.mustbe_compressed_matrix(pruned_ranks) _assert_proper_compressed(pruned_ranks, "csr") return pruned_ranks
def _weigh_edges(pruned_ranks: ut.CompressedMatrix) -> ut.CompressedMatrix: size = pruned_ranks.shape[0] total_ranks_per_row = ut.sum_per(pruned_ranks, per="row") ut.timed_parameters(size=size) scale_per_row = np.reciprocal(total_ranks_per_row, out=total_ranks_per_row) edge_weights = pruned_ranks.multiply(scale_per_row[:, None]) edge_weights = ut.to_layout(edge_weights, "row_major") _assert_proper_compressed(edge_weights, "csr") return edge_weights
def _cluster_genes( similarities_between_candidate_genes: ut.NumpyMatrix, genes_cluster_method: str, ) -> List[Tuple[int, int]]: with ut.timed_step("scipy.pdist"): ut.timed_parameters(size=similarities_between_candidate_genes.shape[0]) distances = scd.pdist(similarities_between_candidate_genes) with ut.timed_step("scipy.linkage"): ut.timed_parameters(size=distances.shape[0], method=genes_cluster_method) linkage = sch.linkage(distances, method=genes_cluster_method) return linkage
def _balance_ranks(outgoing_ranks: ut.NumpyMatrix, k: int, balanced_ranks_factor: float) -> ut.CompressedMatrix: size = outgoing_ranks.shape[0] with ut.timed_step(".multiply"): ut.timed_parameters(size=size) dense_balanced_ranks = outgoing_ranks assert np.sum(np.diagonal(dense_balanced_ranks) == size) == size dense_balanced_ranks *= outgoing_ranks.transpose() with ut.timed_step(".sqrt"): np.sqrt(dense_balanced_ranks, out=dense_balanced_ranks) max_rank = k * balanced_ranks_factor ut.log_calc("max_rank", max_rank) dense_balanced_ranks *= -1 dense_balanced_ranks += 2**21 with ut.timed_step("numpy.argmax"): ut.timed_parameters(size=size) max_index_of_each = ut.to_numpy_vector( dense_balanced_ranks.argmax(axis=1)) # dense_balanced_ranks += max_rank + 1 - 2**21 preserved_row_indices = np.arange(size) preserved_column_indices = max_index_of_each preserved_balanced_ranks = ut.to_numpy_vector( dense_balanced_ranks[preserved_row_indices, preserved_column_indices]) preserved_balanced_ranks[preserved_balanced_ranks < 1] = 1 dense_balanced_ranks[dense_balanced_ranks < 0] = 0 np.fill_diagonal(dense_balanced_ranks, 0) dense_balanced_ranks[preserved_row_indices, preserved_column_indices] = preserved_balanced_ranks assert np.sum(np.diagonal(dense_balanced_ranks) == 0) == size sparse_balanced_ranks = sp.csr_matrix(dense_balanced_ranks) _assert_proper_compressed(sparse_balanced_ranks, "csr") return sparse_balanced_ranks
def _collect_fold_factors( # pylint: disable=too-many-statements *, data: ut.ProperMatrix, candidate_of_cells: ut.NumpyVector, totals_of_cells: ut.NumpyVector, min_gene_fold_factor: float, abs_folds: bool, ) -> Tuple[List[ut.CompressedMatrix], List[ut.NumpyVector]]: list_of_fold_factors: List[ut.CompressedMatrix] = [] list_of_cell_index_of_rows: List[ut.NumpyVector] = [] cells_count, genes_count = data.shape candidates_count = np.max(candidate_of_cells) + 1 ut.timed_parameters(candidates=candidates_count, cells=cells_count, genes=genes_count) remaining_cells_count = cells_count for candidate_index in range(candidates_count): candidate_cell_indices = np.where(candidate_of_cells == candidate_index)[0] candidate_cells_count = candidate_cell_indices.size assert candidate_cells_count > 0 list_of_cell_index_of_rows.append(candidate_cell_indices) remaining_cells_count -= candidate_cells_count if candidate_cells_count < 2: compressed = sparse.csr_matrix( ([], [], [0] * (candidate_cells_count + 1)), shape=(candidate_cells_count, genes_count) ) list_of_fold_factors.append(compressed) assert compressed.has_sorted_indices assert compressed.has_canonical_format continue data_of_candidate: ut.ProperMatrix = data[candidate_cell_indices, :].copy() assert ut.is_layout(data_of_candidate, "row_major") assert data_of_candidate.shape == (candidate_cells_count, genes_count) totals_of_candidate_cells = totals_of_cells[candidate_cell_indices] totals_of_candidate_genes = ut.sum_per(ut.to_layout(data_of_candidate, "column_major"), per="column") assert totals_of_candidate_genes.size == genes_count fractions_of_candidate_genes = ut.to_numpy_vector(totals_of_candidate_genes / np.sum(totals_of_candidate_genes)) _, dense, compressed = ut.to_proper_matrices(data_of_candidate) if compressed is not None: if compressed.nnz == 0: list_of_fold_factors.append(compressed) continue extension_name = "fold_factor_compressed_%s_t_%s_t_%s_t" % ( # pylint: disable=consider-using-f-string compressed.data.dtype, compressed.indices.dtype, compressed.indptr.dtype, ) extension = getattr(xt, extension_name) with ut.timed_step("extensions.fold_factor_compressed"): extension( compressed.data, compressed.indices, compressed.indptr, min_gene_fold_factor, abs_folds, totals_of_candidate_cells, fractions_of_candidate_genes, ) ut.eliminate_zeros(compressed) else: assert dense is not None extension_name = f"fold_factor_dense_{dense.dtype}_t" extension = getattr(xt, extension_name) with ut.timed_step("extensions.fold_factor_dense"): extension( dense, min_gene_fold_factor, abs_folds, totals_of_candidate_cells, fractions_of_candidate_genes, ) compressed = sparse.csr_matrix(dense) assert compressed.has_sorted_indices assert compressed.has_canonical_format list_of_fold_factors.append(compressed) if remaining_cells_count > 0: assert remaining_cells_count == np.sum(candidate_of_cells < 0) list_of_cell_index_of_rows.append(np.where(candidate_of_cells < 0)[0]) compressed = sparse.csr_matrix( ([], [], [0] * (remaining_cells_count + 1)), shape=(remaining_cells_count, genes_count) ) assert compressed.has_sorted_indices assert compressed.has_canonical_format list_of_fold_factors.append(compressed) return list_of_fold_factors, list_of_cell_index_of_rows