Exemple #1
0
def _fold_ranks(
    *,
    cells_count: int,
    fold_factors: ut.CompressedMatrix,
    deviant_gene_indices: ut.NumpyVector,
) -> ut.NumpyMatrix:
    assert fold_factors.getformat() == "csc"

    deviant_genes_count = deviant_gene_indices.size

    ut.timed_parameters(cells=cells_count, deviant_genes=deviant_genes_count)

    deviant_genes_fold_ranks = np.full((cells_count, deviant_genes_count), cells_count, order="F")
    assert ut.is_layout(deviant_genes_fold_ranks, "column_major")

    for deviant_gene_index, gene_index in enumerate(deviant_gene_indices):
        gene_start_offset = fold_factors.indptr[gene_index]
        gene_stop_offset = fold_factors.indptr[gene_index + 1]

        gene_fold_factors = fold_factors.data[gene_start_offset:gene_stop_offset]
        gene_suspect_cell_indices = fold_factors.indices[gene_start_offset:gene_stop_offset]

        gene_fold_ranks = stats.rankdata(gene_fold_factors, method="min")
        gene_fold_ranks *= -1
        gene_fold_ranks += gene_fold_ranks.size + 1

        deviant_genes_fold_ranks[gene_suspect_cell_indices, deviant_gene_index] = gene_fold_ranks

    return deviant_genes_fold_ranks
Exemple #2
0
def _filter_genes(
    *,
    cells_count: int,
    genes_count: int,
    fold_factors: ut.CompressedMatrix,
    min_gene_fold_factor: float,
    max_gene_fraction: Optional[float] = None,
) -> ut.NumpyVector:
    ut.timed_parameters(cells=cells_count, genes=genes_count, fold_factors=fold_factors.nnz)
    max_fold_factors_of_genes = ut.max_per(fold_factors, per="column")
    assert max_fold_factors_of_genes.size == genes_count

    mask_of_deviant_genes = max_fold_factors_of_genes >= min_gene_fold_factor
    deviant_gene_fraction = np.sum(mask_of_deviant_genes) / genes_count

    if max_gene_fraction is not None and deviant_gene_fraction > max_gene_fraction:
        if ut.logging_calc():
            ut.log_calc("candidate_deviant_genes", mask_of_deviant_genes)

        quantile_gene_fold_factor = np.quantile(max_fold_factors_of_genes, 1 - max_gene_fraction)
        assert quantile_gene_fold_factor is not None
        ut.log_calc("quantile_gene_fold_factor", quantile_gene_fold_factor)

        if quantile_gene_fold_factor > min_gene_fold_factor:
            min_gene_fold_factor = quantile_gene_fold_factor
            mask_of_deviant_genes = max_fold_factors_of_genes >= min_gene_fold_factor

            fold_factors.data[fold_factors.data < min_gene_fold_factor] = 0
            ut.eliminate_zeros(fold_factors)

    if ut.logging_calc():
        ut.log_calc("deviant_genes", mask_of_deviant_genes)

    deviant_gene_indices = np.where(mask_of_deviant_genes)[0]
    return deviant_gene_indices
Exemple #3
0
def _prune_ranks(balanced_ranks: ut.CompressedMatrix, k: int,
                 incoming_degree_factor: float,
                 outgoing_degree_factor: float) -> ut.CompressedMatrix:
    size = balanced_ranks.shape[0]

    incoming_degree = int(round(k * incoming_degree_factor))
    incoming_degree = min(incoming_degree, size - 1)
    ut.log_calc("incoming_degree", incoming_degree)

    outgoing_degree = int(round(k * outgoing_degree_factor))
    outgoing_degree = min(outgoing_degree, size - 1)
    ut.log_calc("outgoing_degree", outgoing_degree)

    all_indices = np.arange(size)
    with ut.timed_step("numpy.argmax"):
        ut.timed_parameters(results=size, elements=balanced_ranks.nnz / size)
        max_index_of_each = ut.to_numpy_vector(balanced_ranks.argmax(axis=1))

    preserved_row_indices = all_indices
    preserved_column_indices = max_index_of_each
    preserved_balanced_ranks = ut.to_numpy_vector(
        balanced_ranks[preserved_row_indices, preserved_column_indices])
    assert np.min(preserved_balanced_ranks) > 0
    preserved_matrix = sp.coo_matrix(
        (preserved_balanced_ranks,
         (preserved_row_indices, preserved_column_indices)),
        shape=balanced_ranks.shape)
    preserved_matrix.has_canonical_format = True

    pruned_ranks = ut.mustbe_compressed_matrix(
        ut.to_layout(balanced_ranks, "column_major", symmetric=True))
    _assert_proper_compressed(pruned_ranks, "csc")

    pruned_ranks = ut.prune_per(pruned_ranks, incoming_degree)
    _assert_proper_compressed(pruned_ranks, "csc")

    pruned_ranks = ut.mustbe_compressed_matrix(
        ut.to_layout(pruned_ranks, "row_major"))
    _assert_proper_compressed(pruned_ranks, "csr")

    pruned_ranks = ut.prune_per(pruned_ranks, outgoing_degree)
    _assert_proper_compressed(pruned_ranks, "csr")

    with ut.timed_step("sparse.maximum"):
        ut.timed_parameters(collected=pruned_ranks.nnz,
                            preserved=preserved_matrix.nnz)
        pruned_ranks = pruned_ranks.maximum(preserved_matrix)
        pruned_ranks = pruned_ranks.maximum(preserved_matrix.transpose())

    ut.sort_compressed_indices(pruned_ranks)

    pruned_ranks = ut.mustbe_compressed_matrix(pruned_ranks)
    _assert_proper_compressed(pruned_ranks, "csr")
    return pruned_ranks
Exemple #4
0
def _weigh_edges(pruned_ranks: ut.CompressedMatrix) -> ut.CompressedMatrix:
    size = pruned_ranks.shape[0]

    total_ranks_per_row = ut.sum_per(pruned_ranks, per="row")

    ut.timed_parameters(size=size)
    scale_per_row = np.reciprocal(total_ranks_per_row, out=total_ranks_per_row)
    edge_weights = pruned_ranks.multiply(scale_per_row[:, None])
    edge_weights = ut.to_layout(edge_weights, "row_major")

    _assert_proper_compressed(edge_weights, "csr")
    return edge_weights
Exemple #5
0
def _cluster_genes(
    similarities_between_candidate_genes: ut.NumpyMatrix,
    genes_cluster_method: str,
) -> List[Tuple[int, int]]:
    with ut.timed_step("scipy.pdist"):
        ut.timed_parameters(size=similarities_between_candidate_genes.shape[0])
        distances = scd.pdist(similarities_between_candidate_genes)

    with ut.timed_step("scipy.linkage"):
        ut.timed_parameters(size=distances.shape[0],
                            method=genes_cluster_method)
        linkage = sch.linkage(distances, method=genes_cluster_method)

    return linkage
Exemple #6
0
def _balance_ranks(outgoing_ranks: ut.NumpyMatrix, k: int,
                   balanced_ranks_factor: float) -> ut.CompressedMatrix:
    size = outgoing_ranks.shape[0]

    with ut.timed_step(".multiply"):
        ut.timed_parameters(size=size)
        dense_balanced_ranks = outgoing_ranks
        assert np.sum(np.diagonal(dense_balanced_ranks) == size) == size
        dense_balanced_ranks *= outgoing_ranks.transpose()

    with ut.timed_step(".sqrt"):
        np.sqrt(dense_balanced_ranks, out=dense_balanced_ranks)

    max_rank = k * balanced_ranks_factor
    ut.log_calc("max_rank", max_rank)

    dense_balanced_ranks *= -1
    dense_balanced_ranks += 2**21

    with ut.timed_step("numpy.argmax"):
        ut.timed_parameters(size=size)
        max_index_of_each = ut.to_numpy_vector(
            dense_balanced_ranks.argmax(axis=1))  #

    dense_balanced_ranks += max_rank + 1 - 2**21

    preserved_row_indices = np.arange(size)
    preserved_column_indices = max_index_of_each
    preserved_balanced_ranks = ut.to_numpy_vector(
        dense_balanced_ranks[preserved_row_indices, preserved_column_indices])

    preserved_balanced_ranks[preserved_balanced_ranks < 1] = 1

    dense_balanced_ranks[dense_balanced_ranks < 0] = 0
    np.fill_diagonal(dense_balanced_ranks, 0)

    dense_balanced_ranks[preserved_row_indices,
                         preserved_column_indices] = preserved_balanced_ranks

    assert np.sum(np.diagonal(dense_balanced_ranks) == 0) == size
    sparse_balanced_ranks = sp.csr_matrix(dense_balanced_ranks)

    _assert_proper_compressed(sparse_balanced_ranks, "csr")
    return sparse_balanced_ranks
Exemple #7
0
def _collect_fold_factors(  # pylint: disable=too-many-statements
    *,
    data: ut.ProperMatrix,
    candidate_of_cells: ut.NumpyVector,
    totals_of_cells: ut.NumpyVector,
    min_gene_fold_factor: float,
    abs_folds: bool,
) -> Tuple[List[ut.CompressedMatrix], List[ut.NumpyVector]]:
    list_of_fold_factors: List[ut.CompressedMatrix] = []
    list_of_cell_index_of_rows: List[ut.NumpyVector] = []

    cells_count, genes_count = data.shape
    candidates_count = np.max(candidate_of_cells) + 1

    ut.timed_parameters(candidates=candidates_count, cells=cells_count, genes=genes_count)
    remaining_cells_count = cells_count

    for candidate_index in range(candidates_count):
        candidate_cell_indices = np.where(candidate_of_cells == candidate_index)[0]

        candidate_cells_count = candidate_cell_indices.size
        assert candidate_cells_count > 0

        list_of_cell_index_of_rows.append(candidate_cell_indices)
        remaining_cells_count -= candidate_cells_count

        if candidate_cells_count < 2:
            compressed = sparse.csr_matrix(
                ([], [], [0] * (candidate_cells_count + 1)), shape=(candidate_cells_count, genes_count)
            )
            list_of_fold_factors.append(compressed)
            assert compressed.has_sorted_indices
            assert compressed.has_canonical_format
            continue

        data_of_candidate: ut.ProperMatrix = data[candidate_cell_indices, :].copy()
        assert ut.is_layout(data_of_candidate, "row_major")
        assert data_of_candidate.shape == (candidate_cells_count, genes_count)

        totals_of_candidate_cells = totals_of_cells[candidate_cell_indices]

        totals_of_candidate_genes = ut.sum_per(ut.to_layout(data_of_candidate, "column_major"), per="column")
        assert totals_of_candidate_genes.size == genes_count

        fractions_of_candidate_genes = ut.to_numpy_vector(totals_of_candidate_genes / np.sum(totals_of_candidate_genes))

        _, dense, compressed = ut.to_proper_matrices(data_of_candidate)

        if compressed is not None:
            if compressed.nnz == 0:
                list_of_fold_factors.append(compressed)
                continue

            extension_name = "fold_factor_compressed_%s_t_%s_t_%s_t" % (  # pylint: disable=consider-using-f-string
                compressed.data.dtype,
                compressed.indices.dtype,
                compressed.indptr.dtype,
            )
            extension = getattr(xt, extension_name)

            with ut.timed_step("extensions.fold_factor_compressed"):
                extension(
                    compressed.data,
                    compressed.indices,
                    compressed.indptr,
                    min_gene_fold_factor,
                    abs_folds,
                    totals_of_candidate_cells,
                    fractions_of_candidate_genes,
                )

            ut.eliminate_zeros(compressed)

        else:
            assert dense is not None

            extension_name = f"fold_factor_dense_{dense.dtype}_t"
            extension = getattr(xt, extension_name)

            with ut.timed_step("extensions.fold_factor_dense"):
                extension(
                    dense,
                    min_gene_fold_factor,
                    abs_folds,
                    totals_of_candidate_cells,
                    fractions_of_candidate_genes,
                )

            compressed = sparse.csr_matrix(dense)
            assert compressed.has_sorted_indices
            assert compressed.has_canonical_format

        list_of_fold_factors.append(compressed)

    if remaining_cells_count > 0:
        assert remaining_cells_count == np.sum(candidate_of_cells < 0)
        list_of_cell_index_of_rows.append(np.where(candidate_of_cells < 0)[0])
        compressed = sparse.csr_matrix(
            ([], [], [0] * (remaining_cells_count + 1)), shape=(remaining_cells_count, genes_count)
        )
        assert compressed.has_sorted_indices
        assert compressed.has_canonical_format
        list_of_fold_factors.append(compressed)

    return list_of_fold_factors, list_of_cell_index_of_rows