Example #1
0
def project_atlas_to_query(
    *,
    adata: AnnData,
    qdata: AnnData,
    weights: ut.ProperMatrix,
    property_name: str,
    formatter: Optional[Callable[[Any], Any]] = None,
    to_property_name: Optional[str] = None,
    method: Callable[[ut.Vector, ut.Vector], Any] = ut.highest_weight,
) -> None:
    """
    Project the value of a property from per-observation atlas data to per-observation query data.

    The input annotated ``adata`` is expected to contain a per-observation (cell) annotation named ``property_name``.
    Given the ``weights`` matrix, where each row specifies the weights of the atlas metacells used to project a single
    query metacell, this will generate a new per-observation (group) annotation in ``qdata``, named ``to_property_name``
    (by default, the same as ``property_name``), containing the aggregated value of the property of all the observations
    (cells) that belong to the group.

    The aggregation method (by default, :py:func:`metacells.utilities.computation.highest_weight`) is any function
    taking two array, weights and values, and returning a single value.
    """
    if to_property_name is None:
        to_property_name = property_name

    property_of_atlas_metacells = ut.get_o_numpy(adata, property_name, formatter=formatter)
    property_of_query_metacells = []
    for query_metacell_index in range(qdata.n_obs):
        metacell_weights = ut.to_numpy_vector(weights[query_metacell_index, :])
        metacell_mask = metacell_weights > 0
        metacell_weights = ut.to_numpy_vector(metacell_weights[metacell_mask])
        metacell_values = property_of_atlas_metacells[metacell_mask]
        property_of_query_metacells.append(method(metacell_weights, metacell_values))

    ut.set_o_data(qdata, to_property_name, np.array(property_of_query_metacells))
Example #2
0
def compute_query_projection(
    what: Union[str, ut.Matrix] = "__x__",
    *,
    adata: AnnData,
    qdata: AnnData,
    weights: ut.Matrix,
    atlas_total_umis: Optional[ut.Vector] = None,
    query_total_umis: Optional[ut.Vector] = None,
) -> None:
    """
    Compute the projected image of the query on the atlas.

    **Input**

    Annotated query ``qdata`` and atlas ``adata``, where the observations are cells and the variables are genes, where
    ``what`` is a per-variable-per-observation matrix or the name of a per-variable-per-observation annotation
    containing such a matrix.

    The ``weights`` of the projection where each row is a query metacell, each column is an atlas metacell, and the
    value is the weight of the atlas cell for projecting the metacell, such that the sum of weights in each row
    is one.

    **Returns**

    In addition, sets the following annotations in ``qdata``:

    Observation (Cell) Annotations
        ``projection``
            The number of UMIs of each gene in the projected image of the query to the metacell, if the total number of
            UMIs in the projection is equal to the total number of UMIs in the query metacell.

    **Computation Parameters**

    1. Compute the fraction of each gene in the atlas and the query based on the total UMIs, unless ``atlas_total_umis``
       and/or ``query_total_umis`` are specified.

    2. Compute the projected image of each query metacell on the atlas using the weights.

    3. Convert this image to UMIs count based on the total UMIs of each metacell. Note that if overriding the total
       atlas or query UMIs, this means that the result need not sum to this total.
    """
    assert np.all(adata.var_names == qdata.var_names)

    atlas_umis = ut.get_vo_proper(adata, what, layout="row_major")
    query_umis = ut.get_vo_proper(qdata, what, layout="row_major")

    if atlas_total_umis is None:
        atlas_total_umis = ut.sum_per(atlas_umis, per="row")
    atlas_total_umis = ut.to_numpy_vector(atlas_total_umis)

    if query_total_umis is None:
        query_total_umis = ut.sum_per(query_umis, per="row")
    query_total_umis = ut.to_numpy_vector(query_total_umis)

    atlas_fractions = ut.to_numpy_matrix(ut.fraction_by(atlas_umis, by="row", sums=atlas_total_umis))
    projected_fractions = weights @ atlas_fractions  # type: ignore
    projected_umis = ut.scale_by(projected_fractions, scale=query_total_umis, by="row")
    ut.set_vo_data(qdata, "projected", projected_umis)
Example #3
0
def _project_single_metacell(
    *,
    query_metacell_index: int,
    atlas_umis: ut.Matrix,
    query_atlas_corr: ut.NumpyMatrix,
    atlas_project_data: ut.NumpyMatrix,
    query_project_data: ut.NumpyMatrix,
    atlas_log_fractions: ut.NumpyMatrix,
    candidates_count: int,
    min_significant_gene_value: float,
    min_usage_weight: float,
    max_consistency_fold_factor: float,
) -> Tuple[ut.NumpyVector, ut.NumpyVector]:
    query_metacell_project_data = query_project_data[query_metacell_index, :]
    query_metacell_atlas_correlations = query_atlas_corr[query_metacell_index, :]
    query_metacell_atlas_order = np.argsort(-query_metacell_atlas_correlations)

    atlas_anchor_index = query_metacell_atlas_order[0]
    ut.log_calc("atlas_anchor_index", atlas_anchor_index)
    atlas_anchor_log_fractions = atlas_log_fractions[atlas_anchor_index, :]
    atlas_anchor_umis = ut.to_numpy_vector(atlas_umis[atlas_anchor_index, :])

    atlas_candidate_indices_list = [atlas_anchor_index]
    position = 1
    while len(atlas_candidate_indices_list) < candidates_count and position < len(query_metacell_atlas_order):
        atlas_metacell_index = query_metacell_atlas_order[position]
        position += 1
        atlas_metacell_log_fractions = atlas_log_fractions[atlas_metacell_index, :]
        atlas_metacell_consistency_fold_factors = np.abs(atlas_metacell_log_fractions - atlas_anchor_log_fractions)
        atlas_metacell_umis = ut.to_numpy_vector(atlas_umis[atlas_metacell_index, :])
        atlas_metacell_significant_genes_mask = atlas_metacell_umis + atlas_anchor_umis >= min_significant_gene_value
        atlas_metacell_consistency = np.max(
            atlas_metacell_consistency_fold_factors[atlas_metacell_significant_genes_mask]
        )
        if atlas_metacell_consistency <= max_consistency_fold_factor / 2.0:
            atlas_candidate_indices_list.append(atlas_metacell_index)

    atlas_candidate_indices = np.array(sorted(atlas_candidate_indices_list))
    atlas_candidates_project_data = atlas_project_data[atlas_candidate_indices, :]

    represent_result = ut.represent(query_metacell_project_data, atlas_candidates_project_data)
    assert represent_result is not None
    atlas_candidate_weights = represent_result[1]
    atlas_candidate_weights[atlas_candidate_weights < min_usage_weight] = 0
    atlas_candidate_weights[atlas_candidate_weights < min_usage_weight] /= np.sum(atlas_candidate_weights)

    atlas_used_mask = atlas_candidate_weights > 0

    atlas_used_indices = atlas_candidate_indices[atlas_used_mask].astype("int32")
    ut.log_return("atlas_used_indices", atlas_used_indices)

    atlas_used_weights = atlas_candidate_weights[atlas_used_mask]
    atlas_used_weights = atlas_used_weights.astype("float32")
    ut.log_return("atlas_used_weights", atlas_used_weights)

    return (atlas_used_indices, atlas_used_weights)
Example #4
0
def _prune_ranks(balanced_ranks: ut.CompressedMatrix, k: int,
                 incoming_degree_factor: float,
                 outgoing_degree_factor: float) -> ut.CompressedMatrix:
    size = balanced_ranks.shape[0]

    incoming_degree = int(round(k * incoming_degree_factor))
    incoming_degree = min(incoming_degree, size - 1)
    ut.log_calc("incoming_degree", incoming_degree)

    outgoing_degree = int(round(k * outgoing_degree_factor))
    outgoing_degree = min(outgoing_degree, size - 1)
    ut.log_calc("outgoing_degree", outgoing_degree)

    all_indices = np.arange(size)
    with ut.timed_step("numpy.argmax"):
        ut.timed_parameters(results=size, elements=balanced_ranks.nnz / size)
        max_index_of_each = ut.to_numpy_vector(balanced_ranks.argmax(axis=1))

    preserved_row_indices = all_indices
    preserved_column_indices = max_index_of_each
    preserved_balanced_ranks = ut.to_numpy_vector(
        balanced_ranks[preserved_row_indices, preserved_column_indices])
    assert np.min(preserved_balanced_ranks) > 0
    preserved_matrix = sp.coo_matrix(
        (preserved_balanced_ranks,
         (preserved_row_indices, preserved_column_indices)),
        shape=balanced_ranks.shape)
    preserved_matrix.has_canonical_format = True

    pruned_ranks = ut.mustbe_compressed_matrix(
        ut.to_layout(balanced_ranks, "column_major", symmetric=True))
    _assert_proper_compressed(pruned_ranks, "csc")

    pruned_ranks = ut.prune_per(pruned_ranks, incoming_degree)
    _assert_proper_compressed(pruned_ranks, "csc")

    pruned_ranks = ut.mustbe_compressed_matrix(
        ut.to_layout(pruned_ranks, "row_major"))
    _assert_proper_compressed(pruned_ranks, "csr")

    pruned_ranks = ut.prune_per(pruned_ranks, outgoing_degree)
    _assert_proper_compressed(pruned_ranks, "csr")

    with ut.timed_step("sparse.maximum"):
        ut.timed_parameters(collected=pruned_ranks.nnz,
                            preserved=preserved_matrix.nnz)
        pruned_ranks = pruned_ranks.maximum(preserved_matrix)
        pruned_ranks = pruned_ranks.maximum(preserved_matrix.transpose())

    ut.sort_compressed_indices(pruned_ranks)

    pruned_ranks = ut.mustbe_compressed_matrix(pruned_ranks)
    _assert_proper_compressed(pruned_ranks, "csr")
    return pruned_ranks
Example #5
0
def _balance_ranks(outgoing_ranks: ut.NumpyMatrix, k: int,
                   balanced_ranks_factor: float) -> ut.CompressedMatrix:
    size = outgoing_ranks.shape[0]

    with ut.timed_step(".multiply"):
        ut.timed_parameters(size=size)
        dense_balanced_ranks = outgoing_ranks
        assert np.sum(np.diagonal(dense_balanced_ranks) == size) == size
        dense_balanced_ranks *= outgoing_ranks.transpose()

    with ut.timed_step(".sqrt"):
        np.sqrt(dense_balanced_ranks, out=dense_balanced_ranks)

    max_rank = k * balanced_ranks_factor
    ut.log_calc("max_rank", max_rank)

    dense_balanced_ranks *= -1
    dense_balanced_ranks += 2**21

    with ut.timed_step("numpy.argmax"):
        ut.timed_parameters(size=size)
        max_index_of_each = ut.to_numpy_vector(
            dense_balanced_ranks.argmax(axis=1))  #

    dense_balanced_ranks += max_rank + 1 - 2**21

    preserved_row_indices = np.arange(size)
    preserved_column_indices = max_index_of_each
    preserved_balanced_ranks = ut.to_numpy_vector(
        dense_balanced_ranks[preserved_row_indices, preserved_column_indices])

    preserved_balanced_ranks[preserved_balanced_ranks < 1] = 1

    dense_balanced_ranks[dense_balanced_ranks < 0] = 0
    np.fill_diagonal(dense_balanced_ranks, 0)

    dense_balanced_ranks[preserved_row_indices,
                         preserved_column_indices] = preserved_balanced_ranks

    assert np.sum(np.diagonal(dense_balanced_ranks) == 0) == size
    sparse_balanced_ranks = sp.csr_matrix(dense_balanced_ranks)

    _assert_proper_compressed(sparse_balanced_ranks, "csr")
    return sparse_balanced_ranks
Example #6
0
def _compute_cell_certificates(
    *,
    cell_index: int,
    cells_data: ut.Matrix,
    metacells_data: ut.Matrix,
    group_of_cells: ut.NumpyVector,
    similar_of_cells: ut.NumpyVector,
    total_umis_per_cell: ut.NumpyVector,
    total_umis_per_metacell: ut.NumpyVector,
    significant_gene_fold_factor: float,
) -> Tuple[ut.NumpyVector, ut.NumpyVector]:
    group_index = group_of_cells[cell_index]
    similar_index = similar_of_cells[cell_index]
    assert (group_index < 0) != (similar_index < 0)
    metacell_index = max(group_index, similar_index)

    expected_scale = total_umis_per_cell[cell_index] / total_umis_per_metacell[
        metacell_index]
    expected_data = ut.to_numpy_vector(metacells_data[metacell_index, :],
                                       copy=True)
    expected_data *= expected_scale

    actual_data = ut.to_numpy_vector(cells_data[cell_index, :], copy=True)

    expected_data += 1.0
    actual_data += 1.0

    fold_factors = actual_data
    fold_factors /= expected_data
    fold_factors = np.log2(fold_factors, out=fold_factors)
    fold_factors = np.abs(fold_factors, out=fold_factors)

    significant_folds_mask = fold_factors >= significant_gene_fold_factor
    ut.log_calc("significant_folds_mask", significant_folds_mask)

    significant_gene_indices = np.where(significant_folds_mask)[0].astype(
        "int32")
    significant_gene_folds = fold_factors[significant_folds_mask].astype(
        "float32")
    return (significant_gene_indices, significant_gene_folds)
Example #7
0
def _build_igraph(edge_weights: ut.Matrix) -> Tuple[ig.Graph, ut.NumpyVector]:
    edge_weights = ut.to_proper_matrix(edge_weights)
    assert edge_weights.shape[0] == edge_weights.shape[1]
    size = edge_weights.shape[0]

    sources, targets = edge_weights.nonzero()
    weights_array = ut.to_numpy_vector(edge_weights[sources,
                                                    targets]).astype("float64")

    graph = ig.Graph(directed=True)
    graph.add_vertices(size)
    graph.add_edges(list(zip(sources, targets)))
    graph.es["weight"] = weights_array

    return graph, weights_array
Example #8
0
def project_query_onto_atlas(
    what: Union[str, ut.Matrix] = "__x__",
    *,
    adata: AnnData,
    qdata: AnnData,
    atlas_total_umis: Optional[ut.Vector] = None,
    query_total_umis: Optional[ut.Vector] = None,
    project_log_data: bool = pr.project_log_data,
    fold_normalization: float = pr.project_fold_normalization,
    min_significant_gene_value: float = pr.project_min_significant_gene_value,
    max_consistency_fold_factor: float = pr.project_max_consistency_fold_factor,
    candidates_count: int = pr.project_candidates_count,
    min_usage_weight: float = pr.project_min_usage_weight,
    reproducible: bool,
) -> ut.CompressedMatrix:
    """
    Project query metacells onto atlas metacells.

    **Input**

    Annotated query ``qdata`` and atlas ``adata``, where the observations are cells and the variables are genes, where
    ``what`` is a per-variable-per-observation matrix or the name of a per-variable-per-observation annotation
    containing such a matrix.

    Typically this data excludes any genes having a systematic difference between the query and the atlas, e.g. genes
    detected by by :py:func:`metacells.tools.project.find_systematic_genes`.

    **Returns**

    A matrix whose rows are query metacells and columns are atlas metacells, where each entry is the weight of the atlas
    metacell in the projection of the query metacells. The sum of weights in each row (that is, for a single query
    metacell) is 1. The weighted sum of the atlas metacells using these weights is the "projected" image of the query
    metacell onto the atlas.

    In addition, sets the following annotations in ``qdata``:

    Observation (Cell) Annotations
        ``similar``
            A boolean mask indicating whether the query metacell is similar to its projection onto the atlas. If
            ``False`` the metacells is said to be "dissimilar", which may indicate the query contains cell states that
            do not appear in the atlas.

    **Computation Parameters**

    0. All fold computations (log2 of the ratio between gene expressions as a fraction of the total UMIs) use the
       ``fold_normalization`` (default: {fold_normalization}). Fractions are computed based on the total UMIs, unless
       ``atlas_total_umis`` and/or ``query_total_umis`` are specified.

    For each query metacell:

    1. Correlate the metacell with all the atlas metacells, and pick the highest-correlated one as the "anchor".
       If ``reproducible``, a slower (still parallel) but reproducible algorithm will be used.

    2. Consider as candidates only atlas metacells whose maximal gene fold factor compared to the anchor is at most
       ``max_consistency_fold_factor`` (default: {max_consistency_fold_factor}). Ignore the fold factors of genes whose
       sum of UMIs in the anchor and the candidate metacells is less than ``min_significant_gene_value`` (default:
       {min_significant_gene_value}).

    3. Select the ``candidates_count`` (default: {candidates_count}) candidate metacells with the highest correlation
       with the query metacell.

    4. Compute the non-negative weights (with a sum of 1) of the selected candidates that give the best projection of
       the query metacells onto the atlas. Since the algorithm for computing these weights rarely produces an exact 0
       weight, reduce all weights less than the ``min_usage_weight`` (default: {min_usage_weight}) to zero. If
       ``project_log_data`` (default: {project_log_data}), compute the match on the log of the data instead of the
       actual data.
    """
    assert fold_normalization > 0
    assert candidates_count > 0
    assert min_usage_weight >= 0
    assert max_consistency_fold_factor >= 0
    assert np.all(adata.var_names == qdata.var_names)

    atlas_umis = ut.get_vo_proper(adata, what, layout="row_major")
    query_umis = ut.get_vo_proper(qdata, what, layout="row_major")

    if atlas_total_umis is None:
        atlas_total_umis = ut.sum_per(atlas_umis, per="row")
    atlas_total_umis = ut.to_numpy_vector(atlas_total_umis)

    if query_total_umis is None:
        query_total_umis = ut.sum_per(query_umis, per="row")
    query_total_umis = ut.to_numpy_vector(query_total_umis)

    atlas_fractions = ut.to_numpy_matrix(ut.fraction_by(atlas_umis, by="row", sums=atlas_total_umis))
    query_fractions = ut.to_numpy_matrix(ut.fraction_by(query_umis, by="row", sums=query_total_umis))

    atlas_fractions += fold_normalization
    query_fractions += fold_normalization

    atlas_log_fractions = np.log2(atlas_fractions)
    query_log_fractions = np.log2(query_fractions)

    atlas_fractions -= fold_normalization
    query_fractions -= fold_normalization

    if project_log_data:
        atlas_project_data = atlas_log_fractions
        query_project_data = query_log_fractions
    else:
        atlas_project_data = atlas_fractions
        query_project_data = query_fractions

    query_atlas_corr = ut.cross_corrcoef_rows(query_project_data, atlas_project_data, reproducible=reproducible)

    @ut.timed_call("project_single_metacell")
    def _project_single(query_metacell_index: int) -> Tuple[ut.NumpyVector, ut.NumpyVector]:
        return _project_single_metacell(
            atlas_umis=atlas_umis,
            query_atlas_corr=query_atlas_corr,
            atlas_project_data=atlas_project_data,
            query_project_data=query_project_data,
            atlas_log_fractions=atlas_log_fractions,
            candidates_count=candidates_count,
            min_significant_gene_value=min_significant_gene_value,
            min_usage_weight=min_usage_weight,
            max_consistency_fold_factor=max_consistency_fold_factor,
            query_metacell_index=query_metacell_index,
        )

    results = list(ut.parallel_map(_project_single, qdata.n_obs))

    indices = np.concatenate([result[0] for result in results], dtype="int32")
    data = np.concatenate([result[1] for result in results], dtype="float32")

    atlas_used_sizes = [len(result[0]) for result in results]
    atlas_used_sizes.insert(0, 0)
    indptr = np.cumsum(np.array(atlas_used_sizes))

    return sp.csr_matrix((data, indices, indptr), shape=(qdata.n_obs, adata.n_obs))
Example #9
0
def _collect_fold_factors(  # pylint: disable=too-many-statements
    *,
    data: ut.ProperMatrix,
    candidate_of_cells: ut.NumpyVector,
    totals_of_cells: ut.NumpyVector,
    min_gene_fold_factor: float,
    abs_folds: bool,
) -> Tuple[List[ut.CompressedMatrix], List[ut.NumpyVector]]:
    list_of_fold_factors: List[ut.CompressedMatrix] = []
    list_of_cell_index_of_rows: List[ut.NumpyVector] = []

    cells_count, genes_count = data.shape
    candidates_count = np.max(candidate_of_cells) + 1

    ut.timed_parameters(candidates=candidates_count, cells=cells_count, genes=genes_count)
    remaining_cells_count = cells_count

    for candidate_index in range(candidates_count):
        candidate_cell_indices = np.where(candidate_of_cells == candidate_index)[0]

        candidate_cells_count = candidate_cell_indices.size
        assert candidate_cells_count > 0

        list_of_cell_index_of_rows.append(candidate_cell_indices)
        remaining_cells_count -= candidate_cells_count

        if candidate_cells_count < 2:
            compressed = sparse.csr_matrix(
                ([], [], [0] * (candidate_cells_count + 1)), shape=(candidate_cells_count, genes_count)
            )
            list_of_fold_factors.append(compressed)
            assert compressed.has_sorted_indices
            assert compressed.has_canonical_format
            continue

        data_of_candidate: ut.ProperMatrix = data[candidate_cell_indices, :].copy()
        assert ut.is_layout(data_of_candidate, "row_major")
        assert data_of_candidate.shape == (candidate_cells_count, genes_count)

        totals_of_candidate_cells = totals_of_cells[candidate_cell_indices]

        totals_of_candidate_genes = ut.sum_per(ut.to_layout(data_of_candidate, "column_major"), per="column")
        assert totals_of_candidate_genes.size == genes_count

        fractions_of_candidate_genes = ut.to_numpy_vector(totals_of_candidate_genes / np.sum(totals_of_candidate_genes))

        _, dense, compressed = ut.to_proper_matrices(data_of_candidate)

        if compressed is not None:
            if compressed.nnz == 0:
                list_of_fold_factors.append(compressed)
                continue

            extension_name = "fold_factor_compressed_%s_t_%s_t_%s_t" % (  # pylint: disable=consider-using-f-string
                compressed.data.dtype,
                compressed.indices.dtype,
                compressed.indptr.dtype,
            )
            extension = getattr(xt, extension_name)

            with ut.timed_step("extensions.fold_factor_compressed"):
                extension(
                    compressed.data,
                    compressed.indices,
                    compressed.indptr,
                    min_gene_fold_factor,
                    abs_folds,
                    totals_of_candidate_cells,
                    fractions_of_candidate_genes,
                )

            ut.eliminate_zeros(compressed)

        else:
            assert dense is not None

            extension_name = f"fold_factor_dense_{dense.dtype}_t"
            extension = getattr(xt, extension_name)

            with ut.timed_step("extensions.fold_factor_dense"):
                extension(
                    dense,
                    min_gene_fold_factor,
                    abs_folds,
                    totals_of_candidate_cells,
                    fractions_of_candidate_genes,
                )

            compressed = sparse.csr_matrix(dense)
            assert compressed.has_sorted_indices
            assert compressed.has_canonical_format

        list_of_fold_factors.append(compressed)

    if remaining_cells_count > 0:
        assert remaining_cells_count == np.sum(candidate_of_cells < 0)
        list_of_cell_index_of_rows.append(np.where(candidate_of_cells < 0)[0])
        compressed = sparse.csr_matrix(
            ([], [], [0] * (remaining_cells_count + 1)), shape=(remaining_cells_count, genes_count)
        )
        assert compressed.has_sorted_indices
        assert compressed.has_canonical_format
        list_of_fold_factors.append(compressed)

    return list_of_fold_factors, list_of_cell_index_of_rows
Example #10
0
def find_noisy_lonely_genes(  # pylint: disable=too-many-statements
    adata: AnnData,
    what: Union[str, ut.Matrix] = "__x__",
    *,
    excluded_genes_mask: Optional[str] = None,
    max_sampled_cells: int = pr.noisy_lonely_max_sampled_cells,
    downsample_min_samples: int = pr.noisy_lonely_downsample_min_samples,
    downsample_min_cell_quantile: float = pr.
    noisy_lonely_downsample_max_cell_quantile,
    downsample_max_cell_quantile: float = pr.
    noisy_lonely_downsample_min_cell_quantile,
    min_gene_total: int = pr.noisy_lonely_min_gene_total,
    min_gene_normalized_variance: float = pr.
    noisy_lonely_min_gene_normalized_variance,
    max_gene_similarity: float = pr.noisy_lonely_max_gene_similarity,
    random_seed: int = pr.random_seed,
    inplace: bool = True,
) -> Optional[ut.PandasSeries]:
    """
    Detect "noisy lonely" genes based on ``what`` (default: {what}) data.

    Return the indices of genes which are "noisy" (have high variance compared to their mean) and
    also "lonely" (have low correlation with all other genes). Such genes should be excluded since
    they will never meaningfully help us compute groups, and will actively cause profiles to be
    considered "deviants".

    Noisy genes have high expression and variance. Lonely genes have no (or low) correlations with
    any other gene. Noisy lonely genes tend to throw off clustering algorithms. In general, such
    algorithms try to group together cells with the same overall biological state. Since the genes
    are lonely, they don't contribute towards this goal. Since they are noisy, they actively hamper
    this, because they make cells which are otherwise similar appear different (just for this lonely
    gene).

    It is therefore useful to explicitly identify, in a pre-processing step, the (few) such genes,
    and exclude them from the rest of the analysis.

    **Input**

    Annotated ``adata``, where the observations are cells and the variables are genes, where
    ``what`` is a per-variable-per-observation matrix or the name of a per-variable-per-observation
    annotation containing such a matrix.

    **Returns**

    Variable (Gene) Annotations
        ``noisy_lonely_genes``
            A boolean mask indicating whether each gene was found to be a "noisy lonely" gene.

    If ``inplace`` (default: {inplace}), this is written to the data, and the function returns
    ``None``. Otherwise this is returned as a pandas series (indexed by the variable names).

    **Computation Parameters**

    1. If we have more than ``max_sampled_cells`` (default: {max_sampled_cells}), pick this number
       of random cells from the data using the ``random_seed``.

    2. If we were specified an ``excluded_genes_mask``, this is the name of a per-variable (gene)
       annotation containing a mask of excluded genes. Get rid of all these excluded genes.

    3. Invoke :py:func:`metacells.tools.downsample.downsample_cells` to downsample the cells to the
       same total number of UMIs, using the ``downsample_min_samples`` (default:
       {downsample_min_samples}), ``downsample_min_cell_quantile`` (default:
       {downsample_min_cell_quantile}), ``downsample_max_cell_quantile`` (default:
       {downsample_max_cell_quantile}) and the ``random_seed`` (default: {random_seed}).

    4. Find "noisy" genes which have a total number of UMIs of at least ``min_gene_total`` (default:
       {min_gene_total}) and a normalized variance of at least ``min_gene_normalized_variance``
       (default: ``min_gene_normalized_variance``).

    5. Cross-correlate the noisy genes.

    6. Find the noisy "lonely" genes whose maximal correlation is at most
       ``max_gene_similarity`` (default: {max_gene_similarity}) with all other genes.
    """
    if max_sampled_cells < adata.n_obs:
        np.random.seed(random_seed)
        cell_indices = np.random.choice(np.arange(adata.n_obs),
                                        size=max_sampled_cells,
                                        replace=False)
        s_data = ut.slice(adata,
                          obs=cell_indices,
                          name=".sampled",
                          top_level=False)
    else:
        s_data = ut.copy_adata(adata, top_level=False)

    track_var: Optional[str] = "sampled_gene_index"

    if excluded_genes_mask is not None:
        results = filter_data(s_data,
                              name="included",
                              top_level=False,
                              track_var=track_var,
                              var_masks=[f"~{excluded_genes_mask}"])
        track_var = None
        assert results is not None
        i_data = results[0]
        assert i_data is not None
    else:
        i_data = s_data

    downsample_cells(
        i_data,
        what,
        downsample_min_samples=downsample_min_samples,
        downsample_min_cell_quantile=downsample_min_cell_quantile,
        downsample_max_cell_quantile=downsample_max_cell_quantile,
        random_seed=random_seed,
    )

    find_high_total_genes(i_data, "downsampled", min_gene_total=min_gene_total)

    results = filter_data(i_data,
                          name="high_total",
                          top_level=False,
                          track_var=track_var,
                          var_masks=["high_total_gene"])
    track_var = None
    assert results is not None
    ht_data = results[0]

    noisy_lonely_genes_mask = np.full(adata.n_vars, False)

    if ht_data is not None:
        ht_genes_count = ht_data.shape[1]

        ht_gene_ht_gene_similarity_frame = compute_var_var_similarity(
            ht_data,
            "downsampled",
            inplace=False,
            reproducible=(random_seed != 0))
        assert ht_gene_ht_gene_similarity_frame is not None

        ht_gene_ht_gene_similarity_matrix = ut.to_numpy_matrix(
            ht_gene_ht_gene_similarity_frame, only_extract=True)
        ht_gene_ht_gene_similarity_matrix = ut.to_layout(
            ht_gene_ht_gene_similarity_matrix,
            layout="row_major",
            symmetric=True)
        np.fill_diagonal(ht_gene_ht_gene_similarity_matrix, -1)

        htv_mask_series = find_high_normalized_variance_genes(
            ht_data,
            "downsampled",
            min_gene_normalized_variance=min_gene_normalized_variance,
            inplace=False)
        assert htv_mask_series is not None
        htv_mask = ut.to_numpy_vector(htv_mask_series)

        htv_genes_count = np.sum(htv_mask)
        assert htv_genes_count <= ht_genes_count

        if htv_genes_count > 0:
            htv_gene_ht_gene_similarity_matrix = ht_gene_ht_gene_similarity_matrix[
                htv_mask, :]
            assert ut.is_layout(htv_gene_ht_gene_similarity_matrix,
                                "row_major")
            assert htv_gene_ht_gene_similarity_matrix.shape == (
                htv_genes_count, ht_genes_count)

            max_similarity_of_htv_genes = ut.max_per(
                htv_gene_ht_gene_similarity_matrix, per="row")
            htvl_mask = max_similarity_of_htv_genes <= max_gene_similarity
            htvl_genes_count = np.sum(htvl_mask)
            ut.log_calc("noisy_lonely_genes_count", htvl_genes_count)

            if htvl_genes_count > 0:
                base_index_of_ht_genes = ut.get_v_numpy(
                    ht_data, "sampled_gene_index")
                base_index_of_htv_genes = base_index_of_ht_genes[htv_mask]
                base_index_of_htvl_genes = base_index_of_htv_genes[htvl_mask]

                noisy_lonely_genes_mask[base_index_of_htvl_genes] = True

                htvl_gene_ht_gene_similarity_matrix = htv_gene_ht_gene_similarity_matrix[
                    htvl_mask, :]
                htvl_gene_ht_gene_similarity_matrix = ut.to_layout(
                    htvl_gene_ht_gene_similarity_matrix, layout="row_major")
                assert htvl_gene_ht_gene_similarity_matrix.shape == (
                    htvl_genes_count, ht_genes_count)

                if ut.logging_calc():
                    i_gene_totals = ut.get_v_numpy(i_data,
                                                   "downsampled",
                                                   sum=True)
                    ht_mask = ut.get_v_numpy(i_data, "high_total_gene")
                    i_total = np.sum(i_gene_totals)
                    htvl_gene_totals = i_gene_totals[ht_mask][htv_mask][
                        htvl_mask]
                    top_similarity_of_htvl_genes = ut.top_per(
                        htvl_gene_ht_gene_similarity_matrix, 10, per="row")
                    for htvl_index, gene_index in enumerate(
                            base_index_of_htvl_genes):
                        gene_name = adata.var_names[gene_index]
                        gene_total = htvl_gene_totals[htvl_index]
                        gene_percent = 100 * gene_total / i_total
                        similar_ht_values = ut.to_numpy_vector(
                            top_similarity_of_htvl_genes[htvl_index, :])  #
                        assert len(similar_ht_values) == ht_genes_count
                        top_similar_ht_mask = similar_ht_values > 0
                        top_similar_ht_values = similar_ht_values[
                            top_similar_ht_mask]
                        top_similar_ht_indices = base_index_of_ht_genes[
                            top_similar_ht_mask]
                        top_similar_ht_names = adata.var_names[
                            top_similar_ht_indices]
                        ut.log_calc(
                            f"- {gene_name}",
                            f"total downsampled UMIs: {gene_total} " +
                            f"({gene_percent:.4g}%), correlated with: " +
                            ", ".join([
                                f"{similar_gene_name}: {similar_gene_value:.4g}"
                                for similar_gene_value, similar_gene_name in
                                reversed(
                                    sorted(
                                        zip(top_similar_ht_values,
                                            top_similar_ht_names)))
                            ]),
                        )

    if ut.logging_calc():
        ut.log_calc("noisy_lonely_gene_names",
                    sorted(list(adata.var_names[noisy_lonely_genes_mask])))

    if inplace:
        ut.set_v_data(adata, "noisy_lonely_gene", noisy_lonely_genes_mask)
        return None

    ut.log_return("noisy_lonely_genes", noisy_lonely_genes_mask)
    return ut.to_pandas_series(noisy_lonely_genes_mask, index=adata.var_names)
Example #11
0
def _keep_candidate(  # pylint: disable=too-many-branches
    adata: AnnData,
    candidate_index: int,
    *,
    data: ut.ProperMatrix,
    cell_sizes: Optional[ut.NumpyVector],
    fraction_of_genes: ut.NumpyVector,
    min_metacell_cells: int,
    min_robust_size: Optional[float],
    min_convincing_size: Optional[float],
    min_convincing_gene_fold_factor: float,
    abs_folds: bool,
    candidates_count: int,
    candidate_cell_indices: ut.NumpyVector,
) -> bool:
    genes_count = data.shape[1]

    if cell_sizes is None:
        candidate_total_size = candidate_cell_indices.size
    else:
        candidate_total_size = np.sum(cell_sizes[candidate_cell_indices])

    if candidate_cell_indices.size < min_metacell_cells:
        if ut.logging_calc():
            ut.log_calc(
                f'- candidate: {ut.progress_description(candidates_count, candidate_index, "candidate")} '
                f"cells: {candidate_cell_indices.size} "
                f"size: {candidate_total_size:g} "
                f"is: little")
        return False

    if min_robust_size is not None and candidate_total_size >= min_robust_size:
        if ut.logging_calc():
            ut.log_calc(
                f'- candidate: {ut.progress_description(candidates_count, candidate_index, "candidate")} '
                f"cells: {candidate_cell_indices.size} "
                f"size: {candidate_total_size:g} "
                f"is: robust")
        return True

    if min_convincing_size is None:
        if ut.logging_calc():
            ut.log_calc(
                f'- candidate: {ut.progress_description(candidates_count, candidate_index, "candidate")} '
                f"cells: {candidate_cell_indices.size} "
                f"size: {candidate_total_size:g} "
                f"is: accepted")
        return True

    if candidate_total_size < min_convincing_size:
        if ut.logging_calc():
            ut.log_calc(
                f'- candidate: {ut.progress_description(candidates_count, candidate_index, "candidate")} '
                f"cells: {candidate_cell_indices.size} "
                f"size: {candidate_total_size:g} "
                f"is: unconvincing")
        return False

    candidate_data = data[candidate_cell_indices, :]
    candidate_data_of_genes = ut.to_numpy_vector(candidate_data.sum(axis=0))
    assert candidate_data_of_genes.size == genes_count
    candidate_total = np.sum(candidate_data_of_genes)
    candidate_expected_of_genes = fraction_of_genes * candidate_total
    candidate_expected_of_genes += 1
    candidate_data_of_genes += 1
    candidate_data_of_genes /= candidate_expected_of_genes
    np.log2(candidate_data_of_genes, out=candidate_data_of_genes)
    if abs_folds:
        convincing_genes_mask = np.abs(
            candidate_data_of_genes) >= min_convincing_gene_fold_factor
    else:
        convincing_genes_mask = candidate_data_of_genes >= min_convincing_gene_fold_factor
    keep_candidate = bool(np.any(convincing_genes_mask))

    if ut.logging_calc():
        convincing_gene_indices = np.where(convincing_genes_mask)[0]
        if keep_candidate:
            ut.log_calc(
                f'- candidate: {ut.progress_description(candidates_count, candidate_index, "candidate")} '
                f"cells: {candidate_cell_indices.size} "
                f"size: {candidate_total_size:g} "
                f"is: convincing because:")
            for fold_factor, name in reversed(
                    sorted(
                        zip(candidate_data_of_genes[convincing_gene_indices],
                            adata.var_names[convincing_gene_indices]))):
                ut.log_calc(f"    {name}: {ut.fold_description(fold_factor)}")
        else:
            ut.log_calc(
                f'- candidate: {ut.progress_description(candidates_count, candidate_index, "candidate")} '
                f"cells: {candidate_cell_indices.size} "
                f"size: {candidate_total_size:g} "
                f"is: not convincing")

    return keep_candidate
Example #12
0
def umap_by_distances(
    adata: AnnData,
    distances: Union[str, ut.ProperMatrix] = "umap_distances",
    *,
    prefix: str = "umap",
    k: int = pr.umap_k,
    dimensions: int = 2,
    min_dist: float = pr.umap_min_dist,
    spread: float = pr.umap_spread,
    random_seed: int = pr.random_seed,
) -> None:
    """
    Compute layout for the observations using UMAP, based on a distances matrix.

    **Input**

    The input annotated ``adata`` is expected to contain a per-observation-per-observation property
    ``distances`` (default: {distances}), which describes the distance between each two observations
    (cells). The distances must be non-negative, symmetrical, and zero for self-distances (on the
    diagonal).

    **Returns**

    Sets the following annotations in ``adata``:

    Observation (Cell) Annotations
        ``<prefix>_x``, ``<prefix>_y``
            Coordinates for UMAP 2D projection of the observations (if ``dimensions`` is 2).
        ``<prefix>_u``, ``<prefix>_v``, ``<prefix>_3``
            Coordinates for UMAP 3D projection of the observations (if ``dimensions`` is 3).

    **Computation Parameters**

    1. Invoke UMAP to compute a layout of some ``dimensions`` (default: {dimensions}D) using
       ``min_dist`` (default: {min_dist}), ``spread`` (default: {spread}) and ``k`` (default: {k}).
       If the spread is lower than the minimal distance, it is raised. If ``random_seed`` (default:
       {random_seed}) is not zero, then it is passed to UMAP to force the computation to be
       reproducible. However, this means UMAP will use a single-threaded implementation that will be
       slower.
    """
    assert dimensions in (2, 3)
    if isinstance(distances, str):
        distances_matrix = ut.get_oo_proper(adata, distances)
    else:
        distances_matrix = distances
    # UMAP dies when given a dense matrix.
    distances_csr = sp.csr_matrix(distances_matrix)

    spread = max(min_dist, spread)  # UMAP insists.

    # UMAP implementation doesn't know to reduce K by itself.
    n_neighbors = min(k, adata.n_obs - 2)

    random_state: Optional[int] = None
    if random_seed != 0:
        random_state = random_seed

    try:
        coordinates = umap.UMAP(
            metric="precomputed",
            n_neighbors=n_neighbors,
            spread=spread,
            min_dist=min_dist,
            n_components=dimensions,
            random_state=random_state,
        ).fit_transform(distances_csr)
    except ValueError:
        # UMAP implementation doesn't know how to handle too few edges.
        # However, it considers structural zeros as real edges.
        distances_matrix = distances_matrix + 1.0  # type: ignore
        np.fill_diagonal(distances_matrix, 0.0)
        distances_csr = sp.csr_matrix(distances_matrix)
        distances_csr.data -= 1.0
        coordinates = umap.UMAP(
            metric="precomputed",
            n_neighbors=n_neighbors,
            spread=spread,
            min_dist=min_dist,
            random_state=random_state).fit_transform(distances_csr)

    all_sizes = []
    all_coordinates = []
    for axis in range(dimensions):
        axis_coordinates = ut.to_numpy_vector(coordinates[:, axis], copy=True)
        min_coordinate = np.min(coordinates)
        max_coordinate = np.max(coordinates)
        size = max_coordinate - min_coordinate
        assert size > 0
        all_sizes.append(size)
        all_coordinates.append(axis_coordinates)

    if dimensions == 2:
        all_names = ["x", "y"]
    elif dimensions == 3:
        all_names = ["u", "v", "w"]
    else:
        assert False

    order = np.argsort(all_sizes)
    for axis, name in zip(order, all_names):
        ut.set_o_data(adata, f"{prefix}_{name}", all_coordinates[axis])
Example #13
0
def filter_data(  # pylint: disable=dangerous-default-value
    adata: AnnData,
    obs_masks: List[str] = [],
    var_masks: List[str] = [],
    *,
    mask_obs: Optional[str] = None,
    mask_var: Optional[str] = None,
    invert_obs: bool = False,
    invert_var: bool = False,
    track_obs: Optional[str] = None,
    track_var: Optional[str] = None,
    name: Optional[str] = None,
    top_level: bool = True,
) -> Optional[Tuple[AnnData, ut.PandasSeries, ut.PandasSeries]]:
    """
    Filter (slice) the data based on previously-computed masks.

    For example, it is useful to discard cell-cycle genes, cells which have too few UMIs for
    meaningful analysis, etc. In general, the "best" filter depends on the data set.

    This function makes it easy to combine different pre-computed per-observation (cell) and
    per-variable (gene) boolean mask annotations into a final overall inclusion mask, and slice the
    data accordingly, while tracking the base index of the cells and genes in the filtered data.

    **Input**

    Annotated ``adata``, where the observations are cells and the variables are genes.

    **Returns**

    An annotated data containing a subset of the observations (cells) and variables (genes).

    If no observations and/or no variables were selected by the filter, returns ``None``.

    If ``name`` is not specified, the returned data will be unnamed. Otherwise, if the name starts
    with a ``.``, it will be appended to the current name (if any). Otherwise, ``name`` is the new
    name.

    If ``mask_obs`` and/or ``mask_var`` are specified, store the mask of the selected data as a
    per-observation and/or per-variable annotation of the full ``adata``.

    If ``track_obs`` and/or ``track_var`` are specified, store the original indices of the selected
    data as a per-observation and/or per-variable annotation of the result data.

    **Computation Parameters**

    1. Combine the masks in ``obs_masks`` and/or ``var_masks`` using
       :py:func:`metacells.tools.mask.combine_masks` passing it ``invert_obs`` and ``invert_var``,
       and ``mask_obs`` and ``mask_var`` as the ``to`` parameter. If either list of masks is empty,
       use the full mask.

    2. If the obtained masks for either the observations or variables is empty, return ``None``.
       Otherwise, return a slice of the full data containing just the observations and variables
       specified by the final masks.
    """
    if len(obs_masks) == 0:
        obs_mask = np.full(adata.n_obs, True, dtype="bool")
        if mask_obs is not None:
            ut.set_o_data(adata, mask_obs, obs_mask)
    else:
        mask = combine_masks(adata, obs_masks, invert=invert_obs, to=mask_obs)
        if mask is None:
            assert mask_obs is not None
            obs_mask = ut.get_o_numpy(
                adata, mask_obs, formatter=ut.mask_description) > 0
        else:
            obs_mask = ut.to_numpy_vector(mask, only_extract=True) > 0

    if len(var_masks) == 0:
        var_mask = np.full(adata.n_vars, True, dtype="bool")
        if mask_var is not None:
            ut.set_o_data(adata, mask_var, var_mask)
    else:
        mask = combine_masks(adata, var_masks, invert=invert_var, to=mask_var)
        if mask is None:
            assert mask_var is not None
            var_mask = ut.get_v_numpy(
                adata, mask_var, formatter=ut.mask_description) > 0
        else:
            var_mask = ut.to_numpy_vector(mask, only_extract=True) > 0

    if not np.any(obs_mask) or not np.any(var_mask):
        return None

    fdata = ut.slice(adata,
                     name=name,
                     top_level=top_level,
                     obs=obs_mask,
                     vars=var_mask,
                     track_obs=track_obs,
                     track_var=track_var)

    return (
        fdata,
        ut.to_pandas_series(obs_mask, index=adata.obs_names),
        ut.to_pandas_series(var_mask, index=adata.var_names),
    )