Example #1
0
def convey_obs_to_obs(
    *,
    adata: AnnData,
    bdata: AnnData,
    property_name: str,
    formatter: Optional[Callable[[Any], Any]] = None,
    to_property_name: Optional[str] = None,
    default: Any = None,
) -> None:
    """
    Project the value of a property from one annotated data to another.

    The observation names are expected to be compatible between ``adata`` and ``bdata``. The
    annotated ``adata`` is expected to contain a per-observation (cell) annotation named
    ``property_name``.

    This will generate a new per-observation (cell) annotation in ``bdata``, named
    ``to_property_name`` (by default, the same as ``property_name``), containing the value of the
    observation with the same name in ``adata``. If no such observation exists, the ``default``
    value is used.
    """
    if to_property_name is None:
        to_property_name = property_name

    property_of_from = ut.get_o_numpy(adata,
                                      property_name,
                                      formatter=formatter)
    property_of_name = {
        name: property_of_from[index]
        for index, name in enumerate(adata.obs_names)
    }
    property_of_to = np.array(
        [property_of_name.get(name, default) for name in bdata.obs_names])
    ut.set_o_data(bdata, to_property_name, property_of_to)
Example #2
0
def project_atlas_to_query(
    *,
    adata: AnnData,
    qdata: AnnData,
    weights: ut.ProperMatrix,
    property_name: str,
    formatter: Optional[Callable[[Any], Any]] = None,
    to_property_name: Optional[str] = None,
    method: Callable[[ut.Vector, ut.Vector], Any] = ut.highest_weight,
) -> None:
    """
    Project the value of a property from per-observation atlas data to per-observation query data.

    The input annotated ``adata`` is expected to contain a per-observation (cell) annotation named ``property_name``.
    Given the ``weights`` matrix, where each row specifies the weights of the atlas metacells used to project a single
    query metacell, this will generate a new per-observation (group) annotation in ``qdata``, named ``to_property_name``
    (by default, the same as ``property_name``), containing the aggregated value of the property of all the observations
    (cells) that belong to the group.

    The aggregation method (by default, :py:func:`metacells.utilities.computation.highest_weight`) is any function
    taking two array, weights and values, and returning a single value.
    """
    if to_property_name is None:
        to_property_name = property_name

    property_of_atlas_metacells = ut.get_o_numpy(adata, property_name, formatter=formatter)
    property_of_query_metacells = []
    for query_metacell_index in range(qdata.n_obs):
        metacell_weights = ut.to_numpy_vector(weights[query_metacell_index, :])
        metacell_mask = metacell_weights > 0
        metacell_weights = ut.to_numpy_vector(metacell_weights[metacell_mask])
        metacell_values = property_of_atlas_metacells[metacell_mask]
        property_of_query_metacells.append(method(metacell_weights, metacell_values))

    ut.set_o_data(qdata, to_property_name, np.array(property_of_query_metacells))
Example #3
0
def spread_coordinates(
    adata: AnnData,
    *,
    prefix: str = "umap",
    suffix: str = "spread",
    cover_fraction: float = pr.cover_fraction,
    noise_fraction: float = pr.noise_fraction,
    random_seed: int = pr.random_seed,
) -> None:
    """
    Move UMAP points so they cover some fraction of the plot area without overlapping.

    **Input**

    The input annotated ``adata`` is expected to contain the per-observation properties
    ``<prefix>_x`` and ``<prefix>_y`` (default prefix: {prefix}) which contain the UMAP coordinates.

    **Returns**

    Sets the following annotations in ``adata``:

    Observation (Cell) Annotations
        ``<prefix>_x_<suffix>``, ``<prefix>_y_<suffix>`` (default suffix: {suffix})
            The new coordinates which will be spread out so the points do not overlap and
            cover some fraction of the total plot area.

    **Computation Parameters**

    1. Move the points so they cover ``cover_fraction`` (default: {cover_fraction}) of the total
       plot area. Also add a noise of the ``noise_fraction`` (default: {noise_fraction}) of the
       minimal distance between the
       points, using the ``random_seed`` (default: {random_seed}).
    """
    assert 0 < cover_fraction < 1
    assert noise_fraction >= 0

    x_coordinates = ut.get_o_numpy(adata, f"{prefix}_x")
    y_coordinates = ut.get_o_numpy(adata, f"{prefix}_y")

    x_coordinates, y_coordinates = ut.cover_coordinates(
        x_coordinates,
        y_coordinates,
        cover_fraction=cover_fraction,
        noise_fraction=noise_fraction,
        random_seed=random_seed,
    )

    ut.set_o_data(adata, f"{prefix}_x_{suffix}", x_coordinates)
    ut.set_o_data(adata, f"{prefix}_y_{suffix}", y_coordinates)
Example #4
0
def _results(
    *,
    adata: AnnData,
    rare_module_of_cells: ut.NumpyVector,
    list_of_rare_gene_indices_of_modules: List[List[int]],
    inplace: bool,
) -> Optional[Tuple[ut.PandasFrame, ut.PandasFrame]]:
    assert np.max(
        rare_module_of_cells) == len(list_of_rare_gene_indices_of_modules) - 1

    if not inplace:
        var_metrics = ut.to_pandas_frame(index=adata.var_names)

    rare_gene_mask = np.zeros(adata.n_vars, dtype="bool")
    for module_index, rare_gene_indices_of_module in enumerate(
            list_of_rare_gene_indices_of_modules):
        rare_module_gene_mask = np.zeros(adata.n_vars, dtype="bool")
        rare_module_gene_mask[rare_gene_indices_of_module] = True
        property_name = f"rare_gene_module_{module_index}"
        if inplace:
            ut.set_v_data(adata, property_name, rare_module_gene_mask)
        else:
            var_metrics[property_name] = rare_module_gene_mask
            ut.log_return(property_name, rare_module_gene_mask)
        rare_gene_mask |= rare_module_gene_mask

    if inplace:
        ut.set_v_data(adata, "rare_gene", rare_gene_mask)
    else:
        var_metrics["rare_gene"] = rare_gene_mask
        ut.log_return("rare_gene", rare_gene_mask)

    if inplace:
        ut.set_o_data(adata,
                      "cells_rare_gene_module",
                      rare_module_of_cells,
                      formatter=ut.groups_description)
        ut.set_o_data(adata, "rare_cell", rare_module_of_cells >= 0)
        return None

    obs_metrics = ut.to_pandas_frame(index=adata.obs_names)
    ut.log_return("cells_rare_gene_module",
                  rare_module_of_cells,
                  formatter=ut.groups_description)
    ut.log_return("rare_cell", rare_module_of_cells >= 0)

    return obs_metrics, var_metrics
Example #5
0
def compute_similar_query_metacells(
    adata: AnnData,
    max_projection_fold_factor: float = pr.project_max_projection_fold_factor,
    abs_folds: bool = pr.project_abs_folds,
) -> None:
    """
    Mark query metacells that are similar to their projection on the atlas.

    This does not guarantee the query metacell is "the same as" its projection on the atlas; rather, it means the two
    are sufficiently similar that one can be "reasonably confident" in applying atlas metadata to the query metacell
    based on the projection, which is a much lower bar.

    **Input**

    Annotated query ``adata``, where the observations are cells and the variables are genes, where ``what`` is a
    per-variable-per-observation matrix or the name of a per-variable-per-observation annotation containing such a
    matrix.

    The data should contain per-observation-per-variable annotations ``projected_fold`` with the significant projection
    folds factors, as computed by :py:func:`compute_significant_projected_fold_factors`.

    **Returns**

    Sets the following in ``adata``:

    Per-Observation (Cell) Annotations

        ``similar``
            A boolean mask indicating the query metacell is similar to its projection in the atlas.

    **Computation Parameters**

    1. Mark as dissimilar any query metacells which have even one gene whose projection fold is above
       ``max_projection_fold_factor``.
    """
    assert max_projection_fold_factor >= 0

    projected_folds = ut.get_vo_proper(adata,
                                       "projected_fold",
                                       layout="row_major")
    if abs_folds:
        projected_folds = np.abs(projected_folds)  # type: ignore
    high_folds = projected_folds > max_projection_fold_factor  # type: ignore
    high_folds_per_metacell = ut.sum_per(high_folds, per="row")  # type: ignore
    similar_mask = high_folds_per_metacell == 0
    ut.set_o_data(adata, "similar", similar_mask)
Example #6
0
def convey_obs_to_group(
    *,
    adata: AnnData,
    gdata: AnnData,
    group: str,
    property_name: str,
    formatter: Optional[Callable[[Any], Any]] = None,
    to_property_name: Optional[str] = None,
    method: Callable[[ut.Vector], Any] = ut.most_frequent,
) -> None:
    """
    Project the value of a property from per-observation data to per-group data.

    The input annotated ``adata`` is expected to contain a per-observation (cell) annotation named
    ``property_name`` and also a per-observation annotation named ``group`` which identifies the
    group each observation (cell) belongs to, which must be an integer.

    This will generate a new per-observation (group) annotation in ``gdata``, named
    ``to_property_name`` (by default, the same as ``property_name``), containing the aggregated
    value of the property of all the observations (cells) that belong to the group.

    The aggregation method (by default, :py:func:`metacells.utilities.computation.most_frequent`) is
    any function taking an array of values and returning a single value.
    """
    if to_property_name is None:
        to_property_name = property_name

    group_of_obs = ut.get_o_numpy(adata,
                                  group,
                                  formatter=ut.groups_description)
    property_of_obs = ut.get_o_numpy(adata, property_name, formatter=formatter)
    assert gdata.n_obs == (np.max(group_of_obs) + 1)
    property_of_group = np.array([
        method(property_of_obs[group_of_obs == group])
        for group in range(gdata.n_obs)
    ])
    ut.set_o_data(gdata, to_property_name, property_of_group)
Example #7
0
def convey_group_to_obs(
    *,
    adata: AnnData,
    gdata: AnnData,
    group: str,
    property_name: str,
    formatter: Optional[Callable[[Any], Any]] = None,
    to_property_name: Optional[str] = None,
    default: Any = None,
) -> None:
    """
    Project the value of a property from per-group data to per-observation data.

    The input annotated ``gdata`` is expected to contain a per-observation (group) annotation named
    ``property_name``. The input annotated ``adata`` is expected to contain a per-observation
    annotation named ``group`` which identifies the group each observation (cell) belongs to.

    This will generate a new per-observation (cell) annotation in ``adata``, named
    ``to_property_name`` (by default, the same as ``property_name``), containing the value of the
    property for the group it belongs to. If the ``group`` annotation contains a negative number
    instead of a valid group index, the ``default`` value is used.
    """
    if to_property_name is None:
        to_property_name = property_name

    group_of_obs = ut.get_o_numpy(adata,
                                  group,
                                  formatter=ut.groups_description)
    property_of_group = ut.get_o_numpy(gdata,
                                       property_name,
                                       formatter=formatter)
    property_of_obs = np.array([
        default if group < 0 else property_of_group[group]
        for group in group_of_obs
    ])
    ut.set_o_data(adata, to_property_name, property_of_obs)
Example #8
0
def find_properly_sampled_cells(
    adata: AnnData,
    what: Union[str, ut.Matrix] = "__x__",
    *,
    min_cell_total: Optional[int],
    max_cell_total: Optional[int],
    excluded_adata: Optional[AnnData] = None,
    max_excluded_genes_fraction: Optional[float],
    inplace: bool = True,
) -> Optional[ut.PandasSeries]:
    """
    Detect cells with a "proper" amount of ``what`` (default: {what}) data.

    Due to both technical effects and natural variance between cells, the total number of UMIs
    varies from cell to cell. We often would like to work on cells that contain a sufficient number
    of UMIs for meaningful analysis; we sometimes also wish to exclude cells which have "too many"
    UMIs.

    **Input**

    Annotated ``adata``, where the observations are cells and the variables are genes, where
    ``what`` is a per-variable-per-observation matrix or the name of a per-variable-per-observation
    annotation containing such a matrix.

    **Returns**

    Observation (Cell) Annotations
        ``properly_sampled_cell``
            A boolean mask indicating whether each cell has a "proper" amount of UMIs.

    If ``inplace`` (default: {inplace}), this is written to the data, and the function returns
    ``None``. Otherwise this is returned as a pandas series (indexed by the observation names).

    **Computation Parameters**

    1. Exclude all cells whose total data is less than the ``min_cell_total`` (no default), unless
       it is ``None``.

    2. Exclude all cells whose total data is more than the ``max_cell_total`` (no default), unless
       it is ``None``.

    3. If ``max_excluded_genes_fraction`` (no default) is not ``None``, then ``excluded_adata`` must
       not be ``None`` and should contain just the excluded genes data for each cell. Exclude all
       cells whose sum of the excluded data divided by the total data is more than the specified
       threshold.
    """
    assert (max_excluded_genes_fraction is None) == (excluded_adata is None)

    total_of_cells = ut.get_o_numpy(adata, what, sum=True)

    cells_mask = np.full(adata.n_obs, True, dtype="bool")

    if min_cell_total is not None:
        cells_mask = cells_mask & (total_of_cells >= min_cell_total)

    if max_cell_total is not None:
        cells_mask = cells_mask & (total_of_cells <= max_cell_total)

    if excluded_adata is not None:
        assert max_excluded_genes_fraction is not None
        excluded_data = ut.get_vo_proper(excluded_adata, layout="row_major")
        excluded_of_cells = ut.sum_per(excluded_data, per="row")
        if np.min(total_of_cells) == 0:
            total_of_cells = np.copy(total_of_cells)
            total_of_cells[total_of_cells == 0] = 1
        excluded_fraction = excluded_of_cells / total_of_cells
        cells_mask = cells_mask & (excluded_fraction <=
                                   max_excluded_genes_fraction)

    if inplace:
        ut.set_o_data(adata, "properly_sampled_cell", cells_mask)
        return None

    ut.log_return("properly_sampled_cell", cells_mask)
    return ut.to_pandas_series(cells_mask, index=adata.obs_names)
Example #9
0
def combine_masks(  # pylint: disable=too-many-branches,too-many-statements
    adata: AnnData,
    masks: List[str],
    *,
    invert: bool = False,
    to: Optional[str] = None,
) -> Optional[ut.PandasSeries]:
    """
    Combine different pre-computed masks into a final overall mask.

    **Input**

    Annotated ``adata``, where the observations are cells and the variables are genes.

    **Returns**

    If ``to`` (default: {to}) is ``None``, returns the computed mask. Otherwise, sets the
    mask as an annotation (per-variable or per-observation depending on the type of the combined masks).

    **Computation Parameters**

    1. For each of the mask in ``masks``, fetch it. Silently ignore missing masks if the name has a
       ``?`` suffix. Invert the mask if the name has a ``~`` prefix. If the name has a ``|`` prefix
       (before the ``~`` prefix, if any), then bitwise-OR the mask into the OR mask, otherwise (or if
       it has a ``&`` prefix), bitwise-AND the mask into the AND mask.

    2. Combine (bitwise-AND) the AND mask and the OR mask into a single mask.

    3. If ``invert`` (default: {invert}), invert the result combined mask.
    """
    assert len(masks) > 0

    per: Optional[str] = None

    and_mask: Optional[ut.NumpyVector] = None
    or_mask: Optional[ut.NumpyVector] = None

    for mask_name in masks:
        log_mask_name = mask_name

        if mask_name[0] == "|":
            is_or = True
            mask_name = mask_name[1:]
        else:
            is_or = False
            if mask_name[0] == "&":
                mask_name = mask_name[1:]

        if mask_name[0] == "~":
            invert_mask = True
            mask_name = mask_name[1:]
        else:
            invert_mask = False

        if mask_name[-1] == "?":
            must_exist = False
            mask_name = mask_name[:-1]
        else:
            must_exist = True

        if mask_name in adata.obs:
            mask_per = "o"
            mask = ut.get_o_numpy(
                adata, mask_name, formatter=ut.mask_description) > 0
        elif mask_name in adata.var:
            mask_per = "v"
            mask = ut.get_v_numpy(
                adata, mask_name, formatter=ut.mask_description) > 0
        else:
            if must_exist:
                raise KeyError(f"unknown mask data: {mask_name}")
            continue

        if mask.dtype != "bool":
            raise ValueError(f"the data: {mask_name} is not a boolean mask")

        if invert_mask:
            mask = ~mask

        if ut.logging_calc():
            ut.log_calc(log_mask_name, mask)

        if per is None:
            per = mask_per
        else:
            if mask_per != per:
                raise ValueError(
                    "mixing per-observation and per-variable masks")

        if is_or:
            if or_mask is None:
                or_mask = mask
            else:
                or_mask = or_mask | mask
        else:
            if and_mask is None:
                and_mask = mask
            else:
                and_mask = and_mask & mask

    if and_mask is not None:
        if or_mask is not None:
            combined_mask = and_mask & or_mask
        else:
            combined_mask = and_mask
    else:
        if or_mask is not None:
            combined_mask = or_mask
        else:
            raise ValueError("no masks to combine")

    if invert:
        combined_mask = ~combined_mask

    if to is None:
        ut.log_return("combined", combined_mask)
        if per == "o":
            return ut.to_pandas_series(combined_mask, index=adata.obs_names)
        assert per == "v"
        return ut.to_pandas_series(combined_mask, index=adata.var_names)

    if per == "o":
        ut.set_o_data(adata, to, combined_mask)
    else:
        ut.set_v_data(adata, to, combined_mask)

    return None
Example #10
0
def umap_by_distances(
    adata: AnnData,
    distances: Union[str, ut.ProperMatrix] = "umap_distances",
    *,
    prefix: str = "umap",
    k: int = pr.umap_k,
    dimensions: int = 2,
    min_dist: float = pr.umap_min_dist,
    spread: float = pr.umap_spread,
    random_seed: int = pr.random_seed,
) -> None:
    """
    Compute layout for the observations using UMAP, based on a distances matrix.

    **Input**

    The input annotated ``adata`` is expected to contain a per-observation-per-observation property
    ``distances`` (default: {distances}), which describes the distance between each two observations
    (cells). The distances must be non-negative, symmetrical, and zero for self-distances (on the
    diagonal).

    **Returns**

    Sets the following annotations in ``adata``:

    Observation (Cell) Annotations
        ``<prefix>_x``, ``<prefix>_y``
            Coordinates for UMAP 2D projection of the observations (if ``dimensions`` is 2).
        ``<prefix>_u``, ``<prefix>_v``, ``<prefix>_3``
            Coordinates for UMAP 3D projection of the observations (if ``dimensions`` is 3).

    **Computation Parameters**

    1. Invoke UMAP to compute a layout of some ``dimensions`` (default: {dimensions}D) using
       ``min_dist`` (default: {min_dist}), ``spread`` (default: {spread}) and ``k`` (default: {k}).
       If the spread is lower than the minimal distance, it is raised. If ``random_seed`` (default:
       {random_seed}) is not zero, then it is passed to UMAP to force the computation to be
       reproducible. However, this means UMAP will use a single-threaded implementation that will be
       slower.
    """
    assert dimensions in (2, 3)
    if isinstance(distances, str):
        distances_matrix = ut.get_oo_proper(adata, distances)
    else:
        distances_matrix = distances
    # UMAP dies when given a dense matrix.
    distances_csr = sp.csr_matrix(distances_matrix)

    spread = max(min_dist, spread)  # UMAP insists.

    # UMAP implementation doesn't know to reduce K by itself.
    n_neighbors = min(k, adata.n_obs - 2)

    random_state: Optional[int] = None
    if random_seed != 0:
        random_state = random_seed

    try:
        coordinates = umap.UMAP(
            metric="precomputed",
            n_neighbors=n_neighbors,
            spread=spread,
            min_dist=min_dist,
            n_components=dimensions,
            random_state=random_state,
        ).fit_transform(distances_csr)
    except ValueError:
        # UMAP implementation doesn't know how to handle too few edges.
        # However, it considers structural zeros as real edges.
        distances_matrix = distances_matrix + 1.0  # type: ignore
        np.fill_diagonal(distances_matrix, 0.0)
        distances_csr = sp.csr_matrix(distances_matrix)
        distances_csr.data -= 1.0
        coordinates = umap.UMAP(
            metric="precomputed",
            n_neighbors=n_neighbors,
            spread=spread,
            min_dist=min_dist,
            random_state=random_state).fit_transform(distances_csr)

    all_sizes = []
    all_coordinates = []
    for axis in range(dimensions):
        axis_coordinates = ut.to_numpy_vector(coordinates[:, axis], copy=True)
        min_coordinate = np.min(coordinates)
        max_coordinate = np.max(coordinates)
        size = max_coordinate - min_coordinate
        assert size > 0
        all_sizes.append(size)
        all_coordinates.append(axis_coordinates)

    if dimensions == 2:
        all_names = ["x", "y"]
    elif dimensions == 3:
        all_names = ["u", "v", "w"]
    else:
        assert False

    order = np.argsort(all_sizes)
    for axis, name in zip(order, all_names):
        ut.set_o_data(adata, f"{prefix}_{name}", all_coordinates[axis])
Example #11
0
def filter_data(  # pylint: disable=dangerous-default-value
    adata: AnnData,
    obs_masks: List[str] = [],
    var_masks: List[str] = [],
    *,
    mask_obs: Optional[str] = None,
    mask_var: Optional[str] = None,
    invert_obs: bool = False,
    invert_var: bool = False,
    track_obs: Optional[str] = None,
    track_var: Optional[str] = None,
    name: Optional[str] = None,
    top_level: bool = True,
) -> Optional[Tuple[AnnData, ut.PandasSeries, ut.PandasSeries]]:
    """
    Filter (slice) the data based on previously-computed masks.

    For example, it is useful to discard cell-cycle genes, cells which have too few UMIs for
    meaningful analysis, etc. In general, the "best" filter depends on the data set.

    This function makes it easy to combine different pre-computed per-observation (cell) and
    per-variable (gene) boolean mask annotations into a final overall inclusion mask, and slice the
    data accordingly, while tracking the base index of the cells and genes in the filtered data.

    **Input**

    Annotated ``adata``, where the observations are cells and the variables are genes.

    **Returns**

    An annotated data containing a subset of the observations (cells) and variables (genes).

    If no observations and/or no variables were selected by the filter, returns ``None``.

    If ``name`` is not specified, the returned data will be unnamed. Otherwise, if the name starts
    with a ``.``, it will be appended to the current name (if any). Otherwise, ``name`` is the new
    name.

    If ``mask_obs`` and/or ``mask_var`` are specified, store the mask of the selected data as a
    per-observation and/or per-variable annotation of the full ``adata``.

    If ``track_obs`` and/or ``track_var`` are specified, store the original indices of the selected
    data as a per-observation and/or per-variable annotation of the result data.

    **Computation Parameters**

    1. Combine the masks in ``obs_masks`` and/or ``var_masks`` using
       :py:func:`metacells.tools.mask.combine_masks` passing it ``invert_obs`` and ``invert_var``,
       and ``mask_obs`` and ``mask_var`` as the ``to`` parameter. If either list of masks is empty,
       use the full mask.

    2. If the obtained masks for either the observations or variables is empty, return ``None``.
       Otherwise, return a slice of the full data containing just the observations and variables
       specified by the final masks.
    """
    if len(obs_masks) == 0:
        obs_mask = np.full(adata.n_obs, True, dtype="bool")
        if mask_obs is not None:
            ut.set_o_data(adata, mask_obs, obs_mask)
    else:
        mask = combine_masks(adata, obs_masks, invert=invert_obs, to=mask_obs)
        if mask is None:
            assert mask_obs is not None
            obs_mask = ut.get_o_numpy(
                adata, mask_obs, formatter=ut.mask_description) > 0
        else:
            obs_mask = ut.to_numpy_vector(mask, only_extract=True) > 0

    if len(var_masks) == 0:
        var_mask = np.full(adata.n_vars, True, dtype="bool")
        if mask_var is not None:
            ut.set_o_data(adata, mask_var, var_mask)
    else:
        mask = combine_masks(adata, var_masks, invert=invert_var, to=mask_var)
        if mask is None:
            assert mask_var is not None
            var_mask = ut.get_v_numpy(
                adata, mask_var, formatter=ut.mask_description) > 0
        else:
            var_mask = ut.to_numpy_vector(mask, only_extract=True) > 0

    if not np.any(obs_mask) or not np.any(var_mask):
        return None

    fdata = ut.slice(adata,
                     name=name,
                     top_level=top_level,
                     obs=obs_mask,
                     vars=var_mask,
                     track_obs=track_obs,
                     track_var=track_var)

    return (
        fdata,
        ut.to_pandas_series(obs_mask, index=adata.obs_names),
        ut.to_pandas_series(var_mask, index=adata.var_names),
    )
Example #12
0
def find_deviant_cells(
    adata: AnnData,
    what: Union[str, ut.Matrix] = "__x__",
    *,
    candidates: Union[str, ut.Vector] = "candidate",
    min_gene_fold_factor: float = pr.deviants_min_gene_fold_factor,
    abs_folds: bool = pr.deviants_abs_folds,
    max_gene_fraction: Optional[float] = pr.deviants_max_gene_fraction,
    max_cell_fraction: Optional[float] = pr.deviants_max_cell_fraction,
    inplace: bool = True,
) -> Optional[Tuple[ut.PandasSeries, ut.PandasSeries]]:
    """
    Find cells which are have significantly different gene expression from the metacells they are
    belong to based on ``what`` (default: {what}) data.

    **Input**

    Annotated ``adata``, where the observations are cells and the variables are genes, where
    ``what`` is a per-variable-per-observation matrix or the name of a per-variable-per-observation
    annotation containing such a matrix.

    **Returns**

    Observation (Cell) Annotations
        ``cell_deviant_votes``
            The number of genes that were the reason the cell was marked as deviant (if zero, the
            cell is not deviant).

    Variable (Gene) Annotations
        ``gene_deviant_votes``
            The number of cells each gene marked as deviant (if zero, the gene did not mark any cell
            as deviant).

    If ``inplace`` (default: {inplace}), this is written to the data, and the function returns
    ``None``. Otherwise this is returned as two pandas series (indexed by the observation and
    variable names).

    **Computation Parameters**

    Intuitively, we first select some fraction of the genes which were least predictable compared to
    the mean expression in the candidate metacells. We then mark as deviants some fraction of the
    cells whose expression of these genes was least predictable compared to the mean expression in
    the candidate metacells. Operationally:

    1. Compute for each candidate metacell the mean fraction of the UMIs expressed by each gene.
       Scale this by each cell's total UMIs to compute the expected number of UMIs for each cell.
       Compute the fold factor log2((actual UMIs + 1) / (expected UMIs + 1)) for each gene for each
       cell.

    2. Ignore all fold factors less than the ``min_gene_fold_factor`` (default: {min_gene_fold_factor}). If
       ``abs_folds`` (default: {abs_folds}), consider the absolute fold factors. Count the number of genes which have a
       fold factor above this minimum in at least one cell. If the fraction of such genes is above ``max_gene_fraction``
       (default: {max_gene_fraction}), then raise the minimal gene fold factor such that at most this fraction of genes
       remain.

    3. For each remaining gene, rank all the cells where it is expressed above the min fold
       factor. Give an artificial maximum rank to all cells with fold factor 0, that is, below the
       minimum.

    4. For each cell, compute the minimal rank it has in any of these genes. That is, if a cell has
       a rank of 1, it means that it has at least one gene whose expression fold factor is the worst
       (highest) across all cells (and is also above the minimum).

    5. Select as deviants all cells whose minimal rank is below the artificial maximum rank, that
       is, which contain at least one gene whose expression fold factor is high relative to the rest
       of the cells. If the fraction of such cells is higher than ``max_cell_fraction`` (default:
       {max_cell_fraction}), reduce the maximal rank such that at most this fraction of cells are
       selected as deviants.
    """
    if max_gene_fraction is None:
        max_gene_fraction = 1

    if max_cell_fraction is None:
        max_cell_fraction = 1

    assert min_gene_fold_factor > 0
    assert 0 < max_gene_fraction < 1
    assert 0 < max_cell_fraction < 1

    cells_count, genes_count = adata.shape
    assert cells_count > 0

    candidate_of_cells = ut.get_o_numpy(adata, candidates, formatter=ut.groups_description)

    totals_of_cells = ut.get_o_numpy(adata, what, sum=True)
    assert totals_of_cells.size == cells_count

    data = ut.get_vo_proper(adata, what, layout="row_major")
    list_of_fold_factors, list_of_cell_index_of_rows = _collect_fold_factors(
        data=data,
        candidate_of_cells=candidate_of_cells,
        totals_of_cells=totals_of_cells,
        min_gene_fold_factor=min_gene_fold_factor,
        abs_folds=abs_folds,
    )

    fold_factors = _construct_fold_factors(cells_count, list_of_fold_factors, list_of_cell_index_of_rows)

    if fold_factors is None:
        votes_of_deviant_cells = np.zeros(adata.n_obs, dtype="int32")
        votes_of_deviant_genes = np.zeros(adata.n_vars, dtype="int32")

    else:
        deviant_gene_indices = _filter_genes(
            cells_count=cells_count,
            genes_count=genes_count,
            fold_factors=fold_factors,
            min_gene_fold_factor=min_gene_fold_factor,
            max_gene_fraction=max_gene_fraction,
        )

        deviant_genes_fold_ranks = _fold_ranks(
            cells_count=cells_count, fold_factors=fold_factors, deviant_gene_indices=deviant_gene_indices
        )

        votes_of_deviant_cells, votes_of_deviant_genes = _filter_cells(
            cells_count=cells_count,
            genes_count=genes_count,
            deviant_genes_fold_ranks=deviant_genes_fold_ranks,
            deviant_gene_indices=deviant_gene_indices,
            max_cell_fraction=max_cell_fraction,
        )

    if inplace:
        ut.set_v_data(adata, "gene_deviant_votes", votes_of_deviant_genes, formatter=ut.mask_description)
        ut.set_o_data(adata, "cell_deviant_votes", votes_of_deviant_cells, formatter=ut.mask_description)
        return None

    ut.log_return("gene_deviant_votes", votes_of_deviant_genes, formatter=ut.mask_description)
    ut.log_return("cell_deviant_votes", votes_of_deviant_cells, formatter=ut.mask_description)

    return (
        ut.to_pandas_series(votes_of_deviant_cells, index=adata.obs_names),
        ut.to_pandas_series(votes_of_deviant_genes, index=adata.var_names),
    )
Example #13
0
def group_obs_data(
    adata: AnnData,
    what: Union[str, ut.Matrix] = "__x__",
    *,
    groups: Union[str, ut.Vector],
    name: Optional[str] = None,
) -> Optional[AnnData]:
    """
    Compute new data which has the ``what`` (default: {what}) sum of the observations (cells) for
    each group.

    For example, having computed a metacell index for each cell, compute the per-metacell data
    for further analysis.

    If ``groups`` is a string, it is expected to be the name of a per-observation vector annotation.
    Otherwise it should be a vector. The group indices should be integers, where negative values
    indicate "no group" and non-negative values indicate the index of the group to which each
    observation (cell) belongs to.

    **Input**

    Annotated ``adata``, where the observations are cells and the variables are genes, where
    ``what`` is a per-variable-per-observation matrix or the name of a per-variable-per-observation
    annotation containing such a matrix.

    **Returns**

    An annotated data where each observation is the sum of the group of original observations
    (cells). Observations with a negative group index are discarded. If all observations are
    discarded, return ``None``.

    The new data will contain only:

    * An ``X`` member holding the summed-per-group data.

    * A new ``grouped`` per-observation data which counts, for each group, the number
      of grouped observations summed into it.

    If ``name`` is not specified, the data will be unnamed. Otherwise, if it starts with a ``.``, it
    will be appended to the current name (if any). Otherwise, ``name`` is the new name.
    """
    group_of_cells = ut.get_o_numpy(adata,
                                    groups,
                                    formatter=ut.groups_description)

    data = ut.get_vo_proper(adata, what, layout="row_major")
    results = ut.sum_groups(data, group_of_cells, per="row")
    if results is None:
        return None
    summed_data, cell_counts = results

    gdata = AnnData(summed_data)
    gdata.var_names = adata.var_names

    ut.set_name(gdata, ut.get_name(adata))
    ut.set_name(gdata, name)

    ut.set_o_data(gdata,
                  "grouped",
                  cell_counts,
                  formatter=ut.sizes_description)

    return gdata
Example #14
0
def compute_outliers_matches(
    what: Union[str, ut.Matrix] = "__x__",
    *,
    adata: AnnData,
    gdata: AnnData,
    group: Union[str, ut.Vector] = "metacell",
    similar: str = "similar",
    value_normalization: float = pr.outliers_value_normalization,
    reproducible: bool,
) -> None:
    """
    Given an assignment of observations (cells) to groups (metacells), compute for each outlier the "most similar"
    group.

    **Input**

    Annotated ``adata``, where the observations are cells and the variables are genes, where ``what`` is a
    per-variable-per-observation matrix or the name of a per-variable-per-observation annotation containing such a
    matrix.

    In addition, ``gdata`` is assumed to have one observation for each group, and use the same genes as ``adata``.

    **Returns**

    Sets the following in ``adata``:

    Per-Observation (Cell) Annotations

        ``similar`` (default: {similar})
            For each observation (cell), the index of the "most similar" group.

    **Computation Parameters**

    1. Compute the log2 of the fraction of each gene in each of the outlier cells and the group metacells using
       the ``value_normalization`` (default: {value_normalization}).

    2. Cross-correlate each of the outlier cells with each of the group metacells, in a ``reproducible`` manner.
    """
    group_of_cells = ut.get_o_numpy(adata, group)
    outliers_mask = group_of_cells < 0
    odata = ut.slice(adata, obs=outliers_mask)

    outliers_data = ut.get_vo_proper(odata, what, layout="row_major")
    groups_data = ut.get_vo_proper(gdata, what, layout="row_major")

    outliers_fractions = ut.fraction_by(outliers_data, by="row")
    groups_fractions = ut.fraction_by(groups_data, by="row")

    outliers_fractions = ut.to_numpy_matrix(outliers_fractions)
    groups_fractions = ut.to_numpy_matrix(groups_fractions)

    outliers_fractions += value_normalization
    groups_fractions += value_normalization

    outliers_log_fractions = np.log2(outliers_fractions,
                                     out=outliers_fractions)
    groups_log_fractions = np.log2(groups_fractions, out=groups_fractions)

    outliers_groups_correlation = ut.cross_corrcoef_rows(
        outliers_log_fractions,
        groups_log_fractions,
        reproducible=reproducible)
    outliers_similar_group_indices = np.argmax(outliers_groups_correlation,
                                               axis=1)
    assert len(outliers_similar_group_indices) == odata.n_obs

    cells_similar_group_indices = np.full(adata.n_obs, -1, dtype="int32")
    cells_similar_group_indices[outliers_mask] = outliers_similar_group_indices
    ut.set_o_data(adata, similar, cells_similar_group_indices)
Example #15
0
def compute_type_compatible_sizes(
    adatas: List[AnnData],
    *,
    size: str = "grouped",
    kind: str = "type",
) -> None:
    """
    Given multiple annotated data of groups, compute a "compatible" size for each one to allow for
    consistent inner normalized variance comparison.

    Since the inner normalized variance quality measure is sensitive to the group (metacell) sizes,
    it is useful to artificially shrink the groups so the sizes will be similar between the compared
    data sets. Assuming each group (metacell) has a type annotation, for each such type, we give
    each one a "compatible" size (less than or equal to its actual size) so that using this reduced
    size will give us comparable measures between all the data sets.

    The "compatible" sizes are chosen such that the density distributions of the sizes in all data
    sets would be as similar to each other as possible.

    .. note::

        This is only effective if the groups are "similar" in size. Using this to compare very coarse
        grouping (few thousands of cells) with fine-grained ones (few dozens of cells) will still
        result in very different results.

    **Input**

    Several annotated ``adatas`` where each observation is a group. Should contain per-observation
    ``size`` annotation (default: {size}) and ``kind`` annotation (default: {kind}).

    **Returns**

    Sets the following in each ``adata``:

    Per-Observation (group) Annotations:

        ``compatible_size``
            The number of grouped cells in the group to use for computing excess R^2 and inner
            normalized variance.

    **Computation**

    1. For each type, sort the groups (metacells) in increasing number of grouped observations (cells).

    2. Consider the maximal quantile (rank) of the next smallest group (metacell) in each data set.

    3. Compute the minimal number of grouped observations in all the metacells whose quantile is up
       to this maximal quantile.

    4. Use this as the "compatible" size for all these groups, and remove them from consideration.

    5. Loop until all groups are assigned a "compatible" size.
    """
    assert len(adatas) > 0
    if len(adatas) == 1:
        ut.set_o_data(
            adatas[0], "compatible_size",
            ut.get_o_numpy(adatas[0], size, formatter=ut.sizes_description))
        return

    group_sizes_of_data = [
        ut.get_o_numpy(adata, size, formatter=ut.sizes_description)
        for adata in adatas
    ]
    group_types_of_data = [ut.get_o_numpy(adata, kind) for adata in adatas]

    unique_types: Set[Any] = set()
    for group_types in group_types_of_data:
        unique_types.update(group_types)

    compatible_size_of_data = [np.full(adata.n_obs, -1) for adata in adatas]

    groups_count_of_data: List[int] = []
    for type_index, group_type in enumerate(sorted(unique_types)):
        with ut.log_step(
                f"- {group_type}",
                ut.progress_description(len(unique_types), type_index,
                                        "type")):
            sorted_group_indices_of_data = [
                np.argsort(group_sizes)[group_types == group_type]
                for group_sizes, group_types in zip(group_sizes_of_data,
                                                    group_types_of_data)
            ]

            groups_count_of_data = [
                len(sorted_group_indices)
                for sorted_group_indices in sorted_group_indices_of_data
            ]

            ut.log_calc("group_counts", groups_count_of_data)

            def _for_each(value_of_data: List[T]) -> List[T]:
                return [
                    value for groups_count, value in zip(
                        groups_count_of_data, value_of_data)
                    if groups_count > 0
                ]

            groups_count_of_each = _for_each(groups_count_of_data)

            if len(groups_count_of_each) == 0:
                continue

            sorted_group_indices_of_each = _for_each(
                sorted_group_indices_of_data)
            group_sizes_of_each = _for_each(group_sizes_of_data)
            compatible_size_of_each = _for_each(compatible_size_of_data)

            if len(groups_count_of_each) == 1:
                compatible_size_of_each[0][
                    sorted_group_indices_of_each[0]] = group_sizes_of_each[0][
                        sorted_group_indices_of_each[0]]

            group_quantile_of_each = [
                (np.arange(len(sorted_group_indices)) + 1) /
                len(sorted_group_indices)
                for sorted_group_indices in sorted_group_indices_of_each
            ]

            next_position_of_each = np.full(len(group_quantile_of_each), 0)

            while True:
                next_quantile_of_each = [
                    group_quantile[next_position]
                    for group_quantile, next_position in zip(
                        group_quantile_of_each, next_position_of_each)
                ]
                next_quantile = max(next_quantile_of_each)

                last_position_of_each = next_position_of_each.copy()
                next_position_of_each[:] = [
                    np.sum(group_quantile <= next_quantile)
                    for group_quantile in group_quantile_of_each
                ]

                positions_of_each = [
                    range(last_position, next_position)
                    for last_position, next_position in zip(
                        last_position_of_each, next_position_of_each)
                ]

                sizes_of_each = [
                    group_sizes[sorted_group_indices[positions]]
                    for group_sizes, sorted_group_indices, positions in zip(
                        group_sizes_of_each, sorted_group_indices_of_each,
                        positions_of_each)
                ]

                min_size_of_each = [
                    np.min(sizes) for sizes, positions in zip(
                        sizes_of_each, positions_of_each)
                ]
                min_size = min(min_size_of_each)

                for sorted_group_indices, positions, compatible_size in zip(
                        sorted_group_indices_of_each, positions_of_each,
                        compatible_size_of_each):
                    compatible_size[sorted_group_indices[positions]] = min_size

                is_done_of_each = [
                    next_position == groups_count
                    for next_position, groups_count in zip(
                        next_position_of_each, groups_count_of_each)
                ]
                if all(is_done_of_each):
                    break

                assert not any(is_done_of_each)

    for adata, compatible_size in zip(adatas, compatible_size_of_data):
        assert np.min(compatible_size) > 0
        ut.set_o_data(adata, "compatible_size", compatible_size)
Example #16
0
def group_obs_annotation(
    adata: AnnData,
    gdata: AnnData,
    *,
    groups: Union[str, ut.Vector],
    name: str,
    formatter: Optional[Callable[[Any], Any]] = None,
    method: str = "majority",
    min_value_fraction: float = 0.5,
    conflict: Optional[Any] = None,
    inplace: bool = True,
) -> Optional[ut.PandasSeries]:
    """
    Transfer per-observation data from the per-observation (cell) ``adata`` to the
    per-group-of-observations (metacells) ``gdata``.

    **Input**

    Annotated ``adata``, where the observations are cells and the variables are genes, and the
    ``gdata`` containing the per-metacells summed data.

    **Returns**

    Observations (Cell) Annotations
        ``<name>``
            The per-group-observation annotation computed based on the per-observation annotation.

    If ``inplace`` (default: {inplace}), this is written to the ``gdata``, and the function returns
    ``None``. Otherwise this is returned as a pandas series (indexed by the group observation
    names).

    **Computation Parameters**

    1. Iterate on all the observations (groups, metacells) in ``gdata``.

    2. Consider all the cells whose ``groups`` annotation maps them into this group.

    3. Consider all the ``name`` annotation values of these cells.

    4. Compute an annotation value for the whole group of cells using the ``method``. Supported
       methods are:

       ``unique``
            All the values of all the cells in the group are expected to be the same, use this
            unique value for the whole groups.

       ``majority``
            Use the most common value across all cells in the group as the value for the whole
            group. If this value doesn't have at least ``min_value_fraction`` (default:
            {min_value_fraction}) of the cells, use the ``conflict`` (default: {conflict}) value
            instead.
    """
    group_of_cells = ut.get_o_numpy(adata,
                                    groups,
                                    formatter=ut.groups_description)
    values_of_cells = ut.get_o_numpy(adata, name, formatter=formatter)

    value_of_groups = np.empty(gdata.n_obs, dtype=values_of_cells.dtype)

    assert method in ("unique", "majority")

    if method == "unique":
        with ut.timed_step(".unique"):
            value_of_groups[group_of_cells] = values_of_cells

    else:
        assert method == "majority"
        with ut.timed_step(".majority"):
            for group_index in range(gdata.n_obs):
                cells_mask = group_of_cells == group_index
                cells_count = np.sum(cells_mask)
                assert cells_count > 0
                values_of_cells_of_group = values_of_cells[cells_mask]
                unique_values_of_group, unique_counts_of_group = np.unique(
                    values_of_cells_of_group, return_counts=True)
                majority_index = np.argmax(unique_counts_of_group)
                majority_count = unique_counts_of_group[majority_index]
                if majority_count / cells_count < min_value_fraction:
                    value_of_groups[group_index] = conflict
                else:
                    majority_value = unique_values_of_group[majority_index]
                    value_of_groups[group_index] = majority_value

    if inplace:
        ut.set_o_data(gdata, name, value_of_groups)
        return None

    return ut.to_pandas_series(value_of_groups, index=gdata.obs_names)
Example #17
0
def dissolve_metacells(
    adata: AnnData,
    what: Union[str, ut.Matrix] = "__x__",
    *,
    candidates: Union[str, ut.Vector] = "candidate",
    deviants: Optional[Union[str, ut.Vector]] = "cell_deviant_votes",
    target_metacell_size: float = pr.target_metacell_size,
    cell_sizes: Optional[Union[str, ut.Vector]] = pr.dissolve_cell_sizes,
    min_metacell_cells: int = pr.dissolve_min_metacell_cells,
    min_robust_size_factor: Optional[float] = pr.
    dissolve_min_robust_size_factor,
    min_convincing_size_factor: Optional[float] = pr.
    dissolve_min_convincing_size_factor,
    min_convincing_gene_fold_factor: float = pr.
    dissolve_min_convincing_gene_fold_factor,
    abs_folds: bool = pr.dissolve_abs_folds,
    inplace: bool = True,
) -> Optional[ut.PandasFrame]:
    """
    Dissolve too-small metacells based on ``what`` (default: {what}) data.

    **Input**

    Annotated ``adata``, where the observations are cells and the variables are genes, where
    ``what`` is a per-variable-per-observation matrix or the name of a per-variable-per-observation
    annotation containing such a matrix.

    **Returns**

    Observation (Cell) Annotations
        ``metacell``
            The integer index of the metacell each cell belongs to. The metacells are in no
            particular order. Cells with no metacell assignment are given a metacell index of
            ``-1``.

        ``dissolved``
            A boolean mask of the cells which were in a dissolved metacell.

    If ``inplace`` (default: {inplace}), this is written to the data, and the function returns
    ``None``. Otherwise this is returned as a pandas data frame (indexed by the observation names).

    **Computation Parameters**

    1. Mark all cells with non-zero ``deviants`` (default: {deviants}) as "outliers". This can be
       the name of a per-observation (cell) annotation, or an explicit boolean mask of cells, or a
       or ``None`` if there are no deviant cells to mark.

    2. Any metacell which has less cells than the ``min_metacell_cells`` is dissolved.

    3. We are trying to create metacells of size ``target_metacell_size``. Compute the sizes of the
       resulting metacells by summing the ``cell_sizes`` (default: {cell_sizes}). If it is ``None``,
       each has a size of one. These parameters are typically identical to these passed to
       :py:func:`metacells.tools.candidates.compute_candidate_metacells`.

    4. If ``min_robust_size_factor`` (default: {min_robust_size_factor}) is specified, then any
       metacell whose total size is at least ``target_metacell_size * min_robust_size_factor`` is
       preserved.

    5. If ``min_convincing_size_factor`` (default: {min_convincing_size_factor}) is specified, then any remaining
       metacells whose size is at least ``target_metacell_size * min_convincing_size_factor`` are preserved, given they
       contain at least one gene whose fold factor (log2((actual + 1) / (expected + 1))) is at least
       ``min_convincing_gene_fold_factor`` (default: {min_convincing_gene_fold_factor}). If ``abs_folds``, consider the
       absolute fold factors. That is, we only preserve these smaller metacells if there is at least one gene whose
       expression is significantly different from the mean of the population.

    6 . Any remaining metacell is dissolved into "outlier" cells.
    """
    dissolved_of_cells = np.zeros(adata.n_obs, dtype="bool")

    candidate_of_cells = ut.get_o_numpy(adata,
                                        candidates,
                                        formatter=ut.groups_description)
    candidate_of_cells = np.copy(candidate_of_cells)

    deviant_of_cells = ut.maybe_o_numpy(adata,
                                        deviants,
                                        formatter=ut.mask_description)
    if deviant_of_cells is not None:
        deviant_of_cells = deviant_of_cells > 0
    cell_sizes = ut.maybe_o_numpy(adata,
                                  cell_sizes,
                                  formatter=ut.sizes_description)

    if deviant_of_cells is not None:
        candidate_of_cells[deviant_of_cells > 0] = -1
    candidate_of_cells = ut.compress_indices(candidate_of_cells)
    candidates_count = np.max(candidate_of_cells) + 1

    data = ut.get_vo_proper(adata, what, layout="column_major")
    fraction_of_genes = ut.fraction_per(data, per="column")

    if min_robust_size_factor is None:
        min_robust_size = None
    else:
        min_robust_size = target_metacell_size * min_robust_size_factor
    ut.log_calc("min_robust_size", min_robust_size)

    if min_convincing_size_factor is None:
        min_convincing_size = None
    else:
        min_convincing_size = target_metacell_size * min_convincing_size_factor
    ut.log_calc("min_convincing_size", min_convincing_size)

    did_dissolve = False
    for candidate_index in range(candidates_count):
        candidate_cell_indices = np.where(
            candidate_of_cells == candidate_index)[0]
        if not _keep_candidate(
                adata,
                candidate_index,
                data=data,
                cell_sizes=cell_sizes,
                fraction_of_genes=fraction_of_genes,
                min_metacell_cells=min_metacell_cells,
                min_robust_size=min_robust_size,
                min_convincing_size=min_convincing_size,
                min_convincing_gene_fold_factor=min_convincing_gene_fold_factor,
                abs_folds=abs_folds,
                candidates_count=candidates_count,
                candidate_cell_indices=candidate_cell_indices,
        ):
            dissolved_of_cells[candidate_cell_indices] = True
            candidate_of_cells[candidate_cell_indices] = -1
            did_dissolve = True

    if did_dissolve:
        metacell_of_cells = ut.compress_indices(candidate_of_cells)
    else:
        metacell_of_cells = candidate_of_cells

    if inplace:
        ut.set_o_data(adata,
                      "dissolved",
                      dissolved_of_cells,
                      formatter=ut.mask_description)

        ut.set_o_data(adata,
                      "metacell",
                      metacell_of_cells,
                      formatter=ut.groups_description)
        return None

    ut.log_return("dissolved", dissolved_of_cells)
    ut.log_return("metacell",
                  metacell_of_cells,
                  formatter=ut.groups_description)

    obs_frame = ut.to_pandas_frame(index=adata.obs_names)
    obs_frame["dissolved"] = dissolved_of_cells
    obs_frame["metacell"] = metacell_of_cells
    return obs_frame
Example #18
0
def split_groups(
    adata: AnnData,
    what: Union[str, ut.Matrix] = "__x__",
    *,
    group: str = "metacell",
    feature_downsample_min_samples: int = pr.feature_downsample_min_samples,
    feature_downsample_min_cell_quantile: float = pr.
    feature_downsample_min_cell_quantile,
    feature_downsample_max_cell_quantile: float = pr.
    feature_downsample_max_cell_quantile,
    feature_min_gene_total: Optional[int] = None,
    feature_min_gene_top3: Optional[int] = None,
    feature_min_gene_relative_variance: Optional[float] = pr.
    feature_min_gene_relative_variance,
    forbidden_gene_names: Optional[Collection[str]] = None,
    forbidden_gene_patterns: Optional[Collection[Union[str, Pattern]]] = None,
    cells_similarity_value_normalization: float = pr.
    cells_similarity_value_normalization,
    cells_similarity_log_data: bool = pr.cells_similarity_log_data,
    cells_similarity_method: str = pr.cells_similarity_method,
    max_cell_size: Optional[float] = pr.max_cell_size,
    max_cell_size_factor: Optional[float] = pr.max_cell_size_factor,
    knn_balanced_ranks_factor: float = pr.knn_balanced_ranks_factor,
    knn_incoming_degree_factor: float = pr.knn_incoming_degree_factor,
    knn_outgoing_degree_factor: float = pr.knn_outgoing_degree_factor,
    min_seed_size_quantile: float = pr.min_seed_size_quantile,
    max_seed_size_quantile: float = pr.max_seed_size_quantile,
    candidates_cooldown_pass: float = pr.cooldown_pass,
    candidates_cooldown_node: float = pr.cooldown_node,
    random_seed: int = pr.random_seed,
) -> None:
    """
    Split each metacell into two parts using ``what`` (default: {what}) data.

    This creates a new partition of cells into half-metacells, which can used to
    :py:func:`compute_groups_self_consistency`.

    **Input**

    The input annotated ``adata`` is expected to contain a per-observation annotation named
    ``group`` (default: {group}) which identifies the group (metacells) each observation (cell)
    belongs to.

    **Returns**

    Sets the following annotations in ``adata``:

    Observation (Cell) Annotations
        ``half_<group>``
            The index of the half-group each cell belongs to. This is ``-1`` for ungrouped cells.
            Indices 0 to the number of groups are the first (low) halves; from the number of groups
            to twice that are the second (low) halves.

    **Computation Parameters**

    1. For each group (metacell), invoke
       :py:func:`metacells.pipeline.direct.compute_direct_metacells` on the observations (cells)
       included in the group, forcing the creation of two half-groups that cover all the group's
       cells. The parameters are passed to this call as-is, setting ``must_complete_cover`` to
       ``True`` (that is, disabling outliers detection), and disabling restrictions on the
       half-group sizes.
    """
    group_of_cells = ut.get_o_numpy(adata, group)
    groups_count = np.max(group_of_cells) + 1
    half_groups_of_cells = np.full(adata.n_obs, -1, dtype="int32")

    @ut.timed_call("split_group")
    def split_group(group_index: int) -> Tuple[ut.NumpyVector, ut.NumpyVector]:
        group_cells_mask = group_of_cells == group_index
        assert np.any(group_cells_mask)
        name = f".{group}-{group_index}/{groups_count}"
        gdata = ut.slice(adata,
                         name=name,
                         top_level=False,
                         obs=group_cells_mask,
                         track_obs="complete_cell_index")
        target_metacell_size = (gdata.n_obs + 1) // 2
        compute_direct_metacells(
            gdata,
            what,
            feature_downsample_min_samples=feature_downsample_min_samples,
            feature_downsample_min_cell_quantile=
            feature_downsample_min_cell_quantile,
            feature_downsample_max_cell_quantile=
            feature_downsample_max_cell_quantile,
            feature_min_gene_total=feature_min_gene_total,
            feature_min_gene_top3=feature_min_gene_top3,
            feature_min_gene_relative_variance=
            feature_min_gene_relative_variance,
            forbidden_gene_names=forbidden_gene_names,
            forbidden_gene_patterns=forbidden_gene_patterns,
            cells_similarity_value_normalization=
            cells_similarity_value_normalization,
            cells_similarity_log_data=cells_similarity_log_data,
            cells_similarity_method=cells_similarity_method,
            target_metacell_size=target_metacell_size,
            max_cell_size=max_cell_size,
            max_cell_size_factor=max_cell_size_factor,
            cell_sizes=None,
            knn_k=target_metacell_size,
            min_knn_k=target_metacell_size,
            knn_balanced_ranks_factor=knn_balanced_ranks_factor,
            knn_incoming_degree_factor=knn_incoming_degree_factor,
            knn_outgoing_degree_factor=knn_outgoing_degree_factor,
            min_seed_size_quantile=min_seed_size_quantile,
            max_seed_size_quantile=max_seed_size_quantile,
            candidates_cooldown_pass=candidates_cooldown_pass,
            candidates_cooldown_node=candidates_cooldown_node,
            candidates_min_split_size_factor=None,
            candidates_max_merge_size_factor=None,
            candidates_min_metacell_cells=1,
            must_complete_cover=True,
            random_seed=random_seed,
        )
        direct_groups = ut.get_o_numpy(gdata, "metacell")
        zero_count = np.sum(direct_groups == 0)
        one_count = np.sum(direct_groups == 1)
        ut.log_calc(f"group: {group_index} size: {len(direct_groups)} "
                    f"split into: {zero_count} + {one_count}")
        assert zero_count + one_count == len(direct_groups)
        assert zero_count > 0
        assert one_count > 0
        return (group_cells_mask, group_index + groups_count * direct_groups)

    for (group_cells_mask,
         group_cells_halves) in ut.parallel_map(split_group, groups_count):
        half_groups_of_cells[group_cells_mask] = group_cells_halves

    ut.set_o_data(adata,
                  f"half_{group}",
                  half_groups_of_cells,
                  formatter=ut.groups_description)
Example #19
0
def renormalize_query_by_atlas(  # pylint: disable=too-many-statements,too-many-branches
    what: str = "__x__",
    *,
    adata: AnnData,
    qdata: AnnData,
    var_annotations: Dict[str, Any],
    layers: Dict[str, Any],
    varp_annotations: Dict[str, Any],
) -> Optional[AnnData]:
    """
    Add an ``ATLASNORM`` pseudo-gene to query metacells data to compensate for the query having filtered out many genes.

    This renormalizes the gene fractions in the query to fit the atlas in case the query has aggressive filtered a
    significant amount of genes.

    **Input**

    Annotated query ``qdata`` and atlas ``adata``, where the observations are cells and the variables are genes, where
    ``X`` is a per-variable-per-observation matrix or the name of a per-variable-per-observation annotation containing
    such a matrix.

    **Returns**

    None if no normalization is needed (or possible). Otherwise, a copy of the query metacells data, with an additional
    variable (gene) called ``ATLASNORM`` to the query data, such that the total number of UMIs for each query metacells
    is as expected given the total number of UMIs of the genes common to the query and the atlas. This is skipped if the
    query and the atlas have exactly the same list of genes, or if if the query already contains a high number of genes
    missing from the atlas so that the total number of UMIs for the query metacells is already at least the expected
    based on the common genes.

    **Computation Parameters**

    1. Computes how many UMIs should be added to each query metacell so that its (total UMIs / total common gene UMIs)
       would be the same as the (total atlas UMIs / total atlas common UMIs). If this is zero (or negative), stop.

    2. Add an ``ATLASNORM`` pseudo-gene to the query with the above amount of UMIs. For each per-variable (gene)
       observation, add the value specified in ``var_annotations``, whose list of keys must cover the set of
       per-variable annotations in the query data. For each per-observation-per-variable layer, add the value specified
       in ``layers``, whose list of keys must cover the existing layers. For each per-variable-per-variable annotation,
       add the value specified in ``varp_annotations``.
    """
    for name in qdata.var.keys():
        if "|" not in name and name not in var_annotations.keys():
            raise RuntimeError(f"missing default value for variable annotation {name}")

    for name in qdata.layers.keys():
        if name not in layers.keys():
            raise RuntimeError(f"missing default value for layer {name}")

    for name in qdata.varp.keys():
        if name not in varp_annotations.keys():
            raise RuntimeError(f"missing default value for variable-variable {name}")

    if list(qdata.var_names) == list(adata.var_names):
        return None

    query_genes_list = list(qdata.var_names)
    atlas_genes_list = list(adata.var_names)
    common_genes_list = list(sorted(set(qdata.var_names) & set(adata.var_names)))
    query_gene_indices = np.array([query_genes_list.index(gene) for gene in common_genes_list])
    atlas_gene_indices = np.array([atlas_genes_list.index(gene) for gene in common_genes_list])
    common_qdata = ut.slice(qdata, name=".common", vars=query_gene_indices, track_var="full_index")
    common_adata = ut.slice(adata, name=".common", vars=atlas_gene_indices, track_var="full_index")

    assert list(common_qdata.var_names) == list(common_adata.var_names)

    atlas_total_umis_per_metacell = ut.get_o_numpy(adata, what, sum=True)
    atlas_common_umis_per_metacell = ut.get_o_numpy(common_adata, what, sum=True)
    atlas_total_umis = np.sum(atlas_total_umis_per_metacell)
    atlas_common_umis = np.sum(atlas_common_umis_per_metacell)
    atlas_disjoint_umis_fraction = atlas_total_umis / atlas_common_umis - 1.0

    ut.log_calc("atlas_total_umis", atlas_total_umis)
    ut.log_calc("atlas_common_umis", atlas_common_umis)
    ut.log_calc("atlas_disjoint_umis_fraction", atlas_disjoint_umis_fraction)

    query_total_umis_per_metacell = ut.get_o_numpy(qdata, what, sum=True)
    query_common_umis_per_metacell = ut.get_o_numpy(common_qdata, what, sum=True)
    query_total_umis = np.sum(query_total_umis_per_metacell)
    query_common_umis = np.sum(query_common_umis_per_metacell)
    query_disjoint_umis_fraction = query_total_umis / query_common_umis - 1.0

    ut.log_calc("query_total_umis", query_total_umis)
    ut.log_calc("query_common_umis", query_common_umis)
    ut.log_calc("query_disjoint_umis_fraction", query_disjoint_umis_fraction)

    if query_disjoint_umis_fraction >= atlas_disjoint_umis_fraction:
        return None

    query_normalization_umis_fraction = atlas_disjoint_umis_fraction - query_disjoint_umis_fraction
    ut.log_calc("query_normalization_umis_fraction", query_normalization_umis_fraction)
    query_normalization_umis_per_metacell = query_common_umis_per_metacell * query_normalization_umis_fraction

    _proper, dense, compressed = ut.to_proper_matrices(qdata.X)

    if dense is None:
        assert compressed is not None
        dense = ut.to_numpy_matrix(compressed)
    added = np.concatenate([dense, query_normalization_umis_per_metacell[:, np.newaxis]], axis=1)

    if compressed is not None:
        added = sp.csr_matrix(added)

    assert added.shape[0] == qdata.shape[0]
    assert added.shape[1] == qdata.shape[1] + 1

    ndata = AnnData(added)
    ndata.obs_names = qdata.obs_names
    var_names = list(qdata.var_names)
    var_names.append("ATLASNORM")
    ndata.var_names = var_names

    for name, value in qdata.uns.items():
        ut.set_m_data(ndata, name, value)

    for name, value in qdata.obs.items():
        ut.set_o_data(ndata, name, value)

    for name, value in qdata.obsp.items():
        ut.set_oo_data(ndata, name, value)

    for name in qdata.var.keys():
        if "|" in name:
            continue
        value = ut.get_v_numpy(qdata, name)
        value = np.append(value, [var_annotations[name]])
        ut.set_v_data(ndata, name, value)

    for name in qdata.layers.keys():
        data = ut.get_vo_proper(qdata, name)
        _proper, dense, compressed = ut.to_proper_matrices(data)

        if dense is None:
            assert compressed is not None
            dense = ut.to_numpy_matrix(compressed)

        values = np.full(qdata.n_obs, layers[name], dtype=dense.dtype)
        added = np.concatenate([dense, values[:, np.newaxis]], axis=1)

        if compressed is not None:
            added = sp.csr_matrix(added)

        ut.set_vo_data(ndata, name, added)

    for name in qdata.varp.keys():
        data = ut.get_vv_proper(qdata, name)
        _proper, dense, compressed = ut.to_proper_matrices(data)

        if dense is None:
            assert compressed is not None
            dense = ut.to_numpy_matrix(compressed)

        values = np.full(qdata.n_vars, varp_annotations[name], dtype=dense.dtype)
        added = np.concatenate([dense, values[:, np.newaxis]], axis=1)
        values = np.full(qdata.n_vars + 1, varp_annotations[name], dtype=dense.dtype)
        added = np.concatenate([added, values[:, np.newaxis]], axis=0)

        if compressed is not None:
            added = sp.csr_matrix(added)

        ut.set_vv_data(ndata, name, added)

    return ndata
Example #20
0
def compute_direct_metacells(  # pylint: disable=too-many-statements,too-many-branches
    adata: AnnData,
    what: Union[str, ut.Matrix] = "__x__",
    *,
    feature_downsample_min_samples: int = pr.feature_downsample_min_samples,
    feature_downsample_min_cell_quantile: float = pr.feature_downsample_min_cell_quantile,
    feature_downsample_max_cell_quantile: float = pr.feature_downsample_max_cell_quantile,
    feature_min_gene_total: Optional[int] = pr.feature_min_gene_total,
    feature_min_gene_top3: Optional[int] = pr.feature_min_gene_top3,
    feature_min_gene_relative_variance: Optional[float] = pr.feature_min_gene_relative_variance,
    feature_gene_names: Optional[Collection[str]] = None,
    feature_gene_patterns: Optional[Collection[Union[str, Pattern]]] = None,
    forbidden_gene_names: Optional[Collection[str]] = None,
    forbidden_gene_patterns: Optional[Collection[Union[str, Pattern]]] = None,
    cells_similarity_value_normalization: float = pr.cells_similarity_value_normalization,
    cells_similarity_log_data: bool = pr.cells_similarity_log_data,
    cells_similarity_method: str = pr.cells_similarity_method,
    target_metacell_size: float = pr.target_metacell_size,
    max_cell_size: Optional[float] = pr.max_cell_size,
    max_cell_size_factor: Optional[float] = pr.max_cell_size_factor,
    cell_sizes: Optional[Union[str, ut.Vector]] = pr.cell_sizes,
    knn_k: Optional[int] = pr.knn_k,
    min_knn_k: Optional[int] = pr.min_knn_k,
    knn_balanced_ranks_factor: float = pr.knn_balanced_ranks_factor,
    knn_incoming_degree_factor: float = pr.knn_incoming_degree_factor,
    knn_outgoing_degree_factor: float = pr.knn_outgoing_degree_factor,
    candidates_cell_seeds: Optional[Union[str, ut.Vector]] = None,
    min_seed_size_quantile: float = pr.min_seed_size_quantile,
    max_seed_size_quantile: float = pr.max_seed_size_quantile,
    candidates_cooldown_pass: float = pr.cooldown_pass,
    candidates_cooldown_node: float = pr.cooldown_node,
    candidates_cooldown_phase: float = pr.cooldown_phase,
    candidates_min_split_size_factor: Optional[float] = pr.candidates_min_split_size_factor,
    candidates_max_merge_size_factor: Optional[float] = pr.candidates_max_merge_size_factor,
    candidates_min_metacell_cells: Optional[int] = pr.min_metacell_cells,
    candidates_max_split_min_cut_strength: Optional[float] = pr.max_split_min_cut_strength,
    candidates_min_cut_seed_cells: Optional[int] = pr.min_cut_seed_cells,
    must_complete_cover: bool = False,
    deviants_min_gene_fold_factor: float = pr.deviants_min_gene_fold_factor,
    deviants_abs_folds: bool = pr.deviants_abs_folds,
    deviants_max_gene_fraction: Optional[float] = pr.deviants_max_gene_fraction,
    deviants_max_cell_fraction: Optional[float] = pr.deviants_max_cell_fraction,
    dissolve_min_robust_size_factor: Optional[float] = pr.dissolve_min_robust_size_factor,
    dissolve_min_convincing_size_factor: Optional[float] = pr.dissolve_min_convincing_size_factor,
    dissolve_min_convincing_gene_fold_factor: float = pr.dissolve_min_convincing_gene_fold_factor,
    dissolve_min_metacell_cells: int = pr.dissolve_min_metacell_cells,
    random_seed: int = pr.random_seed,
) -> AnnData:
    """
    Directly compute metacells using ``what`` (default: {what}) data.

    This directly computes the metacells on the whole data. Like any method that directly looks at
    the whole data at once, the amount of CPU and memory needed becomes unreasonable when the data
    size grows. Above O(10,000) you are much better off using the divide-and-conquer method.

    .. note::

        The current implementation is naive in that it computes the full dense N^2 correlation
        matrix, and only then extracts the sparse graph out of it. We actually need two copies where
        each requires 4 bytes per entry, so for O(100,000) cells, we have storage of
        O(100,000,000,000). In addition, the implementation is serial for the graph clustering
        phases.

        It is possible to mitigate this by fusing the correlations phase and the graph generation
        phase, parallelizing the result, and also (somehow) parallelizing the graph clustering
        phase. This might increase the "reasonable" size for the direct approach to O(100,000).

        We have decided not to invest in this direction since it won't allow us to push the size to
        O(1,000,000) and above. Instead we provide the divide-and-conquer method, which easily
        scales to O(1,000,000) on a single multi-core server, and to "unlimited" size if we further
        enhance the implementation to use a distributed compute cluster of such servers.

    .. todo::

        Should :py:func:`compute_direct_metacells` avoid computing the graph and partition it for a
        very small number of cells?

    **Input**

    The presumably "clean" annotated ``adata``, where the observations are cells and the variables
    are genes, where ``what`` is a per-variable-per-observation matrix or the name of a
    per-variable-per-observation annotation containing such a matrix.

    **Returns**

    Sets the following annotations in ``adata``:

    Variable (Gene) Annotations
        ``high_total_gene``
            A boolean mask of genes with "high" expression level.

        ``high_relative_variance_gene``
            A boolean mask of genes with "high" normalized variance, relative to other genes with a
            similar expression level.

        ``forbidden_gene``
            A boolean mask of genes which are forbidden from being chosen as "feature" genes based
            on their name.

        ``feature_gene``
            A boolean mask of the "feature" genes.

        ``gene_deviant_votes``
            The number of cells each gene marked as deviant (if zero, the gene did not mark any cell
            as deviant). This will be zero for non-"feature" genes.

    Observation (Cell) Annotations
        ``seed``
            The index of the seed metacell each cell was assigned to to. This is ``-1`` for
            non-"clean" cells.

        ``candidate``
            The index of the candidate metacell each cell was assigned to to. This is ``-1`` for
            non-"clean" cells.

        ``cell_deviant_votes``
            The number of genes that were the reason the cell was marked as deviant (if zero, the
            cell is not deviant).

        ``dissolved``
            A boolean mask of the cells contained in a dissolved metacell.

        ``metacell``
            The integer index of the metacell each cell belongs to. The metacells are in no
            particular order. Cells with no metacell assignment ("outliers") are given a metacell
            index of ``-1``.

        ``outlier``
            A boolean mask of the cells contained in no metacell.

    **Computation Parameters**

    1. Invoke :py:func:`metacells.pipeline.feature.extract_feature_data` to extract "feature" data
       from the clean data, using the
       ``feature_downsample_min_samples`` (default: {feature_downsample_min_samples}),
       ``feature_downsample_min_cell_quantile`` (default: {feature_downsample_min_cell_quantile}),
       ``feature_downsample_max_cell_quantile`` (default: {feature_downsample_max_cell_quantile}),
       ``feature_min_gene_total`` (default: {feature_min_gene_total}), ``feature_min_gene_top3``
       (default: {feature_min_gene_top3}), ``feature_min_gene_relative_variance`` (default:
       {feature_min_gene_relative_variance}), ``feature_gene_names`` (default:
       {feature_gene_names}), ``feature_gene_patterns`` (default: {feature_gene_patterns}),
       ``forbidden_gene_names`` (default: {forbidden_gene_names}), ``forbidden_gene_patterns``
       (default: {forbidden_gene_patterns}) and ``random_seed`` (default: {random_seed}) to make
       this replicable.

    2. Compute the fractions of each variable in each cell, and add the
       ``cells_similarity_value_normalization`` (default: {cells_similarity_value_normalization}) to
       it.

    3. If ``cells_similarity_log_data`` (default: {cells_similarity_log_data}), invoke the
       :py:func:`metacells.utilities.computation.log_data` function to compute the log (base 2) of
       the data.

    4. Invoke :py:func:`metacells.tools.similarity.compute_obs_obs_similarity` to compute the
       similarity between each pair of cells, using the
       ``cells_similarity_method`` (default: {cells_similarity_method}).

    5. Invoke :py:func:`metacells.pipeline.collect.compute_effective_cell_sizes` using
       ``max_cell_size`` (default: {max_cell_size}), ``max_cell_size_factor`` (default:
       {max_cell_size_factor}) and ``cell_sizes`` (default: {cell_sizes}) to get the effective cell
       sizes to use.

    5. Invoke :py:func:`metacells.tools.knn_graph.compute_obs_obs_knn_graph` to compute a
       K-Nearest-Neighbors graph, using the
       ``knn_balanced_ranks_factor`` (default: {knn_balanced_ranks_factor}),
       ``knn_incoming_degree_factor`` (default: {knn_incoming_degree_factor})
       and
       ``knn_outgoing_degree_factor`` (default: {knn_outgoing_degree_factor}).
       If ``knn_k`` (default: {knn_k}) is not specified, then it is
       chosen to be the median number of cells required to reach the target metacell size,
       but at least ``min_knn_k`` (default: {min_knn_k}).

    6. Invoke :py:func:`metacells.tools.candidates.compute_candidate_metacells` to compute
       the candidate metacells, using the
       ``candidates_cell_seeds`` (default: {candidates_cell_seeds}),
       ``min_seed_size_quantile`` (default: {min_seed_size_quantile}),
       ``max_seed_size_quantile`` (default: {max_seed_size_quantile}),
       ``candidates_cooldown_pass`` (default: {candidates_cooldown_pass}),
       ``candidates_cooldown_node`` (default: {candidates_cooldown_node}),
       ``candidates_cooldown_phase`` (default: {candidates_cooldown_phase}),
       ``candidates_min_split_size_factor`` (default: {candidates_min_split_size_factor}),
       ``candidates_max_merge_size_factor`` (default: {candidates_max_merge_size_factor}),
       ``candidates_min_metacell_cells`` (default: {candidates_min_metacell_cells}),
       and
       ``random_seed`` (default: {random_seed})
       to make this replicable. This tries to build metacells of the
       ``target_metacell_size`` (default: {target_metacell_size})
       using the effective cell sizes.

    7. Unless ``must_complete_cover`` (default: {must_complete_cover}), invoke
       :py:func:`metacells.tools.deviants.find_deviant_cells` to remove deviants from the candidate
       metacells, using the
       ``deviants_min_gene_fold_factor`` (default: {deviants_min_gene_fold_factor}),
       ``deviants_abs_folds`` (default: {deviants_abs_folds}),
       ``deviants_max_gene_fraction`` (default: {deviants_max_gene_fraction})
       and
       ``deviants_max_cell_fraction`` (default: {deviants_max_cell_fraction}).

    8. Unless ``must_complete_cover`` (default: {must_complete_cover}), invoke
       :py:func:`metacells.tools.dissolve.dissolve_metacells` to dissolve small unconvincing
       metacells, using the same
       ``target_metacell_size`` (default: {target_metacell_size}),
       and the effective cell sizes
       and the
       ``dissolve_min_robust_size_factor`` (default: {dissolve_min_robust_size_factor}),
       ``dissolve_min_convincing_size_factor`` (default: {dissolve_min_convincing_size_factor}),
       ``dissolve_min_convincing_gene_fold_factor`` (default: {dissolve_min_convincing_size_factor})
       and
       ``dissolve_min_metacell_cells`` (default: ``dissolve_min_metacell_cells``).
    """
    fdata = extract_feature_data(
        adata,
        what,
        top_level=False,
        downsample_min_samples=feature_downsample_min_samples,
        downsample_min_cell_quantile=feature_downsample_min_cell_quantile,
        downsample_max_cell_quantile=feature_downsample_max_cell_quantile,
        min_gene_relative_variance=feature_min_gene_relative_variance,
        min_gene_total=feature_min_gene_total,
        min_gene_top3=feature_min_gene_top3,
        forced_gene_names=feature_gene_names,
        forced_gene_patterns=feature_gene_patterns,
        forbidden_gene_names=forbidden_gene_names,
        forbidden_gene_patterns=forbidden_gene_patterns,
        random_seed=random_seed,
    )

    if fdata is None:
        raise ValueError("Empty feature data, giving up")

    effective_cell_sizes, max_cell_size, _cell_scale_factors = compute_effective_cell_sizes(
        adata, max_cell_size=max_cell_size, max_cell_size_factor=max_cell_size_factor, cell_sizes=cell_sizes
    )
    ut.log_calc("effective_cell_sizes", effective_cell_sizes, formatter=ut.sizes_description)

    if max_cell_size is not None:
        if candidates_min_metacell_cells is not None:
            target_metacell_size = max(target_metacell_size, max_cell_size * candidates_min_metacell_cells)

        if dissolve_min_metacell_cells is not None:
            target_metacell_size = max(target_metacell_size, max_cell_size * dissolve_min_metacell_cells)

        if candidates_min_metacell_cells is not None or dissolve_min_metacell_cells is not None:
            ut.log_calc("target_metacell_size", target_metacell_size)

    data = ut.get_vo_proper(fdata, "downsampled", layout="row_major")
    data = ut.to_numpy_matrix(data, copy=True)

    if cells_similarity_value_normalization > 0:
        data += cells_similarity_value_normalization

    if cells_similarity_log_data:
        data = ut.log_data(data, base=2)

    if knn_k is None:
        if effective_cell_sizes is None:
            median_cell_size = 1.0
        else:
            median_cell_size = float(np.median(effective_cell_sizes))
        knn_k = int(round(target_metacell_size / median_cell_size))
        if min_knn_k is not None:
            knn_k = max(knn_k, min_knn_k)

    if knn_k == 0:
        ut.log_calc("knn_k: 0 (too small, try single metacell)")
        ut.set_o_data(fdata, "candidate", np.full(fdata.n_obs, 0, dtype="int32"), formatter=lambda _: "* <- 0")
    elif knn_k >= fdata.n_obs:
        ut.log_calc(f"knn_k: {knn_k} (too large, try single metacell)")
        ut.set_o_data(fdata, "candidate", np.full(fdata.n_obs, 0, dtype="int32"), formatter=lambda _: "* <- 0")

    else:
        ut.log_calc("knn_k", knn_k)

        tl.compute_obs_obs_similarity(fdata, data, method=cells_similarity_method, reproducible=(random_seed != 0))

        tl.compute_obs_obs_knn_graph(
            fdata,
            k=knn_k,
            balanced_ranks_factor=knn_balanced_ranks_factor,
            incoming_degree_factor=knn_incoming_degree_factor,
            outgoing_degree_factor=knn_outgoing_degree_factor,
        )

        tl.compute_candidate_metacells(
            fdata,
            target_metacell_size=target_metacell_size,
            cell_sizes=effective_cell_sizes,
            cell_seeds=candidates_cell_seeds,
            min_seed_size_quantile=min_seed_size_quantile,
            max_seed_size_quantile=max_seed_size_quantile,
            cooldown_pass=candidates_cooldown_pass,
            cooldown_node=candidates_cooldown_node,
            cooldown_phase=candidates_cooldown_phase,
            min_split_size_factor=candidates_min_split_size_factor,
            max_merge_size_factor=candidates_max_merge_size_factor,
            min_metacell_cells=candidates_min_metacell_cells,
            max_split_min_cut_strength=candidates_max_split_min_cut_strength,
            min_cut_seed_cells=candidates_min_cut_seed_cells,
            must_complete_cover=must_complete_cover,
            random_seed=random_seed,
        )

        ut.set_oo_data(adata, "obs_similarity", ut.get_oo_proper(fdata, "obs_similarity"))

        ut.set_oo_data(adata, "obs_outgoing_weights", ut.get_oo_proper(fdata, "obs_outgoing_weights"))

        seed_of_cells = ut.get_o_numpy(fdata, "seed", formatter=ut.groups_description)

        ut.set_o_data(adata, "seed", seed_of_cells, formatter=ut.groups_description)

    candidate_of_cells = ut.get_o_numpy(fdata, "candidate", formatter=ut.groups_description)

    ut.set_o_data(adata, "candidate", candidate_of_cells, formatter=ut.groups_description)

    if must_complete_cover:
        assert np.min(candidate_of_cells) == 0

        deviant_votes_of_genes = np.zeros(adata.n_vars, dtype="float32")
        deviant_votes_of_cells = np.zeros(adata.n_obs, dtype="float32")
        dissolved_of_cells = np.zeros(adata.n_obs, dtype="bool")

        ut.set_v_data(adata, "gene_deviant_votes", deviant_votes_of_genes, formatter=ut.mask_description)

        ut.set_o_data(adata, "cell_deviant_votes", deviant_votes_of_cells, formatter=ut.mask_description)

        ut.set_o_data(adata, "dissolved", dissolved_of_cells, formatter=ut.mask_description)

        ut.set_o_data(adata, "metacell", candidate_of_cells, formatter=ut.groups_description)

    else:
        tl.find_deviant_cells(
            adata,
            candidates=candidate_of_cells,
            min_gene_fold_factor=deviants_min_gene_fold_factor,
            abs_folds=deviants_abs_folds,
            max_gene_fraction=deviants_max_gene_fraction,
            max_cell_fraction=deviants_max_cell_fraction,
        )

        tl.dissolve_metacells(
            adata,
            candidates=candidate_of_cells,
            target_metacell_size=target_metacell_size,
            cell_sizes=effective_cell_sizes,
            min_robust_size_factor=dissolve_min_robust_size_factor,
            min_convincing_size_factor=dissolve_min_convincing_size_factor,
            min_convincing_gene_fold_factor=dissolve_min_convincing_gene_fold_factor,
            min_metacell_cells=dissolve_min_metacell_cells,
        )

        metacell_of_cells = ut.get_o_numpy(adata, "metacell", formatter=ut.groups_description)

        outlier_of_cells = metacell_of_cells < 0
        ut.set_o_data(adata, "outlier", outlier_of_cells, formatter=ut.mask_description)

    return fdata
Example #21
0
def compute_candidate_metacells(  # pylint: disable=too-many-statements,too-many-branches
    adata: AnnData,
    what: Union[str, ut.Matrix] = "obs_outgoing_weights",
    *,
    target_metacell_size: float,
    cell_sizes: Optional[Union[str, ut.Vector]] = pr.candidates_cell_sizes,
    cell_seeds: Optional[Union[str, ut.Vector]] = None,
    min_seed_size_quantile: float = pr.min_seed_size_quantile,
    max_seed_size_quantile: float = pr.max_seed_size_quantile,
    cooldown_pass: float = pr.cooldown_pass,
    cooldown_node: float = pr.cooldown_node,
    cooldown_phase: float = pr.cooldown_phase,
    min_split_size_factor: Optional[float] = pr.candidates_min_split_size_factor,
    max_merge_size_factor: Optional[float] = pr.candidates_max_merge_size_factor,
    min_metacell_cells: Optional[int] = pr.candidates_min_metacell_cells,
    max_split_min_cut_strength: Optional[float] = pr.max_split_min_cut_strength,
    min_cut_seed_cells: Optional[int] = pr.min_cut_seed_cells,
    must_complete_cover: bool = False,
    random_seed: int = 0,
    inplace: bool = True,
) -> Optional[ut.PandasSeries]:
    """
    Assign observations (cells) to (raw, candidate) metacells based on ``what`` data. (a weighted
    directed graph).

    These candidate metacells typically go through additional vetting (e.g. deviant detection and
    dissolving too-small metacells) to obtain the final metacells.

    **Input**

    Annotated ``adata``, where the observations are cells and the variables are genes, where
    ``what`` is a per-observation-per-observation matrix where each row is the outgoing weights from
    each observation to the rest, or just the name of a per-observation-per-observation annotation
    containing such a matrix. Typically this matrix will be sparse for efficient processing.

    **Returns**

    Observation (Cell) Annotations
        ``candidate``
            The integer index of the (raw, candidate) metacell each cell belongs to. The metacells
            are in no particular order.

    If ``inplace`` (default: {inplace}), this is written to the data, and the function returns
    ``None``. Otherwise this is returned as a pandas series (indexed by the variable names).

    **Computation Parameters**

    1. We are trying to build metacells of ``target_metacell_size``, using the ``cell_sizes``
       (default: {cell_sizes}) to assign a size for each node (cell). This can be a string name of a
       per-observation annotation or a vector of values.

    2. We start with some an assignment of cells to ``cell_seeds`` (default: {cell_seeds}). If no
       seeds are provided, we use :py:func:`choose_seeds` using ``min_seed_size_quantile`` (default:
       {min_seed_size_quantile}) and ``max_seed_size_quantile`` (default: {max_seed_size_quantile})
       to compute them, picking a number of seeds such that the average metacell size would match
       the target.

    3. We optimize the seeds using :py:func:`optimize_partitions` to obtain initial communities by
       maximizing the "stability" of the solution (probability of starting at a random node and
       moving either forward or backward in the graph and staying within the same metacell, divided
       by the probability of staying in the metacell if the edges connected random nodes). We pass
       it the ``cooldown_pass`` {cooldown_pass}) and ``cooldown_node`` (default: {cooldown_node}).

    4. If ``min_split_size_factor`` (default: {min_split_size_factor}) is specified, randomly split
       to two each community whose size is partition method on each community whose size is at least
       ``target_metacell_size * min_split_size_factor`` and re-optimize the solution (resulting in
       one additional metacell). Every time we re-optimize, we multiply 1 - ``cooldown_pass`` by
       1 - ``cooldown_phase`` (default: {cooldown_phase}).

    5. If ``max_split_min_cut_strength`` (default: {max_split_min_cut_strength}) is specified, and
       the minimal cut of a candidate is lower, split it into two. If one of the partitions is
       smaller than ``min_cut_seed_cells``, then mark the cells in it as outliers, or if
       ``must_complete_cover`` is ``True``, skip the cut altogether.

    5. If ``max_merge_size_factor`` (default: {max_merge_size_factor}) or ``min_metacell_cells``
       (default: {min_metacell_cells}) are specified, make outliers of cells of a community whose
       size is at most ``target_metacell_size * max_merge_size_factor`` or contains less cells and
       re-optimize, which will assign these cells to other metacells (resulting on one less
       metacell). We again apply the ``cooldown_phase`` every time we re-optimize.

    6. Repeat the above steps until all metacells candidates are in the acceptable size range.
    """
    edge_weights = ut.get_oo_proper(adata, what, layout="row_major")
    assert edge_weights.shape[0] == edge_weights.shape[1]
    assert 0.0 < cooldown_pass < 1.0
    assert 0.0 <= cooldown_node <= 1.0
    assert 0.0 < cooldown_phase <= 1.0

    size = edge_weights.shape[0]

    outgoing_edge_weights = ut.mustbe_compressed_matrix(edge_weights)

    assert ut.is_layout(outgoing_edge_weights, "row_major")
    incoming_edge_weights = ut.mustbe_compressed_matrix(ut.to_layout(outgoing_edge_weights, layout="column_major"))
    assert ut.is_layout(incoming_edge_weights, "column_major")

    assert outgoing_edge_weights.data.dtype == "float32"
    assert outgoing_edge_weights.indices.dtype == "int32"
    assert outgoing_edge_weights.indptr.dtype == "int32"
    assert incoming_edge_weights.data.dtype == "float32"
    assert incoming_edge_weights.indices.dtype == "int32"
    assert incoming_edge_weights.indptr.dtype == "int32"

    node_sizes = ut.maybe_o_numpy(adata, cell_sizes, formatter=ut.sizes_description)
    if node_sizes is None:
        node_sizes = np.full(size, 1.0, dtype="float32")
    else:
        node_sizes = node_sizes.astype("float32")
    ut.log_calc("node_sizes", node_sizes, formatter=ut.sizes_description)

    assert target_metacell_size > 0
    max_metacell_size = None
    min_metacell_size = None

    if min_split_size_factor is not None:
        assert min_split_size_factor > 0
        max_metacell_size = ceil(target_metacell_size * min_split_size_factor) - 1
    ut.log_calc("max_metacell_size", max_metacell_size)

    if max_merge_size_factor is not None:
        assert max_merge_size_factor > 0
        min_metacell_size = floor(target_metacell_size * max_merge_size_factor) + 1
    ut.log_calc("min_metacell_size", min_metacell_size)

    target_metacell_cells = max(
        1.0 if min_metacell_cells is None else float(min_metacell_cells),
        float(target_metacell_size / np.mean(node_sizes)),
    )
    ut.log_calc("target_metacell_cells", target_metacell_cells)

    if min_split_size_factor is not None and max_merge_size_factor is not None:
        assert max_merge_size_factor < min_split_size_factor
        assert min_metacell_size is not None
        assert max_metacell_size is not None
        assert min_metacell_size <= max_metacell_size

    community_of_nodes = ut.maybe_o_numpy(adata, cell_seeds, formatter=ut.groups_description)

    if community_of_nodes is not None:
        assert community_of_nodes.dtype == "int32"
    else:
        target_seeds_count = ceil(size / target_metacell_cells)
        ut.log_calc("target_seeds_count", target_seeds_count)

        community_of_nodes = np.full(size, -1, dtype="int32")
        _choose_seeds(
            outgoing_edge_weights=outgoing_edge_weights,
            incoming_edge_weights=incoming_edge_weights,
            seed_of_cells=community_of_nodes,
            max_seeds_count=target_seeds_count,
            min_seed_size_quantile=min_seed_size_quantile,
            max_seed_size_quantile=max_seed_size_quantile,
            random_seed=random_seed,
        )

    ut.set_o_data(adata, "seed", community_of_nodes, formatter=ut.groups_description)
    community_of_nodes = community_of_nodes.copy()

    np.random.seed(random_seed)

    cold_temperature = 1 - cooldown_pass

    old_score = 1e9
    old_communities = community_of_nodes
    old_small_nodes_count = len(community_of_nodes)
    atomic_candidates: Set[Tuple[int, ...]] = set()
    kept_communities_count = 0

    while True:
        cold_temperature, score = _optimize_split_communities(  #
            outgoing_edge_weights=outgoing_edge_weights,
            incoming_edge_weights=incoming_edge_weights,
            community_of_nodes=community_of_nodes,
            node_sizes=node_sizes,
            target_metacell_size=target_metacell_size,
            max_metacell_size=max_metacell_size,
            max_split_min_cut_strength=max_split_min_cut_strength,
            min_cut_seed_cells=min_cut_seed_cells,
            must_complete_cover=must_complete_cover,
            min_seed_size_quantile=min_seed_size_quantile,
            max_seed_size_quantile=max_seed_size_quantile,
            random_seed=random_seed,
            cooldown_pass=cooldown_pass,
            cooldown_node=cooldown_node,
            cooldown_phase=cooldown_phase,
            kept_communities_count=kept_communities_count,
            cold_temperature=cold_temperature,
            atomic_candidates=atomic_candidates,
        )

        small_communities, small_nodes_count = _find_small_communities(
            community_of_nodes=community_of_nodes,
            node_sizes=node_sizes,
            min_metacell_size=min_metacell_size,
            min_metacell_cells=min_metacell_cells,
        )

        small_communities_count = len(small_communities)
        if small_communities_count < 2:
            break

        if (old_small_nodes_count, old_score) <= (small_nodes_count, score):
            ut.logger().debug("is not better, revert")
            community_of_nodes = old_communities
            score = old_score
            ut.log_calc("communities", community_of_nodes, formatter=ut.groups_description)
            ut.log_calc("score", score)
            break

        old_score = score
        old_communities = community_of_nodes.copy()
        old_small_nodes_count = small_nodes_count

        kept_communities_count = _cancel_communities(
            community_of_nodes=community_of_nodes, cancelled_communities=small_communities
        )

        _choose_seeds(
            outgoing_edge_weights=outgoing_edge_weights,
            incoming_edge_weights=incoming_edge_weights,
            seed_of_cells=community_of_nodes,
            max_seeds_count=kept_communities_count + small_communities_count - 1,
            min_seed_size_quantile=min_seed_size_quantile,
            max_seed_size_quantile=max_seed_size_quantile,
            random_seed=random_seed,
        )

    if inplace:
        ut.set_o_data(adata, "candidate", community_of_nodes, formatter=ut.groups_description)
        return None

    if must_complete_cover:
        assert np.min(community_of_nodes) == 0
    else:
        community_of_nodes[community_of_nodes < 0] = -1

    ut.log_return("candidate", community_of_nodes, formatter=ut.groups_description)
    return ut.to_pandas_series(community_of_nodes, index=adata.obs_names)
Example #22
0
def _apply_annotations(  # pylint: disable=too-many-branches
    adata: AnnData,
    sdata: AnnData,
    per: str,
    annotations: Dict[str, DefaultValues],
    indices: Union[str, ut.Vector],
) -> None:
    full_name = ut.get_name(adata)
    slice_name = ut.get_name(sdata)

    assert per in ("o", "v")

    if per == "o":
        full_data = adata.obs
        full_size = adata.n_obs
        slice_data = sdata.obs
        slice_size = sdata.n_obs
        full_indices = ut.get_o_numpy(sdata, indices)
    else:
        full_data = adata.var
        full_size = adata.n_vars
        slice_data = sdata.var
        slice_size = sdata.n_vars
        full_indices = ut.get_v_numpy(sdata, indices)

    for name, default_values in annotations.items():
        slice_value = slice_data.get(name)
        if slice_value is not None:
            formatter: Optional[Callable[[Any], str]] = None
        else:
            if default_values.slice == Skip or isinstance(
                    default_values.slice, Skip):
                continue

            if default_values.slice == Raise or isinstance(
                    default_values.slice, Raise):
                if slice_name is None:
                    raise KeyError(f"unknown slice data name: {name}")
                raise KeyError(
                    f"unknown slice data: {slice_name} name: {name}")

            slice_value = default_values.slice

            def formatter(_: Any) -> str:
                # pylint: disable=cell-var-from-loop
                return f"{slice_size} <- {slice_value}"

            # pylint: enable=cell-var-from-loop

        full_value = full_data.get(name)
        if full_value is not None:
            ut.unfreeze(full_value)
        else:
            if default_values.full == Skip or isinstance(
                    default_values.full, Skip):
                continue

            if default_values.full == Raise or isinstance(
                    default_values.full, Raise):
                if full_name is None:
                    raise KeyError(f"unknown full data name: {name}")
                raise KeyError(f"unknown full data: {full_name} name: {name}")

            if default_values.full is None:
                full_value = np.full(full_size, None, dtype="float32")
            else:
                full_value = np.full(full_size, default_values.full)

        full_value[full_indices] = slice_value
        if per == "o":
            ut.set_o_data(adata, name, full_value, formatter=formatter)
        else:
            ut.set_v_data(adata, name, full_value, formatter=formatter)