Beispiel #1
0
def magic_impute(adata, knn=5, t=2, verbose=0, **kwargs):
    logg.info(
        "To be used carefully. Magic has not yet been tested for this application."
    )
    import magic

    magic_operator = magic.MAGIC(verbose=verbose, knn=knn, t=t, **kwargs)
    adata.layers["Ms"] = magic_operator.fit_transform(adata.layers["spliced"])
    adata.layers["Mu"] = magic_operator.transform(adata.layers["unspliced"])
def score_genes_cell_cycle(adata, s_genes=None, g2m_genes=None, copy=False, **kwargs):
    """\
    Score cell cycle genes.

    Calculates scores and assigns a cell cycle phase (G1, S, G2M) using the list of cell
    cycle genes defined in Tirosh et al, 2015 (https://doi.org/10.1126/science.aad0501).

    Parameters
    ----------
    adata
        The annotated data matrix.
    s_genes
        List of genes associated with S phase.
    g2m_genes
        List of genes associated with G2M phase.
    copy
        Copy `adata` or modify it inplace.
    **kwargs
        Are passed to :func:`~scanpy.tl.score_genes`. `ctrl_size` is not
        possible, as it's set as `min(len(s_genes), len(g2m_genes))`.

    Returns
    -------
    S_score: `adata.obs`, dtype `object`
        The score for S phase for each cell.
    G2M_score: `adata.obs`, dtype `object`
        The score for G2M phase for each cell.
    phase: `adata.obs`, dtype `object`
        The cell cycle phase (`S`, `G2M` or `G1`) for each cell.
    """

    logg.info("calculating cell cycle phase")
    from scanpy.tools._score_genes import score_genes

    adata = adata.copy() if copy else adata

    s_genes_, g2m_genes_ = get_phase_marker_genes(adata)
    if s_genes is None:
        s_genes = s_genes_
    if g2m_genes is None:
        g2m_genes = g2m_genes_

    ctrl_size = min(len(s_genes), len(g2m_genes))

    kwargs.update({"ctrl_size": ctrl_size})
    score_genes(adata, gene_list=s_genes, score_name="S_score", **kwargs)
    score_genes(adata, gene_list=g2m_genes, score_name="G2M_score", **kwargs)
    scores = adata.obs[["S_score", "G2M_score"]]

    phase = pd.Series("S", index=scores.index)  # default phase is S
    phase[scores.G2M_score > scores.S_score] = "G2M"  # G2M, if G2M is higher than S
    phase[np.all(scores < 0, axis=1)] = "G1"  # G1, if all scores are negative

    adata.obs["phase"] = phase
    logg.hint("    'S_score' and 'G2M_score', scores of cell cycle phases (adata.obs)")
    return adata if copy else None
Beispiel #3
0
def remove_duplicate_cells(adata):
    if "X_pca" not in adata.obsm.keys():
        pca(adata)
    idx_duplicates = get_duplicate_cells(adata)
    if len(idx_duplicates) > 0:
        mask = np.ones(adata.n_obs, bool)
        mask[idx_duplicates] = 0
        logg.info("Removed", len(idx_duplicates), "duplicate cells.")
        adata._inplace_subset_obs(mask)
        if "neighbors" in adata.uns.keys():
            neighbors(adata)
Beispiel #4
0
def filter_genes_dispersion(
    data,
    flavor="seurat",
    min_disp=None,
    max_disp=None,
    min_mean=None,
    max_mean=None,
    n_bins=20,
    n_top_genes=None,
    retain_genes=None,
    log=True,
    subset=True,
    copy=False,
):
    """Extract highly variable genes.

    Expects non-logarithmized data.
    The normalized dispersion is obtained by scaling with the mean and standard
    deviation of the dispersions for genes falling into a given bin for mean
    expression of genes. This means that for each bin of mean expression, highly
    variable genes are selected.

    Parameters
    ----------
    data : :class:`~anndata.AnnData`, `np.ndarray`, `sp.sparse`
        The (annotated) data matrix of shape `n_obs` × `n_vars`. Rows correspond
        to cells and columns to genes.
    flavor : {'seurat', 'cell_ranger', 'svr'}, optional (default: 'seurat')
        Choose the flavor for computing normalized dispersion. If choosing
        'seurat', this expects non-logarithmized data - the logarithm of mean
        and dispersion is taken internally when `log` is at its default value
        `True`. For 'cell_ranger', this is usually called for logarithmized data
        - in this case you should set `log` to `False`. In their default
        workflows, Seurat passes the cutoffs whereas Cell Ranger passes
        `n_top_genes`.
    min_mean=0.0125, max_mean=3, min_disp=0.5, max_disp=`None` : `float`, optional
        If `n_top_genes` unequals `None`, these cutoffs for the means and the
        normalized dispersions are ignored.
    n_bins : `int` (default: 20)
        Number of bins for binning the mean gene expression. Normalization is
        done with respect to each bin. If just a single gene falls into a bin,
        the normalized dispersion is artificially set to 1. You'll be informed
        about this if you set `settings.verbosity = 4`.
    n_top_genes : `int` or `None` (default: `None`)
        Number of highly-variable genes to keep.
    retain_genes: `list`, optional (default: `None`)
        List of gene names to be retained independent of thresholds.
    log : `bool`, optional (default: `True`)
        Use the logarithm of the mean to variance ratio.
    subset : `bool`, optional (default: `True`)
        Keep highly-variable genes only (if True) else write a bool
        array for highly-variable genes while keeping all genes.
    copy : `bool`, optional (default: `False`)
        If an :class:`~anndata.AnnData` is passed, determines whether a copy
        is returned.

    Returns
    -------
    If an AnnData `adata` is passed, returns or updates `adata` depending on \
    `copy`. It filters the `adata` and adds the annotations
    """

    adata = data.copy() if copy else data
    _set_initial_size(adata)

    mean, var = materialize_as_ndarray(get_mean_var(adata.X))

    if n_top_genes is not None and adata.n_vars < n_top_genes:
        logg.info("Skip filtering by dispersion since number "
                  "of variables are less than `n_top_genes`.")
    else:
        if flavor == "svr":
            from sklearn.svm import SVR

            log_mu = np.log2(mean)
            log_cv = np.log2(np.sqrt(var) / mean)
            clf = SVR(gamma=150.0 / len(mean))
            clf.fit(log_mu[:, None], log_cv)
            score = log_cv - clf.predict(log_mu[:, None])
            nth_score = np.sort(score)[::-1][n_top_genes - 1]
            adata.var["highly_variable"] = score >= nth_score

        else:
            cut_disp = [min_disp, max_disp, min_mean, max_mean]
            if n_top_genes is not None and not all(x is None
                                                   for x in cut_disp):
                logg.info(
                    "If you pass `n_top_genes`, all cutoffs are ignored.")
            if min_disp is None:
                min_disp = 0.5
            if max_disp is None:
                max_disp = np.inf
            if min_mean is None:
                min_mean = 0.0125
            if max_mean is None:
                max_mean = 3

            mean[mean == 0] = 1e-12  # set entries equal to zero to small value
            dispersion = var / mean
            if log:  # logarithmized mean as in Seurat
                dispersion[dispersion == 0] = np.nan
                dispersion = np.log(dispersion)
                mean = np.log1p(mean)

            # all of the following quantities are "per-gene" here
            df = pd.DataFrame()
            df["mean"], df["dispersion"] = mean, dispersion

            if flavor == "seurat":
                df["mean_bin"] = pd.cut(df["mean"], bins=n_bins)
                disp_grouped = df.groupby("mean_bin")["dispersion"]
                disp_mean_bin = disp_grouped.mean()
                disp_std_bin = disp_grouped.std(ddof=1)

                # retrieve genes that have nan std (i.e. single gene fell in one bin)
                # and implicitly set them to have a normalized disperion of 1
                one_gene_per_bin = disp_std_bin.isnull()

                disp_std_bin[one_gene_per_bin] = disp_mean_bin[
                    one_gene_per_bin].values
                disp_mean_bin[one_gene_per_bin] = 0

                # normalized dispersion
                mu = disp_mean_bin[df["mean_bin"].values].values
                std = disp_std_bin[df["mean_bin"].values].values
                df["dispersion_norm"] = ((df["dispersion"] - mu) /
                                         std).fillna(0)
            elif flavor == "cell_ranger":
                from statsmodels import robust

                cut = np.percentile(df["mean"], np.arange(10, 105, 5))
                df["mean_bin"] = pd.cut(df["mean"], np.r_[-np.inf, cut,
                                                          np.inf])
                disp_grouped = df.groupby("mean_bin")["dispersion"]
                disp_median_bin = disp_grouped.median()
                with warnings.catch_warnings(
                ):  # ignore warning: "Mean of empty slice"
                    warnings.simplefilter("ignore")
                    disp_mad_bin = disp_grouped.apply(robust.mad)
                mu = disp_median_bin[df["mean_bin"].values].values
                std = disp_mad_bin[df["mean_bin"].values].values
                df["dispersion_norm"] = (np.abs(df["dispersion"] - mu) /
                                         std).fillna(0)
            else:
                raise ValueError(
                    '`flavor` needs to be "seurat" or "cell_ranger"')
            dispersion_norm = df["dispersion_norm"].values
            if n_top_genes is not None:
                cut_off = df["dispersion_norm"].nlargest(
                    n_top_genes).values[-1]
                gene_subset = df["dispersion_norm"].values >= cut_off
            else:
                gene_subset = np.logical_and.reduce((
                    mean > min_mean,
                    mean < max_mean,
                    dispersion_norm > min_disp,
                    dispersion_norm < max_disp,
                ))

            adata.var["means"] = df["mean"].values
            adata.var["dispersions"] = df["dispersion"].values
            adata.var["dispersions_norm"] = df["dispersion_norm"].values
            adata.var["highly_variable"] = gene_subset

        if subset:
            gene_subset = adata.var["highly_variable"]
            if retain_genes is not None:
                if isinstance(retain_genes, str):
                    retain_genes = [retain_genes]
                gene_subset = gene_subset | adata.var_names.isin(retain_genes)
            adata._inplace_subset_var(gene_subset)

        logg.info(f"Extracted {np.sum(gene_subset)} highly variable genes.")
    return adata if copy else None
Beispiel #5
0
def _paga_graph(
    adata,
    ax,
    solid_edges=None,
    dashed_edges=None,
    adjacency_solid=None,
    adjacency_dashed=None,
    transitions=None,
    threshold=None,
    root=0,
    colors=None,
    labels=None,
    fontsize=None,
    fontweight=None,
    fontoutline=None,
    text_kwds=None,
    node_size_scale=1.0,
    node_size_power=0.5,
    edge_width_scale=1.0,
    normalize_to_color="reference",
    title=None,
    pos=None,
    cmap=None,
    frameon=True,
    min_edge_width=None,
    max_edge_width=None,
    export_to_gexf=False,
    colorbar=None,
    use_raw=True,
    cb_kwds=None,
    single_component=False,
    arrowsize=30,
):
    """scanpy/_paga_graph with some adjustments for directional graphs.
    To be moved back to scanpy once finalized.
    """
    import warnings
    from pathlib import Path

    import networkx as nx
    import pandas as pd
    import scipy
    from pandas.api.types import is_categorical_dtype

    from matplotlib import patheffects
    from matplotlib.colors import is_color_like

    from scanpy.plotting._utils import add_colors_for_categorical_sample_annotation

    node_labels = labels  # rename for clarity
    if (node_labels is not None and isinstance(node_labels, str)
            and node_labels != adata.uns["paga"]["groups"]):
        raise ValueError(
            f"Provide a list of group labels for the PAGA "
            f"groups {adata.uns['paga']['groups']}, not {node_labels}.")
    groups_key = adata.uns["paga"]["groups"]
    if node_labels is None:
        node_labels = adata.obs[groups_key].cat.categories

    if (colors is None or colors == groups_key) and groups_key is not None:
        if f"{groups_key}_colors" not in adata.uns or len(
                adata.obs[groups_key].cat.categories) != len(
                    adata.uns[f"{groups_key}_colors"]):
            add_colors_for_categorical_sample_annotation(adata, groups_key)
        colors = adata.uns[f"{groups_key}_colors"]

    nx_g_solid = nx.Graph(adjacency_solid)
    if dashed_edges is not None:
        nx_g_dashed = nx.Graph(adjacency_dashed)

    # convert pos to array and dict
    if not isinstance(pos, (Path, str)):
        pos_array = pos
    else:
        pos = Path(pos)
        if pos.suffix != ".gdf":
            raise ValueError(
                "Currently only supporting reading positions from .gdf files.")
        s = ""  # read the node definition from the file
        with pos.open() as f:
            f.readline()
            for line in f:
                if line.startswith("edgedef>"):
                    break
                s += line
        from io import StringIO

        df = pd.read_csv(StringIO(s), header=-1)
        pos_array = df[[4, 5]].values

    # convert to dictionary
    pos = {n: [p[0], p[1]] for n, p in enumerate(pos_array)}

    # uniform color
    if isinstance(colors, str) and is_color_like(colors):
        colors = [colors for c in range(len(node_labels))]

    # color degree of the graph
    if isinstance(colors, str) and colors.startswith("degree"):
        # see also tools.paga.paga_degrees
        if colors == "degree_dashed":
            colors = [d for _, d in nx_g_dashed.degree(weight="weight")]
        elif colors == "degree_solid":
            colors = [d for _, d in nx_g_solid.degree(weight="weight")]
        else:
            raise ValueError(
                '`degree` either "degree_dashed" or "degree_solid".')
        colors = (np.array(colors) - np.min(colors)) / (np.max(colors) -
                                                        np.min(colors))

    # plot gene expression
    var_names = adata.var_names if adata.raw is None else adata.raw.var_names
    if isinstance(colors, str) and colors in var_names:
        x_color = []
        cats = adata.obs[groups_key].cat.categories
        for cat in cats:
            subset = (cat == adata.obs[groups_key]).values
            if adata.raw is not None and use_raw:
                adata_gene = adata.raw[:, colors]
            else:
                adata_gene = adata[:, colors]
            x_color.append(np.mean(adata_gene.X[subset]))
        colors = x_color

    # plot continuous annotation
    if (isinstance(colors, str) and colors in adata.obs
            and not is_categorical_dtype(adata.obs[colors])):
        x_color = []
        cats = adata.obs[groups_key].cat.categories
        for cat in cats:
            subset = (cat == adata.obs[groups_key]).values
            x_color.append(adata.obs.loc[subset, colors].mean())
        colors = x_color

    # plot categorical annotation
    if (isinstance(colors, str) and colors in adata.obs
            and is_categorical_dtype(adata.obs[colors])):
        from scanpy._utils import (
            compute_association_matrix_of_groups,
            get_associated_colors_of_groups,
        )

        norm = "reference" if normalize_to_color else "prediction"
        _, asso_matrix = compute_association_matrix_of_groups(
            adata, prediction=groups_key, reference=colors, normalization=norm)
        add_colors_for_categorical_sample_annotation(adata, colors)
        asso_colors = get_associated_colors_of_groups(
            adata.uns[f"{colors}_colors"], asso_matrix)
        colors = asso_colors

    if len(colors) < len(node_labels):
        raise ValueError(
            "`color` list need to be at least as long as `groups`/`node_labels` list."
        )

    # count number of connected components
    n_components, labels = scipy.sparse.csgraph.connected_components(
        adjacency_solid)
    if n_components > 1 and single_component:
        component_sizes = np.bincount(labels)
        largest_component = np.where(
            component_sizes == component_sizes.max())[0][0]
        adjacency_solid = adjacency_solid.tocsr()[labels ==
                                                  largest_component, :]
        adjacency_solid = adjacency_solid.tocsc()[:,
                                                  labels == largest_component]
        colors = np.array(colors)[labels == largest_component]
        node_labels = np.array(node_labels)[labels == largest_component]
        cats_dropped = (adata.obs[groups_key].cat.categories[
            labels != largest_component].tolist())
        logg.info(f"Restricting graph to largest connected component "
                  f"by dropping categories\n{cats_dropped}")
        nx_g_solid = nx.Graph(adjacency_solid)
        if dashed_edges is not None:
            raise ValueError(
                "`single_component` only if `dashed_edges` is `None`.")

    # groups sizes
    if groups_key is not None and f"{groups_key}_sizes" in adata.uns:
        groups_sizes = adata.uns[f"{groups_key}_sizes"]
    else:
        groups_sizes = np.ones(len(node_labels))
    base_scale_scatter = 2000
    base_pie_size = (base_scale_scatter /
                     (np.sqrt(adjacency_solid.shape[0]) + 10) *
                     node_size_scale)
    median_group_size = np.median(groups_sizes)
    groups_sizes = base_pie_size * np.power(groups_sizes / median_group_size,
                                            node_size_power)

    # edge widths
    base_edge_width = edge_width_scale * 5 * rcParams["lines.linewidth"]

    # draw dashed edges
    if dashed_edges is not None:
        widths = [x[-1]["weight"] for x in nx_g_dashed.edges(data=True)]
        widths = base_edge_width * np.array(widths)
        if max_edge_width is not None:
            widths = np.clip(widths, None, max_edge_width)
        nx.draw_networkx_edges(
            nx_g_dashed,
            pos,
            ax=ax,
            width=widths,
            edge_color="grey",
            style="dashed",
            alpha=0.5,
        )

    # draw solid edges
    if transitions is None:
        widths = [x[-1]["weight"] for x in nx_g_solid.edges(data=True)]
        widths = base_edge_width * np.array(widths)
        if min_edge_width is not None or max_edge_width is not None:
            widths = np.clip(widths, min_edge_width, max_edge_width)
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            nx.draw_networkx_edges(nx_g_solid,
                                   pos,
                                   ax=ax,
                                   width=widths,
                                   edge_color="black")

    # draw directed edges
    else:
        adjacency_transitions = adata.uns["paga"][transitions].copy()
        if threshold is None:
            threshold = 0.01
        adjacency_transitions.data[adjacency_transitions.data < threshold] = 0
        adjacency_transitions.eliminate_zeros()
        g_dir = nx.DiGraph(adjacency_transitions.T)
        widths = [x[-1]["weight"] for x in g_dir.edges(data=True)]
        widths = base_edge_width * np.array(widths)
        if min_edge_width is not None or max_edge_width is not None:
            widths = np.clip(widths, min_edge_width, max_edge_width)
        nx.draw_networkx_edges(
            g_dir,
            pos,
            ax=ax,
            width=widths,
            edge_color="k",
            arrowsize=arrowsize,
            arrowstyle="-|>",
            node_size=groups_sizes,
        )

    if export_to_gexf:
        if isinstance(colors[0], tuple):
            from matplotlib.colors import rgb2hex

            colors = [rgb2hex(c) for c in colors]
        for count, n in enumerate(nx_g_solid.nodes()):
            nx_g_solid.node[count]["label"] = f"{node_labels[count]}"
            nx_g_solid.node[count]["color"] = f"{colors[count]}"
            nx_g_solid.node[count]["viz"] = dict(position=dict(
                x=1000 * pos[count][0], y=1000 * pos[count][1], z=0))
        filename = settings.writedir / "paga_graph.gexf"
        logg.warn(f"exporting to {filename}")
        settings.writedir.mkdir(parents=True, exist_ok=True)
        nx.write_gexf(nx_g_solid, settings.writedir / "paga_graph.gexf")

    ax.set_frame_on(frameon)
    ax.set_xticks([])
    ax.set_yticks([])

    if fontsize is None:
        fontsize = rcParams["legend.fontsize"]
    if fontoutline is not None:
        text_kwds = dict(text_kwds)
        text_kwds["path_effects"] = [
            patheffects.withStroke(linewidth=fontoutline, foreground="w")
        ]
    # usual scatter plot
    if not isinstance(colors[0], cabc.Mapping):
        n_groups = len(pos_array)
        sct = ax.scatter(
            pos_array[:, 0],
            pos_array[:, 1],
            s=groups_sizes,
            cmap=cmap,
            c=colors[:n_groups],
            edgecolors="face",
            zorder=2,
        )
        for count, group in enumerate(node_labels):
            ax.text(
                pos_array[count, 0],
                pos_array[count, 1],
                group,
                verticalalignment="center",
                horizontalalignment="center",
                size=fontsize,
                fontweight=fontweight,
                **text_kwds,
            )
    # else pie chart plot
    else:

        def transform_ax_coords(a, b):
            return trans2(trans((a, b)))

        # start with this dummy plot... otherwise strange behavior
        sct = ax.scatter(
            pos_array[:, 0],
            pos_array[:, 1],
            alpha=0,
            linewidths=0,
            c="w",
            edgecolors="face",
            s=groups_sizes,
            cmap=cmap,
        )
        bboxes = getbb(sct,
                       ax)  # bounding boxes around the scatterplot markers

        trans = ax.transData.transform
        bbox = ax.get_position().get_points()
        ax_x_min = bbox[0, 0]
        ax_x_max = bbox[1, 0]
        ax_y_min = bbox[0, 1]
        ax_y_max = bbox[1, 1]
        ax_len_x = ax_x_max - ax_x_min
        ax_len_y = ax_y_max - ax_y_min
        trans2 = ax.transAxes.inverted().transform
        pie_axs = []
        for count, (n, box) in enumerate(zip(nx_g_solid.nodes(), bboxes)):
            x0, y0 = transform_ax_coords(box.x0, box.y0)
            x1, y1 = transform_ax_coords(box.x1, box.y1)
            pie_size = np.sqrt(((x0 - x1)**2) + ((y0 - y1)**2))

            xa, ya = transform_ax_coords(*pos[n])
            xa = ax_x_min + (xa - pie_size / 2) * ax_len_x
            ya = ax_y_min + (ya - pie_size / 2) * ax_len_y
            # clip, the fruchterman layout sometimes places below figure
            if ya < 0:
                ya = 0
            if xa < 0:
                xa = 0
            pie_axs.append(
                pl.axes([xa, ya, pie_size * ax_len_x, pie_size * ax_len_y],
                        frameon=False))
            pie_axs[count].set_xticks([])
            pie_axs[count].set_yticks([])
            if not isinstance(colors[count], cabc.Mapping):
                raise ValueError(
                    f"{colors[count]} is neither a dict of valid "
                    "matplotlib colors nor a valid matplotlib color.")
            color_single = colors[count].keys()
            fracs = [colors[count][c] for c in color_single]
            if sum(fracs) < 1:
                color_single = list(color_single)
                color_single.append("grey")
                fracs.append(1 - sum(fracs))
            wedgeprops = dict(linewidth=0, edgecolor="k", antialiased=True)
            pie_axs[count].pie(fracs,
                               colors=color_single,
                               wedgeprops=wedgeprops,
                               normalize=True)
        if node_labels is not None:
            text_kwds.update(
                dict(verticalalignment="center", fontweight=fontweight))
            text_kwds.update(dict(horizontalalignment="center", size=fontsize))
            for ia, a in enumerate(pie_axs):
                a.text(0.5,
                       0.5,
                       node_labels[ia],
                       transform=a.transAxes,
                       **text_kwds)
    return sct
Beispiel #6
0
def filter_genes(
    data,
    min_counts=None,
    min_cells=None,
    max_counts=None,
    max_cells=None,
    min_counts_u=None,
    min_cells_u=None,
    max_counts_u=None,
    max_cells_u=None,
    min_shared_counts=None,
    min_shared_cells=None,
    retain_genes=None,
    copy=False,
):
    """Filter genes based on number of cells or counts.
    Keep genes that have at least `min_counts` counts or are expressed in at
    least `min_cells` cells or have at most `max_counts` counts or are expressed
    in at most `max_cells` cells.
    Only provide one of the optional parameters `min_counts`, `min_cells`,
    `max_counts`, `max_cells` per call.

    Parameters
    ----------
    data : :class:`~anndata.AnnData`, `np.ndarray`, `sp.spmatrix`
        The (annotated) data matrix of shape `n_obs` × `n_vars`. Rows correspond
        to cells and columns to genes.
    min_counts : `int`, optional (default: `None`)
        Minimum number of counts required for a gene to pass filtering.
    min_cells : `int`, optional (default: `None`)
        Minimum number of cells expressed required for a gene to pass filtering.
    max_counts : `int`, optional (default: `None`)
        Maximum number of counts required for a gene to pass filtering.
    max_cells : `int`, optional (default: `None`)
        Maximum number of cells expressed required for a gene to pass filtering.
    min_counts_u : `int`, optional (default: `None`)
        Minimum number of unspliced counts required for a gene to pass filtering.
    min_cells_u : `int`, optional (default: `None`)
        Minimum number of unspliced cells expressed required to pass filtering.
    max_counts_u : `int`, optional (default: `None`)
        Maximum number of unspliced counts required for a gene to pass filtering.
    max_cells_u : `int`, optional (default: `None`)
        Maximum number of unspliced cells expressed required to pass filtering.
    min_shared_counts: `int`, optional (default: `None`)
        Minimum number of counts (both unspliced and spliced) required for a gene.
    min_shared_cells: `int`, optional (default: `None`)
        Minimum number of cells required to be expressed (both unspliced and spliced).
    retain_genes: `list`, optional (default: `None`)
        List of gene names to be retained independent of thresholds.
    copy : `bool`, optional (default: `False`)
        Determines whether a copy is returned.

    Returns
    -------
    Filters the object and adds `n_counts` to `adata.var`.
    """

    adata = data.copy() if copy else data

    # set initial cell sizes before filtering
    _set_initial_size(adata)

    layers = [
        layer for layer in ["spliced", "unspliced"]
        if layer in adata.layers.keys()
    ]
    if min_shared_counts is not None or min_shared_cells is not None:
        layers.extend(["shared"])

    for layer in layers:

        if layer == "spliced":
            _min_counts, _min_cells, _max_counts, _max_cells = (
                min_counts,
                min_cells,
                max_counts,
                max_cells,
            )
        elif layer == "unspliced":
            _min_counts, _min_cells, _max_counts, _max_cells = (
                min_counts_u,
                min_cells_u,
                max_counts_u,
                max_cells_u,
            )
        else:  # shared counts/cells
            _min_counts, _min_cells, _max_counts, _max_cells = (
                min_shared_counts,
                min_shared_cells,
                None,
                None,
            )

        if layer in adata.layers.keys():
            X = adata.layers[layer]
        else:  # shared counts/cells
            Xs, Xu = adata.layers["spliced"], adata.layers["unspliced"]
            nonzeros = ((Xs > 0).multiply(Xu > 0) if issparse(Xs) else
                        (Xs > 0) * (Xu > 0))
            X = (nonzeros.multiply(Xs) +
                 nonzeros.multiply(Xu) if issparse(nonzeros) else nonzeros *
                 (Xs + Xu))

        gene_subset = np.ones(adata.n_vars, dtype=bool)

        if _min_counts is not None or _max_counts is not None:
            gene_subset &= _filter(X,
                                   min_counts=_min_counts,
                                   max_counts=_max_counts)[0]

        if _min_cells is not None or _max_cells is not None:
            gene_subset &= _filter(X,
                                   min_cells=_min_cells,
                                   max_cells=_max_cells)[0]

        if retain_genes is not None:
            if isinstance(retain_genes, str):
                retain_genes = [retain_genes]
            gene_subset |= adata.var_names.isin(retain_genes)

        adata._inplace_subset_var(gene_subset)

        s = np.sum(~gene_subset)
        if s > 0:
            logg.info(f"Filtered out {s} genes that are detected", end=" ")
            if _min_cells is not None or _min_counts is not None:
                logg.info(
                    f"in less than {_min_cells} cells ({layer})."
                    if _min_counts is None else
                    f"{_min_counts} counts ({layer}).",
                    no_indent=True,
                )
            if max_cells is not None or max_counts is not None:
                logg.info(
                    f"in more than {_max_cells} cells ({layer})."
                    if _max_counts is None else
                    f"{_max_counts} counts ({layer}).",
                    no_indent=True,
                )

    return adata if copy else None
Beispiel #7
0
def rank_velocity_genes(
    data,
    vkey="velocity",
    n_genes=100,
    groupby=None,
    match_with=None,
    resolution=None,
    min_counts=None,
    min_r2=None,
    min_corr=None,
    min_dispersion=None,
    min_likelihood=None,
    copy=False,
):
    """Rank genes for velocity characterizing groups.

    This applies a differential expression test (Welch t-test with overestimated
    variance to be conservative) on velocity expression, to find genes in a cluster that
    show dynamics that is transcriptionally regulated differently compared to all other
    clusters (e.g. induction in that cluster and homeostasis in remaining population).
    If no clusters are given, it priorly computes velocity clusters by applying louvain
    modularity on velocity expression.

    .. code:: python

        scv.tl.rank_velocity_genes(adata, groupby='clusters')
        scv.pl.scatter(
            adata, basis=adata.uns['rank_velocity_genes']['names']['Beta'][:3]
        )
        pd.DataFrame(adata.uns['rank_velocity_genes']['names']).head()

    .. image:: https://user-images.githubusercontent.com/31883718/69626017-11c47980-1048-11ea-89f4-df3769df5ad5.png
       :width: 600px

    .. image:: https://user-images.githubusercontent.com/31883718/69626572-30774000-1049-11ea-871f-e8a30c42f10e.png
       :width: 600px

    Arguments
    ----------
    data : :class:`~anndata.AnnData`
        Annotated data matrix.
    vkey: `str` (default: `'velocity'`)
        Key of velocities computed in `tl.velocity`
    n_genes : `int`, optional (default: 100)
        The number of genes that appear in the returned tables.
    groupby: `str`, `list` or `np.ndarray` (default: `None`)
        Key of observations grouping to consider.
    match_with: `str` or `None` (default: `None`)
        adata.obs key to separatively rank velocities on.
    resolution: `str` or `None` (default: `None`)
        Resolution for louvain modularity.
    min_counts: `float` (default: None)
        Minimum count of genes for consideration.
    min_r2: `float` (default: None)
        Minimum r2 value of genes for consideration.
    min_corr: `float` (default: None)
        Minimum Spearmans correlation coefficient between spliced and unspliced.
    min_dispersion: `float` (default: None)
        Minimum dispersion norm value of genes for consideration.
    min_likelihood: `float` between `0` and `1` or `None` (default: `None`)
        Only rank velocity of genes with a likelihood higher than min_likelihood.
    copy: `bool` (default: `False`)
        Return a copy instead of writing to data.

    Returns
    -------
    rank_velocity_genes : `.uns`
        Structured array to be indexed by group id storing the gene
        names. Ordered according to scores.
    velocity_score : `.var`
        Storing the score for each gene for each group. Ordered according to scores.
    """  # noqa E501

    adata = data.copy() if copy else data

    if groupby is None or groupby == "velocity_clusters":
        velocity_clusters(
            adata,
            vkey=vkey,
            match_with=match_with,
            resolution=resolution,
            min_likelihood=min_likelihood,
        )
        groupby = f"{vkey}_clusters"

    logg.info("ranking velocity genes", r=True)

    if "spearmans_score" not in adata.var.keys():
        corr = vcorrcoef(
            np.array(adata.layers["Ms"]).T,
            np.array(adata.layers["Mu"].T),
            mode="spearmans",
        )
        adata.var["spearmans_score"] = np.clip(corr, 0, None)

    tmp_filter = ~np.isnan(adata.layers[vkey].sum(0))
    if f"{vkey}_genes" in adata.var.keys():
        tmp_filter &= np.array(adata.var[f"{vkey}_genes"].values, dtype=bool)

    if "unspliced" in adata.layers.keys():
        n_counts = (adata.layers["unspliced"] > 0).sum(0)
        n_counts = n_counts.A1 if issparse(
            adata.layers["unspliced"]) else n_counts
        min_counts = (min(50, np.percentile(n_counts, 50))
                      if min_counts is None else min_counts)
        tmp_filter &= np.ravel(n_counts > min_counts)

    if f"{vkey}_r2" in adata.var.keys():
        r2 = adata.var[f"{vkey}_r2"]
        min_r2 = 0.1 if min_r2 is None else min_r2  # np.percentile(r2[r2 > 0], 50)
        tmp_filter &= r2 > min_r2

    if "spearmans_score" in adata.var.keys():
        corr = adata.var["spearmans_score"]
        min_corr = (0.1 if min_corr is None else min_corr
                    )  # np.percentile(r2[r2 > 0], 50)
        tmp_filter &= corr > min_corr

    if "dispersions_norm" in adata.var.keys():
        dispersions = adata.var.dispersions_norm
        min_dispersion = 0 if min_dispersion is None else min_dispersion
        tmp_filter &= dispersions > min_dispersion

    if "fit_likelihood" in adata.var.keys():
        fit_likelihood = adata.var["fit_likelihood"]
        min_likelihood = 0.1 if min_likelihood is None else min_likelihood
        tmp_filter &= fit_likelihood > min_likelihood

    X = adata[:, tmp_filter].layers[vkey]
    groups, groups_masks = select_groups(adata, key=groupby)

    n_groups = groups_masks.shape[0]
    sizes = groups_masks.sum(1)

    mean, var = np.zeros((n_groups, X.shape[1])), np.zeros(
        (n_groups, X.shape[1]))
    for i, mask in enumerate(groups_masks):
        mean[i], var[i] = get_mean_var(X[mask])

    # test each against the union of all other groups
    rankings_gene_names, rankings_gene_scores = [], []
    for i in range(n_groups):
        mask_rest = ~groups_masks[i]
        mean_rest, var_rest = get_mean_var(X[mask_rest])
        size_rest = sizes[i]  # else mask_rest.sum() if method == 't-test'

        scores = (mean[i] - mean_rest) / np.sqrt(var[i] / sizes[i] +
                                                 var_rest / size_rest)
        scores = np.nan_to_num(scores)

        # equivalent to but much faster than np.argsort(scores)[-10:]
        if n_genes > X.shape[1]:
            n_genes = X.shape[1]
        idx = np.argpartition(scores, -n_genes)[-n_genes:]
        idx = idx[np.argsort(scores[idx])[::-1]]

        rankings_gene_names.append(adata[:, tmp_filter].var_names[idx].values)
        rankings_gene_scores.append(scores[idx])

    rankings_gene_names = np.array([list(n) for n in rankings_gene_names])
    rankings_gene_scores = np.array([list(n) for n in rankings_gene_scores])

    all_names = rankings_gene_names.T.flatten()
    all_scores = rankings_gene_scores.T.flatten()
    vscore = np.zeros(adata.n_vars, dtype=int)
    for i, name in enumerate(adata.var_names):
        if name in all_names:
            vscore[i] = all_scores[np.where(name == all_names)[0][0]]
    adata.var["velocity_score"] = vscore

    key = "rank_velocity_genes"
    if key not in adata.uns.keys():
        adata.uns[key] = {}

    adata.uns[key] = {
        "names":
        np.rec.fromarrays([n for n in rankings_gene_names],
                          dtype=[(f"{rn}", "U50") for rn in groups]),
        "scores":
        np.rec.fromarrays(
            [n.round(2) for n in rankings_gene_scores],
            dtype=[(f"{rn}", "float32") for rn in groups],
        ),
        "params": {
            "groupby": groupby,
            "reference": "rest",
            "method": "t-test_overestim_var",
            "use_raw": True,
        },
    }

    logg.info("    finished",
              time=True,
              end=" " if settings.verbosity > 2 else "\n")
    logg.hint(
        "added \n"
        f"    '{key}', sorted scores by group ids (adata.uns) \n"
        "    'spearmans_score', spearmans correlation scores (adata.var)")

    return adata if copy else None
Beispiel #8
0
def velocity_clusters(
    data,
    vkey="velocity",
    match_with="clusters",
    sort_by="velocity_pseudotime",
    resolution=None,
    min_likelihood=None,
    copy=False,
):
    """Computes velocity clusters via louvain on velocities.

    .. code:: python

        scv.tl.velocity_clusters(adata)
        scv.pl.scatter(adata, color='velocity_clusters')

    .. image:: https://user-images.githubusercontent.com/31883718/69625627-484dc480-1047-11ea-847f-6607a3430427.png
       :width: 600px


    Arguments
    ----------
    data : :class:`~anndata.AnnData`
        Annotated data matrix.
    vkey: `str` (default: `'velocity'`)
        Key of velocities computed in `tl.velocity`
    match_with : `int`, optional (default: 100)
        The number of genes that appear in the returned tables.
    match_with: `str` (default: `'clusters'`)
        Match the names of the velocity clusters with the names of this key (.obs).
    sort_by: `str` or `None` (default: `'dpt_pseudotime'`)
        Sort velocity clusters by this key (.obs).
    resolution: `float` (default: 0.7)
        Resolution for louvain modularity.
    min_likelihood: `float` between `0` and `1` or `None` (default: `None`)
        Only rank velocity of genes with a likelihood higher than min_likelihood.
    copy: `bool` (default: `False`)
        Return a copy instead of writing to data.

    Returns
    -------
    velocity_clusters : `.obs`
        Clusters obtained from applying louvain modularity on velocity expression.
    """  # noqa E501

    adata = data.copy() if copy else data

    logg.info("computing velocity clusters", r=True)

    tmp_filter = ~np.isnan(adata.layers[vkey].sum(0))
    if f"{vkey}_genes" in adata.var.keys():
        tmp_filter &= np.array(adata.var[f"{vkey}_genes"].values, dtype=bool)

    if "unspliced" in adata.layers.keys():
        n_counts = (adata.layers["unspliced"] > 0).sum(0)
        n_counts = n_counts.A1 if issparse(
            adata.layers["unspliced"]) else n_counts
        min_counts = min(50, np.percentile(n_counts, 50))
        tmp_filter &= np.ravel(n_counts > min_counts)

    if "r2" in adata.var.keys():
        r2 = adata.var.velocity_r2
        min_r2 = np.percentile(r2[r2 > 0], 50)
        tmp_filter &= r2 > min_r2

    if "dispersions_norm" in adata.var.keys():
        dispersions = adata.var.dispersions_norm
        min_dispersion = np.percentile(dispersions, 20)
        tmp_filter &= dispersions > min_dispersion

    if "fit_likelihood" in adata.var.keys() and min_likelihood is not None:
        tmp_filter &= adata.var["fit_likelihood"] > min_likelihood

    from anndata import AnnData

    vdata = AnnData(adata.layers[vkey][:, tmp_filter])
    vdata.obs = adata.obs.copy()
    vdata.var = adata.var[tmp_filter].copy()

    if "highly_variable" in vdata.var.keys():
        vdata.var["highly_variable"] = np.array(vdata.var["highly_variable"],
                                                dtype=bool)

    import scanpy as sc

    logg.switch_verbosity("off", module="scanpy")
    sc.pp.pca(vdata, n_comps=20, svd_solver="arpack")
    sc.pp.neighbors(vdata, n_pcs=20)
    sc.tl.louvain(vdata, resolution=0.7 if resolution is None else resolution)
    logg.switch_verbosity("on", module="scanpy")

    if sort_by == "velocity_pseudotime" and sort_by not in adata.obs.keys():
        velocity_pseudotime(adata, vkey=vkey)
    if sort_by in vdata.obs.keys():
        vc = vdata.obs["louvain"]
        vc_cats = vc.cat.categories
        mean_times = [
            np.mean(vdata.obs[sort_by][vc == cat]) for cat in vc_cats
        ]
        vdata.obs["louvain"].cat.reorder_categories(
            vc_cats[np.argsort(mean_times)], inplace=True)

    if isinstance(match_with, str) and match_with in adata.obs.keys():
        from .utils import most_common_in_list

        vc = vdata.obs["louvain"]
        cats_nums = {cat: 0 for cat in adata.obs[match_with].cat.categories}
        for cat in vc.cat.categories:
            cells_in_cat = np.where(vc == cat)[0]
            new_cat = most_common_in_list(adata.obs[match_with][cells_in_cat])
            cats_nums[new_cat] += 1
            vc = vc.cat.rename_categories(
                {cat: f"{new_cat} ({cats_nums[new_cat]})"})
        vdata.obs["louvain"] = vc
    else:
        vdata.obs["louvain"].cat.categories = np.arange(
            len(vdata.obs["louvain"].cat.categories))
    adata.obs[f"{vkey}_clusters"] = vdata.obs["louvain"].copy()

    del vdata

    logg.info("    finished",
              time=True,
              end=" " if settings.verbosity > 2 else "\n")
    logg.hint(
        "added \n"
        f"    '{vkey}_clusters', "
        f"clusters based on louvain modularity on velocity vector field (adata.obs)"
    )

    return adata if copy else None
Beispiel #9
0
def velocity_embedding(
    data,
    basis=None,
    vkey="velocity",
    scale=10,
    self_transitions=True,
    use_negative_cosines=True,
    direct_pca_projection=None,
    retain_scale=False,
    autoscale=True,
    all_comps=True,
    T=None,
    copy=False,
):
    """Projects the single cell velocities into any embedding.

    Given normalized difference of the embedding positions
    :math:
    `\\tilde \\delta_{ij} = \\frac{x_j-x_i}{\\left\\lVert x_j-x_i \\right\\rVert}`.
    the projections are obtained as expected displacements with respect to the
    transition matrix :math:`\\tilde \\pi_{ij}` as

    .. math::
        \\tilde \\nu_i = E_{\\tilde \\pi_{i\\cdot}} [\\tilde \\delta_{i \\cdot}]
        = \\sum_{j \\neq i} \\left( \\tilde \\pi_{ij} - \\frac1n \\right) \\tilde \\
        delta_{ij}.


    Arguments
    ---------
    data: :class:`~anndata.AnnData`
        Annotated data matrix.
    basis: `str` (default: `'tsne'`)
        Which embedding to use.
    vkey: `str` (default: `'velocity'`)
        Name of velocity estimates to be used.
    scale: `int` (default: 10)
        Scale parameter of gaussian kernel for transition matrix.
    self_transitions: `bool` (default: `True`)
        Whether to allow self transitions, based on the confidences of transitioning to
        neighboring cells.
    use_negative_cosines: `bool` (default: `True`)
        Whether to project cell-to-cell transitions with negative cosines into
        negative/opposite direction.
    direct_pca_projection: `bool` (default: `None`)
        Whether to directly project the velocities into PCA space,
        thus skipping the velocity graph.
    retain_scale: `bool` (default: `False`)
        Whether to retain scale from high dimensional space in embedding.
    autoscale: `bool` (default: `True`)
        Whether to scale the embedded velocities by a scalar multiplier,
        which simply ensures that the arrows in the embedding are properly scaled.
    all_comps: `bool` (default: `True`)
        Whether to compute the velocities on all embedding components.
    T: `csr_matrix` (default: `None`)
        Allows the user to directly pass a transition matrix.
    copy: `bool` (default: `False`)
        Return a copy instead of writing to `adata`.

    Returns
    -------
    velocity_umap: `.obsm`
        coordinates of velocity projection on embedding (e.g., basis='umap')
    """

    adata = data.copy() if copy else data

    if basis is None:
        keys = [
            key for key in ["pca", "tsne", "umap"] if f"X_{key}" in adata.obsm.keys()
        ]
        if len(keys) > 0:
            basis = "pca" if direct_pca_projection else keys[-1]
        else:
            raise ValueError("No basis specified")

    if f"X_{basis}" not in adata.obsm_keys():
        raise ValueError("You need to compute the embedding first.")

    if direct_pca_projection and "pca" in basis:
        logg.warn(
            "Directly projecting velocities into PCA space is for exploratory analysis "
            "on principal components.\n"
            "         It does not reflect the actual velocity field from high "
            "dimensional gene expression space.\n"
            "         To visualize velocities, consider applying "
            "`direct_pca_projection=False`.\n"
        )

    logg.info("computing velocity embedding", r=True)

    V = np.array(adata.layers[vkey])
    vgenes = np.ones(adata.n_vars, dtype=bool)
    if f"{vkey}_genes" in adata.var.keys():
        vgenes &= np.array(adata.var[f"{vkey}_genes"], dtype=bool)
    vgenes &= ~np.isnan(V.sum(0))
    V = V[:, vgenes]

    if direct_pca_projection and "pca" in basis:
        PCs = adata.varm["PCs"] if all_comps else adata.varm["PCs"][:, :2]
        PCs = PCs[vgenes]

        X_emb = adata.obsm[f"X_{basis}"]
        V_emb = (V - V.mean(0)).dot(PCs)

    else:
        X_emb = (
            adata.obsm[f"X_{basis}"] if all_comps else adata.obsm[f"X_{basis}"][:, :2]
        )
        V_emb = np.zeros(X_emb.shape)

        T = (
            transition_matrix(
                adata,
                vkey=vkey,
                scale=scale,
                self_transitions=self_transitions,
                use_negative_cosines=use_negative_cosines,
            )
            if T is None
            else T
        )
        T.setdiag(0)
        T.eliminate_zeros()

        densify = adata.n_obs < 1e4
        TA = T.A if densify else None

        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            for i in range(adata.n_obs):
                indices = T[i].indices
                dX = X_emb[indices] - X_emb[i, None]  # shape (n_neighbors, 2)
                if not retain_scale:
                    dX /= l2_norm(dX)[:, None]
                dX[np.isnan(dX)] = 0  # zero diff in a steady-state
                probs = TA[i, indices] if densify else T[i].data
                V_emb[i] = probs.dot(dX) - probs.mean() * dX.sum(0)

        if retain_scale:
            X = (
                adata.layers["Ms"]
                if "Ms" in adata.layers.keys()
                else adata.layers["spliced"]
            )
            delta = T.dot(X[:, vgenes]) - X[:, vgenes]
            if issparse(delta):
                delta = delta.A
            cos_proj = (V * delta).sum(1) / l2_norm(delta)
            V_emb *= np.clip(cos_proj[:, None] * 10, 0, 1)

    if autoscale:
        V_emb /= 3 * quiver_autoscale(X_emb, V_emb)

    if f"{vkey}_params" in adata.uns.keys():
        adata.uns[f"{vkey}_params"]["embeddings"] = (
            []
            if "embeddings" not in adata.uns[f"{vkey}_params"]
            else list(adata.uns[f"{vkey}_params"]["embeddings"])
        )
        adata.uns[f"{vkey}_params"]["embeddings"].extend([basis])

    vkey += f"_{basis}"
    adata.obsm[vkey] = V_emb

    logg.info("    finished", time=True, end=" " if settings.verbosity > 2 else "\n")
    logg.hint("added\n" f"    '{vkey}', embedded velocity vectors (adata.obsm)")

    return adata if copy else None
Beispiel #10
0
def velocity_graph(
    data,
    vkey="velocity",
    xkey="Ms",
    tkey=None,
    basis=None,
    n_neighbors=None,
    n_recurse_neighbors=None,
    random_neighbors_at_max=None,
    sqrt_transform=None,
    variance_stabilization=None,
    gene_subset=None,
    compute_uncertainties=None,
    approx=None,
    mode_neighbors="distances",
    copy=False,
    n_jobs=None,
    backend="loky",
):
    """Computes velocity graph based on cosine similarities.

    The cosine similarities are computed between velocities and potential cell state
    transitions, i.e. it measures how well a corresponding change in gene expression
    :math:`\\delta_{ij} = x_j - x_i` matches the predicted change according to the
    velocity vector :math:`\\nu_i`,

    .. math::
        \\pi_{ij} = \\cos\\angle(\\delta_{ij}, \\nu_i)
        = \\frac{\\delta_{ij}^T \\nu_i}{\\left\\lVert\\delta_{ij}\\right\\rVert
        \\left\\lVert \\nu_i \\right\\rVert}.

    Arguments
    ---------
    data: :class:`~anndata.AnnData`
        Annotated data matrix.
    vkey: `str` (default: `'velocity'`)
        Name of velocity estimates to be used.
    xkey: `str` (default: `'Ms'`)
        Layer key to extract count data from.
    tkey: `str` (default: `None`)
        Observation key to extract time data from.
    basis: `str` (default: `None`)
        Basis / Embedding to use.
    n_neighbors: `int` or `None` (default: None)
        Use fixed number of neighbors or do recursive neighbor search (if `None`).
    n_recurse_neighbors: `int` (default: `None`)
        Number of recursions for neighbors search. Defaults to
        2 if mode_neighbors is 'distances', and 1 if mode_neighbors is 'connectivities'.
    random_neighbors_at_max: `int` or `None` (default: `None`)
        If number of iterative neighbors for an individual cell is higher than this
        threshold, a random selection of such are chosen as reference neighbors.
    sqrt_transform: `bool` (default: `False`)
        Whether to variance-transform the cell states changes
        and velocities before computing cosine similarities.
    gene_subset: `list` of `str`, subset of adata.var_names or `None`(default: `None`)
        Subset of genes to compute velocity graph on exclusively.
    compute_uncertainties: `bool` (default: `None`)
        Whether to compute uncertainties along with cosine correlation.
    approx: `bool` or `None` (default: `None`)
        If True, first 30 pc's are used instead of the full count matrix
    mode_neighbors: 'str' (default: `'distances'`)
        Determines the type of KNN graph used. Options are 'distances' or
        'connectivities'. The latter yields a symmetric graph.
    copy: `bool` (default: `False`)
        Return a copy instead of writing to adata.
    n_jobs: `int` or `None` (default: `None`)
        Number of parallel jobs.
    backend: `str` (default: "loky")
        Backend used for multiprocessing. See :class:`joblib.Parallel` for valid
        options.

    Returns
    -------
    velocity_graph: `.uns`
        sparse matrix with correlations of cell state transitions with velocities
    """

    adata = data.copy() if copy else data
    verify_neighbors(adata)
    if vkey not in adata.layers.keys():
        velocity(adata, vkey=vkey)
    if sqrt_transform is None:
        sqrt_transform = variance_stabilization

    vgraph = VelocityGraph(
        adata,
        vkey=vkey,
        xkey=xkey,
        tkey=tkey,
        basis=basis,
        n_neighbors=n_neighbors,
        approx=approx,
        n_recurse_neighbors=n_recurse_neighbors,
        random_neighbors_at_max=random_neighbors_at_max,
        sqrt_transform=sqrt_transform,
        gene_subset=gene_subset,
        compute_uncertainties=compute_uncertainties,
        report=True,
        mode_neighbors=mode_neighbors,
    )

    if isinstance(basis, str):
        logg.warn(
            f"The velocity graph is computed on {basis} embedding coordinates.\n"
            f"        Consider computing the graph in an unbiased manner \n"
            f"        on full expression space by not specifying basis.\n")

    n_jobs = get_n_jobs(n_jobs=n_jobs)
    logg.info(
        f"computing velocity graph (using {n_jobs}/{os.cpu_count()} cores)",
        r=True)
    vgraph.compute_cosines(n_jobs=n_jobs, backend=backend)

    adata.uns[f"{vkey}_graph"] = vgraph.graph
    adata.uns[f"{vkey}_graph_neg"] = vgraph.graph_neg

    if vgraph.uncertainties is not None:
        adata.uns[f"{vkey}_graph_uncertainties"] = vgraph.uncertainties

    adata.obs[f"{vkey}_self_transition"] = vgraph.self_prob

    if f"{vkey}_params" in adata.uns.keys():
        if "embeddings" in adata.uns[f"{vkey}_params"]:
            del adata.uns[f"{vkey}_params"]["embeddings"]
    else:
        adata.uns[f"{vkey}_params"] = {}
    adata.uns[f"{vkey}_params"]["mode_neighbors"] = mode_neighbors
    adata.uns[f"{vkey}_params"][
        "n_recurse_neighbors"] = vgraph.n_recurse_neighbors

    logg.info("    finished",
              time=True,
              end=" " if settings.verbosity > 2 else "\n")
    logg.hint(
        "added \n"
        f"    '{vkey}_graph', sparse matrix with cosine correlations (adata.uns)"
    )

    return adata if copy else None
Beispiel #11
0
def normalize_per_cell(
    data,
    counts_per_cell_after=None,
    counts_per_cell=None,
    key_n_counts=None,
    max_proportion_per_cell=None,
    use_initial_size=True,
    layers=None,
    enforce=None,
    copy=False,
):
    """Normalize each cell by total counts over all genes.

    Parameters
    ----------
    data : :class:`~anndata.AnnData`, `np.ndarray`, `sp.sparse`
        The (annotated) data matrix of shape `n_obs` × `n_vars`. Rows correspond
        to cells and columns to genes.
    counts_per_cell_after : `float` or `None`, optional (default: `None`)
        If `None`, after normalization, each cell has a total count equal
        to the median of the *counts_per_cell* before normalization.
    counts_per_cell : `np.array`, optional (default: `None`)
        Precomputed counts per cell.
    key_n_counts : `str`, optional (default: `'n_counts'`)
        Name of the field in `adata.obs` where the total counts per cell are
        stored.
    max_proportion_per_cell : `int` (default: `None`)
        Exclude genes counts that account for more than
        a specific proportion of cell size, e.g. 0.05.
    use_initial_size : `bool` (default: `True`)
        Whether to use initial cell sizes oder actual cell sizes.
    layers : `str` or `list` (default: `['spliced', 'unspliced']`)
        Keys for layers to be also considered for normalization.
    copy : `bool`, optional (default: `False`)
        If an :class:`~anndata.AnnData` is passed, determines whether a copy
        is returned.

    Returns
    -------
    Returns or updates `adata` with normalized counts.
    """

    adata = data.copy() if copy else data
    if layers is None:
        layers = ["spliced", "unspliced"]
    elif layers == "all":
        layers = adata.layers.keys()
    elif isinstance(layers, str):
        layers = [layers]
    layers = ["X"
              ] + [layer for layer in layers if layer in adata.layers.keys()]
    modified_layers = []

    if isinstance(counts_per_cell, str):
        if counts_per_cell not in adata.obs.keys():
            _set_initial_size(adata, layers)
        counts_per_cell = (adata.obs[counts_per_cell].values
                           if counts_per_cell in adata.obs.keys() else None)

    for layer in layers:
        check_if_valid_dtype(adata, layer)
        X = adata.X if layer == "X" else adata.layers[layer]

        if not_yet_normalized(X) or enforce:
            counts = (counts_per_cell if counts_per_cell is not None else
                      _get_initial_size(adata, layer)
                      if use_initial_size else _get_size(adata, layer))
            if max_proportion_per_cell is not None and (
                    0 < max_proportion_per_cell < 1):
                counts = counts_per_cell_quantile(X, max_proportion_per_cell,
                                                  counts)
            # equivalent to sc.pp.normalize_per_cell(X, counts_per_cell_after, counts)
            counts_after = (np.median(counts) if counts_per_cell_after is None
                            else counts_per_cell_after)

            counts_after += counts_after == 0
            counts = counts / counts_after
            counts += counts == 0  # to avoid division by zero

            if issparse(X):
                sparsefuncs.inplace_row_scale(X, 1 / counts)
            else:
                X /= np.array(counts[:, None])
            modified_layers.append(layer)
            if (layer == "X" and "gene_count_corr" not in adata.var.keys()
                    and X.shape[-1] > 3e3):
                try:
                    adata.var["gene_count_corr"] = np.round(
                        csr_vcorrcoef(X.T, np.ravel((X > 0).sum(1))), 4)
                except Exception:
                    pass
        else:
            logg.warn(
                f"Did not normalize {layer} as it looks processed already. "
                "To enforce normalization, set `enforce=True`.")

    adata.obs["n_counts"
              if key_n_counts is None else key_n_counts] = _get_size(adata)
    if len(modified_layers) > 0:
        logg.info("Normalized count data:", f"{', '.join(modified_layers)}.")

    return adata if copy else None
Beispiel #12
0
def cell_origin(
    data,
    groupby="clusters",
    disconnected_groups=None,
    self_transitions=False,
    n_neighbors=None,
    copy=False,
):
    """Computes individual cell root points

    Arguments
    ---------
    data: :class:`~anndata.AnnData`
        Annotated data matrix.
    groupby: `str` (default: `'clusters'`)
        Key to which to assign the fates.
    disconnected_groups: list of `str` (default: `None`)
        Which groups to treat as disconnected for fate assignment.
    n_neighbors: `int` (default: `None`)
        Number of neighbors to restrict transitions to.
    self_transitions: `bool` (default: `False`)
        Whether to include self-transitions.
    copy: `bool` (default: `False`)
        Return a copy instead of writing to `adata`.

    Returns
    -------
    cell_origin: `.obs`
        most likely cell origin for each individual cell
    cell_origin_confidence: `.obs`
        confidence of coming from assigned origin
    """

    adata = data.copy() if copy else data
    logg.info("computing cell fates", r=True)

    n_neighbors = 10 if n_neighbors is None else n_neighbors
    _adata = adata.copy()
    vgraph = VelocityGraph(_adata,
                           n_neighbors=n_neighbors,
                           approx=True,
                           n_recurse_neighbors=1)
    vgraph.compute_cosines()
    _adata.uns["velocity_graph"] = vgraph.graph
    _adata.uns["velocity_graph_neg"] = vgraph.graph_neg

    T = transition_matrix(_adata,
                          self_transitions=self_transitions,
                          backward=True)
    fate = np.linalg.inv(np.eye(_adata.n_obs) - T)
    if issparse(T):
        fate = fate.A
    cell_fates = np.array(_adata.obs[groupby][fate.argmax(1)])
    if disconnected_groups is not None:
        idx = _adata.obs[groupby].isin(disconnected_groups)
        cell_fates[idx] = _adata.obs[groupby][idx]

    adata.obs["cell_origin"] = cell_fates
    adata.obs["cell_origin_confidence"] = fate.max(1) / fate.sum(1)
    strings_to_categoricals(adata)

    logg.info("    finished",
              time=True,
              end=" " if settings.verbosity > 2 else "\n")
    logg.hint(
        "added\n"
        "    'cell_origin', most likely cell origin (adata.obs)\n"
        "    'cell_origin_confidence', confidence of assigned origin (adata.obs)"
    )
Beispiel #13
0
def neighbors(
    adata,
    n_neighbors=30,
    n_pcs=None,
    use_rep=None,
    use_highly_variable=True,
    knn=True,
    random_state=0,
    method="umap",
    metric="euclidean",
    metric_kwds=None,
    num_threads=-1,
    copy=False,
):
    """
    Compute a neighborhood graph of observations.

    The neighbor graph methods (umap, hnsw, sklearn) only differ in runtime and
    yield the same result as scanpy [Wolf18]_. Connectivities are computed with
    adaptive kernel width as proposed in Haghverdi et al. 2016 (doi:10.1038/nmeth.3971).

    Parameters
    ----------
    adata
        Annotated data matrix.
    n_neighbors
        The size of local neighborhood (in terms of number of neighboring data
        points) used for manifold approximation. Larger values result in more
        global views of the manifold, while smaller values result in more local
        data being preserved. In general values should be in the range 2 to 100.
        If `knn` is `True`, number of nearest neighbors to be searched. If `knn`
        is `False`, a Gaussian kernel width is set to the distance of the
        `n_neighbors` neighbor.
    n_pcs : `int` or `None` (default: None)
        Number of principal components to use.
        If not specified, the full space is used of a pre-computed PCA,
        or 30 components are used when PCA is computed internally.
    use_rep : `None`, `'X'` or any key for `.obsm` (default: None)
        Use the indicated representation. If `None`, the representation is chosen
        automatically: for .n_vars < 50, .X is used, otherwise ‘X_pca’ is used.
    use_highly_variable: `bool` (default: True)
        Whether to use highly variable genes only, stored in .var['highly_variable'].
    knn
        If `True`, use a hard threshold to restrict the number of neighbors to
        `n_neighbors`, that is, consider a knn graph. Otherwise, use a Gaussian
        Kernel to assign low weights to neighbors more distant than the
        `n_neighbors` nearest neighbor.
    random_state
        A numpy random seed.
    method : {{'umap', 'hnsw', 'sklearn'}}  (default: `'umap'`)
        Method to compute neighbors, only differs in runtime.
        The 'hnsw' method is most efficient and requires to `pip install hnswlib`.
        Connectivities are computed with adaptive kernel.
    metric
        A known metric’s name or a callable that returns a distance.
    metric_kwds
        Options for the metric.
    num_threads
        Number of threads to be used (for runtime).
    copy
        Return a copy instead of writing to adata.

    Returns
    -------
    connectivities : `.obsp`
        Sparse weighted adjacency matrix of the neighborhood graph of data
        points. Weights should be interpreted as connectivities.
    distances : `.obsp`
        Sparse matrix of distances for each pair of neighbors.
    """

    adata = adata.copy() if copy else adata

    if use_rep is None:
        use_rep = "X" if adata.n_vars < 50 or n_pcs == 0 else "X_pca"
        n_pcs = None if use_rep == "X" else n_pcs
    elif use_rep not in adata.obsm.keys() and f"X_{use_rep}" in adata.obsm.keys():
        use_rep = f"X_{use_rep}"

    if use_rep == "X_pca":
        if (
            "X_pca" not in adata.obsm.keys()
            or n_pcs is not None
            and n_pcs > adata.obsm["X_pca"].shape[1]
        ):
            n_vars = (
                np.sum(adata.var["highly_variable"])
                if use_highly_variable and "highly_variable" in adata.var.keys()
                else adata.n_vars
            )
            n_comps = min(30 if n_pcs is None else n_pcs, n_vars - 1, adata.n_obs - 1)
            use_highly_variable &= "highly_variable" in adata.var.keys()
            pca(
                adata,
                n_comps=n_comps,
                use_highly_variable=use_highly_variable,
                svd_solver="arpack",
            )
        elif n_pcs is None and adata.obsm["X_pca"].shape[1] < 10:
            logg.warn(
                f"Neighbors are computed on {adata.obsm['X_pca'].shape[1]} "
                f"principal components only."
            )

        n_duplicate_cells = len(get_duplicate_cells(adata))
        if n_duplicate_cells > 0:
            logg.warn(
                f"You seem to have {n_duplicate_cells} duplicate cells in your data.",
                "Consider removing these via pp.remove_duplicate_cells.",
            )

    if metric_kwds is None:
        metric_kwds = {}

    logg.info("computing neighbors", r=True)

    if method == "sklearn":
        from sklearn.neighbors import NearestNeighbors

        X = adata.X if use_rep == "X" else adata.obsm[use_rep]
        neighbors = NearestNeighbors(
            n_neighbors=n_neighbors - 1,
            metric=metric,
            metric_params=metric_kwds,
            n_jobs=num_threads,
        )
        neighbors.fit(X if n_pcs is None else X[:, :n_pcs])
        knn_distances, neighbors.knn_indices = neighbors.kneighbors()
        knn_distances, neighbors.knn_indices = set_diagonal(
            knn_distances, neighbors.knn_indices
        )
        neighbors.distances, neighbors.connectivities = compute_connectivities_umap(
            neighbors.knn_indices, knn_distances, X.shape[0], n_neighbors=n_neighbors
        )

    elif method == "hnsw":
        X = adata.X if use_rep == "X" else adata.obsm[use_rep]
        neighbors = FastNeighbors(n_neighbors=n_neighbors, num_threads=num_threads)
        neighbors.fit(
            X if n_pcs is None else X[:, :n_pcs],
            metric=metric,
            random_state=random_state,
            **metric_kwds,
        )

    else:
        logg.switch_verbosity("off", module="scanpy")
        with warnings.catch_warnings():  # ignore numba warning (umap/issues/252)
            warnings.simplefilter("ignore")
            neighbors = Neighbors(adata)
            neighbors.compute_neighbors(
                n_neighbors=n_neighbors,
                knn=knn,
                n_pcs=n_pcs,
                method=method,
                use_rep=use_rep,
                random_state=random_state,
                metric=metric,
                metric_kwds=metric_kwds,
                write_knn_indices=True,
            )
        logg.switch_verbosity("on", module="scanpy")

    adata.uns["neighbors"] = {}
    try:
        adata.obsp["distances"] = neighbors.distances
        adata.obsp["connectivities"] = neighbors.connectivities
        adata.uns["neighbors"]["connectivities_key"] = "connectivities"
        adata.uns["neighbors"]["distances_key"] = "distances"
    except Exception:
        adata.uns["neighbors"]["distances"] = neighbors.distances
        adata.uns["neighbors"]["connectivities"] = neighbors.connectivities

    if hasattr(neighbors, "knn_indices"):
        adata.uns["neighbors"]["indices"] = neighbors.knn_indices
    adata.uns["neighbors"]["params"] = {
        "n_neighbors": n_neighbors,
        "method": method,
        "metric": metric,
        "n_pcs": n_pcs,
        "use_rep": use_rep,
    }

    logg.info("    finished", time=True, end=" " if settings.verbosity > 2 else "\n")
    logg.hint(
        "added \n"
        "    'distances' and 'connectivities', weighted adjacency matrices (adata.obsp)"
    )

    return adata if copy else None
Beispiel #14
0
def moments(
    data,
    n_neighbors=30,
    n_pcs=None,
    mode="connectivities",
    method="umap",
    use_rep=None,
    use_highly_variable=True,
    copy=False,
):
    """Computes moments for velocity estimation.

    First-/second-order moments are computed for each cell across its nearest neighbors,
    where the neighbor graph is obtained from euclidean distances in PCA space.

    Arguments
    ---------
    data: :class:`~anndata.AnnData`
        Annotated data matrix.
    n_neighbors: `int` (default: 30)
        Number of neighbors to use.
    n_pcs: `int` (default: None)
        Number of principal components to use.
        If not specified, the full space is used of a pre-computed PCA,
        or 30 components are used when PCA is computed internally.
    mode: `'connectivities'` or `'distances'`  (default: `'connectivities'`)
        Distance metric to use for moment computation.
    method : {{'umap', 'hnsw', 'sklearn', `None`}}  (default: `'umap'`)
        Method to compute neighbors, only differs in runtime.
        Connectivities are computed with adaptive kernel width as proposed in
        Haghverdi et al. 2016 (https://doi.org/10.1038/nmeth.3971).
    use_rep : `None`, `'X'` or any key for `.obsm` (default: None)
        Use the indicated representation. If `None`, the representation is chosen
        automatically: for .n_vars < 50, .X is used, otherwise ‘X_pca’ is used.
    use_highly_variable: `bool` (default: True)
        Whether to use highly variable genes only, stored in .var['highly_variable'].
    copy: `bool` (default: `False`)
        Return a copy instead of writing to adata.

    Returns
    -------
    Ms: `.layers`
        dense matrix with first order moments of spliced counts.
    Mu: `.layers`
        dense matrix with first order moments of unspliced counts.
    """

    adata = data.copy() if copy else data

    layers = [
        layer for layer in {"spliced", "unspliced"} if layer in adata.layers
    ]
    if any([not_yet_normalized(adata.layers[layer]) for layer in layers]):
        normalize_per_cell(adata)

    if n_neighbors is not None and n_neighbors > get_n_neighs(adata):
        neighbors(
            adata,
            n_neighbors=n_neighbors,
            use_rep=use_rep,
            use_highly_variable=use_highly_variable,
            n_pcs=n_pcs,
            method=method,
        )
    verify_neighbors(adata)

    if "spliced" not in adata.layers.keys(
    ) or "unspliced" not in adata.layers.keys():
        logg.warn(
            "Skipping moments, because un/spliced counts were not found.")
    else:
        logg.info(f"computing moments based on {mode}", r=True)
        connectivities = get_connectivities(adata,
                                            mode,
                                            n_neighbors=n_neighbors,
                                            recurse_neighbors=False)

        adata.layers["Ms"] = (csr_matrix.dot(
            connectivities,
            csr_matrix(adata.layers["spliced"])).astype(np.float32).A)
        adata.layers["Mu"] = (csr_matrix.dot(
            connectivities,
            csr_matrix(adata.layers["unspliced"])).astype(np.float32).A)
        # if renormalize: normalize_per_cell(adata, layers={'Ms', 'Mu'}, enforce=True)

        logg.info("    finished",
                  time=True,
                  end=" " if settings.verbosity > 2 else "\n")
        logg.hint(
            "added \n"
            "    'Ms' and 'Mu', moments of un/spliced abundances (adata.layers)"
        )
    return adata if copy else None
Beispiel #15
0
def velocity(
    data,
    vkey="velocity",
    mode="stochastic",
    fit_offset=False,
    fit_offset2=False,
    filter_genes=False,
    groups=None,
    groupby=None,
    groups_for_fit=None,
    constrain_ratio=None,
    use_raw=False,
    use_latent_time=None,
    perc=[5, 95],
    min_r2=1e-2,
    min_likelihood=1e-3,
    r2_adjusted=None,
    use_highly_variable=True,
    diff_kinetics=None,
    copy=False,
    **kwargs,
):
    """Estimates velocities in a gene-specific manner.

    The steady-state model [Manno18]_ determines velocities by quantifying how
    observations deviate from a presumed steady-state equilibrium ratio of unspliced to
    spliced mRNA levels. This steady-state ratio is obtained by performing a linear
    regression restricting the input data to the extreme quantiles. By including
    second-order moments, the stochastic model [Bergen19]_ exploits not only the balance
    of unspliced to spliced mRNA levels but also their covariation. By contrast, the
    likelihood-based dynamical model [Bergen19]_ solves the full splicing kinetics and
    generalizes RNA velocity estimation to transient systems. It is also capable of
    capturing non-observed steady states.

    .. image:: https://user-images.githubusercontent.com/31883718/69636491-ff057100-1056-11ea-90b7-d04098112ce1.png

    Arguments
    ---------
    data: :class:`~anndata.AnnData`
        Annotated data matrix.
    vkey: `str` (default: `'velocity'`)
        Name under which to refer to the computed velocities
        for `velocity_graph` and `velocity_embedding`.
    mode: `'deterministic'`, `'stochastic'` or `'dynamical'` (default: `'stochastic'`)
        Whether to run the estimation using the steady-state/deterministic,
        stochastic or dynamical model of transcriptional dynamics.
        The dynamical model requires to run `tl.recover_dynamics` first.
    fit_offset: `bool` (default: `False`)
        Whether to fit with offset for first order moment dynamics.
    fit_offset2: `bool`, (default: `False`)
        Whether to fit with offset for second order moment dynamics.
    filter_genes: `bool` (default: `True`)
        Whether to remove genes that are not used for further velocity analysis.
    groups: `str`, `list` (default: `None`)
        Subset of groups, e.g. [‘g1’, ‘g2’, ‘g3’],
        to which velocity analysis shall be restricted.
    groupby: `str`, `list` or `np.ndarray` (default: `None`)
        Key of observations grouping to consider.
    groups_for_fit: `str`, `list` or `np.ndarray` (default: `None`)
        Subset of groups, e.g. [‘g1’, ‘g2’, ‘g3’],
        to which steady-state fitting shall be restricted.
    constrain_ratio: `float` or tuple of type `float` or None: (default: `None`)
        Bounds for the steady-state ratio.
    use_raw: `bool` (default: `False`)
        Whether to use raw data for estimation.
    use_latent_time: `bool`or `None` (default: `None`)
        Whether to use latent time as a regularization for velocity estimation.
    perc: `float` (default: `[5, 95]`)
        Percentile, e.g. 98, for extreme quantile fit.
    min_r2: `float` (default: 0.01)
        Minimum threshold for coefficient of determination
    min_likelihood: `float` (default: `None`)
        Minimal likelihood for velocity genes to fit the model on.
    r2_adjusted: `bool` (default: `None`)
        Whether to compute coefficient of determination
        on full data fit (adjusted) or extreme quantile fit (None)
    use_highly_variable: `bool` (default: True)
        Whether to use highly variable genes only, stored in .var['highly_variable'].
    copy: `bool` (default: `False`)
        Return a copy instead of writing to `adata`.

    Returns
    -------
    velocity: `.layers`
        velocity vectors for each individual cell
    velocity_genes, velocity_beta, velocity_gamma, velocity_r2: `.var`
        parameters
    """  # noqa E501

    adata = data.copy() if copy else data
    if not use_raw and "Ms" not in adata.layers.keys():
        moments(adata)

    logg.info("computing velocities", r=True)

    strings_to_categoricals(adata)

    if mode is None or (mode == "dynamical"
                        and "fit_alpha" not in adata.var.keys()):
        mode = "stochastic"
        logg.warn("Falling back to stochastic model. "
                  "For the dynamical model run tl.recover_dynamics first.")

    if mode in {"dynamical", "dynamical_residuals"}:
        from .dynamical_model_utils import get_divergence, get_reads, get_vars

        gene_subset = ~np.isnan(adata.var["fit_alpha"].values)
        vdata = adata[:, gene_subset]
        alpha, beta, gamma, scaling, t_ = get_vars(vdata)

        connect = not adata.uns["recover_dynamics"]["use_raw"]
        kwargs_ = {
            "kernel_width": None,
            "normalized": True,
            "var_scale": True,
            "reg_par": None,
            "min_confidence": 1e-2,
            "constraint_time_increments": False,
            "fit_steady_states": True,
            "fit_basal_transcription": None,
            "use_connectivities": connect,
            "time_connectivities": connect,
            "use_latent_time": use_latent_time,
        }
        kwargs_.update(adata.uns["recover_dynamics"])
        kwargs_.update(**kwargs)

        if "residuals" in mode:
            u, s = get_reads(vdata,
                             use_raw=adata.uns["recover_dynamics"]["use_raw"])
            if kwargs_["fit_basal_transcription"]:
                u, s = u - adata.var["fit_u0"], s - adata.var["fit_s0"]
            o = vdata.layers["fit_t"] < t_
            vt = u * beta - s * gamma  # ds/dt
            wt = (alpha * o - beta * u) * scaling  # du/dt
        else:
            vt, wt = get_divergence(vdata, mode="velocity", **kwargs_)

        vgenes = adata.var.fit_likelihood > min_likelihood
        if min_r2 is not None:
            if "fit_r2" not in adata.var.keys():
                velo = Velocity(
                    adata,
                    groups_for_fit=groups_for_fit,
                    groupby=groupby,
                    constrain_ratio=constrain_ratio,
                    min_r2=min_r2,
                    use_highly_variable=use_highly_variable,
                    use_raw=use_raw,
                )
                velo.compute_deterministic(fit_offset=fit_offset, perc=perc)
                adata.var["fit_r2"] = velo._r2
            vgenes &= adata.var.fit_r2 > min_r2

        lb, ub = np.nanpercentile(adata.var.fit_scaling, [10, 90])
        vgenes = (vgenes
                  & (adata.var.fit_scaling > np.min([lb, 0.03]))
                  & (adata.var.fit_scaling < np.max([ub, 3])))

        adata.var[f"{vkey}_genes"] = vgenes

        adata.layers[vkey] = np.ones(adata.shape) * np.nan
        adata.layers[vkey][:, gene_subset] = vt

        adata.layers[f"{vkey}_u"] = np.ones(adata.shape) * np.nan
        adata.layers[f"{vkey}_u"][:, gene_subset] = wt

        if filter_genes and len(set(vgenes)) > 1:
            adata._inplace_subset_var(vgenes)

    elif mode in {"steady_state", "deterministic", "stochastic"}:
        categories = (adata.obs[groupby].cat.categories
                      if groupby is not None and groups is None
                      and groups_for_fit is None else [None])

        for cat in categories:
            groups = cat if cat is not None else groups

            cell_subset = groups_to_bool(adata, groups, groupby)
            _adata = adata if groups is None else adata[cell_subset]
            velo = Velocity(
                _adata,
                groups_for_fit=groups_for_fit,
                groupby=groupby,
                constrain_ratio=constrain_ratio,
                min_r2=min_r2,
                r2_adjusted=r2_adjusted,
                use_highly_variable=use_highly_variable,
                use_raw=use_raw,
            )
            velo.compute_deterministic(fit_offset=fit_offset, perc=perc)

            if mode == "stochastic":
                if filter_genes and len(set(velo._velocity_genes)) > 1:
                    adata._inplace_subset_var(velo._velocity_genes)
                    residual = velo._residual[:, velo._velocity_genes]
                    _adata = adata if groups is None else adata[cell_subset]
                    velo = Velocity(
                        _adata,
                        residual=residual,
                        groups_for_fit=groups_for_fit,
                        groupby=groupby,
                        constrain_ratio=constrain_ratio,
                        use_highly_variable=use_highly_variable,
                    )
                velo.compute_stochastic(fit_offset,
                                        fit_offset2,
                                        mode,
                                        perc=perc)

            write_residuals(adata, vkey, velo._residual, cell_subset)
            write_residuals(adata, f"variance_{vkey}", velo._residual2,
                            cell_subset)
            write_pars(adata,
                       vkey,
                       velo.get_pars(),
                       velo.get_pars_names(),
                       add_key=cat)

            if filter_genes and len(set(velo._velocity_genes)) > 1:
                adata._inplace_subset_var(velo._velocity_genes)

    else:
        raise ValueError(
            "Mode can only be one of these: deterministic, stochastic or dynamical."
        )

    if f"{vkey}_genes" in adata.var.keys() and np.sum(
            adata.var[f"{vkey}_genes"]) < 10:
        logg.warn(
            "Too few genes are selected as velocity genes. "
            "Consider setting a lower threshold for min_r2 or min_likelihood.")

    if diff_kinetics:
        if not isinstance(diff_kinetics, str):
            diff_kinetics = "fit_diff_kinetics"
        if diff_kinetics in adata.var.keys():
            if diff_kinetics in adata.uns["recover_dynamics"]:
                groupby = adata.uns["recover_dynamics"]["fit_diff_kinetics"]
            else:
                groupby = "clusters"
            clusters = adata.obs[groupby]
            for i, v in enumerate(
                    np.array(adata.var[diff_kinetics].values, dtype=str)):
                if len(v) > 0 and v != "nan":
                    idx = 1 - clusters.isin([a.strip() for a in v.split(",")])
                    adata.layers[vkey][:, i] *= idx
                    if mode == "dynamical":
                        adata.layers[f"{vkey}_u"][:, i] *= idx

    adata.uns[f"{vkey}_params"] = {
        "mode": mode,
        "fit_offset": fit_offset,
        "perc": perc
    }

    logg.info("    finished",
              time=True,
              end=" " if settings.verbosity > 2 else "\n")
    logg.hint(
        "added \n"
        f"    '{vkey}', velocity vectors for each individual cell (adata.layers)"
    )

    return adata if copy else None
Beispiel #16
0
def paga(
    adata,
    groups=None,
    vkey="velocity",
    use_time_prior=True,
    root_key=None,
    end_key=None,
    threshold_root_end_prior=None,
    minimum_spanning_tree=True,
    copy=False,
):
    """PAGA graph with velocity-directed edges.

    Mapping out the coarse-grained connectivity structures of complex manifolds
    [Wolf19]_. By quantifying the connectivity of partitions (groups, clusters) of the
    single-cell graph, partition-based graph abstraction (PAGA) generates a much
    simpler abstracted graph (*PAGA graph*) of partitions, in which edge weights
    represent confidence in the presence of connections.

    Parameters
    ----------
    adata : :class:`~anndata.AnnData`
        An annotated data matrix.
    groups : key for categorical in `adata.obs`, optional (default: 'louvain')
        You can pass your predefined groups by choosing any categorical
        annotation of observations (`adata.obs`).
    vkey: `str` or `None` (default: `None`)
        Key for annotations of observations/cells or variables/genes.
    use_time_prior : `str` or bool, optional (default: True)
        Obs key for pseudo-time values.
        If True, 'velocity_pseudotime' is used if available.
    root_key : `str` or bool, optional (default: None)
        Obs key for root states.
    end_key : `str` or bool, optional (default: None)
        Obs key for end states.
    threshold_root_end_prior : `float` (default: 0.9)
        Threshold for root and final states priors, to be in the range of [0,1].
        Values above the threshold will be considered as terminal and included as prior.
    minimum_spanning_tree : bool, optional (default: True)
        Whether to prune the tree such that a path from A-to-B
        is removed if another more confident path exists.
    copy : `bool`, optional (default: `False`)
        Copy `adata` before computation and return a copy.
        Otherwise, perform computation inplace and return `None`.

    Returns
    -------
    connectivities: `.uns`
        The full adjacency matrix of the abstracted graph, weights correspond to
        confidence in the connectivities of partitions.
    connectivities_tree: `.uns`
        The adjacency matrix of the tree-like subgraph that best explains the topology.
    transitions_confidence: `.uns`
        The adjacency matrix of the abstracted directed graph, weights correspond to
        confidence in the transitions between partitions.
    """

    if "neighbors" not in adata.uns:
        raise ValueError(
            "You need to run `pp.neighbors` first to compute a neighborhood graph."
        )

    adata = adata.copy() if copy else adata
    strings_to_categoricals(adata)

    if groups is None:
        groups = ("clusters" if "clusters" in adata.obs.keys() else
                  "louvain" if "louvain" in adata.obs.keys() else None)
    elif groups == "velocity_clusters" and "velocity_clusters" not in adata.obs.keys(
    ):
        velocity_clusters(adata)
    if use_time_prior and not isinstance(use_time_prior, str):
        use_time_prior = "velocity_pseudotime"
        if use_time_prior not in adata.obs.keys():
            velocity_pseudotime(adata,
                                vkey=vkey,
                                root_key=root_key,
                                end_key=end_key)

    priors = [
        p for p in [use_time_prior, root_key, end_key]
        if p in adata.obs.keys()
    ]
    logg.info(
        "running PAGA",
        f"using priors: {priors}" if len(priors) > 0 else "",
        r=True,
    )
    paga = PAGA_tree(
        adata,
        groups,
        vkey=vkey,
        use_time_prior=use_time_prior,
        root_key=root_key,
        end_key=end_key,
        threshold_root_end_prior=threshold_root_end_prior,
        minimum_spanning_tree=minimum_spanning_tree,
    )

    if "paga" not in adata.uns:
        adata.uns["paga"] = {}

    paga.compute_connectivities()
    adata.uns["paga"]["connectivities"] = paga.connectivities
    adata.uns["paga"]["connectivities_tree"] = paga.connectivities_tree
    adata.uns[f"{groups}_sizes"] = np.array(paga.ns)

    paga.compute_transitions()
    adata.uns["paga"]["transitions_confidence"] = paga.transitions_confidence
    adata.uns["paga"]["threshold"] = paga.threshold
    adata.uns["paga"]["groups"] = groups

    logg.info("    finished",
              time=True,
              end=" " if settings.verbosity > 2 else "\n")
    logg.hint(
        "added\n" +
        "    'paga/connectivities', connectivities adjacency (adata.uns)\n"
        "    'paga/connectivities_tree', connectivities subtree (adata.uns)\n"
        "    'paga/transitions_confidence', velocity transitions (adata.uns)")

    return adata if copy else None
Beispiel #17
0
def velocity_genes(
    data,
    vkey="velocity",
    min_r2=0.01,
    min_ratio=0.01,
    use_highly_variable=True,
    copy=False,
):
    """Estimates velocities in a gene-specific manner

    Arguments
    ---------
    data: :class:`~anndata.AnnData`
        Annotated data matrix.
    vkey: `str` (default: `'velocity'`)
        Name under which to refer to the computed velocities.
    min_r2: `float` (default: 0.01)
        Minimum threshold for coefficient of determination
    min_ratio: `float` (default: 0.01)
        Minimum threshold for quantile regression un/spliced ratio.
    use_highly_variable: `bool` (default: True)
        Whether to use highly variable genes only, stored in .var['highly_variable'].
    copy: `bool` (default: `False`)
        Return a copy instead of writing to `adata`.

    Returns
    -------
    Updates `adata` attributes
    velocity_genes: `.var`
        genes to be used for further velocity analysis (velocity graph and embedding)
    """

    adata = data.copy() if copy else data
    if f"{vkey}_genes" not in adata.var.keys():
        velocity(adata, vkey)
    vgenes = np.ones(adata.n_vars, dtype=bool)

    if "Ms" in adata.layers.keys() and "Mu" in adata.layers.keys():
        vgenes &= np.max(adata.layers["Ms"] > 0, 0) > 0
        vgenes &= np.max(adata.layers["Mu"] > 0, 0) > 0

    if min_r2 is not None and f"{vkey}_r2" in adata.var.keys():
        vgenes &= adata.var[f"{vkey}_r2"] > min_r2

    if min_ratio is not None and f"{vkey}_qreg_ratio" in adata.var.keys():
        vgenes &= adata.var[f"{vkey}_qreg_ratio"] > min_ratio

    if use_highly_variable and "highly_variable" in adata.var.keys():
        vgenes &= adata.var["highly_variable"].values

    if np.sum(vgenes) < 2:
        logg.warn(
            "You seem to have very low signal in splicing dynamics.\n"
            "Consider reducing the thresholds and be cautious with interpretations.\n"
        )

    adata.var[f"{vkey}_genes"] = vgenes

    logg.info("Number of obtained velocity_genes:",
              np.sum(adata.var[f"{vkey}_genes"]))

    return adata if copy else None
Beispiel #18
0
def filter_and_normalize(
    data,
    min_counts=None,
    min_counts_u=None,
    min_cells=None,
    min_cells_u=None,
    min_shared_counts=None,
    min_shared_cells=None,
    n_top_genes=None,
    retain_genes=None,
    subset_highly_variable=True,
    flavor="seurat",
    log=True,
    layers_normalize=None,
    copy=False,
    **kwargs,
):
    """Filtering, normalization and log transform

    Expects non-logarithmized data. If using logarithmized data, pass `log=False`.

    Runs the following steps

    .. code:: python

        scv.pp.filter_genes(adata)
        scv.pp.normalize_per_cell(adata)
        if n_top_genes is not None:
            scv.pp.filter_genes_dispersion(adata)
        if log:
            scv.pp.log1p(adata)


    Arguments
    ---------
    data: :class:`~anndata.AnnData`
        Annotated data matrix.
    min_counts: `int` (default: `None`)
        Minimum number of counts required for a gene to pass filtering (spliced).
    min_counts_u: `int` (default: `None`)
        Minimum number of counts required for a gene to pass filtering (unspliced).
    min_cells: `int` (default: `None`)
        Minimum number of cells expressed required to pass filtering (spliced).
    min_cells_u: `int` (default: `None`)
        Minimum number of cells expressed required to pass filtering (unspliced).
    min_shared_counts: `int`, optional (default: `None`)
        Minimum number of counts (both unspliced and spliced) required for a gene.
    min_shared_cells: `int`, optional (default: `None`)
        Minimum number of cells required to be expressed (both unspliced and spliced).
    n_top_genes: `int` (default: `None`)
        Number of genes to keep.
    retain_genes: `list`, optional (default: `None`)
        List of gene names to be retained independent of thresholds.
    subset_highly_variable: `bool` (default: True)
        Whether to subset highly variable genes or to store in .var['highly_variable'].
    flavor: {'seurat', 'cell_ranger', 'svr'}, optional (default: 'seurat')
        Choose the flavor for computing normalized dispersion.
        If choosing 'seurat', this expects non-logarithmized data.
    log: `bool` (default: `True`)
        Take logarithm.
    layers_normalize: list of `str` (default: None)
        List of layers to be normalized.
        If set to None, the layers {'X', 'spliced', 'unspliced'} are considered for
        normalization upon testing whether they have already been normalized
        (by checking type of entries: int -> unprocessed, float -> processed).
    copy: `bool` (default: `False`)
        Return a copy of `adata` instead of updating it.
    **kwargs:
        Keyword arguments passed to pp.normalize_per_cell (e.g. counts_per_cell).

    Returns
    -------
    Returns or updates `adata` depending on `copy`.
    """

    adata = data.copy() if copy else data

    if "spliced" not in adata.layers.keys(
    ) or "unspliced" not in adata.layers.keys():
        logg.warn("Could not find spliced / unspliced counts.")

    filter_genes(
        adata,
        min_counts=min_counts,
        min_counts_u=min_counts_u,
        min_cells=min_cells,
        min_cells_u=min_cells_u,
        min_shared_counts=min_shared_counts,
        min_shared_cells=min_shared_cells,
        retain_genes=retain_genes,
    )

    if layers_normalize is not None and "enforce" not in kwargs:
        kwargs["enforce"] = True
    normalize_per_cell(adata, layers=layers_normalize, **kwargs)

    if n_top_genes is not None:
        filter_genes_dispersion(
            adata,
            n_top_genes=n_top_genes,
            retain_genes=retain_genes,
            flavor=flavor,
            subset=subset_highly_variable,
        )

    log_advised = (np.allclose(adata.X[:10].sum(),
                               adata.layers["spliced"][:10].sum())
                   if "spliced" in adata.layers.keys() else True)

    if log and log_advised:
        log1p(adata)
    if log and log_advised:
        logg.info("Logarithmized X.")
    elif log and not log_advised:
        logg.warn("Did not modify X as it looks preprocessed already.")
    elif log_advised and not log:
        logg.warn(
            "Consider logarithmizing X with `scv.pp.log1p` for better results."
        )

    return adata if copy else None
Beispiel #19
0
def terminal_states(
    data,
    vkey="velocity",
    modality="Ms",
    groupby=None,
    groups=None,
    self_transitions=False,
    eps=1e-3,
    random_state=0,
    copy=False,
    **kwargs,
):
    """Computes terminal states (root and end points).

    The end points and root cells are obtained as stationary states of the
    velocity-inferred transition matrix and its transposed, respectively,
    which is given by left eigenvectors corresponding to an eigenvalue of 1, i.e.

    .. math::
        μ^{\\textrm{end}}=μ^{\\textrm{end}} \\pi, \\quad
        μ^{\\textrm{root}}=μ^{\\textrm{root}} \\pi^{\\small \\textrm{T}}.

    .. code:: python

        scv.tl.terminal_states(adata)
        scv.pl.scatter(adata, color=['root_cells', 'end_points'])

    .. image:: https://user-images.githubusercontent.com/31883718/69496183-bcfdf300-0ecf-11ea-9aae-685300a0b1ba.png

    Alternatively, we recommend to use :func:`cellrank.tl.terminal_states`
    providing an improved/generalized approach of identifying terminal states.

    Arguments
    ---------
    data: :class:`~anndata.AnnData`
        Annotated data matrix.
    vkey: `str` (default: `'velocity'`)
        Name of velocity estimates to be used.
    modality: `str` (default: `'Ms'`)
        Layer used to calculate terminal states.
    groupby: `str`, `list` or `np.ndarray` (default: `None`)
        Key of observations grouping to consider. Only to be set, if each group is
        assumed to have a distinct lineage with an independent root and end point.
    groups: `str`, `list` or `np.ndarray` (default: `None`)
        Groups selected to find terminal states on. Must be an element of .obs[groupby].
        To be specified only for very distinct/disconnected clusters.
    self_transitions: `bool` (default: `False`)
        Allow transitions from one node to itself.
    eps: `float` (default: 1e-3)
        Tolerance for eigenvalue selection.
    random_state: `int` or None (default: 0)
        Seed used by the random number generator.
        If `None`, use the `RandomState` instance by `np.random`.
    copy: `bool` (default: `False`)
        Return a copy instead of writing to data.
    **kwargs:
        Passed to scvelo.tl.transition_matrix(), e.g. basis, weight_diffusion.

    Returns
    -------
    root_cells: `.obs`
        sparse matrix with transition probabilities.
    end_points: `.obs`
        sparse matrix with transition probabilities.
    """  # noqa E501

    adata = data.copy() if copy else data
    verify_neighbors(adata)

    logg.info("computing terminal states", r=True)

    strings_to_categoricals(adata)
    if groupby is not None:
        logg.warn(
            "Only set groupby, when you have evident distinct clusters/lineages,"
            " each with an own root and end point.")

    kwargs.update({"self_transitions": self_transitions})
    categories = [None]
    if groupby is not None and groups is None:
        categories = adata.obs[groupby].cat.categories
    for cat in categories:
        groups = cat if cat is not None else groups
        cell_subset = groups_to_bool(adata, groups=groups, groupby=groupby)
        _adata = adata if groups is None else adata[cell_subset]
        connectivities = get_connectivities(_adata, "distances")

        T = transition_matrix(_adata, vkey=vkey, backward=True, **kwargs)
        eigvecs_roots = eigs(T,
                             eps=eps,
                             perc=[2, 98],
                             random_state=random_state)[1]
        roots = csr_matrix.dot(connectivities, eigvecs_roots).sum(1)
        roots = scale(np.clip(roots, 0, np.percentile(roots, 98)))
        roots = verify_roots(_adata, roots, modality)
        write_to_obs(adata, "root_cells", roots, cell_subset)

        T = transition_matrix(_adata, vkey=vkey, backward=False, **kwargs)
        eigvecs_ends = eigs(T,
                            eps=eps,
                            perc=[2, 98],
                            random_state=random_state)[1]
        ends = csr_matrix.dot(connectivities, eigvecs_ends).sum(1)
        ends = scale(np.clip(ends, 0, np.percentile(ends, 98)))
        write_to_obs(adata, "end_points", ends, cell_subset)

        n_roots, n_ends = eigvecs_roots.shape[1], eigvecs_ends.shape[1]
        groups_str = f" ({groups})" if isinstance(groups, str) else ""
        roots_str = f"{n_roots} {'regions' if n_roots > 1 else 'region'}"
        ends_str = f"{n_ends} {'regions' if n_ends > 1 else 'region'}"

        logg.info(f"    identified {roots_str} of root cells "
                  f"and {ends_str} of end points {groups_str}.")

    logg.info("    finished",
              time=True,
              end=" " if settings.verbosity > 2 else "\n")
    logg.hint(
        "added\n"
        "    'root_cells', root cells of Markov diffusion process (adata.obs)\n"
        "    'end_points', end points of Markov diffusion process (adata.obs)")
    return adata if copy else None