Ejemplo n.º 1
0
def score_genes_cell_cycle(adata, s_genes=None, g2m_genes=None, copy=False, **kwargs):
    """\
    Score cell cycle genes.

    Calculates scores and assigns a cell cycle phase (G1, S, G2M) using the list of cell
    cycle genes defined in Tirosh et al, 2015 (https://doi.org/10.1126/science.aad0501).

    Parameters
    ----------
    adata
        The annotated data matrix.
    s_genes
        List of genes associated with S phase.
    g2m_genes
        List of genes associated with G2M phase.
    copy
        Copy `adata` or modify it inplace.
    **kwargs
        Are passed to :func:`~scanpy.tl.score_genes`. `ctrl_size` is not
        possible, as it's set as `min(len(s_genes), len(g2m_genes))`.

    Returns
    -------
    S_score: `adata.obs`, dtype `object`
        The score for S phase for each cell.
    G2M_score: `adata.obs`, dtype `object`
        The score for G2M phase for each cell.
    phase: `adata.obs`, dtype `object`
        The cell cycle phase (`S`, `G2M` or `G1`) for each cell.
    """

    logg.info("calculating cell cycle phase")
    from scanpy.tools._score_genes import score_genes

    adata = adata.copy() if copy else adata

    s_genes_, g2m_genes_ = get_phase_marker_genes(adata)
    if s_genes is None:
        s_genes = s_genes_
    if g2m_genes is None:
        g2m_genes = g2m_genes_

    ctrl_size = min(len(s_genes), len(g2m_genes))

    kwargs.update({"ctrl_size": ctrl_size})
    score_genes(adata, gene_list=s_genes, score_name="S_score", **kwargs)
    score_genes(adata, gene_list=g2m_genes, score_name="G2M_score", **kwargs)
    scores = adata.obs[["S_score", "G2M_score"]]

    phase = pd.Series("S", index=scores.index)  # default phase is S
    phase[scores.G2M_score > scores.S_score] = "G2M"  # G2M, if G2M is higher than S
    phase[np.all(scores < 0, axis=1)] = "G1"  # G1, if all scores are negative

    adata.obs["phase"] = phase
    logg.hint("    'S_score' and 'G2M_score', scores of cell cycle phases (adata.obs)")
    return adata if copy else None
Ejemplo n.º 2
0
def velocity_confidence_transition(data,
                                   vkey="velocity",
                                   scale=10,
                                   copy=False):
    """Computes confidences of velocity transitions.

    Arguments
    ---------
    data: :class:`~anndata.AnnData`
        Annotated data matrix.
    vkey: `str` (default: `'velocity'`)
        Name of velocity estimates to be used.
    scale: `float` (default: 10)
        Scale parameter of gaussian kernel.
    copy: `bool` (default: `False`)
        Return a copy instead of writing to adata.

    Returns
    -------
    velocity_confidence_transition: `.obs`
        Confidence of transition for each cell
    """

    adata = data.copy() if copy else data
    if vkey not in adata.layers.keys():
        raise ValueError("You need to run `tl.velocity` first.")

    X = np.array(adata.layers["Ms"])
    V = np.array(adata.layers[vkey])

    tmp_filter = np.invert(np.isnan(np.sum(V, axis=0)))
    if f"{vkey}_genes" in adata.var.keys():
        tmp_filter &= np.array(adata.var[f"{vkey}_genes"], dtype=bool)
    if "spearmans_score" in adata.var.keys():
        tmp_filter &= adata.var["spearmans_score"].values > 0.1

    V = V[:, tmp_filter]
    X = X[:, tmp_filter]

    T = transition_matrix(adata, vkey=vkey, scale=scale)
    dX = T.dot(X) - X
    dX -= dX.mean(1)[:, None]
    V -= V.mean(1)[:, None]

    norms = l2_norm(dX, axis=1) * l2_norm(V, axis=1)
    norms += norms == 0

    adata.obs[f"{vkey}_confidence_transition"] = prod_sum(dX, V,
                                                          axis=1) / norms

    logg.hint(f"added '{vkey}_confidence_transition' (adata.obs)")

    return adata if copy else None
Ejemplo n.º 3
0
def velocity_clusters(
    data,
    vkey="velocity",
    match_with="clusters",
    sort_by="velocity_pseudotime",
    resolution=None,
    min_likelihood=None,
    copy=False,
):
    """Computes velocity clusters via louvain on velocities.

    .. code:: python

        scv.tl.velocity_clusters(adata)
        scv.pl.scatter(adata, color='velocity_clusters')

    .. image:: https://user-images.githubusercontent.com/31883718/69625627-484dc480-1047-11ea-847f-6607a3430427.png
       :width: 600px


    Arguments
    ----------
    data : :class:`~anndata.AnnData`
        Annotated data matrix.
    vkey: `str` (default: `'velocity'`)
        Key of velocities computed in `tl.velocity`
    match_with : `int`, optional (default: 100)
        The number of genes that appear in the returned tables.
    match_with: `str` (default: `'clusters'`)
        Match the names of the velocity clusters with the names of this key (.obs).
    sort_by: `str` or `None` (default: `'dpt_pseudotime'`)
        Sort velocity clusters by this key (.obs).
    resolution: `float` (default: 0.7)
        Resolution for louvain modularity.
    min_likelihood: `float` between `0` and `1` or `None` (default: `None`)
        Only rank velocity of genes with a likelihood higher than min_likelihood.
    copy: `bool` (default: `False`)
        Return a copy instead of writing to data.

    Returns
    -------
    velocity_clusters : `.obs`
        Clusters obtained from applying louvain modularity on velocity expression.
    """  # noqa E501

    adata = data.copy() if copy else data

    logg.info("computing velocity clusters", r=True)

    tmp_filter = ~np.isnan(adata.layers[vkey].sum(0))
    if f"{vkey}_genes" in adata.var.keys():
        tmp_filter &= np.array(adata.var[f"{vkey}_genes"].values, dtype=bool)

    if "unspliced" in adata.layers.keys():
        n_counts = (adata.layers["unspliced"] > 0).sum(0)
        n_counts = n_counts.A1 if issparse(
            adata.layers["unspliced"]) else n_counts
        min_counts = min(50, np.percentile(n_counts, 50))
        tmp_filter &= np.ravel(n_counts > min_counts)

    if "r2" in adata.var.keys():
        r2 = adata.var.velocity_r2
        min_r2 = np.percentile(r2[r2 > 0], 50)
        tmp_filter &= r2 > min_r2

    if "dispersions_norm" in adata.var.keys():
        dispersions = adata.var.dispersions_norm
        min_dispersion = np.percentile(dispersions, 20)
        tmp_filter &= dispersions > min_dispersion

    if "fit_likelihood" in adata.var.keys() and min_likelihood is not None:
        tmp_filter &= adata.var["fit_likelihood"] > min_likelihood

    from anndata import AnnData

    vdata = AnnData(adata.layers[vkey][:, tmp_filter])
    vdata.obs = adata.obs.copy()
    vdata.var = adata.var[tmp_filter].copy()

    if "highly_variable" in vdata.var.keys():
        vdata.var["highly_variable"] = np.array(vdata.var["highly_variable"],
                                                dtype=bool)

    import scanpy as sc

    logg.switch_verbosity("off", module="scanpy")
    sc.pp.pca(vdata, n_comps=20, svd_solver="arpack")
    sc.pp.neighbors(vdata, n_pcs=20)
    sc.tl.louvain(vdata, resolution=0.7 if resolution is None else resolution)
    logg.switch_verbosity("on", module="scanpy")

    if sort_by == "velocity_pseudotime" and sort_by not in adata.obs.keys():
        velocity_pseudotime(adata, vkey=vkey)
    if sort_by in vdata.obs.keys():
        vc = vdata.obs["louvain"]
        vc_cats = vc.cat.categories
        mean_times = [
            np.mean(vdata.obs[sort_by][vc == cat]) for cat in vc_cats
        ]
        vdata.obs["louvain"].cat.reorder_categories(
            vc_cats[np.argsort(mean_times)], inplace=True)

    if isinstance(match_with, str) and match_with in adata.obs.keys():
        from .utils import most_common_in_list

        vc = vdata.obs["louvain"]
        cats_nums = {cat: 0 for cat in adata.obs[match_with].cat.categories}
        for cat in vc.cat.categories:
            cells_in_cat = np.where(vc == cat)[0]
            new_cat = most_common_in_list(adata.obs[match_with][cells_in_cat])
            cats_nums[new_cat] += 1
            vc = vc.cat.rename_categories(
                {cat: f"{new_cat} ({cats_nums[new_cat]})"})
        vdata.obs["louvain"] = vc
    else:
        vdata.obs["louvain"].cat.categories = np.arange(
            len(vdata.obs["louvain"].cat.categories))
    adata.obs[f"{vkey}_clusters"] = vdata.obs["louvain"].copy()

    del vdata

    logg.info("    finished",
              time=True,
              end=" " if settings.verbosity > 2 else "\n")
    logg.hint(
        "added \n"
        f"    '{vkey}_clusters', "
        f"clusters based on louvain modularity on velocity vector field (adata.obs)"
    )

    return adata if copy else None
Ejemplo n.º 4
0
def rank_velocity_genes(
    data,
    vkey="velocity",
    n_genes=100,
    groupby=None,
    match_with=None,
    resolution=None,
    min_counts=None,
    min_r2=None,
    min_corr=None,
    min_dispersion=None,
    min_likelihood=None,
    copy=False,
):
    """Rank genes for velocity characterizing groups.

    This applies a differential expression test (Welch t-test with overestimated
    variance to be conservative) on velocity expression, to find genes in a cluster that
    show dynamics that is transcriptionally regulated differently compared to all other
    clusters (e.g. induction in that cluster and homeostasis in remaining population).
    If no clusters are given, it priorly computes velocity clusters by applying louvain
    modularity on velocity expression.

    .. code:: python

        scv.tl.rank_velocity_genes(adata, groupby='clusters')
        scv.pl.scatter(
            adata, basis=adata.uns['rank_velocity_genes']['names']['Beta'][:3]
        )
        pd.DataFrame(adata.uns['rank_velocity_genes']['names']).head()

    .. image:: https://user-images.githubusercontent.com/31883718/69626017-11c47980-1048-11ea-89f4-df3769df5ad5.png
       :width: 600px

    .. image:: https://user-images.githubusercontent.com/31883718/69626572-30774000-1049-11ea-871f-e8a30c42f10e.png
       :width: 600px

    Arguments
    ----------
    data : :class:`~anndata.AnnData`
        Annotated data matrix.
    vkey: `str` (default: `'velocity'`)
        Key of velocities computed in `tl.velocity`
    n_genes : `int`, optional (default: 100)
        The number of genes that appear in the returned tables.
    groupby: `str`, `list` or `np.ndarray` (default: `None`)
        Key of observations grouping to consider.
    match_with: `str` or `None` (default: `None`)
        adata.obs key to separatively rank velocities on.
    resolution: `str` or `None` (default: `None`)
        Resolution for louvain modularity.
    min_counts: `float` (default: None)
        Minimum count of genes for consideration.
    min_r2: `float` (default: None)
        Minimum r2 value of genes for consideration.
    min_corr: `float` (default: None)
        Minimum Spearmans correlation coefficient between spliced and unspliced.
    min_dispersion: `float` (default: None)
        Minimum dispersion norm value of genes for consideration.
    min_likelihood: `float` between `0` and `1` or `None` (default: `None`)
        Only rank velocity of genes with a likelihood higher than min_likelihood.
    copy: `bool` (default: `False`)
        Return a copy instead of writing to data.

    Returns
    -------
    rank_velocity_genes : `.uns`
        Structured array to be indexed by group id storing the gene
        names. Ordered according to scores.
    velocity_score : `.var`
        Storing the score for each gene for each group. Ordered according to scores.
    """  # noqa E501

    adata = data.copy() if copy else data

    if groupby is None or groupby == "velocity_clusters":
        velocity_clusters(
            adata,
            vkey=vkey,
            match_with=match_with,
            resolution=resolution,
            min_likelihood=min_likelihood,
        )
        groupby = f"{vkey}_clusters"

    logg.info("ranking velocity genes", r=True)

    if "spearmans_score" not in adata.var.keys():
        corr = vcorrcoef(
            np.array(adata.layers["Ms"]).T,
            np.array(adata.layers["Mu"].T),
            mode="spearmans",
        )
        adata.var["spearmans_score"] = np.clip(corr, 0, None)

    tmp_filter = ~np.isnan(adata.layers[vkey].sum(0))
    if f"{vkey}_genes" in adata.var.keys():
        tmp_filter &= np.array(adata.var[f"{vkey}_genes"].values, dtype=bool)

    if "unspliced" in adata.layers.keys():
        n_counts = (adata.layers["unspliced"] > 0).sum(0)
        n_counts = n_counts.A1 if issparse(
            adata.layers["unspliced"]) else n_counts
        min_counts = (min(50, np.percentile(n_counts, 50))
                      if min_counts is None else min_counts)
        tmp_filter &= np.ravel(n_counts > min_counts)

    if f"{vkey}_r2" in adata.var.keys():
        r2 = adata.var[f"{vkey}_r2"]
        min_r2 = 0.1 if min_r2 is None else min_r2  # np.percentile(r2[r2 > 0], 50)
        tmp_filter &= r2 > min_r2

    if "spearmans_score" in adata.var.keys():
        corr = adata.var["spearmans_score"]
        min_corr = (0.1 if min_corr is None else min_corr
                    )  # np.percentile(r2[r2 > 0], 50)
        tmp_filter &= corr > min_corr

    if "dispersions_norm" in adata.var.keys():
        dispersions = adata.var.dispersions_norm
        min_dispersion = 0 if min_dispersion is None else min_dispersion
        tmp_filter &= dispersions > min_dispersion

    if "fit_likelihood" in adata.var.keys():
        fit_likelihood = adata.var["fit_likelihood"]
        min_likelihood = 0.1 if min_likelihood is None else min_likelihood
        tmp_filter &= fit_likelihood > min_likelihood

    X = adata[:, tmp_filter].layers[vkey]
    groups, groups_masks = select_groups(adata, key=groupby)

    n_groups = groups_masks.shape[0]
    sizes = groups_masks.sum(1)

    mean, var = np.zeros((n_groups, X.shape[1])), np.zeros(
        (n_groups, X.shape[1]))
    for i, mask in enumerate(groups_masks):
        mean[i], var[i] = get_mean_var(X[mask])

    # test each against the union of all other groups
    rankings_gene_names, rankings_gene_scores = [], []
    for i in range(n_groups):
        mask_rest = ~groups_masks[i]
        mean_rest, var_rest = get_mean_var(X[mask_rest])
        size_rest = sizes[i]  # else mask_rest.sum() if method == 't-test'

        scores = (mean[i] - mean_rest) / np.sqrt(var[i] / sizes[i] +
                                                 var_rest / size_rest)
        scores = np.nan_to_num(scores)

        # equivalent to but much faster than np.argsort(scores)[-10:]
        if n_genes > X.shape[1]:
            n_genes = X.shape[1]
        idx = np.argpartition(scores, -n_genes)[-n_genes:]
        idx = idx[np.argsort(scores[idx])[::-1]]

        rankings_gene_names.append(adata[:, tmp_filter].var_names[idx].values)
        rankings_gene_scores.append(scores[idx])

    rankings_gene_names = np.array([list(n) for n in rankings_gene_names])
    rankings_gene_scores = np.array([list(n) for n in rankings_gene_scores])

    all_names = rankings_gene_names.T.flatten()
    all_scores = rankings_gene_scores.T.flatten()
    vscore = np.zeros(adata.n_vars, dtype=int)
    for i, name in enumerate(adata.var_names):
        if name in all_names:
            vscore[i] = all_scores[np.where(name == all_names)[0][0]]
    adata.var["velocity_score"] = vscore

    key = "rank_velocity_genes"
    if key not in adata.uns.keys():
        adata.uns[key] = {}

    adata.uns[key] = {
        "names":
        np.rec.fromarrays([n for n in rankings_gene_names],
                          dtype=[(f"{rn}", "U50") for rn in groups]),
        "scores":
        np.rec.fromarrays(
            [n.round(2) for n in rankings_gene_scores],
            dtype=[(f"{rn}", "float32") for rn in groups],
        ),
        "params": {
            "groupby": groupby,
            "reference": "rest",
            "method": "t-test_overestim_var",
            "use_raw": True,
        },
    }

    logg.info("    finished",
              time=True,
              end=" " if settings.verbosity > 2 else "\n")
    logg.hint(
        "added \n"
        f"    '{key}', sorted scores by group ids (adata.uns) \n"
        "    'spearmans_score', spearmans correlation scores (adata.var)")

    return adata if copy else None
Ejemplo n.º 5
0
def velocity_graph(
    data,
    vkey="velocity",
    xkey="Ms",
    tkey=None,
    basis=None,
    n_neighbors=None,
    n_recurse_neighbors=None,
    random_neighbors_at_max=None,
    sqrt_transform=None,
    variance_stabilization=None,
    gene_subset=None,
    compute_uncertainties=None,
    approx=None,
    mode_neighbors="distances",
    copy=False,
    n_jobs=None,
    backend="loky",
):
    """Computes velocity graph based on cosine similarities.

    The cosine similarities are computed between velocities and potential cell state
    transitions, i.e. it measures how well a corresponding change in gene expression
    :math:`\\delta_{ij} = x_j - x_i` matches the predicted change according to the
    velocity vector :math:`\\nu_i`,

    .. math::
        \\pi_{ij} = \\cos\\angle(\\delta_{ij}, \\nu_i)
        = \\frac{\\delta_{ij}^T \\nu_i}{\\left\\lVert\\delta_{ij}\\right\\rVert
        \\left\\lVert \\nu_i \\right\\rVert}.

    Arguments
    ---------
    data: :class:`~anndata.AnnData`
        Annotated data matrix.
    vkey: `str` (default: `'velocity'`)
        Name of velocity estimates to be used.
    xkey: `str` (default: `'Ms'`)
        Layer key to extract count data from.
    tkey: `str` (default: `None`)
        Observation key to extract time data from.
    basis: `str` (default: `None`)
        Basis / Embedding to use.
    n_neighbors: `int` or `None` (default: None)
        Use fixed number of neighbors or do recursive neighbor search (if `None`).
    n_recurse_neighbors: `int` (default: `None`)
        Number of recursions for neighbors search. Defaults to
        2 if mode_neighbors is 'distances', and 1 if mode_neighbors is 'connectivities'.
    random_neighbors_at_max: `int` or `None` (default: `None`)
        If number of iterative neighbors for an individual cell is higher than this
        threshold, a random selection of such are chosen as reference neighbors.
    sqrt_transform: `bool` (default: `False`)
        Whether to variance-transform the cell states changes
        and velocities before computing cosine similarities.
    gene_subset: `list` of `str`, subset of adata.var_names or `None`(default: `None`)
        Subset of genes to compute velocity graph on exclusively.
    compute_uncertainties: `bool` (default: `None`)
        Whether to compute uncertainties along with cosine correlation.
    approx: `bool` or `None` (default: `None`)
        If True, first 30 pc's are used instead of the full count matrix
    mode_neighbors: 'str' (default: `'distances'`)
        Determines the type of KNN graph used. Options are 'distances' or
        'connectivities'. The latter yields a symmetric graph.
    copy: `bool` (default: `False`)
        Return a copy instead of writing to adata.
    n_jobs: `int` or `None` (default: `None`)
        Number of parallel jobs.
    backend: `str` (default: "loky")
        Backend used for multiprocessing. See :class:`joblib.Parallel` for valid
        options.

    Returns
    -------
    velocity_graph: `.uns`
        sparse matrix with correlations of cell state transitions with velocities
    """

    adata = data.copy() if copy else data
    verify_neighbors(adata)
    if vkey not in adata.layers.keys():
        velocity(adata, vkey=vkey)
    if sqrt_transform is None:
        sqrt_transform = variance_stabilization

    vgraph = VelocityGraph(
        adata,
        vkey=vkey,
        xkey=xkey,
        tkey=tkey,
        basis=basis,
        n_neighbors=n_neighbors,
        approx=approx,
        n_recurse_neighbors=n_recurse_neighbors,
        random_neighbors_at_max=random_neighbors_at_max,
        sqrt_transform=sqrt_transform,
        gene_subset=gene_subset,
        compute_uncertainties=compute_uncertainties,
        report=True,
        mode_neighbors=mode_neighbors,
    )

    if isinstance(basis, str):
        logg.warn(
            f"The velocity graph is computed on {basis} embedding coordinates.\n"
            f"        Consider computing the graph in an unbiased manner \n"
            f"        on full expression space by not specifying basis.\n")

    n_jobs = get_n_jobs(n_jobs=n_jobs)
    logg.info(
        f"computing velocity graph (using {n_jobs}/{os.cpu_count()} cores)",
        r=True)
    vgraph.compute_cosines(n_jobs=n_jobs, backend=backend)

    adata.uns[f"{vkey}_graph"] = vgraph.graph
    adata.uns[f"{vkey}_graph_neg"] = vgraph.graph_neg

    if vgraph.uncertainties is not None:
        adata.uns[f"{vkey}_graph_uncertainties"] = vgraph.uncertainties

    adata.obs[f"{vkey}_self_transition"] = vgraph.self_prob

    if f"{vkey}_params" in adata.uns.keys():
        if "embeddings" in adata.uns[f"{vkey}_params"]:
            del adata.uns[f"{vkey}_params"]["embeddings"]
    else:
        adata.uns[f"{vkey}_params"] = {}
    adata.uns[f"{vkey}_params"]["mode_neighbors"] = mode_neighbors
    adata.uns[f"{vkey}_params"][
        "n_recurse_neighbors"] = vgraph.n_recurse_neighbors

    logg.info("    finished",
              time=True,
              end=" " if settings.verbosity > 2 else "\n")
    logg.hint(
        "added \n"
        f"    '{vkey}_graph', sparse matrix with cosine correlations (adata.uns)"
    )

    return adata if copy else None
Ejemplo n.º 6
0
def velocity_confidence(data, vkey="velocity", copy=False):
    """Computes confidences of velocities.

    .. code:: python

        scv.tl.velocity_confidence(adata)
        scv.pl.scatter(adata, color='velocity_confidence', perc=[2,98])

    .. image:: https://user-images.githubusercontent.com/31883718/69626334-b6df5200-1048-11ea-9171-495845c5bc7a.png
       :width: 600px


    Arguments
    ---------
    data: :class:`~anndata.AnnData`
        Annotated data matrix.
    vkey: `str` (default: `'velocity'`)
        Name of velocity estimates to be used.
    copy: `bool` (default: `False`)
        Return a copy instead of writing to adata.

    Returns
    -------
    velocity_length: `.obs`
        Length of the velocity vectors for each individual cell
    velocity_confidence: `.obs`
        Confidence for each cell
    """  # noqa E501

    adata = data.copy() if copy else data
    if vkey not in adata.layers.keys():
        raise ValueError("You need to run `tl.velocity` first.")

    V = np.array(adata.layers[vkey])

    tmp_filter = np.invert(np.isnan(np.sum(V, axis=0)))
    if f"{vkey}_genes" in adata.var.keys():
        tmp_filter &= np.array(adata.var[f"{vkey}_genes"], dtype=bool)
    if "spearmans_score" in adata.var.keys():
        tmp_filter &= adata.var["spearmans_score"].values > 0.1

    V = V[:, tmp_filter]

    V -= V.mean(1)[:, None]
    V_norm = l2_norm(V, axis=1)
    R = np.zeros(adata.n_obs)

    indices = get_indices(dist=get_neighs(adata, "distances"))[0]
    for i in range(adata.n_obs):
        Vi_neighs = V[indices[i]]
        Vi_neighs -= Vi_neighs.mean(1)[:, None]
        R[i] = np.mean(
            np.einsum("ij, j", Vi_neighs, V[i]) /
            (l2_norm(Vi_neighs, axis=1) * V_norm[i])[None, :])

    adata.obs[f"{vkey}_length"] = V_norm.round(2)
    adata.obs[f"{vkey}_confidence"] = np.clip(R, 0, None)

    logg.hint(f"added '{vkey}_length' (adata.obs)")
    logg.hint(f"added '{vkey}_confidence' (adata.obs)")

    if f"{vkey}_confidence_transition" not in adata.obs.keys():
        velocity_confidence_transition(adata, vkey)

    return adata if copy else None
Ejemplo n.º 7
0
def velocity_embedding(
    data,
    basis=None,
    vkey="velocity",
    scale=10,
    self_transitions=True,
    use_negative_cosines=True,
    direct_pca_projection=None,
    retain_scale=False,
    autoscale=True,
    all_comps=True,
    T=None,
    copy=False,
):
    """Projects the single cell velocities into any embedding.

    Given normalized difference of the embedding positions
    :math:
    `\\tilde \\delta_{ij} = \\frac{x_j-x_i}{\\left\\lVert x_j-x_i \\right\\rVert}`.
    the projections are obtained as expected displacements with respect to the
    transition matrix :math:`\\tilde \\pi_{ij}` as

    .. math::
        \\tilde \\nu_i = E_{\\tilde \\pi_{i\\cdot}} [\\tilde \\delta_{i \\cdot}]
        = \\sum_{j \\neq i} \\left( \\tilde \\pi_{ij} - \\frac1n \\right) \\tilde \\
        delta_{ij}.


    Arguments
    ---------
    data: :class:`~anndata.AnnData`
        Annotated data matrix.
    basis: `str` (default: `'tsne'`)
        Which embedding to use.
    vkey: `str` (default: `'velocity'`)
        Name of velocity estimates to be used.
    scale: `int` (default: 10)
        Scale parameter of gaussian kernel for transition matrix.
    self_transitions: `bool` (default: `True`)
        Whether to allow self transitions, based on the confidences of transitioning to
        neighboring cells.
    use_negative_cosines: `bool` (default: `True`)
        Whether to project cell-to-cell transitions with negative cosines into
        negative/opposite direction.
    direct_pca_projection: `bool` (default: `None`)
        Whether to directly project the velocities into PCA space,
        thus skipping the velocity graph.
    retain_scale: `bool` (default: `False`)
        Whether to retain scale from high dimensional space in embedding.
    autoscale: `bool` (default: `True`)
        Whether to scale the embedded velocities by a scalar multiplier,
        which simply ensures that the arrows in the embedding are properly scaled.
    all_comps: `bool` (default: `True`)
        Whether to compute the velocities on all embedding components.
    T: `csr_matrix` (default: `None`)
        Allows the user to directly pass a transition matrix.
    copy: `bool` (default: `False`)
        Return a copy instead of writing to `adata`.

    Returns
    -------
    velocity_umap: `.obsm`
        coordinates of velocity projection on embedding (e.g., basis='umap')
    """

    adata = data.copy() if copy else data

    if basis is None:
        keys = [
            key for key in ["pca", "tsne", "umap"] if f"X_{key}" in adata.obsm.keys()
        ]
        if len(keys) > 0:
            basis = "pca" if direct_pca_projection else keys[-1]
        else:
            raise ValueError("No basis specified")

    if f"X_{basis}" not in adata.obsm_keys():
        raise ValueError("You need to compute the embedding first.")

    if direct_pca_projection and "pca" in basis:
        logg.warn(
            "Directly projecting velocities into PCA space is for exploratory analysis "
            "on principal components.\n"
            "         It does not reflect the actual velocity field from high "
            "dimensional gene expression space.\n"
            "         To visualize velocities, consider applying "
            "`direct_pca_projection=False`.\n"
        )

    logg.info("computing velocity embedding", r=True)

    V = np.array(adata.layers[vkey])
    vgenes = np.ones(adata.n_vars, dtype=bool)
    if f"{vkey}_genes" in adata.var.keys():
        vgenes &= np.array(adata.var[f"{vkey}_genes"], dtype=bool)
    vgenes &= ~np.isnan(V.sum(0))
    V = V[:, vgenes]

    if direct_pca_projection and "pca" in basis:
        PCs = adata.varm["PCs"] if all_comps else adata.varm["PCs"][:, :2]
        PCs = PCs[vgenes]

        X_emb = adata.obsm[f"X_{basis}"]
        V_emb = (V - V.mean(0)).dot(PCs)

    else:
        X_emb = (
            adata.obsm[f"X_{basis}"] if all_comps else adata.obsm[f"X_{basis}"][:, :2]
        )
        V_emb = np.zeros(X_emb.shape)

        T = (
            transition_matrix(
                adata,
                vkey=vkey,
                scale=scale,
                self_transitions=self_transitions,
                use_negative_cosines=use_negative_cosines,
            )
            if T is None
            else T
        )
        T.setdiag(0)
        T.eliminate_zeros()

        densify = adata.n_obs < 1e4
        TA = T.A if densify else None

        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            for i in range(adata.n_obs):
                indices = T[i].indices
                dX = X_emb[indices] - X_emb[i, None]  # shape (n_neighbors, 2)
                if not retain_scale:
                    dX /= l2_norm(dX)[:, None]
                dX[np.isnan(dX)] = 0  # zero diff in a steady-state
                probs = TA[i, indices] if densify else T[i].data
                V_emb[i] = probs.dot(dX) - probs.mean() * dX.sum(0)

        if retain_scale:
            X = (
                adata.layers["Ms"]
                if "Ms" in adata.layers.keys()
                else adata.layers["spliced"]
            )
            delta = T.dot(X[:, vgenes]) - X[:, vgenes]
            if issparse(delta):
                delta = delta.A
            cos_proj = (V * delta).sum(1) / l2_norm(delta)
            V_emb *= np.clip(cos_proj[:, None] * 10, 0, 1)

    if autoscale:
        V_emb /= 3 * quiver_autoscale(X_emb, V_emb)

    if f"{vkey}_params" in adata.uns.keys():
        adata.uns[f"{vkey}_params"]["embeddings"] = (
            []
            if "embeddings" not in adata.uns[f"{vkey}_params"]
            else list(adata.uns[f"{vkey}_params"]["embeddings"])
        )
        adata.uns[f"{vkey}_params"]["embeddings"].extend([basis])

    vkey += f"_{basis}"
    adata.obsm[vkey] = V_emb

    logg.info("    finished", time=True, end=" " if settings.verbosity > 2 else "\n")
    logg.hint("added\n" f"    '{vkey}', embedded velocity vectors (adata.obsm)")

    return adata if copy else None
Ejemplo n.º 8
0
def neighbors(
    adata,
    n_neighbors=30,
    n_pcs=None,
    use_rep=None,
    use_highly_variable=True,
    knn=True,
    random_state=0,
    method="umap",
    metric="euclidean",
    metric_kwds=None,
    num_threads=-1,
    copy=False,
):
    """
    Compute a neighborhood graph of observations.

    The neighbor graph methods (umap, hnsw, sklearn) only differ in runtime and
    yield the same result as scanpy [Wolf18]_. Connectivities are computed with
    adaptive kernel width as proposed in Haghverdi et al. 2016 (doi:10.1038/nmeth.3971).

    Parameters
    ----------
    adata
        Annotated data matrix.
    n_neighbors
        The size of local neighborhood (in terms of number of neighboring data
        points) used for manifold approximation. Larger values result in more
        global views of the manifold, while smaller values result in more local
        data being preserved. In general values should be in the range 2 to 100.
        If `knn` is `True`, number of nearest neighbors to be searched. If `knn`
        is `False`, a Gaussian kernel width is set to the distance of the
        `n_neighbors` neighbor.
    n_pcs : `int` or `None` (default: None)
        Number of principal components to use.
        If not specified, the full space is used of a pre-computed PCA,
        or 30 components are used when PCA is computed internally.
    use_rep : `None`, `'X'` or any key for `.obsm` (default: None)
        Use the indicated representation. If `None`, the representation is chosen
        automatically: for .n_vars < 50, .X is used, otherwise ‘X_pca’ is used.
    use_highly_variable: `bool` (default: True)
        Whether to use highly variable genes only, stored in .var['highly_variable'].
    knn
        If `True`, use a hard threshold to restrict the number of neighbors to
        `n_neighbors`, that is, consider a knn graph. Otherwise, use a Gaussian
        Kernel to assign low weights to neighbors more distant than the
        `n_neighbors` nearest neighbor.
    random_state
        A numpy random seed.
    method : {{'umap', 'hnsw', 'sklearn'}}  (default: `'umap'`)
        Method to compute neighbors, only differs in runtime.
        The 'hnsw' method is most efficient and requires to `pip install hnswlib`.
        Connectivities are computed with adaptive kernel.
    metric
        A known metric’s name or a callable that returns a distance.
    metric_kwds
        Options for the metric.
    num_threads
        Number of threads to be used (for runtime).
    copy
        Return a copy instead of writing to adata.

    Returns
    -------
    connectivities : `.obsp`
        Sparse weighted adjacency matrix of the neighborhood graph of data
        points. Weights should be interpreted as connectivities.
    distances : `.obsp`
        Sparse matrix of distances for each pair of neighbors.
    """

    adata = adata.copy() if copy else adata

    if use_rep is None:
        use_rep = "X" if adata.n_vars < 50 or n_pcs == 0 else "X_pca"
        n_pcs = None if use_rep == "X" else n_pcs
    elif use_rep not in adata.obsm.keys() and f"X_{use_rep}" in adata.obsm.keys():
        use_rep = f"X_{use_rep}"

    if use_rep == "X_pca":
        if (
            "X_pca" not in adata.obsm.keys()
            or n_pcs is not None
            and n_pcs > adata.obsm["X_pca"].shape[1]
        ):
            n_vars = (
                np.sum(adata.var["highly_variable"])
                if use_highly_variable and "highly_variable" in adata.var.keys()
                else adata.n_vars
            )
            n_comps = min(30 if n_pcs is None else n_pcs, n_vars - 1, adata.n_obs - 1)
            use_highly_variable &= "highly_variable" in adata.var.keys()
            pca(
                adata,
                n_comps=n_comps,
                use_highly_variable=use_highly_variable,
                svd_solver="arpack",
            )
        elif n_pcs is None and adata.obsm["X_pca"].shape[1] < 10:
            logg.warn(
                f"Neighbors are computed on {adata.obsm['X_pca'].shape[1]} "
                f"principal components only."
            )

        n_duplicate_cells = len(get_duplicate_cells(adata))
        if n_duplicate_cells > 0:
            logg.warn(
                f"You seem to have {n_duplicate_cells} duplicate cells in your data.",
                "Consider removing these via pp.remove_duplicate_cells.",
            )

    if metric_kwds is None:
        metric_kwds = {}

    logg.info("computing neighbors", r=True)

    if method == "sklearn":
        from sklearn.neighbors import NearestNeighbors

        X = adata.X if use_rep == "X" else adata.obsm[use_rep]
        neighbors = NearestNeighbors(
            n_neighbors=n_neighbors - 1,
            metric=metric,
            metric_params=metric_kwds,
            n_jobs=num_threads,
        )
        neighbors.fit(X if n_pcs is None else X[:, :n_pcs])
        knn_distances, neighbors.knn_indices = neighbors.kneighbors()
        knn_distances, neighbors.knn_indices = set_diagonal(
            knn_distances, neighbors.knn_indices
        )
        neighbors.distances, neighbors.connectivities = compute_connectivities_umap(
            neighbors.knn_indices, knn_distances, X.shape[0], n_neighbors=n_neighbors
        )

    elif method == "hnsw":
        X = adata.X if use_rep == "X" else adata.obsm[use_rep]
        neighbors = FastNeighbors(n_neighbors=n_neighbors, num_threads=num_threads)
        neighbors.fit(
            X if n_pcs is None else X[:, :n_pcs],
            metric=metric,
            random_state=random_state,
            **metric_kwds,
        )

    else:
        logg.switch_verbosity("off", module="scanpy")
        with warnings.catch_warnings():  # ignore numba warning (umap/issues/252)
            warnings.simplefilter("ignore")
            neighbors = Neighbors(adata)
            neighbors.compute_neighbors(
                n_neighbors=n_neighbors,
                knn=knn,
                n_pcs=n_pcs,
                method=method,
                use_rep=use_rep,
                random_state=random_state,
                metric=metric,
                metric_kwds=metric_kwds,
                write_knn_indices=True,
            )
        logg.switch_verbosity("on", module="scanpy")

    adata.uns["neighbors"] = {}
    try:
        adata.obsp["distances"] = neighbors.distances
        adata.obsp["connectivities"] = neighbors.connectivities
        adata.uns["neighbors"]["connectivities_key"] = "connectivities"
        adata.uns["neighbors"]["distances_key"] = "distances"
    except Exception:
        adata.uns["neighbors"]["distances"] = neighbors.distances
        adata.uns["neighbors"]["connectivities"] = neighbors.connectivities

    if hasattr(neighbors, "knn_indices"):
        adata.uns["neighbors"]["indices"] = neighbors.knn_indices
    adata.uns["neighbors"]["params"] = {
        "n_neighbors": n_neighbors,
        "method": method,
        "metric": metric,
        "n_pcs": n_pcs,
        "use_rep": use_rep,
    }

    logg.info("    finished", time=True, end=" " if settings.verbosity > 2 else "\n")
    logg.hint(
        "added \n"
        "    'distances' and 'connectivities', weighted adjacency matrices (adata.obsp)"
    )

    return adata if copy else None
Ejemplo n.º 9
0
def moments(
    data,
    n_neighbors=30,
    n_pcs=None,
    mode="connectivities",
    method="umap",
    use_rep=None,
    use_highly_variable=True,
    copy=False,
):
    """Computes moments for velocity estimation.

    First-/second-order moments are computed for each cell across its nearest neighbors,
    where the neighbor graph is obtained from euclidean distances in PCA space.

    Arguments
    ---------
    data: :class:`~anndata.AnnData`
        Annotated data matrix.
    n_neighbors: `int` (default: 30)
        Number of neighbors to use.
    n_pcs: `int` (default: None)
        Number of principal components to use.
        If not specified, the full space is used of a pre-computed PCA,
        or 30 components are used when PCA is computed internally.
    mode: `'connectivities'` or `'distances'`  (default: `'connectivities'`)
        Distance metric to use for moment computation.
    method : {{'umap', 'hnsw', 'sklearn', `None`}}  (default: `'umap'`)
        Method to compute neighbors, only differs in runtime.
        Connectivities are computed with adaptive kernel width as proposed in
        Haghverdi et al. 2016 (https://doi.org/10.1038/nmeth.3971).
    use_rep : `None`, `'X'` or any key for `.obsm` (default: None)
        Use the indicated representation. If `None`, the representation is chosen
        automatically: for .n_vars < 50, .X is used, otherwise ‘X_pca’ is used.
    use_highly_variable: `bool` (default: True)
        Whether to use highly variable genes only, stored in .var['highly_variable'].
    copy: `bool` (default: `False`)
        Return a copy instead of writing to adata.

    Returns
    -------
    Ms: `.layers`
        dense matrix with first order moments of spliced counts.
    Mu: `.layers`
        dense matrix with first order moments of unspliced counts.
    """

    adata = data.copy() if copy else data

    layers = [
        layer for layer in {"spliced", "unspliced"} if layer in adata.layers
    ]
    if any([not_yet_normalized(adata.layers[layer]) for layer in layers]):
        normalize_per_cell(adata)

    if n_neighbors is not None and n_neighbors > get_n_neighs(adata):
        neighbors(
            adata,
            n_neighbors=n_neighbors,
            use_rep=use_rep,
            use_highly_variable=use_highly_variable,
            n_pcs=n_pcs,
            method=method,
        )
    verify_neighbors(adata)

    if "spliced" not in adata.layers.keys(
    ) or "unspliced" not in adata.layers.keys():
        logg.warn(
            "Skipping moments, because un/spliced counts were not found.")
    else:
        logg.info(f"computing moments based on {mode}", r=True)
        connectivities = get_connectivities(adata,
                                            mode,
                                            n_neighbors=n_neighbors,
                                            recurse_neighbors=False)

        adata.layers["Ms"] = (csr_matrix.dot(
            connectivities,
            csr_matrix(adata.layers["spliced"])).astype(np.float32).A)
        adata.layers["Mu"] = (csr_matrix.dot(
            connectivities,
            csr_matrix(adata.layers["unspliced"])).astype(np.float32).A)
        # if renormalize: normalize_per_cell(adata, layers={'Ms', 'Mu'}, enforce=True)

        logg.info("    finished",
                  time=True,
                  end=" " if settings.verbosity > 2 else "\n")
        logg.hint(
            "added \n"
            "    'Ms' and 'Mu', moments of un/spliced abundances (adata.layers)"
        )
    return adata if copy else None
Ejemplo n.º 10
0
def paga(
    adata,
    groups=None,
    vkey="velocity",
    use_time_prior=True,
    root_key=None,
    end_key=None,
    threshold_root_end_prior=None,
    minimum_spanning_tree=True,
    copy=False,
):
    """PAGA graph with velocity-directed edges.

    Mapping out the coarse-grained connectivity structures of complex manifolds
    [Wolf19]_. By quantifying the connectivity of partitions (groups, clusters) of the
    single-cell graph, partition-based graph abstraction (PAGA) generates a much
    simpler abstracted graph (*PAGA graph*) of partitions, in which edge weights
    represent confidence in the presence of connections.

    Parameters
    ----------
    adata : :class:`~anndata.AnnData`
        An annotated data matrix.
    groups : key for categorical in `adata.obs`, optional (default: 'louvain')
        You can pass your predefined groups by choosing any categorical
        annotation of observations (`adata.obs`).
    vkey: `str` or `None` (default: `None`)
        Key for annotations of observations/cells or variables/genes.
    use_time_prior : `str` or bool, optional (default: True)
        Obs key for pseudo-time values.
        If True, 'velocity_pseudotime' is used if available.
    root_key : `str` or bool, optional (default: None)
        Obs key for root states.
    end_key : `str` or bool, optional (default: None)
        Obs key for end states.
    threshold_root_end_prior : `float` (default: 0.9)
        Threshold for root and final states priors, to be in the range of [0,1].
        Values above the threshold will be considered as terminal and included as prior.
    minimum_spanning_tree : bool, optional (default: True)
        Whether to prune the tree such that a path from A-to-B
        is removed if another more confident path exists.
    copy : `bool`, optional (default: `False`)
        Copy `adata` before computation and return a copy.
        Otherwise, perform computation inplace and return `None`.

    Returns
    -------
    connectivities: `.uns`
        The full adjacency matrix of the abstracted graph, weights correspond to
        confidence in the connectivities of partitions.
    connectivities_tree: `.uns`
        The adjacency matrix of the tree-like subgraph that best explains the topology.
    transitions_confidence: `.uns`
        The adjacency matrix of the abstracted directed graph, weights correspond to
        confidence in the transitions between partitions.
    """

    if "neighbors" not in adata.uns:
        raise ValueError(
            "You need to run `pp.neighbors` first to compute a neighborhood graph."
        )

    adata = adata.copy() if copy else adata
    strings_to_categoricals(adata)

    if groups is None:
        groups = ("clusters" if "clusters" in adata.obs.keys() else
                  "louvain" if "louvain" in adata.obs.keys() else None)
    elif groups == "velocity_clusters" and "velocity_clusters" not in adata.obs.keys(
    ):
        velocity_clusters(adata)
    if use_time_prior and not isinstance(use_time_prior, str):
        use_time_prior = "velocity_pseudotime"
        if use_time_prior not in adata.obs.keys():
            velocity_pseudotime(adata,
                                vkey=vkey,
                                root_key=root_key,
                                end_key=end_key)

    priors = [
        p for p in [use_time_prior, root_key, end_key]
        if p in adata.obs.keys()
    ]
    logg.info(
        "running PAGA",
        f"using priors: {priors}" if len(priors) > 0 else "",
        r=True,
    )
    paga = PAGA_tree(
        adata,
        groups,
        vkey=vkey,
        use_time_prior=use_time_prior,
        root_key=root_key,
        end_key=end_key,
        threshold_root_end_prior=threshold_root_end_prior,
        minimum_spanning_tree=minimum_spanning_tree,
    )

    if "paga" not in adata.uns:
        adata.uns["paga"] = {}

    paga.compute_connectivities()
    adata.uns["paga"]["connectivities"] = paga.connectivities
    adata.uns["paga"]["connectivities_tree"] = paga.connectivities_tree
    adata.uns[f"{groups}_sizes"] = np.array(paga.ns)

    paga.compute_transitions()
    adata.uns["paga"]["transitions_confidence"] = paga.transitions_confidence
    adata.uns["paga"]["threshold"] = paga.threshold
    adata.uns["paga"]["groups"] = groups

    logg.info("    finished",
              time=True,
              end=" " if settings.verbosity > 2 else "\n")
    logg.hint(
        "added\n" +
        "    'paga/connectivities', connectivities adjacency (adata.uns)\n"
        "    'paga/connectivities_tree', connectivities subtree (adata.uns)\n"
        "    'paga/transitions_confidence', velocity transitions (adata.uns)")

    return adata if copy else None
Ejemplo n.º 11
0
def velocity(
    data,
    vkey="velocity",
    mode="stochastic",
    fit_offset=False,
    fit_offset2=False,
    filter_genes=False,
    groups=None,
    groupby=None,
    groups_for_fit=None,
    constrain_ratio=None,
    use_raw=False,
    use_latent_time=None,
    perc=[5, 95],
    min_r2=1e-2,
    min_likelihood=1e-3,
    r2_adjusted=None,
    use_highly_variable=True,
    diff_kinetics=None,
    copy=False,
    **kwargs,
):
    """Estimates velocities in a gene-specific manner.

    The steady-state model [Manno18]_ determines velocities by quantifying how
    observations deviate from a presumed steady-state equilibrium ratio of unspliced to
    spliced mRNA levels. This steady-state ratio is obtained by performing a linear
    regression restricting the input data to the extreme quantiles. By including
    second-order moments, the stochastic model [Bergen19]_ exploits not only the balance
    of unspliced to spliced mRNA levels but also their covariation. By contrast, the
    likelihood-based dynamical model [Bergen19]_ solves the full splicing kinetics and
    generalizes RNA velocity estimation to transient systems. It is also capable of
    capturing non-observed steady states.

    .. image:: https://user-images.githubusercontent.com/31883718/69636491-ff057100-1056-11ea-90b7-d04098112ce1.png

    Arguments
    ---------
    data: :class:`~anndata.AnnData`
        Annotated data matrix.
    vkey: `str` (default: `'velocity'`)
        Name under which to refer to the computed velocities
        for `velocity_graph` and `velocity_embedding`.
    mode: `'deterministic'`, `'stochastic'` or `'dynamical'` (default: `'stochastic'`)
        Whether to run the estimation using the steady-state/deterministic,
        stochastic or dynamical model of transcriptional dynamics.
        The dynamical model requires to run `tl.recover_dynamics` first.
    fit_offset: `bool` (default: `False`)
        Whether to fit with offset for first order moment dynamics.
    fit_offset2: `bool`, (default: `False`)
        Whether to fit with offset for second order moment dynamics.
    filter_genes: `bool` (default: `True`)
        Whether to remove genes that are not used for further velocity analysis.
    groups: `str`, `list` (default: `None`)
        Subset of groups, e.g. [‘g1’, ‘g2’, ‘g3’],
        to which velocity analysis shall be restricted.
    groupby: `str`, `list` or `np.ndarray` (default: `None`)
        Key of observations grouping to consider.
    groups_for_fit: `str`, `list` or `np.ndarray` (default: `None`)
        Subset of groups, e.g. [‘g1’, ‘g2’, ‘g3’],
        to which steady-state fitting shall be restricted.
    constrain_ratio: `float` or tuple of type `float` or None: (default: `None`)
        Bounds for the steady-state ratio.
    use_raw: `bool` (default: `False`)
        Whether to use raw data for estimation.
    use_latent_time: `bool`or `None` (default: `None`)
        Whether to use latent time as a regularization for velocity estimation.
    perc: `float` (default: `[5, 95]`)
        Percentile, e.g. 98, for extreme quantile fit.
    min_r2: `float` (default: 0.01)
        Minimum threshold for coefficient of determination
    min_likelihood: `float` (default: `None`)
        Minimal likelihood for velocity genes to fit the model on.
    r2_adjusted: `bool` (default: `None`)
        Whether to compute coefficient of determination
        on full data fit (adjusted) or extreme quantile fit (None)
    use_highly_variable: `bool` (default: True)
        Whether to use highly variable genes only, stored in .var['highly_variable'].
    copy: `bool` (default: `False`)
        Return a copy instead of writing to `adata`.

    Returns
    -------
    velocity: `.layers`
        velocity vectors for each individual cell
    velocity_genes, velocity_beta, velocity_gamma, velocity_r2: `.var`
        parameters
    """  # noqa E501

    adata = data.copy() if copy else data
    if not use_raw and "Ms" not in adata.layers.keys():
        moments(adata)

    logg.info("computing velocities", r=True)

    strings_to_categoricals(adata)

    if mode is None or (mode == "dynamical"
                        and "fit_alpha" not in adata.var.keys()):
        mode = "stochastic"
        logg.warn("Falling back to stochastic model. "
                  "For the dynamical model run tl.recover_dynamics first.")

    if mode in {"dynamical", "dynamical_residuals"}:
        from .dynamical_model_utils import get_divergence, get_reads, get_vars

        gene_subset = ~np.isnan(adata.var["fit_alpha"].values)
        vdata = adata[:, gene_subset]
        alpha, beta, gamma, scaling, t_ = get_vars(vdata)

        connect = not adata.uns["recover_dynamics"]["use_raw"]
        kwargs_ = {
            "kernel_width": None,
            "normalized": True,
            "var_scale": True,
            "reg_par": None,
            "min_confidence": 1e-2,
            "constraint_time_increments": False,
            "fit_steady_states": True,
            "fit_basal_transcription": None,
            "use_connectivities": connect,
            "time_connectivities": connect,
            "use_latent_time": use_latent_time,
        }
        kwargs_.update(adata.uns["recover_dynamics"])
        kwargs_.update(**kwargs)

        if "residuals" in mode:
            u, s = get_reads(vdata,
                             use_raw=adata.uns["recover_dynamics"]["use_raw"])
            if kwargs_["fit_basal_transcription"]:
                u, s = u - adata.var["fit_u0"], s - adata.var["fit_s0"]
            o = vdata.layers["fit_t"] < t_
            vt = u * beta - s * gamma  # ds/dt
            wt = (alpha * o - beta * u) * scaling  # du/dt
        else:
            vt, wt = get_divergence(vdata, mode="velocity", **kwargs_)

        vgenes = adata.var.fit_likelihood > min_likelihood
        if min_r2 is not None:
            if "fit_r2" not in adata.var.keys():
                velo = Velocity(
                    adata,
                    groups_for_fit=groups_for_fit,
                    groupby=groupby,
                    constrain_ratio=constrain_ratio,
                    min_r2=min_r2,
                    use_highly_variable=use_highly_variable,
                    use_raw=use_raw,
                )
                velo.compute_deterministic(fit_offset=fit_offset, perc=perc)
                adata.var["fit_r2"] = velo._r2
            vgenes &= adata.var.fit_r2 > min_r2

        lb, ub = np.nanpercentile(adata.var.fit_scaling, [10, 90])
        vgenes = (vgenes
                  & (adata.var.fit_scaling > np.min([lb, 0.03]))
                  & (adata.var.fit_scaling < np.max([ub, 3])))

        adata.var[f"{vkey}_genes"] = vgenes

        adata.layers[vkey] = np.ones(adata.shape) * np.nan
        adata.layers[vkey][:, gene_subset] = vt

        adata.layers[f"{vkey}_u"] = np.ones(adata.shape) * np.nan
        adata.layers[f"{vkey}_u"][:, gene_subset] = wt

        if filter_genes and len(set(vgenes)) > 1:
            adata._inplace_subset_var(vgenes)

    elif mode in {"steady_state", "deterministic", "stochastic"}:
        categories = (adata.obs[groupby].cat.categories
                      if groupby is not None and groups is None
                      and groups_for_fit is None else [None])

        for cat in categories:
            groups = cat if cat is not None else groups

            cell_subset = groups_to_bool(adata, groups, groupby)
            _adata = adata if groups is None else adata[cell_subset]
            velo = Velocity(
                _adata,
                groups_for_fit=groups_for_fit,
                groupby=groupby,
                constrain_ratio=constrain_ratio,
                min_r2=min_r2,
                r2_adjusted=r2_adjusted,
                use_highly_variable=use_highly_variable,
                use_raw=use_raw,
            )
            velo.compute_deterministic(fit_offset=fit_offset, perc=perc)

            if mode == "stochastic":
                if filter_genes and len(set(velo._velocity_genes)) > 1:
                    adata._inplace_subset_var(velo._velocity_genes)
                    residual = velo._residual[:, velo._velocity_genes]
                    _adata = adata if groups is None else adata[cell_subset]
                    velo = Velocity(
                        _adata,
                        residual=residual,
                        groups_for_fit=groups_for_fit,
                        groupby=groupby,
                        constrain_ratio=constrain_ratio,
                        use_highly_variable=use_highly_variable,
                    )
                velo.compute_stochastic(fit_offset,
                                        fit_offset2,
                                        mode,
                                        perc=perc)

            write_residuals(adata, vkey, velo._residual, cell_subset)
            write_residuals(adata, f"variance_{vkey}", velo._residual2,
                            cell_subset)
            write_pars(adata,
                       vkey,
                       velo.get_pars(),
                       velo.get_pars_names(),
                       add_key=cat)

            if filter_genes and len(set(velo._velocity_genes)) > 1:
                adata._inplace_subset_var(velo._velocity_genes)

    else:
        raise ValueError(
            "Mode can only be one of these: deterministic, stochastic or dynamical."
        )

    if f"{vkey}_genes" in adata.var.keys() and np.sum(
            adata.var[f"{vkey}_genes"]) < 10:
        logg.warn(
            "Too few genes are selected as velocity genes. "
            "Consider setting a lower threshold for min_r2 or min_likelihood.")

    if diff_kinetics:
        if not isinstance(diff_kinetics, str):
            diff_kinetics = "fit_diff_kinetics"
        if diff_kinetics in adata.var.keys():
            if diff_kinetics in adata.uns["recover_dynamics"]:
                groupby = adata.uns["recover_dynamics"]["fit_diff_kinetics"]
            else:
                groupby = "clusters"
            clusters = adata.obs[groupby]
            for i, v in enumerate(
                    np.array(adata.var[diff_kinetics].values, dtype=str)):
                if len(v) > 0 and v != "nan":
                    idx = 1 - clusters.isin([a.strip() for a in v.split(",")])
                    adata.layers[vkey][:, i] *= idx
                    if mode == "dynamical":
                        adata.layers[f"{vkey}_u"][:, i] *= idx

    adata.uns[f"{vkey}_params"] = {
        "mode": mode,
        "fit_offset": fit_offset,
        "perc": perc
    }

    logg.info("    finished",
              time=True,
              end=" " if settings.verbosity > 2 else "\n")
    logg.hint(
        "added \n"
        f"    '{vkey}', velocity vectors for each individual cell (adata.layers)"
    )

    return adata if copy else None
Ejemplo n.º 12
0
def cell_origin(
    data,
    groupby="clusters",
    disconnected_groups=None,
    self_transitions=False,
    n_neighbors=None,
    copy=False,
):
    """Computes individual cell root points

    Arguments
    ---------
    data: :class:`~anndata.AnnData`
        Annotated data matrix.
    groupby: `str` (default: `'clusters'`)
        Key to which to assign the fates.
    disconnected_groups: list of `str` (default: `None`)
        Which groups to treat as disconnected for fate assignment.
    n_neighbors: `int` (default: `None`)
        Number of neighbors to restrict transitions to.
    self_transitions: `bool` (default: `False`)
        Whether to include self-transitions.
    copy: `bool` (default: `False`)
        Return a copy instead of writing to `adata`.

    Returns
    -------
    cell_origin: `.obs`
        most likely cell origin for each individual cell
    cell_origin_confidence: `.obs`
        confidence of coming from assigned origin
    """

    adata = data.copy() if copy else data
    logg.info("computing cell fates", r=True)

    n_neighbors = 10 if n_neighbors is None else n_neighbors
    _adata = adata.copy()
    vgraph = VelocityGraph(_adata,
                           n_neighbors=n_neighbors,
                           approx=True,
                           n_recurse_neighbors=1)
    vgraph.compute_cosines()
    _adata.uns["velocity_graph"] = vgraph.graph
    _adata.uns["velocity_graph_neg"] = vgraph.graph_neg

    T = transition_matrix(_adata,
                          self_transitions=self_transitions,
                          backward=True)
    fate = np.linalg.inv(np.eye(_adata.n_obs) - T)
    if issparse(T):
        fate = fate.A
    cell_fates = np.array(_adata.obs[groupby][fate.argmax(1)])
    if disconnected_groups is not None:
        idx = _adata.obs[groupby].isin(disconnected_groups)
        cell_fates[idx] = _adata.obs[groupby][idx]

    adata.obs["cell_origin"] = cell_fates
    adata.obs["cell_origin_confidence"] = fate.max(1) / fate.sum(1)
    strings_to_categoricals(adata)

    logg.info("    finished",
              time=True,
              end=" " if settings.verbosity > 2 else "\n")
    logg.hint(
        "added\n"
        "    'cell_origin', most likely cell origin (adata.obs)\n"
        "    'cell_origin_confidence', confidence of assigned origin (adata.obs)"
    )
Ejemplo n.º 13
0
def terminal_states(
    data,
    vkey="velocity",
    modality="Ms",
    groupby=None,
    groups=None,
    self_transitions=False,
    eps=1e-3,
    random_state=0,
    copy=False,
    **kwargs,
):
    """Computes terminal states (root and end points).

    The end points and root cells are obtained as stationary states of the
    velocity-inferred transition matrix and its transposed, respectively,
    which is given by left eigenvectors corresponding to an eigenvalue of 1, i.e.

    .. math::
        μ^{\\textrm{end}}=μ^{\\textrm{end}} \\pi, \\quad
        μ^{\\textrm{root}}=μ^{\\textrm{root}} \\pi^{\\small \\textrm{T}}.

    .. code:: python

        scv.tl.terminal_states(adata)
        scv.pl.scatter(adata, color=['root_cells', 'end_points'])

    .. image:: https://user-images.githubusercontent.com/31883718/69496183-bcfdf300-0ecf-11ea-9aae-685300a0b1ba.png

    Alternatively, we recommend to use :func:`cellrank.tl.terminal_states`
    providing an improved/generalized approach of identifying terminal states.

    Arguments
    ---------
    data: :class:`~anndata.AnnData`
        Annotated data matrix.
    vkey: `str` (default: `'velocity'`)
        Name of velocity estimates to be used.
    modality: `str` (default: `'Ms'`)
        Layer used to calculate terminal states.
    groupby: `str`, `list` or `np.ndarray` (default: `None`)
        Key of observations grouping to consider. Only to be set, if each group is
        assumed to have a distinct lineage with an independent root and end point.
    groups: `str`, `list` or `np.ndarray` (default: `None`)
        Groups selected to find terminal states on. Must be an element of .obs[groupby].
        To be specified only for very distinct/disconnected clusters.
    self_transitions: `bool` (default: `False`)
        Allow transitions from one node to itself.
    eps: `float` (default: 1e-3)
        Tolerance for eigenvalue selection.
    random_state: `int` or None (default: 0)
        Seed used by the random number generator.
        If `None`, use the `RandomState` instance by `np.random`.
    copy: `bool` (default: `False`)
        Return a copy instead of writing to data.
    **kwargs:
        Passed to scvelo.tl.transition_matrix(), e.g. basis, weight_diffusion.

    Returns
    -------
    root_cells: `.obs`
        sparse matrix with transition probabilities.
    end_points: `.obs`
        sparse matrix with transition probabilities.
    """  # noqa E501

    adata = data.copy() if copy else data
    verify_neighbors(adata)

    logg.info("computing terminal states", r=True)

    strings_to_categoricals(adata)
    if groupby is not None:
        logg.warn(
            "Only set groupby, when you have evident distinct clusters/lineages,"
            " each with an own root and end point.")

    kwargs.update({"self_transitions": self_transitions})
    categories = [None]
    if groupby is not None and groups is None:
        categories = adata.obs[groupby].cat.categories
    for cat in categories:
        groups = cat if cat is not None else groups
        cell_subset = groups_to_bool(adata, groups=groups, groupby=groupby)
        _adata = adata if groups is None else adata[cell_subset]
        connectivities = get_connectivities(_adata, "distances")

        T = transition_matrix(_adata, vkey=vkey, backward=True, **kwargs)
        eigvecs_roots = eigs(T,
                             eps=eps,
                             perc=[2, 98],
                             random_state=random_state)[1]
        roots = csr_matrix.dot(connectivities, eigvecs_roots).sum(1)
        roots = scale(np.clip(roots, 0, np.percentile(roots, 98)))
        roots = verify_roots(_adata, roots, modality)
        write_to_obs(adata, "root_cells", roots, cell_subset)

        T = transition_matrix(_adata, vkey=vkey, backward=False, **kwargs)
        eigvecs_ends = eigs(T,
                            eps=eps,
                            perc=[2, 98],
                            random_state=random_state)[1]
        ends = csr_matrix.dot(connectivities, eigvecs_ends).sum(1)
        ends = scale(np.clip(ends, 0, np.percentile(ends, 98)))
        write_to_obs(adata, "end_points", ends, cell_subset)

        n_roots, n_ends = eigvecs_roots.shape[1], eigvecs_ends.shape[1]
        groups_str = f" ({groups})" if isinstance(groups, str) else ""
        roots_str = f"{n_roots} {'regions' if n_roots > 1 else 'region'}"
        ends_str = f"{n_ends} {'regions' if n_ends > 1 else 'region'}"

        logg.info(f"    identified {roots_str} of root cells "
                  f"and {ends_str} of end points {groups_str}.")

    logg.info("    finished",
              time=True,
              end=" " if settings.verbosity > 2 else "\n")
    logg.hint(
        "added\n"
        "    'root_cells', root cells of Markov diffusion process (adata.obs)\n"
        "    'end_points', end points of Markov diffusion process (adata.obs)")
    return adata if copy else None