def magic_impute(adata, knn=5, t=2, verbose=0, **kwargs): logg.info( "To be used carefully. Magic has not yet been tested for this application." ) import magic magic_operator = magic.MAGIC(verbose=verbose, knn=knn, t=t, **kwargs) adata.layers["Ms"] = magic_operator.fit_transform(adata.layers["spliced"]) adata.layers["Mu"] = magic_operator.transform(adata.layers["unspliced"])
def score_genes_cell_cycle(adata, s_genes=None, g2m_genes=None, copy=False, **kwargs): """\ Score cell cycle genes. Calculates scores and assigns a cell cycle phase (G1, S, G2M) using the list of cell cycle genes defined in Tirosh et al, 2015 (https://doi.org/10.1126/science.aad0501). Parameters ---------- adata The annotated data matrix. s_genes List of genes associated with S phase. g2m_genes List of genes associated with G2M phase. copy Copy `adata` or modify it inplace. **kwargs Are passed to :func:`~scanpy.tl.score_genes`. `ctrl_size` is not possible, as it's set as `min(len(s_genes), len(g2m_genes))`. Returns ------- S_score: `adata.obs`, dtype `object` The score for S phase for each cell. G2M_score: `adata.obs`, dtype `object` The score for G2M phase for each cell. phase: `adata.obs`, dtype `object` The cell cycle phase (`S`, `G2M` or `G1`) for each cell. """ logg.info("calculating cell cycle phase") from scanpy.tools._score_genes import score_genes adata = adata.copy() if copy else adata s_genes_, g2m_genes_ = get_phase_marker_genes(adata) if s_genes is None: s_genes = s_genes_ if g2m_genes is None: g2m_genes = g2m_genes_ ctrl_size = min(len(s_genes), len(g2m_genes)) kwargs.update({"ctrl_size": ctrl_size}) score_genes(adata, gene_list=s_genes, score_name="S_score", **kwargs) score_genes(adata, gene_list=g2m_genes, score_name="G2M_score", **kwargs) scores = adata.obs[["S_score", "G2M_score"]] phase = pd.Series("S", index=scores.index) # default phase is S phase[scores.G2M_score > scores.S_score] = "G2M" # G2M, if G2M is higher than S phase[np.all(scores < 0, axis=1)] = "G1" # G1, if all scores are negative adata.obs["phase"] = phase logg.hint(" 'S_score' and 'G2M_score', scores of cell cycle phases (adata.obs)") return adata if copy else None
def remove_duplicate_cells(adata): if "X_pca" not in adata.obsm.keys(): pca(adata) idx_duplicates = get_duplicate_cells(adata) if len(idx_duplicates) > 0: mask = np.ones(adata.n_obs, bool) mask[idx_duplicates] = 0 logg.info("Removed", len(idx_duplicates), "duplicate cells.") adata._inplace_subset_obs(mask) if "neighbors" in adata.uns.keys(): neighbors(adata)
def filter_genes_dispersion( data, flavor="seurat", min_disp=None, max_disp=None, min_mean=None, max_mean=None, n_bins=20, n_top_genes=None, retain_genes=None, log=True, subset=True, copy=False, ): """Extract highly variable genes. Expects non-logarithmized data. The normalized dispersion is obtained by scaling with the mean and standard deviation of the dispersions for genes falling into a given bin for mean expression of genes. This means that for each bin of mean expression, highly variable genes are selected. Parameters ---------- data : :class:`~anndata.AnnData`, `np.ndarray`, `sp.sparse` The (annotated) data matrix of shape `n_obs` × `n_vars`. Rows correspond to cells and columns to genes. flavor : {'seurat', 'cell_ranger', 'svr'}, optional (default: 'seurat') Choose the flavor for computing normalized dispersion. If choosing 'seurat', this expects non-logarithmized data - the logarithm of mean and dispersion is taken internally when `log` is at its default value `True`. For 'cell_ranger', this is usually called for logarithmized data - in this case you should set `log` to `False`. In their default workflows, Seurat passes the cutoffs whereas Cell Ranger passes `n_top_genes`. min_mean=0.0125, max_mean=3, min_disp=0.5, max_disp=`None` : `float`, optional If `n_top_genes` unequals `None`, these cutoffs for the means and the normalized dispersions are ignored. n_bins : `int` (default: 20) Number of bins for binning the mean gene expression. Normalization is done with respect to each bin. If just a single gene falls into a bin, the normalized dispersion is artificially set to 1. You'll be informed about this if you set `settings.verbosity = 4`. n_top_genes : `int` or `None` (default: `None`) Number of highly-variable genes to keep. retain_genes: `list`, optional (default: `None`) List of gene names to be retained independent of thresholds. log : `bool`, optional (default: `True`) Use the logarithm of the mean to variance ratio. subset : `bool`, optional (default: `True`) Keep highly-variable genes only (if True) else write a bool array for highly-variable genes while keeping all genes. copy : `bool`, optional (default: `False`) If an :class:`~anndata.AnnData` is passed, determines whether a copy is returned. Returns ------- If an AnnData `adata` is passed, returns or updates `adata` depending on \ `copy`. It filters the `adata` and adds the annotations """ adata = data.copy() if copy else data _set_initial_size(adata) mean, var = materialize_as_ndarray(get_mean_var(adata.X)) if n_top_genes is not None and adata.n_vars < n_top_genes: logg.info("Skip filtering by dispersion since number " "of variables are less than `n_top_genes`.") else: if flavor == "svr": from sklearn.svm import SVR log_mu = np.log2(mean) log_cv = np.log2(np.sqrt(var) / mean) clf = SVR(gamma=150.0 / len(mean)) clf.fit(log_mu[:, None], log_cv) score = log_cv - clf.predict(log_mu[:, None]) nth_score = np.sort(score)[::-1][n_top_genes - 1] adata.var["highly_variable"] = score >= nth_score else: cut_disp = [min_disp, max_disp, min_mean, max_mean] if n_top_genes is not None and not all(x is None for x in cut_disp): logg.info( "If you pass `n_top_genes`, all cutoffs are ignored.") if min_disp is None: min_disp = 0.5 if max_disp is None: max_disp = np.inf if min_mean is None: min_mean = 0.0125 if max_mean is None: max_mean = 3 mean[mean == 0] = 1e-12 # set entries equal to zero to small value dispersion = var / mean if log: # logarithmized mean as in Seurat dispersion[dispersion == 0] = np.nan dispersion = np.log(dispersion) mean = np.log1p(mean) # all of the following quantities are "per-gene" here df = pd.DataFrame() df["mean"], df["dispersion"] = mean, dispersion if flavor == "seurat": df["mean_bin"] = pd.cut(df["mean"], bins=n_bins) disp_grouped = df.groupby("mean_bin")["dispersion"] disp_mean_bin = disp_grouped.mean() disp_std_bin = disp_grouped.std(ddof=1) # retrieve genes that have nan std (i.e. single gene fell in one bin) # and implicitly set them to have a normalized disperion of 1 one_gene_per_bin = disp_std_bin.isnull() disp_std_bin[one_gene_per_bin] = disp_mean_bin[ one_gene_per_bin].values disp_mean_bin[one_gene_per_bin] = 0 # normalized dispersion mu = disp_mean_bin[df["mean_bin"].values].values std = disp_std_bin[df["mean_bin"].values].values df["dispersion_norm"] = ((df["dispersion"] - mu) / std).fillna(0) elif flavor == "cell_ranger": from statsmodels import robust cut = np.percentile(df["mean"], np.arange(10, 105, 5)) df["mean_bin"] = pd.cut(df["mean"], np.r_[-np.inf, cut, np.inf]) disp_grouped = df.groupby("mean_bin")["dispersion"] disp_median_bin = disp_grouped.median() with warnings.catch_warnings( ): # ignore warning: "Mean of empty slice" warnings.simplefilter("ignore") disp_mad_bin = disp_grouped.apply(robust.mad) mu = disp_median_bin[df["mean_bin"].values].values std = disp_mad_bin[df["mean_bin"].values].values df["dispersion_norm"] = (np.abs(df["dispersion"] - mu) / std).fillna(0) else: raise ValueError( '`flavor` needs to be "seurat" or "cell_ranger"') dispersion_norm = df["dispersion_norm"].values if n_top_genes is not None: cut_off = df["dispersion_norm"].nlargest( n_top_genes).values[-1] gene_subset = df["dispersion_norm"].values >= cut_off else: gene_subset = np.logical_and.reduce(( mean > min_mean, mean < max_mean, dispersion_norm > min_disp, dispersion_norm < max_disp, )) adata.var["means"] = df["mean"].values adata.var["dispersions"] = df["dispersion"].values adata.var["dispersions_norm"] = df["dispersion_norm"].values adata.var["highly_variable"] = gene_subset if subset: gene_subset = adata.var["highly_variable"] if retain_genes is not None: if isinstance(retain_genes, str): retain_genes = [retain_genes] gene_subset = gene_subset | adata.var_names.isin(retain_genes) adata._inplace_subset_var(gene_subset) logg.info(f"Extracted {np.sum(gene_subset)} highly variable genes.") return adata if copy else None
def _paga_graph( adata, ax, solid_edges=None, dashed_edges=None, adjacency_solid=None, adjacency_dashed=None, transitions=None, threshold=None, root=0, colors=None, labels=None, fontsize=None, fontweight=None, fontoutline=None, text_kwds=None, node_size_scale=1.0, node_size_power=0.5, edge_width_scale=1.0, normalize_to_color="reference", title=None, pos=None, cmap=None, frameon=True, min_edge_width=None, max_edge_width=None, export_to_gexf=False, colorbar=None, use_raw=True, cb_kwds=None, single_component=False, arrowsize=30, ): """scanpy/_paga_graph with some adjustments for directional graphs. To be moved back to scanpy once finalized. """ import warnings from pathlib import Path import networkx as nx import pandas as pd import scipy from pandas.api.types import is_categorical_dtype from matplotlib import patheffects from matplotlib.colors import is_color_like from scanpy.plotting._utils import add_colors_for_categorical_sample_annotation node_labels = labels # rename for clarity if (node_labels is not None and isinstance(node_labels, str) and node_labels != adata.uns["paga"]["groups"]): raise ValueError( f"Provide a list of group labels for the PAGA " f"groups {adata.uns['paga']['groups']}, not {node_labels}.") groups_key = adata.uns["paga"]["groups"] if node_labels is None: node_labels = adata.obs[groups_key].cat.categories if (colors is None or colors == groups_key) and groups_key is not None: if f"{groups_key}_colors" not in adata.uns or len( adata.obs[groups_key].cat.categories) != len( adata.uns[f"{groups_key}_colors"]): add_colors_for_categorical_sample_annotation(adata, groups_key) colors = adata.uns[f"{groups_key}_colors"] nx_g_solid = nx.Graph(adjacency_solid) if dashed_edges is not None: nx_g_dashed = nx.Graph(adjacency_dashed) # convert pos to array and dict if not isinstance(pos, (Path, str)): pos_array = pos else: pos = Path(pos) if pos.suffix != ".gdf": raise ValueError( "Currently only supporting reading positions from .gdf files.") s = "" # read the node definition from the file with pos.open() as f: f.readline() for line in f: if line.startswith("edgedef>"): break s += line from io import StringIO df = pd.read_csv(StringIO(s), header=-1) pos_array = df[[4, 5]].values # convert to dictionary pos = {n: [p[0], p[1]] for n, p in enumerate(pos_array)} # uniform color if isinstance(colors, str) and is_color_like(colors): colors = [colors for c in range(len(node_labels))] # color degree of the graph if isinstance(colors, str) and colors.startswith("degree"): # see also tools.paga.paga_degrees if colors == "degree_dashed": colors = [d for _, d in nx_g_dashed.degree(weight="weight")] elif colors == "degree_solid": colors = [d for _, d in nx_g_solid.degree(weight="weight")] else: raise ValueError( '`degree` either "degree_dashed" or "degree_solid".') colors = (np.array(colors) - np.min(colors)) / (np.max(colors) - np.min(colors)) # plot gene expression var_names = adata.var_names if adata.raw is None else adata.raw.var_names if isinstance(colors, str) and colors in var_names: x_color = [] cats = adata.obs[groups_key].cat.categories for cat in cats: subset = (cat == adata.obs[groups_key]).values if adata.raw is not None and use_raw: adata_gene = adata.raw[:, colors] else: adata_gene = adata[:, colors] x_color.append(np.mean(adata_gene.X[subset])) colors = x_color # plot continuous annotation if (isinstance(colors, str) and colors in adata.obs and not is_categorical_dtype(adata.obs[colors])): x_color = [] cats = adata.obs[groups_key].cat.categories for cat in cats: subset = (cat == adata.obs[groups_key]).values x_color.append(adata.obs.loc[subset, colors].mean()) colors = x_color # plot categorical annotation if (isinstance(colors, str) and colors in adata.obs and is_categorical_dtype(adata.obs[colors])): from scanpy._utils import ( compute_association_matrix_of_groups, get_associated_colors_of_groups, ) norm = "reference" if normalize_to_color else "prediction" _, asso_matrix = compute_association_matrix_of_groups( adata, prediction=groups_key, reference=colors, normalization=norm) add_colors_for_categorical_sample_annotation(adata, colors) asso_colors = get_associated_colors_of_groups( adata.uns[f"{colors}_colors"], asso_matrix) colors = asso_colors if len(colors) < len(node_labels): raise ValueError( "`color` list need to be at least as long as `groups`/`node_labels` list." ) # count number of connected components n_components, labels = scipy.sparse.csgraph.connected_components( adjacency_solid) if n_components > 1 and single_component: component_sizes = np.bincount(labels) largest_component = np.where( component_sizes == component_sizes.max())[0][0] adjacency_solid = adjacency_solid.tocsr()[labels == largest_component, :] adjacency_solid = adjacency_solid.tocsc()[:, labels == largest_component] colors = np.array(colors)[labels == largest_component] node_labels = np.array(node_labels)[labels == largest_component] cats_dropped = (adata.obs[groups_key].cat.categories[ labels != largest_component].tolist()) logg.info(f"Restricting graph to largest connected component " f"by dropping categories\n{cats_dropped}") nx_g_solid = nx.Graph(adjacency_solid) if dashed_edges is not None: raise ValueError( "`single_component` only if `dashed_edges` is `None`.") # groups sizes if groups_key is not None and f"{groups_key}_sizes" in adata.uns: groups_sizes = adata.uns[f"{groups_key}_sizes"] else: groups_sizes = np.ones(len(node_labels)) base_scale_scatter = 2000 base_pie_size = (base_scale_scatter / (np.sqrt(adjacency_solid.shape[0]) + 10) * node_size_scale) median_group_size = np.median(groups_sizes) groups_sizes = base_pie_size * np.power(groups_sizes / median_group_size, node_size_power) # edge widths base_edge_width = edge_width_scale * 5 * rcParams["lines.linewidth"] # draw dashed edges if dashed_edges is not None: widths = [x[-1]["weight"] for x in nx_g_dashed.edges(data=True)] widths = base_edge_width * np.array(widths) if max_edge_width is not None: widths = np.clip(widths, None, max_edge_width) nx.draw_networkx_edges( nx_g_dashed, pos, ax=ax, width=widths, edge_color="grey", style="dashed", alpha=0.5, ) # draw solid edges if transitions is None: widths = [x[-1]["weight"] for x in nx_g_solid.edges(data=True)] widths = base_edge_width * np.array(widths) if min_edge_width is not None or max_edge_width is not None: widths = np.clip(widths, min_edge_width, max_edge_width) with warnings.catch_warnings(): warnings.simplefilter("ignore") nx.draw_networkx_edges(nx_g_solid, pos, ax=ax, width=widths, edge_color="black") # draw directed edges else: adjacency_transitions = adata.uns["paga"][transitions].copy() if threshold is None: threshold = 0.01 adjacency_transitions.data[adjacency_transitions.data < threshold] = 0 adjacency_transitions.eliminate_zeros() g_dir = nx.DiGraph(adjacency_transitions.T) widths = [x[-1]["weight"] for x in g_dir.edges(data=True)] widths = base_edge_width * np.array(widths) if min_edge_width is not None or max_edge_width is not None: widths = np.clip(widths, min_edge_width, max_edge_width) nx.draw_networkx_edges( g_dir, pos, ax=ax, width=widths, edge_color="k", arrowsize=arrowsize, arrowstyle="-|>", node_size=groups_sizes, ) if export_to_gexf: if isinstance(colors[0], tuple): from matplotlib.colors import rgb2hex colors = [rgb2hex(c) for c in colors] for count, n in enumerate(nx_g_solid.nodes()): nx_g_solid.node[count]["label"] = f"{node_labels[count]}" nx_g_solid.node[count]["color"] = f"{colors[count]}" nx_g_solid.node[count]["viz"] = dict(position=dict( x=1000 * pos[count][0], y=1000 * pos[count][1], z=0)) filename = settings.writedir / "paga_graph.gexf" logg.warn(f"exporting to {filename}") settings.writedir.mkdir(parents=True, exist_ok=True) nx.write_gexf(nx_g_solid, settings.writedir / "paga_graph.gexf") ax.set_frame_on(frameon) ax.set_xticks([]) ax.set_yticks([]) if fontsize is None: fontsize = rcParams["legend.fontsize"] if fontoutline is not None: text_kwds = dict(text_kwds) text_kwds["path_effects"] = [ patheffects.withStroke(linewidth=fontoutline, foreground="w") ] # usual scatter plot if not isinstance(colors[0], cabc.Mapping): n_groups = len(pos_array) sct = ax.scatter( pos_array[:, 0], pos_array[:, 1], s=groups_sizes, cmap=cmap, c=colors[:n_groups], edgecolors="face", zorder=2, ) for count, group in enumerate(node_labels): ax.text( pos_array[count, 0], pos_array[count, 1], group, verticalalignment="center", horizontalalignment="center", size=fontsize, fontweight=fontweight, **text_kwds, ) # else pie chart plot else: def transform_ax_coords(a, b): return trans2(trans((a, b))) # start with this dummy plot... otherwise strange behavior sct = ax.scatter( pos_array[:, 0], pos_array[:, 1], alpha=0, linewidths=0, c="w", edgecolors="face", s=groups_sizes, cmap=cmap, ) bboxes = getbb(sct, ax) # bounding boxes around the scatterplot markers trans = ax.transData.transform bbox = ax.get_position().get_points() ax_x_min = bbox[0, 0] ax_x_max = bbox[1, 0] ax_y_min = bbox[0, 1] ax_y_max = bbox[1, 1] ax_len_x = ax_x_max - ax_x_min ax_len_y = ax_y_max - ax_y_min trans2 = ax.transAxes.inverted().transform pie_axs = [] for count, (n, box) in enumerate(zip(nx_g_solid.nodes(), bboxes)): x0, y0 = transform_ax_coords(box.x0, box.y0) x1, y1 = transform_ax_coords(box.x1, box.y1) pie_size = np.sqrt(((x0 - x1)**2) + ((y0 - y1)**2)) xa, ya = transform_ax_coords(*pos[n]) xa = ax_x_min + (xa - pie_size / 2) * ax_len_x ya = ax_y_min + (ya - pie_size / 2) * ax_len_y # clip, the fruchterman layout sometimes places below figure if ya < 0: ya = 0 if xa < 0: xa = 0 pie_axs.append( pl.axes([xa, ya, pie_size * ax_len_x, pie_size * ax_len_y], frameon=False)) pie_axs[count].set_xticks([]) pie_axs[count].set_yticks([]) if not isinstance(colors[count], cabc.Mapping): raise ValueError( f"{colors[count]} is neither a dict of valid " "matplotlib colors nor a valid matplotlib color.") color_single = colors[count].keys() fracs = [colors[count][c] for c in color_single] if sum(fracs) < 1: color_single = list(color_single) color_single.append("grey") fracs.append(1 - sum(fracs)) wedgeprops = dict(linewidth=0, edgecolor="k", antialiased=True) pie_axs[count].pie(fracs, colors=color_single, wedgeprops=wedgeprops, normalize=True) if node_labels is not None: text_kwds.update( dict(verticalalignment="center", fontweight=fontweight)) text_kwds.update(dict(horizontalalignment="center", size=fontsize)) for ia, a in enumerate(pie_axs): a.text(0.5, 0.5, node_labels[ia], transform=a.transAxes, **text_kwds) return sct
def filter_genes( data, min_counts=None, min_cells=None, max_counts=None, max_cells=None, min_counts_u=None, min_cells_u=None, max_counts_u=None, max_cells_u=None, min_shared_counts=None, min_shared_cells=None, retain_genes=None, copy=False, ): """Filter genes based on number of cells or counts. Keep genes that have at least `min_counts` counts or are expressed in at least `min_cells` cells or have at most `max_counts` counts or are expressed in at most `max_cells` cells. Only provide one of the optional parameters `min_counts`, `min_cells`, `max_counts`, `max_cells` per call. Parameters ---------- data : :class:`~anndata.AnnData`, `np.ndarray`, `sp.spmatrix` The (annotated) data matrix of shape `n_obs` × `n_vars`. Rows correspond to cells and columns to genes. min_counts : `int`, optional (default: `None`) Minimum number of counts required for a gene to pass filtering. min_cells : `int`, optional (default: `None`) Minimum number of cells expressed required for a gene to pass filtering. max_counts : `int`, optional (default: `None`) Maximum number of counts required for a gene to pass filtering. max_cells : `int`, optional (default: `None`) Maximum number of cells expressed required for a gene to pass filtering. min_counts_u : `int`, optional (default: `None`) Minimum number of unspliced counts required for a gene to pass filtering. min_cells_u : `int`, optional (default: `None`) Minimum number of unspliced cells expressed required to pass filtering. max_counts_u : `int`, optional (default: `None`) Maximum number of unspliced counts required for a gene to pass filtering. max_cells_u : `int`, optional (default: `None`) Maximum number of unspliced cells expressed required to pass filtering. min_shared_counts: `int`, optional (default: `None`) Minimum number of counts (both unspliced and spliced) required for a gene. min_shared_cells: `int`, optional (default: `None`) Minimum number of cells required to be expressed (both unspliced and spliced). retain_genes: `list`, optional (default: `None`) List of gene names to be retained independent of thresholds. copy : `bool`, optional (default: `False`) Determines whether a copy is returned. Returns ------- Filters the object and adds `n_counts` to `adata.var`. """ adata = data.copy() if copy else data # set initial cell sizes before filtering _set_initial_size(adata) layers = [ layer for layer in ["spliced", "unspliced"] if layer in adata.layers.keys() ] if min_shared_counts is not None or min_shared_cells is not None: layers.extend(["shared"]) for layer in layers: if layer == "spliced": _min_counts, _min_cells, _max_counts, _max_cells = ( min_counts, min_cells, max_counts, max_cells, ) elif layer == "unspliced": _min_counts, _min_cells, _max_counts, _max_cells = ( min_counts_u, min_cells_u, max_counts_u, max_cells_u, ) else: # shared counts/cells _min_counts, _min_cells, _max_counts, _max_cells = ( min_shared_counts, min_shared_cells, None, None, ) if layer in adata.layers.keys(): X = adata.layers[layer] else: # shared counts/cells Xs, Xu = adata.layers["spliced"], adata.layers["unspliced"] nonzeros = ((Xs > 0).multiply(Xu > 0) if issparse(Xs) else (Xs > 0) * (Xu > 0)) X = (nonzeros.multiply(Xs) + nonzeros.multiply(Xu) if issparse(nonzeros) else nonzeros * (Xs + Xu)) gene_subset = np.ones(adata.n_vars, dtype=bool) if _min_counts is not None or _max_counts is not None: gene_subset &= _filter(X, min_counts=_min_counts, max_counts=_max_counts)[0] if _min_cells is not None or _max_cells is not None: gene_subset &= _filter(X, min_cells=_min_cells, max_cells=_max_cells)[0] if retain_genes is not None: if isinstance(retain_genes, str): retain_genes = [retain_genes] gene_subset |= adata.var_names.isin(retain_genes) adata._inplace_subset_var(gene_subset) s = np.sum(~gene_subset) if s > 0: logg.info(f"Filtered out {s} genes that are detected", end=" ") if _min_cells is not None or _min_counts is not None: logg.info( f"in less than {_min_cells} cells ({layer})." if _min_counts is None else f"{_min_counts} counts ({layer}).", no_indent=True, ) if max_cells is not None or max_counts is not None: logg.info( f"in more than {_max_cells} cells ({layer})." if _max_counts is None else f"{_max_counts} counts ({layer}).", no_indent=True, ) return adata if copy else None
def rank_velocity_genes( data, vkey="velocity", n_genes=100, groupby=None, match_with=None, resolution=None, min_counts=None, min_r2=None, min_corr=None, min_dispersion=None, min_likelihood=None, copy=False, ): """Rank genes for velocity characterizing groups. This applies a differential expression test (Welch t-test with overestimated variance to be conservative) on velocity expression, to find genes in a cluster that show dynamics that is transcriptionally regulated differently compared to all other clusters (e.g. induction in that cluster and homeostasis in remaining population). If no clusters are given, it priorly computes velocity clusters by applying louvain modularity on velocity expression. .. code:: python scv.tl.rank_velocity_genes(adata, groupby='clusters') scv.pl.scatter( adata, basis=adata.uns['rank_velocity_genes']['names']['Beta'][:3] ) pd.DataFrame(adata.uns['rank_velocity_genes']['names']).head() .. image:: https://user-images.githubusercontent.com/31883718/69626017-11c47980-1048-11ea-89f4-df3769df5ad5.png :width: 600px .. image:: https://user-images.githubusercontent.com/31883718/69626572-30774000-1049-11ea-871f-e8a30c42f10e.png :width: 600px Arguments ---------- data : :class:`~anndata.AnnData` Annotated data matrix. vkey: `str` (default: `'velocity'`) Key of velocities computed in `tl.velocity` n_genes : `int`, optional (default: 100) The number of genes that appear in the returned tables. groupby: `str`, `list` or `np.ndarray` (default: `None`) Key of observations grouping to consider. match_with: `str` or `None` (default: `None`) adata.obs key to separatively rank velocities on. resolution: `str` or `None` (default: `None`) Resolution for louvain modularity. min_counts: `float` (default: None) Minimum count of genes for consideration. min_r2: `float` (default: None) Minimum r2 value of genes for consideration. min_corr: `float` (default: None) Minimum Spearmans correlation coefficient between spliced and unspliced. min_dispersion: `float` (default: None) Minimum dispersion norm value of genes for consideration. min_likelihood: `float` between `0` and `1` or `None` (default: `None`) Only rank velocity of genes with a likelihood higher than min_likelihood. copy: `bool` (default: `False`) Return a copy instead of writing to data. Returns ------- rank_velocity_genes : `.uns` Structured array to be indexed by group id storing the gene names. Ordered according to scores. velocity_score : `.var` Storing the score for each gene for each group. Ordered according to scores. """ # noqa E501 adata = data.copy() if copy else data if groupby is None or groupby == "velocity_clusters": velocity_clusters( adata, vkey=vkey, match_with=match_with, resolution=resolution, min_likelihood=min_likelihood, ) groupby = f"{vkey}_clusters" logg.info("ranking velocity genes", r=True) if "spearmans_score" not in adata.var.keys(): corr = vcorrcoef( np.array(adata.layers["Ms"]).T, np.array(adata.layers["Mu"].T), mode="spearmans", ) adata.var["spearmans_score"] = np.clip(corr, 0, None) tmp_filter = ~np.isnan(adata.layers[vkey].sum(0)) if f"{vkey}_genes" in adata.var.keys(): tmp_filter &= np.array(adata.var[f"{vkey}_genes"].values, dtype=bool) if "unspliced" in adata.layers.keys(): n_counts = (adata.layers["unspliced"] > 0).sum(0) n_counts = n_counts.A1 if issparse( adata.layers["unspliced"]) else n_counts min_counts = (min(50, np.percentile(n_counts, 50)) if min_counts is None else min_counts) tmp_filter &= np.ravel(n_counts > min_counts) if f"{vkey}_r2" in adata.var.keys(): r2 = adata.var[f"{vkey}_r2"] min_r2 = 0.1 if min_r2 is None else min_r2 # np.percentile(r2[r2 > 0], 50) tmp_filter &= r2 > min_r2 if "spearmans_score" in adata.var.keys(): corr = adata.var["spearmans_score"] min_corr = (0.1 if min_corr is None else min_corr ) # np.percentile(r2[r2 > 0], 50) tmp_filter &= corr > min_corr if "dispersions_norm" in adata.var.keys(): dispersions = adata.var.dispersions_norm min_dispersion = 0 if min_dispersion is None else min_dispersion tmp_filter &= dispersions > min_dispersion if "fit_likelihood" in adata.var.keys(): fit_likelihood = adata.var["fit_likelihood"] min_likelihood = 0.1 if min_likelihood is None else min_likelihood tmp_filter &= fit_likelihood > min_likelihood X = adata[:, tmp_filter].layers[vkey] groups, groups_masks = select_groups(adata, key=groupby) n_groups = groups_masks.shape[0] sizes = groups_masks.sum(1) mean, var = np.zeros((n_groups, X.shape[1])), np.zeros( (n_groups, X.shape[1])) for i, mask in enumerate(groups_masks): mean[i], var[i] = get_mean_var(X[mask]) # test each against the union of all other groups rankings_gene_names, rankings_gene_scores = [], [] for i in range(n_groups): mask_rest = ~groups_masks[i] mean_rest, var_rest = get_mean_var(X[mask_rest]) size_rest = sizes[i] # else mask_rest.sum() if method == 't-test' scores = (mean[i] - mean_rest) / np.sqrt(var[i] / sizes[i] + var_rest / size_rest) scores = np.nan_to_num(scores) # equivalent to but much faster than np.argsort(scores)[-10:] if n_genes > X.shape[1]: n_genes = X.shape[1] idx = np.argpartition(scores, -n_genes)[-n_genes:] idx = idx[np.argsort(scores[idx])[::-1]] rankings_gene_names.append(adata[:, tmp_filter].var_names[idx].values) rankings_gene_scores.append(scores[idx]) rankings_gene_names = np.array([list(n) for n in rankings_gene_names]) rankings_gene_scores = np.array([list(n) for n in rankings_gene_scores]) all_names = rankings_gene_names.T.flatten() all_scores = rankings_gene_scores.T.flatten() vscore = np.zeros(adata.n_vars, dtype=int) for i, name in enumerate(adata.var_names): if name in all_names: vscore[i] = all_scores[np.where(name == all_names)[0][0]] adata.var["velocity_score"] = vscore key = "rank_velocity_genes" if key not in adata.uns.keys(): adata.uns[key] = {} adata.uns[key] = { "names": np.rec.fromarrays([n for n in rankings_gene_names], dtype=[(f"{rn}", "U50") for rn in groups]), "scores": np.rec.fromarrays( [n.round(2) for n in rankings_gene_scores], dtype=[(f"{rn}", "float32") for rn in groups], ), "params": { "groupby": groupby, "reference": "rest", "method": "t-test_overestim_var", "use_raw": True, }, } logg.info(" finished", time=True, end=" " if settings.verbosity > 2 else "\n") logg.hint( "added \n" f" '{key}', sorted scores by group ids (adata.uns) \n" " 'spearmans_score', spearmans correlation scores (adata.var)") return adata if copy else None
def velocity_clusters( data, vkey="velocity", match_with="clusters", sort_by="velocity_pseudotime", resolution=None, min_likelihood=None, copy=False, ): """Computes velocity clusters via louvain on velocities. .. code:: python scv.tl.velocity_clusters(adata) scv.pl.scatter(adata, color='velocity_clusters') .. image:: https://user-images.githubusercontent.com/31883718/69625627-484dc480-1047-11ea-847f-6607a3430427.png :width: 600px Arguments ---------- data : :class:`~anndata.AnnData` Annotated data matrix. vkey: `str` (default: `'velocity'`) Key of velocities computed in `tl.velocity` match_with : `int`, optional (default: 100) The number of genes that appear in the returned tables. match_with: `str` (default: `'clusters'`) Match the names of the velocity clusters with the names of this key (.obs). sort_by: `str` or `None` (default: `'dpt_pseudotime'`) Sort velocity clusters by this key (.obs). resolution: `float` (default: 0.7) Resolution for louvain modularity. min_likelihood: `float` between `0` and `1` or `None` (default: `None`) Only rank velocity of genes with a likelihood higher than min_likelihood. copy: `bool` (default: `False`) Return a copy instead of writing to data. Returns ------- velocity_clusters : `.obs` Clusters obtained from applying louvain modularity on velocity expression. """ # noqa E501 adata = data.copy() if copy else data logg.info("computing velocity clusters", r=True) tmp_filter = ~np.isnan(adata.layers[vkey].sum(0)) if f"{vkey}_genes" in adata.var.keys(): tmp_filter &= np.array(adata.var[f"{vkey}_genes"].values, dtype=bool) if "unspliced" in adata.layers.keys(): n_counts = (adata.layers["unspliced"] > 0).sum(0) n_counts = n_counts.A1 if issparse( adata.layers["unspliced"]) else n_counts min_counts = min(50, np.percentile(n_counts, 50)) tmp_filter &= np.ravel(n_counts > min_counts) if "r2" in adata.var.keys(): r2 = adata.var.velocity_r2 min_r2 = np.percentile(r2[r2 > 0], 50) tmp_filter &= r2 > min_r2 if "dispersions_norm" in adata.var.keys(): dispersions = adata.var.dispersions_norm min_dispersion = np.percentile(dispersions, 20) tmp_filter &= dispersions > min_dispersion if "fit_likelihood" in adata.var.keys() and min_likelihood is not None: tmp_filter &= adata.var["fit_likelihood"] > min_likelihood from anndata import AnnData vdata = AnnData(adata.layers[vkey][:, tmp_filter]) vdata.obs = adata.obs.copy() vdata.var = adata.var[tmp_filter].copy() if "highly_variable" in vdata.var.keys(): vdata.var["highly_variable"] = np.array(vdata.var["highly_variable"], dtype=bool) import scanpy as sc logg.switch_verbosity("off", module="scanpy") sc.pp.pca(vdata, n_comps=20, svd_solver="arpack") sc.pp.neighbors(vdata, n_pcs=20) sc.tl.louvain(vdata, resolution=0.7 if resolution is None else resolution) logg.switch_verbosity("on", module="scanpy") if sort_by == "velocity_pseudotime" and sort_by not in adata.obs.keys(): velocity_pseudotime(adata, vkey=vkey) if sort_by in vdata.obs.keys(): vc = vdata.obs["louvain"] vc_cats = vc.cat.categories mean_times = [ np.mean(vdata.obs[sort_by][vc == cat]) for cat in vc_cats ] vdata.obs["louvain"].cat.reorder_categories( vc_cats[np.argsort(mean_times)], inplace=True) if isinstance(match_with, str) and match_with in adata.obs.keys(): from .utils import most_common_in_list vc = vdata.obs["louvain"] cats_nums = {cat: 0 for cat in adata.obs[match_with].cat.categories} for cat in vc.cat.categories: cells_in_cat = np.where(vc == cat)[0] new_cat = most_common_in_list(adata.obs[match_with][cells_in_cat]) cats_nums[new_cat] += 1 vc = vc.cat.rename_categories( {cat: f"{new_cat} ({cats_nums[new_cat]})"}) vdata.obs["louvain"] = vc else: vdata.obs["louvain"].cat.categories = np.arange( len(vdata.obs["louvain"].cat.categories)) adata.obs[f"{vkey}_clusters"] = vdata.obs["louvain"].copy() del vdata logg.info(" finished", time=True, end=" " if settings.verbosity > 2 else "\n") logg.hint( "added \n" f" '{vkey}_clusters', " f"clusters based on louvain modularity on velocity vector field (adata.obs)" ) return adata if copy else None
def velocity_embedding( data, basis=None, vkey="velocity", scale=10, self_transitions=True, use_negative_cosines=True, direct_pca_projection=None, retain_scale=False, autoscale=True, all_comps=True, T=None, copy=False, ): """Projects the single cell velocities into any embedding. Given normalized difference of the embedding positions :math: `\\tilde \\delta_{ij} = \\frac{x_j-x_i}{\\left\\lVert x_j-x_i \\right\\rVert}`. the projections are obtained as expected displacements with respect to the transition matrix :math:`\\tilde \\pi_{ij}` as .. math:: \\tilde \\nu_i = E_{\\tilde \\pi_{i\\cdot}} [\\tilde \\delta_{i \\cdot}] = \\sum_{j \\neq i} \\left( \\tilde \\pi_{ij} - \\frac1n \\right) \\tilde \\ delta_{ij}. Arguments --------- data: :class:`~anndata.AnnData` Annotated data matrix. basis: `str` (default: `'tsne'`) Which embedding to use. vkey: `str` (default: `'velocity'`) Name of velocity estimates to be used. scale: `int` (default: 10) Scale parameter of gaussian kernel for transition matrix. self_transitions: `bool` (default: `True`) Whether to allow self transitions, based on the confidences of transitioning to neighboring cells. use_negative_cosines: `bool` (default: `True`) Whether to project cell-to-cell transitions with negative cosines into negative/opposite direction. direct_pca_projection: `bool` (default: `None`) Whether to directly project the velocities into PCA space, thus skipping the velocity graph. retain_scale: `bool` (default: `False`) Whether to retain scale from high dimensional space in embedding. autoscale: `bool` (default: `True`) Whether to scale the embedded velocities by a scalar multiplier, which simply ensures that the arrows in the embedding are properly scaled. all_comps: `bool` (default: `True`) Whether to compute the velocities on all embedding components. T: `csr_matrix` (default: `None`) Allows the user to directly pass a transition matrix. copy: `bool` (default: `False`) Return a copy instead of writing to `adata`. Returns ------- velocity_umap: `.obsm` coordinates of velocity projection on embedding (e.g., basis='umap') """ adata = data.copy() if copy else data if basis is None: keys = [ key for key in ["pca", "tsne", "umap"] if f"X_{key}" in adata.obsm.keys() ] if len(keys) > 0: basis = "pca" if direct_pca_projection else keys[-1] else: raise ValueError("No basis specified") if f"X_{basis}" not in adata.obsm_keys(): raise ValueError("You need to compute the embedding first.") if direct_pca_projection and "pca" in basis: logg.warn( "Directly projecting velocities into PCA space is for exploratory analysis " "on principal components.\n" " It does not reflect the actual velocity field from high " "dimensional gene expression space.\n" " To visualize velocities, consider applying " "`direct_pca_projection=False`.\n" ) logg.info("computing velocity embedding", r=True) V = np.array(adata.layers[vkey]) vgenes = np.ones(adata.n_vars, dtype=bool) if f"{vkey}_genes" in adata.var.keys(): vgenes &= np.array(adata.var[f"{vkey}_genes"], dtype=bool) vgenes &= ~np.isnan(V.sum(0)) V = V[:, vgenes] if direct_pca_projection and "pca" in basis: PCs = adata.varm["PCs"] if all_comps else adata.varm["PCs"][:, :2] PCs = PCs[vgenes] X_emb = adata.obsm[f"X_{basis}"] V_emb = (V - V.mean(0)).dot(PCs) else: X_emb = ( adata.obsm[f"X_{basis}"] if all_comps else adata.obsm[f"X_{basis}"][:, :2] ) V_emb = np.zeros(X_emb.shape) T = ( transition_matrix( adata, vkey=vkey, scale=scale, self_transitions=self_transitions, use_negative_cosines=use_negative_cosines, ) if T is None else T ) T.setdiag(0) T.eliminate_zeros() densify = adata.n_obs < 1e4 TA = T.A if densify else None with warnings.catch_warnings(): warnings.simplefilter("ignore") for i in range(adata.n_obs): indices = T[i].indices dX = X_emb[indices] - X_emb[i, None] # shape (n_neighbors, 2) if not retain_scale: dX /= l2_norm(dX)[:, None] dX[np.isnan(dX)] = 0 # zero diff in a steady-state probs = TA[i, indices] if densify else T[i].data V_emb[i] = probs.dot(dX) - probs.mean() * dX.sum(0) if retain_scale: X = ( adata.layers["Ms"] if "Ms" in adata.layers.keys() else adata.layers["spliced"] ) delta = T.dot(X[:, vgenes]) - X[:, vgenes] if issparse(delta): delta = delta.A cos_proj = (V * delta).sum(1) / l2_norm(delta) V_emb *= np.clip(cos_proj[:, None] * 10, 0, 1) if autoscale: V_emb /= 3 * quiver_autoscale(X_emb, V_emb) if f"{vkey}_params" in adata.uns.keys(): adata.uns[f"{vkey}_params"]["embeddings"] = ( [] if "embeddings" not in adata.uns[f"{vkey}_params"] else list(adata.uns[f"{vkey}_params"]["embeddings"]) ) adata.uns[f"{vkey}_params"]["embeddings"].extend([basis]) vkey += f"_{basis}" adata.obsm[vkey] = V_emb logg.info(" finished", time=True, end=" " if settings.verbosity > 2 else "\n") logg.hint("added\n" f" '{vkey}', embedded velocity vectors (adata.obsm)") return adata if copy else None
def velocity_graph( data, vkey="velocity", xkey="Ms", tkey=None, basis=None, n_neighbors=None, n_recurse_neighbors=None, random_neighbors_at_max=None, sqrt_transform=None, variance_stabilization=None, gene_subset=None, compute_uncertainties=None, approx=None, mode_neighbors="distances", copy=False, n_jobs=None, backend="loky", ): """Computes velocity graph based on cosine similarities. The cosine similarities are computed between velocities and potential cell state transitions, i.e. it measures how well a corresponding change in gene expression :math:`\\delta_{ij} = x_j - x_i` matches the predicted change according to the velocity vector :math:`\\nu_i`, .. math:: \\pi_{ij} = \\cos\\angle(\\delta_{ij}, \\nu_i) = \\frac{\\delta_{ij}^T \\nu_i}{\\left\\lVert\\delta_{ij}\\right\\rVert \\left\\lVert \\nu_i \\right\\rVert}. Arguments --------- data: :class:`~anndata.AnnData` Annotated data matrix. vkey: `str` (default: `'velocity'`) Name of velocity estimates to be used. xkey: `str` (default: `'Ms'`) Layer key to extract count data from. tkey: `str` (default: `None`) Observation key to extract time data from. basis: `str` (default: `None`) Basis / Embedding to use. n_neighbors: `int` or `None` (default: None) Use fixed number of neighbors or do recursive neighbor search (if `None`). n_recurse_neighbors: `int` (default: `None`) Number of recursions for neighbors search. Defaults to 2 if mode_neighbors is 'distances', and 1 if mode_neighbors is 'connectivities'. random_neighbors_at_max: `int` or `None` (default: `None`) If number of iterative neighbors for an individual cell is higher than this threshold, a random selection of such are chosen as reference neighbors. sqrt_transform: `bool` (default: `False`) Whether to variance-transform the cell states changes and velocities before computing cosine similarities. gene_subset: `list` of `str`, subset of adata.var_names or `None`(default: `None`) Subset of genes to compute velocity graph on exclusively. compute_uncertainties: `bool` (default: `None`) Whether to compute uncertainties along with cosine correlation. approx: `bool` or `None` (default: `None`) If True, first 30 pc's are used instead of the full count matrix mode_neighbors: 'str' (default: `'distances'`) Determines the type of KNN graph used. Options are 'distances' or 'connectivities'. The latter yields a symmetric graph. copy: `bool` (default: `False`) Return a copy instead of writing to adata. n_jobs: `int` or `None` (default: `None`) Number of parallel jobs. backend: `str` (default: "loky") Backend used for multiprocessing. See :class:`joblib.Parallel` for valid options. Returns ------- velocity_graph: `.uns` sparse matrix with correlations of cell state transitions with velocities """ adata = data.copy() if copy else data verify_neighbors(adata) if vkey not in adata.layers.keys(): velocity(adata, vkey=vkey) if sqrt_transform is None: sqrt_transform = variance_stabilization vgraph = VelocityGraph( adata, vkey=vkey, xkey=xkey, tkey=tkey, basis=basis, n_neighbors=n_neighbors, approx=approx, n_recurse_neighbors=n_recurse_neighbors, random_neighbors_at_max=random_neighbors_at_max, sqrt_transform=sqrt_transform, gene_subset=gene_subset, compute_uncertainties=compute_uncertainties, report=True, mode_neighbors=mode_neighbors, ) if isinstance(basis, str): logg.warn( f"The velocity graph is computed on {basis} embedding coordinates.\n" f" Consider computing the graph in an unbiased manner \n" f" on full expression space by not specifying basis.\n") n_jobs = get_n_jobs(n_jobs=n_jobs) logg.info( f"computing velocity graph (using {n_jobs}/{os.cpu_count()} cores)", r=True) vgraph.compute_cosines(n_jobs=n_jobs, backend=backend) adata.uns[f"{vkey}_graph"] = vgraph.graph adata.uns[f"{vkey}_graph_neg"] = vgraph.graph_neg if vgraph.uncertainties is not None: adata.uns[f"{vkey}_graph_uncertainties"] = vgraph.uncertainties adata.obs[f"{vkey}_self_transition"] = vgraph.self_prob if f"{vkey}_params" in adata.uns.keys(): if "embeddings" in adata.uns[f"{vkey}_params"]: del adata.uns[f"{vkey}_params"]["embeddings"] else: adata.uns[f"{vkey}_params"] = {} adata.uns[f"{vkey}_params"]["mode_neighbors"] = mode_neighbors adata.uns[f"{vkey}_params"][ "n_recurse_neighbors"] = vgraph.n_recurse_neighbors logg.info(" finished", time=True, end=" " if settings.verbosity > 2 else "\n") logg.hint( "added \n" f" '{vkey}_graph', sparse matrix with cosine correlations (adata.uns)" ) return adata if copy else None
def normalize_per_cell( data, counts_per_cell_after=None, counts_per_cell=None, key_n_counts=None, max_proportion_per_cell=None, use_initial_size=True, layers=None, enforce=None, copy=False, ): """Normalize each cell by total counts over all genes. Parameters ---------- data : :class:`~anndata.AnnData`, `np.ndarray`, `sp.sparse` The (annotated) data matrix of shape `n_obs` × `n_vars`. Rows correspond to cells and columns to genes. counts_per_cell_after : `float` or `None`, optional (default: `None`) If `None`, after normalization, each cell has a total count equal to the median of the *counts_per_cell* before normalization. counts_per_cell : `np.array`, optional (default: `None`) Precomputed counts per cell. key_n_counts : `str`, optional (default: `'n_counts'`) Name of the field in `adata.obs` where the total counts per cell are stored. max_proportion_per_cell : `int` (default: `None`) Exclude genes counts that account for more than a specific proportion of cell size, e.g. 0.05. use_initial_size : `bool` (default: `True`) Whether to use initial cell sizes oder actual cell sizes. layers : `str` or `list` (default: `['spliced', 'unspliced']`) Keys for layers to be also considered for normalization. copy : `bool`, optional (default: `False`) If an :class:`~anndata.AnnData` is passed, determines whether a copy is returned. Returns ------- Returns or updates `adata` with normalized counts. """ adata = data.copy() if copy else data if layers is None: layers = ["spliced", "unspliced"] elif layers == "all": layers = adata.layers.keys() elif isinstance(layers, str): layers = [layers] layers = ["X" ] + [layer for layer in layers if layer in adata.layers.keys()] modified_layers = [] if isinstance(counts_per_cell, str): if counts_per_cell not in adata.obs.keys(): _set_initial_size(adata, layers) counts_per_cell = (adata.obs[counts_per_cell].values if counts_per_cell in adata.obs.keys() else None) for layer in layers: check_if_valid_dtype(adata, layer) X = adata.X if layer == "X" else adata.layers[layer] if not_yet_normalized(X) or enforce: counts = (counts_per_cell if counts_per_cell is not None else _get_initial_size(adata, layer) if use_initial_size else _get_size(adata, layer)) if max_proportion_per_cell is not None and ( 0 < max_proportion_per_cell < 1): counts = counts_per_cell_quantile(X, max_proportion_per_cell, counts) # equivalent to sc.pp.normalize_per_cell(X, counts_per_cell_after, counts) counts_after = (np.median(counts) if counts_per_cell_after is None else counts_per_cell_after) counts_after += counts_after == 0 counts = counts / counts_after counts += counts == 0 # to avoid division by zero if issparse(X): sparsefuncs.inplace_row_scale(X, 1 / counts) else: X /= np.array(counts[:, None]) modified_layers.append(layer) if (layer == "X" and "gene_count_corr" not in adata.var.keys() and X.shape[-1] > 3e3): try: adata.var["gene_count_corr"] = np.round( csr_vcorrcoef(X.T, np.ravel((X > 0).sum(1))), 4) except Exception: pass else: logg.warn( f"Did not normalize {layer} as it looks processed already. " "To enforce normalization, set `enforce=True`.") adata.obs["n_counts" if key_n_counts is None else key_n_counts] = _get_size(adata) if len(modified_layers) > 0: logg.info("Normalized count data:", f"{', '.join(modified_layers)}.") return adata if copy else None
def cell_origin( data, groupby="clusters", disconnected_groups=None, self_transitions=False, n_neighbors=None, copy=False, ): """Computes individual cell root points Arguments --------- data: :class:`~anndata.AnnData` Annotated data matrix. groupby: `str` (default: `'clusters'`) Key to which to assign the fates. disconnected_groups: list of `str` (default: `None`) Which groups to treat as disconnected for fate assignment. n_neighbors: `int` (default: `None`) Number of neighbors to restrict transitions to. self_transitions: `bool` (default: `False`) Whether to include self-transitions. copy: `bool` (default: `False`) Return a copy instead of writing to `adata`. Returns ------- cell_origin: `.obs` most likely cell origin for each individual cell cell_origin_confidence: `.obs` confidence of coming from assigned origin """ adata = data.copy() if copy else data logg.info("computing cell fates", r=True) n_neighbors = 10 if n_neighbors is None else n_neighbors _adata = adata.copy() vgraph = VelocityGraph(_adata, n_neighbors=n_neighbors, approx=True, n_recurse_neighbors=1) vgraph.compute_cosines() _adata.uns["velocity_graph"] = vgraph.graph _adata.uns["velocity_graph_neg"] = vgraph.graph_neg T = transition_matrix(_adata, self_transitions=self_transitions, backward=True) fate = np.linalg.inv(np.eye(_adata.n_obs) - T) if issparse(T): fate = fate.A cell_fates = np.array(_adata.obs[groupby][fate.argmax(1)]) if disconnected_groups is not None: idx = _adata.obs[groupby].isin(disconnected_groups) cell_fates[idx] = _adata.obs[groupby][idx] adata.obs["cell_origin"] = cell_fates adata.obs["cell_origin_confidence"] = fate.max(1) / fate.sum(1) strings_to_categoricals(adata) logg.info(" finished", time=True, end=" " if settings.verbosity > 2 else "\n") logg.hint( "added\n" " 'cell_origin', most likely cell origin (adata.obs)\n" " 'cell_origin_confidence', confidence of assigned origin (adata.obs)" )
def neighbors( adata, n_neighbors=30, n_pcs=None, use_rep=None, use_highly_variable=True, knn=True, random_state=0, method="umap", metric="euclidean", metric_kwds=None, num_threads=-1, copy=False, ): """ Compute a neighborhood graph of observations. The neighbor graph methods (umap, hnsw, sklearn) only differ in runtime and yield the same result as scanpy [Wolf18]_. Connectivities are computed with adaptive kernel width as proposed in Haghverdi et al. 2016 (doi:10.1038/nmeth.3971). Parameters ---------- adata Annotated data matrix. n_neighbors The size of local neighborhood (in terms of number of neighboring data points) used for manifold approximation. Larger values result in more global views of the manifold, while smaller values result in more local data being preserved. In general values should be in the range 2 to 100. If `knn` is `True`, number of nearest neighbors to be searched. If `knn` is `False`, a Gaussian kernel width is set to the distance of the `n_neighbors` neighbor. n_pcs : `int` or `None` (default: None) Number of principal components to use. If not specified, the full space is used of a pre-computed PCA, or 30 components are used when PCA is computed internally. use_rep : `None`, `'X'` or any key for `.obsm` (default: None) Use the indicated representation. If `None`, the representation is chosen automatically: for .n_vars < 50, .X is used, otherwise ‘X_pca’ is used. use_highly_variable: `bool` (default: True) Whether to use highly variable genes only, stored in .var['highly_variable']. knn If `True`, use a hard threshold to restrict the number of neighbors to `n_neighbors`, that is, consider a knn graph. Otherwise, use a Gaussian Kernel to assign low weights to neighbors more distant than the `n_neighbors` nearest neighbor. random_state A numpy random seed. method : {{'umap', 'hnsw', 'sklearn'}} (default: `'umap'`) Method to compute neighbors, only differs in runtime. The 'hnsw' method is most efficient and requires to `pip install hnswlib`. Connectivities are computed with adaptive kernel. metric A known metric’s name or a callable that returns a distance. metric_kwds Options for the metric. num_threads Number of threads to be used (for runtime). copy Return a copy instead of writing to adata. Returns ------- connectivities : `.obsp` Sparse weighted adjacency matrix of the neighborhood graph of data points. Weights should be interpreted as connectivities. distances : `.obsp` Sparse matrix of distances for each pair of neighbors. """ adata = adata.copy() if copy else adata if use_rep is None: use_rep = "X" if adata.n_vars < 50 or n_pcs == 0 else "X_pca" n_pcs = None if use_rep == "X" else n_pcs elif use_rep not in adata.obsm.keys() and f"X_{use_rep}" in adata.obsm.keys(): use_rep = f"X_{use_rep}" if use_rep == "X_pca": if ( "X_pca" not in adata.obsm.keys() or n_pcs is not None and n_pcs > adata.obsm["X_pca"].shape[1] ): n_vars = ( np.sum(adata.var["highly_variable"]) if use_highly_variable and "highly_variable" in adata.var.keys() else adata.n_vars ) n_comps = min(30 if n_pcs is None else n_pcs, n_vars - 1, adata.n_obs - 1) use_highly_variable &= "highly_variable" in adata.var.keys() pca( adata, n_comps=n_comps, use_highly_variable=use_highly_variable, svd_solver="arpack", ) elif n_pcs is None and adata.obsm["X_pca"].shape[1] < 10: logg.warn( f"Neighbors are computed on {adata.obsm['X_pca'].shape[1]} " f"principal components only." ) n_duplicate_cells = len(get_duplicate_cells(adata)) if n_duplicate_cells > 0: logg.warn( f"You seem to have {n_duplicate_cells} duplicate cells in your data.", "Consider removing these via pp.remove_duplicate_cells.", ) if metric_kwds is None: metric_kwds = {} logg.info("computing neighbors", r=True) if method == "sklearn": from sklearn.neighbors import NearestNeighbors X = adata.X if use_rep == "X" else adata.obsm[use_rep] neighbors = NearestNeighbors( n_neighbors=n_neighbors - 1, metric=metric, metric_params=metric_kwds, n_jobs=num_threads, ) neighbors.fit(X if n_pcs is None else X[:, :n_pcs]) knn_distances, neighbors.knn_indices = neighbors.kneighbors() knn_distances, neighbors.knn_indices = set_diagonal( knn_distances, neighbors.knn_indices ) neighbors.distances, neighbors.connectivities = compute_connectivities_umap( neighbors.knn_indices, knn_distances, X.shape[0], n_neighbors=n_neighbors ) elif method == "hnsw": X = adata.X if use_rep == "X" else adata.obsm[use_rep] neighbors = FastNeighbors(n_neighbors=n_neighbors, num_threads=num_threads) neighbors.fit( X if n_pcs is None else X[:, :n_pcs], metric=metric, random_state=random_state, **metric_kwds, ) else: logg.switch_verbosity("off", module="scanpy") with warnings.catch_warnings(): # ignore numba warning (umap/issues/252) warnings.simplefilter("ignore") neighbors = Neighbors(adata) neighbors.compute_neighbors( n_neighbors=n_neighbors, knn=knn, n_pcs=n_pcs, method=method, use_rep=use_rep, random_state=random_state, metric=metric, metric_kwds=metric_kwds, write_knn_indices=True, ) logg.switch_verbosity("on", module="scanpy") adata.uns["neighbors"] = {} try: adata.obsp["distances"] = neighbors.distances adata.obsp["connectivities"] = neighbors.connectivities adata.uns["neighbors"]["connectivities_key"] = "connectivities" adata.uns["neighbors"]["distances_key"] = "distances" except Exception: adata.uns["neighbors"]["distances"] = neighbors.distances adata.uns["neighbors"]["connectivities"] = neighbors.connectivities if hasattr(neighbors, "knn_indices"): adata.uns["neighbors"]["indices"] = neighbors.knn_indices adata.uns["neighbors"]["params"] = { "n_neighbors": n_neighbors, "method": method, "metric": metric, "n_pcs": n_pcs, "use_rep": use_rep, } logg.info(" finished", time=True, end=" " if settings.verbosity > 2 else "\n") logg.hint( "added \n" " 'distances' and 'connectivities', weighted adjacency matrices (adata.obsp)" ) return adata if copy else None
def moments( data, n_neighbors=30, n_pcs=None, mode="connectivities", method="umap", use_rep=None, use_highly_variable=True, copy=False, ): """Computes moments for velocity estimation. First-/second-order moments are computed for each cell across its nearest neighbors, where the neighbor graph is obtained from euclidean distances in PCA space. Arguments --------- data: :class:`~anndata.AnnData` Annotated data matrix. n_neighbors: `int` (default: 30) Number of neighbors to use. n_pcs: `int` (default: None) Number of principal components to use. If not specified, the full space is used of a pre-computed PCA, or 30 components are used when PCA is computed internally. mode: `'connectivities'` or `'distances'` (default: `'connectivities'`) Distance metric to use for moment computation. method : {{'umap', 'hnsw', 'sklearn', `None`}} (default: `'umap'`) Method to compute neighbors, only differs in runtime. Connectivities are computed with adaptive kernel width as proposed in Haghverdi et al. 2016 (https://doi.org/10.1038/nmeth.3971). use_rep : `None`, `'X'` or any key for `.obsm` (default: None) Use the indicated representation. If `None`, the representation is chosen automatically: for .n_vars < 50, .X is used, otherwise ‘X_pca’ is used. use_highly_variable: `bool` (default: True) Whether to use highly variable genes only, stored in .var['highly_variable']. copy: `bool` (default: `False`) Return a copy instead of writing to adata. Returns ------- Ms: `.layers` dense matrix with first order moments of spliced counts. Mu: `.layers` dense matrix with first order moments of unspliced counts. """ adata = data.copy() if copy else data layers = [ layer for layer in {"spliced", "unspliced"} if layer in adata.layers ] if any([not_yet_normalized(adata.layers[layer]) for layer in layers]): normalize_per_cell(adata) if n_neighbors is not None and n_neighbors > get_n_neighs(adata): neighbors( adata, n_neighbors=n_neighbors, use_rep=use_rep, use_highly_variable=use_highly_variable, n_pcs=n_pcs, method=method, ) verify_neighbors(adata) if "spliced" not in adata.layers.keys( ) or "unspliced" not in adata.layers.keys(): logg.warn( "Skipping moments, because un/spliced counts were not found.") else: logg.info(f"computing moments based on {mode}", r=True) connectivities = get_connectivities(adata, mode, n_neighbors=n_neighbors, recurse_neighbors=False) adata.layers["Ms"] = (csr_matrix.dot( connectivities, csr_matrix(adata.layers["spliced"])).astype(np.float32).A) adata.layers["Mu"] = (csr_matrix.dot( connectivities, csr_matrix(adata.layers["unspliced"])).astype(np.float32).A) # if renormalize: normalize_per_cell(adata, layers={'Ms', 'Mu'}, enforce=True) logg.info(" finished", time=True, end=" " if settings.verbosity > 2 else "\n") logg.hint( "added \n" " 'Ms' and 'Mu', moments of un/spliced abundances (adata.layers)" ) return adata if copy else None
def velocity( data, vkey="velocity", mode="stochastic", fit_offset=False, fit_offset2=False, filter_genes=False, groups=None, groupby=None, groups_for_fit=None, constrain_ratio=None, use_raw=False, use_latent_time=None, perc=[5, 95], min_r2=1e-2, min_likelihood=1e-3, r2_adjusted=None, use_highly_variable=True, diff_kinetics=None, copy=False, **kwargs, ): """Estimates velocities in a gene-specific manner. The steady-state model [Manno18]_ determines velocities by quantifying how observations deviate from a presumed steady-state equilibrium ratio of unspliced to spliced mRNA levels. This steady-state ratio is obtained by performing a linear regression restricting the input data to the extreme quantiles. By including second-order moments, the stochastic model [Bergen19]_ exploits not only the balance of unspliced to spliced mRNA levels but also their covariation. By contrast, the likelihood-based dynamical model [Bergen19]_ solves the full splicing kinetics and generalizes RNA velocity estimation to transient systems. It is also capable of capturing non-observed steady states. .. image:: https://user-images.githubusercontent.com/31883718/69636491-ff057100-1056-11ea-90b7-d04098112ce1.png Arguments --------- data: :class:`~anndata.AnnData` Annotated data matrix. vkey: `str` (default: `'velocity'`) Name under which to refer to the computed velocities for `velocity_graph` and `velocity_embedding`. mode: `'deterministic'`, `'stochastic'` or `'dynamical'` (default: `'stochastic'`) Whether to run the estimation using the steady-state/deterministic, stochastic or dynamical model of transcriptional dynamics. The dynamical model requires to run `tl.recover_dynamics` first. fit_offset: `bool` (default: `False`) Whether to fit with offset for first order moment dynamics. fit_offset2: `bool`, (default: `False`) Whether to fit with offset for second order moment dynamics. filter_genes: `bool` (default: `True`) Whether to remove genes that are not used for further velocity analysis. groups: `str`, `list` (default: `None`) Subset of groups, e.g. [‘g1’, ‘g2’, ‘g3’], to which velocity analysis shall be restricted. groupby: `str`, `list` or `np.ndarray` (default: `None`) Key of observations grouping to consider. groups_for_fit: `str`, `list` or `np.ndarray` (default: `None`) Subset of groups, e.g. [‘g1’, ‘g2’, ‘g3’], to which steady-state fitting shall be restricted. constrain_ratio: `float` or tuple of type `float` or None: (default: `None`) Bounds for the steady-state ratio. use_raw: `bool` (default: `False`) Whether to use raw data for estimation. use_latent_time: `bool`or `None` (default: `None`) Whether to use latent time as a regularization for velocity estimation. perc: `float` (default: `[5, 95]`) Percentile, e.g. 98, for extreme quantile fit. min_r2: `float` (default: 0.01) Minimum threshold for coefficient of determination min_likelihood: `float` (default: `None`) Minimal likelihood for velocity genes to fit the model on. r2_adjusted: `bool` (default: `None`) Whether to compute coefficient of determination on full data fit (adjusted) or extreme quantile fit (None) use_highly_variable: `bool` (default: True) Whether to use highly variable genes only, stored in .var['highly_variable']. copy: `bool` (default: `False`) Return a copy instead of writing to `adata`. Returns ------- velocity: `.layers` velocity vectors for each individual cell velocity_genes, velocity_beta, velocity_gamma, velocity_r2: `.var` parameters """ # noqa E501 adata = data.copy() if copy else data if not use_raw and "Ms" not in adata.layers.keys(): moments(adata) logg.info("computing velocities", r=True) strings_to_categoricals(adata) if mode is None or (mode == "dynamical" and "fit_alpha" not in adata.var.keys()): mode = "stochastic" logg.warn("Falling back to stochastic model. " "For the dynamical model run tl.recover_dynamics first.") if mode in {"dynamical", "dynamical_residuals"}: from .dynamical_model_utils import get_divergence, get_reads, get_vars gene_subset = ~np.isnan(adata.var["fit_alpha"].values) vdata = adata[:, gene_subset] alpha, beta, gamma, scaling, t_ = get_vars(vdata) connect = not adata.uns["recover_dynamics"]["use_raw"] kwargs_ = { "kernel_width": None, "normalized": True, "var_scale": True, "reg_par": None, "min_confidence": 1e-2, "constraint_time_increments": False, "fit_steady_states": True, "fit_basal_transcription": None, "use_connectivities": connect, "time_connectivities": connect, "use_latent_time": use_latent_time, } kwargs_.update(adata.uns["recover_dynamics"]) kwargs_.update(**kwargs) if "residuals" in mode: u, s = get_reads(vdata, use_raw=adata.uns["recover_dynamics"]["use_raw"]) if kwargs_["fit_basal_transcription"]: u, s = u - adata.var["fit_u0"], s - adata.var["fit_s0"] o = vdata.layers["fit_t"] < t_ vt = u * beta - s * gamma # ds/dt wt = (alpha * o - beta * u) * scaling # du/dt else: vt, wt = get_divergence(vdata, mode="velocity", **kwargs_) vgenes = adata.var.fit_likelihood > min_likelihood if min_r2 is not None: if "fit_r2" not in adata.var.keys(): velo = Velocity( adata, groups_for_fit=groups_for_fit, groupby=groupby, constrain_ratio=constrain_ratio, min_r2=min_r2, use_highly_variable=use_highly_variable, use_raw=use_raw, ) velo.compute_deterministic(fit_offset=fit_offset, perc=perc) adata.var["fit_r2"] = velo._r2 vgenes &= adata.var.fit_r2 > min_r2 lb, ub = np.nanpercentile(adata.var.fit_scaling, [10, 90]) vgenes = (vgenes & (adata.var.fit_scaling > np.min([lb, 0.03])) & (adata.var.fit_scaling < np.max([ub, 3]))) adata.var[f"{vkey}_genes"] = vgenes adata.layers[vkey] = np.ones(adata.shape) * np.nan adata.layers[vkey][:, gene_subset] = vt adata.layers[f"{vkey}_u"] = np.ones(adata.shape) * np.nan adata.layers[f"{vkey}_u"][:, gene_subset] = wt if filter_genes and len(set(vgenes)) > 1: adata._inplace_subset_var(vgenes) elif mode in {"steady_state", "deterministic", "stochastic"}: categories = (adata.obs[groupby].cat.categories if groupby is not None and groups is None and groups_for_fit is None else [None]) for cat in categories: groups = cat if cat is not None else groups cell_subset = groups_to_bool(adata, groups, groupby) _adata = adata if groups is None else adata[cell_subset] velo = Velocity( _adata, groups_for_fit=groups_for_fit, groupby=groupby, constrain_ratio=constrain_ratio, min_r2=min_r2, r2_adjusted=r2_adjusted, use_highly_variable=use_highly_variable, use_raw=use_raw, ) velo.compute_deterministic(fit_offset=fit_offset, perc=perc) if mode == "stochastic": if filter_genes and len(set(velo._velocity_genes)) > 1: adata._inplace_subset_var(velo._velocity_genes) residual = velo._residual[:, velo._velocity_genes] _adata = adata if groups is None else adata[cell_subset] velo = Velocity( _adata, residual=residual, groups_for_fit=groups_for_fit, groupby=groupby, constrain_ratio=constrain_ratio, use_highly_variable=use_highly_variable, ) velo.compute_stochastic(fit_offset, fit_offset2, mode, perc=perc) write_residuals(adata, vkey, velo._residual, cell_subset) write_residuals(adata, f"variance_{vkey}", velo._residual2, cell_subset) write_pars(adata, vkey, velo.get_pars(), velo.get_pars_names(), add_key=cat) if filter_genes and len(set(velo._velocity_genes)) > 1: adata._inplace_subset_var(velo._velocity_genes) else: raise ValueError( "Mode can only be one of these: deterministic, stochastic or dynamical." ) if f"{vkey}_genes" in adata.var.keys() and np.sum( adata.var[f"{vkey}_genes"]) < 10: logg.warn( "Too few genes are selected as velocity genes. " "Consider setting a lower threshold for min_r2 or min_likelihood.") if diff_kinetics: if not isinstance(diff_kinetics, str): diff_kinetics = "fit_diff_kinetics" if diff_kinetics in adata.var.keys(): if diff_kinetics in adata.uns["recover_dynamics"]: groupby = adata.uns["recover_dynamics"]["fit_diff_kinetics"] else: groupby = "clusters" clusters = adata.obs[groupby] for i, v in enumerate( np.array(adata.var[diff_kinetics].values, dtype=str)): if len(v) > 0 and v != "nan": idx = 1 - clusters.isin([a.strip() for a in v.split(",")]) adata.layers[vkey][:, i] *= idx if mode == "dynamical": adata.layers[f"{vkey}_u"][:, i] *= idx adata.uns[f"{vkey}_params"] = { "mode": mode, "fit_offset": fit_offset, "perc": perc } logg.info(" finished", time=True, end=" " if settings.verbosity > 2 else "\n") logg.hint( "added \n" f" '{vkey}', velocity vectors for each individual cell (adata.layers)" ) return adata if copy else None
def paga( adata, groups=None, vkey="velocity", use_time_prior=True, root_key=None, end_key=None, threshold_root_end_prior=None, minimum_spanning_tree=True, copy=False, ): """PAGA graph with velocity-directed edges. Mapping out the coarse-grained connectivity structures of complex manifolds [Wolf19]_. By quantifying the connectivity of partitions (groups, clusters) of the single-cell graph, partition-based graph abstraction (PAGA) generates a much simpler abstracted graph (*PAGA graph*) of partitions, in which edge weights represent confidence in the presence of connections. Parameters ---------- adata : :class:`~anndata.AnnData` An annotated data matrix. groups : key for categorical in `adata.obs`, optional (default: 'louvain') You can pass your predefined groups by choosing any categorical annotation of observations (`adata.obs`). vkey: `str` or `None` (default: `None`) Key for annotations of observations/cells or variables/genes. use_time_prior : `str` or bool, optional (default: True) Obs key for pseudo-time values. If True, 'velocity_pseudotime' is used if available. root_key : `str` or bool, optional (default: None) Obs key for root states. end_key : `str` or bool, optional (default: None) Obs key for end states. threshold_root_end_prior : `float` (default: 0.9) Threshold for root and final states priors, to be in the range of [0,1]. Values above the threshold will be considered as terminal and included as prior. minimum_spanning_tree : bool, optional (default: True) Whether to prune the tree such that a path from A-to-B is removed if another more confident path exists. copy : `bool`, optional (default: `False`) Copy `adata` before computation and return a copy. Otherwise, perform computation inplace and return `None`. Returns ------- connectivities: `.uns` The full adjacency matrix of the abstracted graph, weights correspond to confidence in the connectivities of partitions. connectivities_tree: `.uns` The adjacency matrix of the tree-like subgraph that best explains the topology. transitions_confidence: `.uns` The adjacency matrix of the abstracted directed graph, weights correspond to confidence in the transitions between partitions. """ if "neighbors" not in adata.uns: raise ValueError( "You need to run `pp.neighbors` first to compute a neighborhood graph." ) adata = adata.copy() if copy else adata strings_to_categoricals(adata) if groups is None: groups = ("clusters" if "clusters" in adata.obs.keys() else "louvain" if "louvain" in adata.obs.keys() else None) elif groups == "velocity_clusters" and "velocity_clusters" not in adata.obs.keys( ): velocity_clusters(adata) if use_time_prior and not isinstance(use_time_prior, str): use_time_prior = "velocity_pseudotime" if use_time_prior not in adata.obs.keys(): velocity_pseudotime(adata, vkey=vkey, root_key=root_key, end_key=end_key) priors = [ p for p in [use_time_prior, root_key, end_key] if p in adata.obs.keys() ] logg.info( "running PAGA", f"using priors: {priors}" if len(priors) > 0 else "", r=True, ) paga = PAGA_tree( adata, groups, vkey=vkey, use_time_prior=use_time_prior, root_key=root_key, end_key=end_key, threshold_root_end_prior=threshold_root_end_prior, minimum_spanning_tree=minimum_spanning_tree, ) if "paga" not in adata.uns: adata.uns["paga"] = {} paga.compute_connectivities() adata.uns["paga"]["connectivities"] = paga.connectivities adata.uns["paga"]["connectivities_tree"] = paga.connectivities_tree adata.uns[f"{groups}_sizes"] = np.array(paga.ns) paga.compute_transitions() adata.uns["paga"]["transitions_confidence"] = paga.transitions_confidence adata.uns["paga"]["threshold"] = paga.threshold adata.uns["paga"]["groups"] = groups logg.info(" finished", time=True, end=" " if settings.verbosity > 2 else "\n") logg.hint( "added\n" + " 'paga/connectivities', connectivities adjacency (adata.uns)\n" " 'paga/connectivities_tree', connectivities subtree (adata.uns)\n" " 'paga/transitions_confidence', velocity transitions (adata.uns)") return adata if copy else None
def velocity_genes( data, vkey="velocity", min_r2=0.01, min_ratio=0.01, use_highly_variable=True, copy=False, ): """Estimates velocities in a gene-specific manner Arguments --------- data: :class:`~anndata.AnnData` Annotated data matrix. vkey: `str` (default: `'velocity'`) Name under which to refer to the computed velocities. min_r2: `float` (default: 0.01) Minimum threshold for coefficient of determination min_ratio: `float` (default: 0.01) Minimum threshold for quantile regression un/spliced ratio. use_highly_variable: `bool` (default: True) Whether to use highly variable genes only, stored in .var['highly_variable']. copy: `bool` (default: `False`) Return a copy instead of writing to `adata`. Returns ------- Updates `adata` attributes velocity_genes: `.var` genes to be used for further velocity analysis (velocity graph and embedding) """ adata = data.copy() if copy else data if f"{vkey}_genes" not in adata.var.keys(): velocity(adata, vkey) vgenes = np.ones(adata.n_vars, dtype=bool) if "Ms" in adata.layers.keys() and "Mu" in adata.layers.keys(): vgenes &= np.max(adata.layers["Ms"] > 0, 0) > 0 vgenes &= np.max(adata.layers["Mu"] > 0, 0) > 0 if min_r2 is not None and f"{vkey}_r2" in adata.var.keys(): vgenes &= adata.var[f"{vkey}_r2"] > min_r2 if min_ratio is not None and f"{vkey}_qreg_ratio" in adata.var.keys(): vgenes &= adata.var[f"{vkey}_qreg_ratio"] > min_ratio if use_highly_variable and "highly_variable" in adata.var.keys(): vgenes &= adata.var["highly_variable"].values if np.sum(vgenes) < 2: logg.warn( "You seem to have very low signal in splicing dynamics.\n" "Consider reducing the thresholds and be cautious with interpretations.\n" ) adata.var[f"{vkey}_genes"] = vgenes logg.info("Number of obtained velocity_genes:", np.sum(adata.var[f"{vkey}_genes"])) return adata if copy else None
def filter_and_normalize( data, min_counts=None, min_counts_u=None, min_cells=None, min_cells_u=None, min_shared_counts=None, min_shared_cells=None, n_top_genes=None, retain_genes=None, subset_highly_variable=True, flavor="seurat", log=True, layers_normalize=None, copy=False, **kwargs, ): """Filtering, normalization and log transform Expects non-logarithmized data. If using logarithmized data, pass `log=False`. Runs the following steps .. code:: python scv.pp.filter_genes(adata) scv.pp.normalize_per_cell(adata) if n_top_genes is not None: scv.pp.filter_genes_dispersion(adata) if log: scv.pp.log1p(adata) Arguments --------- data: :class:`~anndata.AnnData` Annotated data matrix. min_counts: `int` (default: `None`) Minimum number of counts required for a gene to pass filtering (spliced). min_counts_u: `int` (default: `None`) Minimum number of counts required for a gene to pass filtering (unspliced). min_cells: `int` (default: `None`) Minimum number of cells expressed required to pass filtering (spliced). min_cells_u: `int` (default: `None`) Minimum number of cells expressed required to pass filtering (unspliced). min_shared_counts: `int`, optional (default: `None`) Minimum number of counts (both unspliced and spliced) required for a gene. min_shared_cells: `int`, optional (default: `None`) Minimum number of cells required to be expressed (both unspliced and spliced). n_top_genes: `int` (default: `None`) Number of genes to keep. retain_genes: `list`, optional (default: `None`) List of gene names to be retained independent of thresholds. subset_highly_variable: `bool` (default: True) Whether to subset highly variable genes or to store in .var['highly_variable']. flavor: {'seurat', 'cell_ranger', 'svr'}, optional (default: 'seurat') Choose the flavor for computing normalized dispersion. If choosing 'seurat', this expects non-logarithmized data. log: `bool` (default: `True`) Take logarithm. layers_normalize: list of `str` (default: None) List of layers to be normalized. If set to None, the layers {'X', 'spliced', 'unspliced'} are considered for normalization upon testing whether they have already been normalized (by checking type of entries: int -> unprocessed, float -> processed). copy: `bool` (default: `False`) Return a copy of `adata` instead of updating it. **kwargs: Keyword arguments passed to pp.normalize_per_cell (e.g. counts_per_cell). Returns ------- Returns or updates `adata` depending on `copy`. """ adata = data.copy() if copy else data if "spliced" not in adata.layers.keys( ) or "unspliced" not in adata.layers.keys(): logg.warn("Could not find spliced / unspliced counts.") filter_genes( adata, min_counts=min_counts, min_counts_u=min_counts_u, min_cells=min_cells, min_cells_u=min_cells_u, min_shared_counts=min_shared_counts, min_shared_cells=min_shared_cells, retain_genes=retain_genes, ) if layers_normalize is not None and "enforce" not in kwargs: kwargs["enforce"] = True normalize_per_cell(adata, layers=layers_normalize, **kwargs) if n_top_genes is not None: filter_genes_dispersion( adata, n_top_genes=n_top_genes, retain_genes=retain_genes, flavor=flavor, subset=subset_highly_variable, ) log_advised = (np.allclose(adata.X[:10].sum(), adata.layers["spliced"][:10].sum()) if "spliced" in adata.layers.keys() else True) if log and log_advised: log1p(adata) if log and log_advised: logg.info("Logarithmized X.") elif log and not log_advised: logg.warn("Did not modify X as it looks preprocessed already.") elif log_advised and not log: logg.warn( "Consider logarithmizing X with `scv.pp.log1p` for better results." ) return adata if copy else None
def terminal_states( data, vkey="velocity", modality="Ms", groupby=None, groups=None, self_transitions=False, eps=1e-3, random_state=0, copy=False, **kwargs, ): """Computes terminal states (root and end points). The end points and root cells are obtained as stationary states of the velocity-inferred transition matrix and its transposed, respectively, which is given by left eigenvectors corresponding to an eigenvalue of 1, i.e. .. math:: μ^{\\textrm{end}}=μ^{\\textrm{end}} \\pi, \\quad μ^{\\textrm{root}}=μ^{\\textrm{root}} \\pi^{\\small \\textrm{T}}. .. code:: python scv.tl.terminal_states(adata) scv.pl.scatter(adata, color=['root_cells', 'end_points']) .. image:: https://user-images.githubusercontent.com/31883718/69496183-bcfdf300-0ecf-11ea-9aae-685300a0b1ba.png Alternatively, we recommend to use :func:`cellrank.tl.terminal_states` providing an improved/generalized approach of identifying terminal states. Arguments --------- data: :class:`~anndata.AnnData` Annotated data matrix. vkey: `str` (default: `'velocity'`) Name of velocity estimates to be used. modality: `str` (default: `'Ms'`) Layer used to calculate terminal states. groupby: `str`, `list` or `np.ndarray` (default: `None`) Key of observations grouping to consider. Only to be set, if each group is assumed to have a distinct lineage with an independent root and end point. groups: `str`, `list` or `np.ndarray` (default: `None`) Groups selected to find terminal states on. Must be an element of .obs[groupby]. To be specified only for very distinct/disconnected clusters. self_transitions: `bool` (default: `False`) Allow transitions from one node to itself. eps: `float` (default: 1e-3) Tolerance for eigenvalue selection. random_state: `int` or None (default: 0) Seed used by the random number generator. If `None`, use the `RandomState` instance by `np.random`. copy: `bool` (default: `False`) Return a copy instead of writing to data. **kwargs: Passed to scvelo.tl.transition_matrix(), e.g. basis, weight_diffusion. Returns ------- root_cells: `.obs` sparse matrix with transition probabilities. end_points: `.obs` sparse matrix with transition probabilities. """ # noqa E501 adata = data.copy() if copy else data verify_neighbors(adata) logg.info("computing terminal states", r=True) strings_to_categoricals(adata) if groupby is not None: logg.warn( "Only set groupby, when you have evident distinct clusters/lineages," " each with an own root and end point.") kwargs.update({"self_transitions": self_transitions}) categories = [None] if groupby is not None and groups is None: categories = adata.obs[groupby].cat.categories for cat in categories: groups = cat if cat is not None else groups cell_subset = groups_to_bool(adata, groups=groups, groupby=groupby) _adata = adata if groups is None else adata[cell_subset] connectivities = get_connectivities(_adata, "distances") T = transition_matrix(_adata, vkey=vkey, backward=True, **kwargs) eigvecs_roots = eigs(T, eps=eps, perc=[2, 98], random_state=random_state)[1] roots = csr_matrix.dot(connectivities, eigvecs_roots).sum(1) roots = scale(np.clip(roots, 0, np.percentile(roots, 98))) roots = verify_roots(_adata, roots, modality) write_to_obs(adata, "root_cells", roots, cell_subset) T = transition_matrix(_adata, vkey=vkey, backward=False, **kwargs) eigvecs_ends = eigs(T, eps=eps, perc=[2, 98], random_state=random_state)[1] ends = csr_matrix.dot(connectivities, eigvecs_ends).sum(1) ends = scale(np.clip(ends, 0, np.percentile(ends, 98))) write_to_obs(adata, "end_points", ends, cell_subset) n_roots, n_ends = eigvecs_roots.shape[1], eigvecs_ends.shape[1] groups_str = f" ({groups})" if isinstance(groups, str) else "" roots_str = f"{n_roots} {'regions' if n_roots > 1 else 'region'}" ends_str = f"{n_ends} {'regions' if n_ends > 1 else 'region'}" logg.info(f" identified {roots_str} of root cells " f"and {ends_str} of end points {groups_str}.") logg.info(" finished", time=True, end=" " if settings.verbosity > 2 else "\n") logg.hint( "added\n" " 'root_cells', root cells of Markov diffusion process (adata.obs)\n" " 'end_points', end points of Markov diffusion process (adata.obs)") return adata if copy else None