def get_neighbors( data: AnnData, K: int = 100, rep: str = "pca", n_jobs: int = -1, random_state: int = 0, full_speed: bool = False, ) -> Tuple[List[int], List[float]]: """Find K nearest neighbors for each data point and return the indices and distances arrays. Parameters ---------- data : `AnnData` An AnnData object. K : `int`, optional (default: 100) Number of neighbors, including the data point itself. rep : `str`, optional (default: 'pca') Representation used to calculate kNN. If `None` use data.X n_jobs : `int`, optional (default: -1) Number of threads to use. -1 refers to all available threads random_state: `int`, optional (default: 0) Random seed for random number generator. full_speed: `bool`, optional (default: False) If full_speed, use multiple threads in constructing hnsw index. However, the kNN results are not reproducible. If not full_speed, use only one thread to make sure results are reproducible. Returns ------- kNN indices and distances arrays. Examples -------- >>> indices, distances = tools.get_neighbors(adata) """ rep = update_rep(rep) indices_key = rep + "_knn_indices" distances_key = rep + "_knn_distances" if knn_is_cached(data, indices_key, distances_key, K): indices = data.uns[indices_key] distances = data.uns[distances_key] logger.info("Found cached kNN results, no calculation is required.") else: indices, distances = calculate_nearest_neighbors( X_from_rep(data, rep), K=K, n_jobs=effective_n_jobs(n_jobs), random_state=random_state, full_speed=full_speed, ) data.uns[indices_key] = indices data.uns[distances_key] = distances return indices, distances
def get_neighbors( data: MultimodalData, K: int = 100, rep: str = "pca", n_comps: int = None, n_jobs: int = -1, random_state: int = 0, full_speed: bool = False, use_cache: bool = True, dist: str = "l2", ) -> Tuple[List[int], List[float]]: """Find K nearest neighbors for each data point and return the indices and distances arrays. Parameters ---------- data : `pegasusio.MultimodalData` An AnnData object. K : `int`, optional (default: 100) Number of neighbors, including the data point itself. rep : `str`, optional (default: 'pca') Representation used to calculate kNN. If `None` use data.X n_comps: `int`, optional (default: None) Number of components to be used in the `rep`. If n_comps == None, use all components; otherwise, use the minimum of n_comps and rep's dimensions. n_jobs : `int`, optional (default: -1) Number of threads to use. -1 refers to using all physical CPU cores. random_state: `int`, optional (default: 0) Random seed for random number generator. full_speed: `bool`, optional (default: False) If full_speed, use multiple threads in constructing hnsw index. However, the kNN results are not reproducible. If not full_speed, use only one thread to make sure results are reproducible. use_cache: `bool`, optional (default: True) If use_cache and found cached knn results, will not recompute. dist: `str`, optional (default: 'l2') Distance metric to use. By default, use squared L2 distance. Available options, inner product 'ip' or cosine similarity 'cosine'. Returns ------- kNN indices and distances arrays. Examples -------- >>> indices, distances = tools.get_neighbors(data) """ rep = update_rep(rep) indices_key = rep + "_knn_indices" distances_key = rep + "_knn_distances" if use_cache and knn_is_cached(data, indices_key, distances_key, K): indices = data.obsm[indices_key] distances = data.obsm[distances_key] logger.info("Found cached kNN results, no calculation is required.") else: indices, distances = calculate_nearest_neighbors( X_from_rep(data, rep, n_comps), K=K, n_jobs=eff_n_jobs(n_jobs), random_state=random_state, full_speed=full_speed, dist=dist, ) data.obsm[indices_key] = indices data.register_attr(indices_key, "knn") data.obsm[distances_key] = distances data.register_attr(distances_key, "knn") return indices, distances
def net_fle( data: MultimodalData, file_name: str = None, n_jobs: int = -1, rep: str = "diffmap", K: int = 50, full_speed: bool = False, target_change_per_node: float = 2.0, target_steps: int = 5000, is3d: bool = False, memory: int = 8, random_state: int = 0, select_frac: float = 0.1, select_K: int = 25, select_alpha: float = 1.0, net_alpha: float = 0.1, polish_target_steps: int = 1500, out_basis: str = "net_fle", ) -> None: """Construct Net-Force-directed (FLE) graph. Net-FLE is an approximated FLE graph using Deep Learning model to improve the speed. In specific, the deep model used is MLPRegressor_, the *scikit-learn* implementation of Multi-layer Perceptron regressor. See [Li20]_ for details. .. _MLPRegressor: https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPRegressor.html Parameters ---------- data: ``pegasusio.MultimodalData`` Annotated data matrix with rows for cells and columns for genes. file_name: ``str``, optional, default: ``None`` Temporary file to store the coordinates as the input to forceatlas2. If ``None``, use ``tempfile.mkstemp`` to generate file name. n_jobs: ``int``, optional, default: ``-1`` Number of threads to use. If ``-1``, use all available threads. rep: ``str``, optional, default: ``"diffmap"`` Representation of data used for the calculation. By default, use Diffusion Map coordinates. If ``None``, use the count matrix ``data.X``. K: ``int``, optional, default: ``50`` Number of nearest neighbors to be considered during the computation. full_speed: ``bool``, optional, default: ``False`` * If ``True``, use multiple threads in constructing ``hnsw`` index. However, the kNN results are not reproducible. * Otherwise, use only one thread to make sure results are reproducible. target_change_per_node: ``float``, optional, default: ``2.0`` Target change per node to stop ForceAtlas2. target_steps: ``int``, optional, default: ``5000`` Maximum number of iterations before stopping the ForceAtlas2 algorithm. is3d: ``bool``, optional, default: ``False`` If ``True``, calculate 3D force-directed layout. memory: ``int``, optional, default: ``8`` Memory size in GB for the Java FA2 component. By default, use 8GB memory. random_state: ``int``, optional, default: ``0`` Random seed set for reproducing results. select_frac: ``float``, optional, default: ``0.1`` Down sampling fraction on the cells. select_K: ``int``, optional, default: ``25`` Number of neighbors to be used to estimate local density for each data point for down sampling. select_alpha: ``float``, optional, default: ``1.0`` Weight the down sample to be proportional to ``radius ** select_alpha``. net_alpha: ``float``, optional, default: ``0.1`` L2 penalty (regularization term) parameter of the deep regressor. polish_target_steps: ``int``, optional, default: ``1500`` After running the deep regressor to predict new coordinate, Number of ForceAtlas2 iterations. out_basis: ``str``, optional, default: ``"net_fle"`` Key name for calculated FLE coordinates to store. Returns ------- ``None`` Update ``data.obsm``: * ``data.obsm['X_' + out_basis]``: Net FLE coordinates of the data. Update ``data.obs``: * ``data.obs['ds_selected']``: Boolean array to indicate which cells are selected during the down sampling phase. Examples -------- >>> pg.net_fle(data) """ if file_name is None: if file_name is None: import tempfile _, file_name = tempfile.mkstemp() n_jobs = effective_n_jobs(n_jobs) rep = update_rep(rep) if ("W_" + rep) not in data.uns: neighbors( data, K=K, rep=rep, n_jobs=n_jobs, random_state=random_state, full_speed=full_speed, ) indices_key = rep + "_knn_indices" distances_key = rep + "_knn_distances" if not knn_is_cached(data, indices_key, distances_key, select_K): raise ValueError("Please run neighbors first!") selected = select_cells( data.uns[distances_key], select_frac, K=select_K, alpha=select_alpha, random_state=random_state, ) X_full = X_from_rep(data, rep) X = X_full[selected, :] ds_indices_key = "ds_" + rep + "_knn_indices" ds_distances_key = "ds_" + rep + "_knn_distances" indices, distances = calculate_nearest_neighbors(X, K=K, n_jobs=n_jobs, random_state=random_state, full_speed=full_speed) data.uns[ds_indices_key] = indices data.uns[ds_distances_key] = distances W = calculate_affinity_matrix(indices, distances) X_fle = calc_force_directed_layout( W, file_name + ".small", n_jobs, target_change_per_node, target_steps, is3d, memory, random_state, ) data.uns["X_" + out_basis + "_small"] = X_fle data.obs["ds_diffmap_selected"] = selected n_components = 2 if not is3d else 3 Y_init = np.zeros((data.shape[0], n_components), dtype=np.float64) Y_init[selected, :] = X_fle Y_init[~selected, :] = net_train_and_predict(X, X_fle, X_full[~selected, :], net_alpha, random_state, verbose=True) data.obsm["X_" + out_basis + "_pred"] = Y_init data.obsm["X_" + out_basis] = calc_force_directed_layout( W_from_rep(data, rep), file_name, n_jobs, target_change_per_node, polish_target_steps, is3d, memory, random_state, init=Y_init, )
def net_umap( data: MultimodalData, rep: str = "pca", n_jobs: int = -1, n_components: int = 2, n_neighbors: int = 15, min_dist: float = 0.5, spread: float = 1.0, random_state: int = 0, select_frac: float = 0.1, select_K: int = 25, select_alpha: float = 1.0, full_speed: bool = False, net_alpha: float = 0.1, polish_learning_rate: float = 10.0, polish_n_epochs: int = 30, out_basis: str = "net_umap", ) -> None: """Calculate Net-UMAP embedding of cells. Net-UMAP is an approximated UMAP embedding using Deep Learning model to improve the speed. In specific, the deep model used is MLPRegressor_, the *scikit-learn* implementation of Multi-layer Perceptron regressor. See [Li20]_ for details. .. _MLPRegressor: https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPRegressor.html Parameters ---------- data: ``pegasusio.MultimodalData`` Annotated data matrix with rows for cells and columns for genes. rep: ``str``, optional, default: ``"pca"`` Representation of data used for the calculation. By default, use PCA coordinates. If ``None``, use the count matrix ``data.X``. n_components: ``int``, optional, default: ``2`` Dimension of calculated UMAP coordinates. By default, generate 2-dimensional data for 2D visualization. n_neighbors: ``int``, optional, default: ``15`` Number of nearest neighbors considered during the computation. min_dist: ``float``, optional, default: ``0.5`` The effective minimum distance between embedded data points. spread: ``float``, optional, default: ``1.0`` The effective scale of embedded data points. random_state: ``int``, optional, default: ``0`` Random seed set for reproducing results. select_frac: ``float``, optional, default: ``0.1`` Down sampling fraction on the cells. select_K: ``int``, optional, default: ``25`` Number of neighbors to be used to estimate local density for each data point for down sampling. select_alpha: ``float``, optional, default: ``1.0`` Weight the down sample to be proportional to ``radius ** select_alpha``. full_speed: ``bool``, optional, default: ``False`` * If ``True``, use multiple threads in constructing ``hnsw`` index. However, the kNN results are not reproducible. * Otherwise, use only one thread to make sure results are reproducible. net_alpha: ``float``, optional, default: ``0.1`` L2 penalty (regularization term) parameter of the deep regressor. polish_learning_frac: ``float``, optional, default: ``10.0`` After running the deep regressor to predict new coordinates, use ``polish_learning_frac`` * ``n_obs`` as the learning rate to polish the coordinates. polish_n_iter: ``int``, optional, default: ``30`` Number of iterations for polishing UMAP run. out_basis: ``str``, optional, default: ``"net_umap"`` Key name for calculated UMAP coordinates to store. Returns ------- ``None`` Update ``data.obsm``: * ``data.obsm['X_' + out_basis]``: Net UMAP coordinates of the data. Update ``data.obs``: * ``data.obs['ds_selected']``: Boolean array to indicate which cells are selected during the down sampling phase. Examples -------- >>> pg.net_umap(data) """ rep = update_rep(rep) indices_key = rep + "_knn_indices" distances_key = rep + "_knn_distances" if not knn_is_cached(data, indices_key, distances_key, select_K): raise ValueError("Please run neighbors first!") n_jobs = effective_n_jobs(n_jobs) selected = select_cells( data.uns[distances_key], select_frac, K=select_K, alpha=select_alpha, random_state=random_state, ) X_full = X_from_rep(data, rep) X = X_full[selected, :] ds_indices_key = "ds_" + rep + "_knn_indices" # ds refers to down-sampling ds_distances_key = "ds_" + rep + "_knn_distances" indices, distances = calculate_nearest_neighbors( X, K=n_neighbors, n_jobs=n_jobs, random_state=random_state, full_speed=full_speed, ) data.uns[ds_indices_key] = indices data.uns[ds_distances_key] = distances knn_indices = np.insert(data.uns[ds_indices_key][:, 0:n_neighbors - 1], 0, range(X.shape[0]), axis=1) knn_dists = np.insert(data.uns[ds_distances_key][:, 0:n_neighbors - 1], 0, 0.0, axis=1) X_umap = calc_umap( X, n_components, n_neighbors, min_dist, spread, random_state, knn_indices=knn_indices, knn_dists=knn_dists, ) data.uns["X_" + out_basis + "_small"] = X_umap data.obs["ds_selected"] = selected Y_init = np.zeros((data.shape[0], n_components), dtype=np.float64) Y_init[selected, :] = X_umap Y_init[~selected, :] = net_train_and_predict(X, X_umap, X_full[~selected, :], net_alpha, random_state, verbose=True) data.obsm["X_" + out_basis + "_pred"] = Y_init knn_indices = np.insert(data.uns[indices_key][:, 0:n_neighbors - 1], 0, range(data.shape[0]), axis=1) knn_dists = np.insert(data.uns[distances_key][:, 0:n_neighbors - 1], 0, 0.0, axis=1) data.obsm["X_" + out_basis] = calc_umap( X_full, n_components, n_neighbors, min_dist, spread, random_state, init=Y_init, n_epochs=polish_n_epochs, learning_rate=polish_learning_rate, knn_indices=knn_indices, knn_dists=knn_dists, )
def net_tsne( data: MultimodalData, rep: str = "pca", n_jobs: int = -1, n_components: int = 2, perplexity: float = 30, early_exaggeration: int = 12, learning_rate: float = 1000, random_state: int = 0, select_frac: float = 0.1, select_K: int = 25, select_alpha: float = 1.0, net_alpha: float = 0.1, polish_learning_frac: float = 0.33, polish_n_iter: int = 150, out_basis: str = "net_tsne", ) -> None: """Calculate Net-tSNE embedding of cells. Net-tSNE is an approximated tSNE embedding using Deep Learning model to improve the calculation speed. In specific, the deep model used is MLPRegressor_, the *scikit-learn* implementation of Multi-layer Perceptron regressor. See [Li20]_ for details. .. _MLPRegressor: https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPRegressor.html Parameters ---------- data: ``pegasusio.MultimodalData`` Annotated data matrix with rows for cells (``n_obs``) and columns for genes (``n_feature``). rep: ``str``, optional, default: ``"pca"`` Representation of data used for the calculation. By default, use PCA coordinates. If ``None``, use the count matrix ``data.X``. n_jobs: ``int``, optional, default: ``-1`` Number of threads to use. If ``-1``, use all available threads. n_components: ``int``, optional, default: ``2`` Dimension of calculated tSNE coordinates. By default, generate 2-dimensional data for 2D visualization. perplexity: ``float``, optional, default: ``30`` The perplexity is related to the number of nearest neighbors used in other manifold learning algorithms. Larger datasets usually require a larger perplexity. early_exaggeration: ``int``, optional, default: ``12`` Controls how tight natural clusters in the original space are in the embedded space, and how much space will be between them. learning_rate: ``float``, optional, default: ``1000`` The learning rate can be a critical parameter, which should be between 100 and 1000. random_state: ``int``, optional, default: ``0`` Random seed set for reproducing results. select_frac: ``float``, optional, default: ``0.1`` Down sampling fraction on the cells. select_K: ``int``, optional, default: ``25`` Number of neighbors to be used to estimate local density for each data point for down sampling. select_alpha: ``float``, optional, default: ``1.0`` Weight the down sample to be proportional to ``radius ** select_alpha``. net_alpha: ``float``, optional, default: ``0.1`` L2 penalty (regularization term) parameter of the deep regressor. polish_learning_frac: ``float``, optional, default: ``0.33`` After running the deep regressor to predict new coordinates, use ``polish_learning_frac`` * ``n_obs`` as the learning rate to polish the coordinates. polish_n_iter: ``int``, optional, default: ``150`` Number of iterations for polishing tSNE run. out_basis: ``str``, optional, default: ``"net_tsne"`` Key name for the approximated tSNE coordinates calculated. Returns ------- ``None`` Update ``data.obsm``: * ``data.obsm['X_' + out_basis]``: Net tSNE coordinates of the data. Update ``data.obs``: * ``data.obs['ds_selected']``: Boolean array to indicate which cells are selected during the down sampling phase. Examples -------- >>> pg.net_tsne(data) """ rep = update_rep(rep) indices_key = rep + "_knn_indices" distances_key = rep + "_knn_distances" if not knn_is_cached(data, indices_key, distances_key, select_K): raise ValueError("Please run neighbors first!") n_jobs = effective_n_jobs(n_jobs) selected = select_cells( data.uns[distances_key], select_frac, K=select_K, alpha=select_alpha, random_state=random_state, ) X_full = X_from_rep(data, rep) X = X_full[selected, :] X_tsne = calc_tsne( X, n_jobs, n_components, perplexity, early_exaggeration, learning_rate, random_state, ) data.uns["X_" + out_basis + "_small"] = X_tsne data.obs["ds_selected"] = selected Y_init = np.zeros((data.shape[0], n_components), dtype=np.float64) Y_init[selected, :] = X_tsne Y_init[~selected, :] = net_train_and_predict(X, X_tsne, X_full[~selected, :], net_alpha, random_state, verbose=True) data.obsm["X_" + out_basis + "_pred"] = Y_init polish_learning_rate = polish_learning_frac * data.shape[0] data.obsm["X_" + out_basis] = calc_tsne( X_full, n_jobs, n_components, perplexity, early_exaggeration, polish_learning_rate, random_state, init=Y_init, n_iter=polish_n_iter, n_iter_early_exag=0, )
def umap( data: MultimodalData, rep: str = "pca", n_components: int = 2, n_neighbors: int = 15, min_dist: float = 0.5, spread: float = 1.0, random_state: int = 0, out_basis: str = "umap", ) -> None: """Calculate UMAP embedding of cells. This function uses umap-learn_ package. See [McInnes18]_ for details on UMAP. .. _umap-learn: https://github.com/lmcinnes/umap Parameters ---------- data: ``pegasusio.MultimodalData`` Annotated data matrix with rows for cells and columns for genes. rep: ``str``, optional, default: ``"pca"`` Representation of data used for the calculation. By default, use PCA coordinates. If ``None``, use the count matrix ``data.X``. n_components: ``int``, optional, default: ``2`` Dimension of calculated UMAP coordinates. By default, generate 2-dimensional data for 2D visualization. n_neighbors: ``int``, optional, default: ``15`` Number of nearest neighbors considered during the computation. min_dist: ``float``, optional, default: ``0.5`` The effective minimum distance between embedded data points. spread: ``float``, optional, default: ``1.0`` The effective scale of embedded data points. random_state: ``int``, optional, default: ``0`` Random seed set for reproducing results. out_basis: ``str``, optional, default: ``"umap"`` Key name for calculated UMAP coordinates to store. Returns ------- ``None`` Update ``data.obsm``: * ``data.obsm['X_' + out_basis]``: UMAP coordinates of the data. Examples -------- >>> pg.umap(data) """ start = time.time() rep = update_rep(rep) indices_key = rep + "_knn_indices" distances_key = rep + "_knn_distances" X = X_from_rep(data, rep) if not knn_is_cached(data, indices_key, distances_key, n_neighbors): if indices_key in data.uns and n_neighbors > data.uns[ indices_key].shape[1] + 1: logger.warning( f"Reduce K for neighbors in UMAP from {n_neighbors} to {data.uns[indices_key].shape[1] + 1}" ) n_neighbors = data.uns[indices_key].shape[1] + 1 else: raise ValueError("Please run neighbors first!") knn_indices = np.insert(data.uns[indices_key][:, 0:n_neighbors - 1], 0, range(data.shape[0]), axis=1) knn_dists = np.insert(data.uns[distances_key][:, 0:n_neighbors - 1], 0, 0.0, axis=1) data.obsm["X_" + out_basis] = calc_umap( X, n_components, n_neighbors, min_dist, spread, random_state, knn_indices=knn_indices, knn_dists=knn_dists, ) end = time.time() logger.info("UMAP is calculated. Time spent = {:.2f}s.".format(end - start))
def fitsne( data: MultimodalData, rep: str = "pca", n_jobs: int = -1, n_components: int = 2, perplexity: float = 30, early_exaggeration: int = 12, learning_rate: float = 1000, random_state: int = 0, out_basis: str = "fitsne", ) -> None: """Calculate FIt-SNE embedding of cells. This function uses fitsne_ package. See [Linderman19]_ for details on FIt-SNE. .. _fitsne: https://github.com/KlugerLab/FIt-SNE Parameters ---------- data: ``pegasusio.MultimodalData`` Annotated data matrix with rows for cells and columns for genes. rep: ``str``, optional, default: ``"pca"`` Representation of data used for the calculation. By default, use PCA coordinates. If ``None``, use the count matrix ``data.X``. n_jobs: ``int``, optional, default: ``-1`` Number of threads to use. If ``-1``, use all available threads. n_components: ``int``, optional, default: ``2`` Dimension of calculated FI-tSNE coordinates. By default, generate 2-dimensional data for 2D visualization. perplexity: ``float``, optional, default: ``30`` The perplexity is related to the number of nearest neighbors used in other manifold learning algorithms. Larger datasets usually require a larger perplexity. early_exaggeration: ``int``, optional, default: ``12`` Controls how tight natural clusters in the original space are in the embedded space, and how much space will be between them. learning_rate: ``float``, optional, default: ``1000`` The learning rate can be a critical parameter, which should be between 100 and 1000. random_state: ``int``, optional, default: ``0`` Random seed set for reproducing results. out_basis: ``str``, optional, default: ``"fitsne"`` Key name for calculated FI-tSNE coordinates to store. Returns ------- ``None`` Update ``data.obsm``: * ``data.obsm['X_' + out_basis]``: FI-tSNE coordinates of the data. Examples -------- >>> pg.fitsne(data) """ rep = update_rep(rep) n_jobs = effective_n_jobs(n_jobs) data.obsm["X_" + out_basis] = calc_fitsne( X_from_rep(data, rep), n_jobs, n_components, perplexity, early_exaggeration, learning_rate, random_state, )
def umap( data: MultimodalData, rep: str = "pca", n_components: int = 2, n_neighbors: int = 15, min_dist: float = 0.5, spread: float = 1.0, n_jobs: int = -1, full_speed: bool = False, random_state: int = 0, out_basis: str = "umap", ) -> None: """Calculate UMAP embedding of cells. This function uses umap-learn_ package. See [McInnes18]_ for details on UMAP. .. _umap-learn: https://github.com/lmcinnes/umap Parameters ---------- data: ``pegasusio.MultimodalData`` Annotated data matrix with rows for cells and columns for genes. rep: ``str``, optional, default: ``"pca"`` Representation of data used for the calculation. By default, use PCA coordinates. If ``None``, use the count matrix ``data.X``. n_components: ``int``, optional, default: ``2`` Dimension of calculated UMAP coordinates. By default, generate 2-dimensional data for 2D visualization. n_neighbors: ``int``, optional, default: ``15`` Number of nearest neighbors considered during the computation. min_dist: ``float``, optional, default: ``0.5`` The effective minimum distance between embedded data points. spread: ``float``, optional, default: ``1.0`` The effective scale of embedded data points. n_jobs: ``int``, optional, default: ``-1`` Number of threads to use for computing kNN graphs. If ``-1``, use all physical CPU cores. full_speed: ``bool``, optional, default: ``False`` * If ``True``, use multiple threads in constructing ``hnsw`` index. However, the kNN results are not reproducible. * Otherwise, use only one thread to make sure results are reproducible. random_state: ``int``, optional, default: ``0`` Random seed set for reproducing results. out_basis: ``str``, optional, default: ``"umap"`` Key name for calculated UMAP coordinates to store. Returns ------- ``None`` Update ``data.obsm``: * ``data.obsm['X_' + out_basis]``: UMAP coordinates of the data. Examples -------- >>> pg.umap(data) """ rep = update_rep(rep) X = X_from_rep(data, rep) if data.shape[0] < n_neighbors: logger.warning( f"Warning: Number of samples = {data.shape[0]} < K = {n_neighbors}!\n Set K to {data.shape[0]}." ) n_neighbors = data.shape[0] knn_indices, knn_dists = get_neighbors(data, K=n_neighbors, rep=rep, n_jobs=n_jobs, random_state=random_state, full_speed=full_speed) knn_indices = np.insert(knn_indices[:, 0:n_neighbors - 1], 0, range(data.shape[0]), axis=1) knn_dists = np.insert(knn_dists[:, 0:n_neighbors - 1], 0, 0.0, axis=1) data.obsm["X_" + out_basis] = calc_umap( X, n_components, n_neighbors, min_dist, spread, random_state, knn_indices=knn_indices, knn_dists=knn_dists, )
def tsne( data: MultimodalData, rep: str = "pca", n_jobs: int = -1, n_components: int = 2, perplexity: float = 30, early_exaggeration: int = 12, learning_rate: float = "auto", initialization: str = "pca", random_state: int = 0, out_basis: str = "tsne", ) -> None: """Calculate t-SNE embedding of cells using the FIt-SNE package. This function uses fitsne_ package. See [Linderman19]_ for details on FIt-SNE algorithm. .. _fitsne: https://github.com/KlugerLab/FIt-SNE Parameters ---------- data: ``pegasusio.MultimodalData`` Annotated data matrix with rows for cells and columns for genes. rep: ``str``, optional, default: ``"pca"`` Representation of data used for the calculation. By default, use PCA coordinates. If ``None``, use the count matrix ``data.X``. n_jobs: ``int``, optional, default: ``-1`` Number of threads to use. If ``-1``, use all physical CPU cores. n_components: ``int``, optional, default: ``2`` Dimension of calculated FI-tSNE coordinates. By default, generate 2-dimensional data for 2D visualization. perplexity: ``float``, optional, default: ``30`` The perplexity is related to the number of nearest neighbors used in other manifold learning algorithms. Larger datasets usually require a larger perplexity. early_exaggeration: ``int``, optional, default: ``12`` Controls how tight natural clusters in the original space are in the embedded space, and how much space will be between them. learning_rate: ``float``, optional, default: ``auto`` By default, the learning rate is determined automatically as max(data.shape[0] / early_exaggeration, 200). See [Belkina19]_ and [Kobak19]_ for details. initialization: ``str``, optional, default: ``pca`` Initialization can be either ``pca`` or ``random`` or np.ndarray. By default, we use ``pca`` initialization according to [Kobak19]_. random_state: ``int``, optional, default: ``0`` Random seed set for reproducing results. out_basis: ``str``, optional, default: ``"fitsne"`` Key name for calculated FI-tSNE coordinates to store. Returns ------- ``None`` Update ``data.obsm``: * ``data.obsm['X_' + out_basis]``: FI-tSNE coordinates of the data. Examples -------- >>> pg.tsne(data) """ rep = update_rep(rep) n_jobs = eff_n_jobs(n_jobs) X = X_from_rep(data, rep).astype(np.float64) if learning_rate == "auto": learning_rate = max(X.shape[0] / early_exaggeration, 200.0) if initialization == "random": initialization = None elif initialization == "pca": if rep == "pca": initialization = X[:, 0:n_components].copy() else: from sklearn.decomposition import PCA pca = PCA(n_components=n_components, random_state=random_state) with threadpool_limits(limits=n_jobs): initialization = np.ascontiguousarray(pca.fit_transform(X)) initialization = initialization / np.std(initialization[:, 0]) * 0.0001 else: assert isinstance( initialization, np.ndarray) and initialization.ndim == 2 and initialization.shape[ 0] == X.shape[0] and initialization.shape[1] == n_components if initialization.dtype != np.float64: initialization = initialization.astype(np.float64) data.obsm["X_" + out_basis] = calc_tsne( X, n_jobs, n_components, perplexity, early_exaggeration, learning_rate, random_state, initialization, )
def run_harmony( data: MultimodalData, rep: str = 'pca', n_jobs: int = -1, n_clusters: int = None, random_state: int = 0, ) -> str: """Batch correction on PCs using Harmony. This is a wrapper of `harmony-pytorch <https://github.com/lilab-bcb/harmony-pytorch>`_ package, which is a Pytorch implementation of Harmony algorithm [Korsunsky19]_. Parameters ---------- data: ``MultimodalData``. Annotated data matrix with rows for cells and columns for genes. rep: ``str``, optional, default: ``"pca"``. Which representation to use as input of Harmony, default is PCA. n_jobs : ``int``, optional, default: ``-1``. Number of threads to use for the KMeans clustering used in Harmony. ``-1`` refers to using all available threads. n_clusters: ``int``, optional, default: ``None``. Number of Harmony clusters. Default is ``None``, which asks Harmony to estimate this number from the data. random_state: ``int``, optional, default: ``0``. Seed for random number generator Returns ------- out_rep: ``str`` The keyword in ``data.obsm`` referring to the embedding calculated by Harmony algorithm. This keyword is ``rep + '_harmony'``, where ``rep`` is the input parameter above. Update ``data.obsm``: * ``data.obsm['X_' + out_rep]``: The embedding calculated by Harmony algorithm. Examples -------- >>> pg.run_harmony(data, rep = "pca", n_jobs = 10, random_state = 25) """ if not is_categorical_dtype(data.obs['Channel']): data.obs['Channel'] = pd.Categorical(data.obs['Channel']) if data.obs['Channel'].cat.categories.size == 1: logger.warning("Warning: data only contains 1 channel. Cannot apply Harmony!") return rep try: from harmony import harmonize except ImportError as e: print(f"ERROR: {e}") print("ERROR: Need Harmony! Try 'pip install harmony-pytorch'.") import sys sys.exit(-1) logger.info("Start integration using Harmony.") out_rep = rep + '_harmony' data.obsm['X_' + out_rep] = harmonize(X_from_rep(data, rep), data.obs, 'Channel', n_clusters = n_clusters, n_jobs_kmeans = n_jobs, random_state = random_state) return out_rep
def tsne( data: AnnData, rep: str = "pca", n_jobs: int = -1, n_components: int = 2, perplexity: float = 30, early_exaggeration: int = 12, learning_rate: float = 1000, random_state: int = 0, out_basis: str = "tsne", ) -> None: """Calculate tSNE embedding using MulticoreTSNE_ package. .. _MulticoreTSNE: https://github.com/DmitryUlyanov/Multicore-TSNE Parameters ---------- data: ``anndata.AnnData`` Annotated data matrix with rows for cells and columns for genes. rep: ``str``, optional, default: ``"pca"`` Representation of data used for the calculation. By default, use PCA coordinates. If ``None``, use the count matrix ``data.X``. n_jobs: ``int``, optional, default: ``-1`` Number of threads to use. If ``-1``, use all available threads. n_components: ``int``, optional, default: ``2`` Dimension of calculated tSNE coordinates. By default, generate 2-dimensional data for 2D visualization. perplexity: ``float``, optional, default: ``30`` The perplexity is related to the number of nearest neighbors used in other manifold learning algorithms. Larger datasets usually require a larger perplexity. early_exaggeration: ``int``, optional, default: ``12`` Controls how tight natural clusters in the original space are in the embedded space, and how much space will be between them. learning_rate: ``float``, optional, default: ``1000`` The learning rate can be a critical parameter, which should be between 100 and 1000. random_state: ``int``, optional, default: ``0`` Random seed set for reproducing results. out_basis: ``str``, optional, default: ``"tsne"`` Key name for calculated tSNE coordinates to store. Returns ------- ``None`` Update ``data.obsm``: * ``data.obsm['X_' + out_basis]``: tSNE coordinates of the data. Examples -------- >>> pg.tsne(adata) """ start = time.time() rep = update_rep(rep) n_jobs = effective_n_jobs(n_jobs) data.obsm["X_" + out_basis] = calc_tsne( X_from_rep(data, rep), n_jobs, n_components, perplexity, early_exaggeration, learning_rate, random_state, ) end = time.time() logger.info("t-SNE is calculated. Time spent = {:.2f}s.".format(end - start))
def run_harmony( data: Union[MultimodalData, UnimodalData], batch: str = "Channel", rep: str = "pca", n_comps: int = None, n_jobs: int = -1, n_clusters: int = None, random_state: int = 0, use_gpu: bool = False, max_iter_harmony: int = 10, ) -> str: """Batch correction on PCs using Harmony. This is a wrapper of `harmony-pytorch <https://github.com/lilab-bcb/harmony-pytorch>`_ package, which is a Pytorch implementation of Harmony algorithm [Korsunsky19]_. Parameters ---------- data: ``MultimodalData``. Annotated data matrix with rows for cells and columns for genes. batch: ``str``, optional, default: ``"Channel"``. Which attribute in data.obs field represents batches, default is "Channel". rep: ``str``, optional, default: ``"pca"``. Which representation to use as input of Harmony, default is PCA. n_comps: `int`, optional (default: None) Number of components to be used in the `rep`. If n_comps == None, use all components; otherwise, use the minimum of n_comps and rep's dimensions. n_jobs : ``int``, optional, default: ``-1``. Number of threads to use in Harmony. ``-1`` refers to using all physical CPU cores. n_clusters: ``int``, optional, default: ``None``. Number of Harmony clusters. Default is ``None``, which asks Harmony to estimate this number from the data. random_state: ``int``, optional, default: ``0``. Seed for random number generator use_gpu: ``bool``, optional, default: ``False``. If ``True``, use GPU if available. Otherwise, use CPU only. max_iter_harmony: ``int``, optional, default: ``10``. Maximum iterations on running Harmony if not converged. Returns ------- out_rep: ``str`` The keyword in ``data.obsm`` referring to the embedding calculated by Harmony algorithm. This keyword is ``rep + '_harmony'``, where ``rep`` is the input parameter above. Update ``data.obsm``: * ``data.obsm['X_' + out_rep]``: The embedding calculated by Harmony algorithm. Examples -------- >>> pg.run_harmony(data, rep = "pca", n_jobs = 10, random_state = 25) """ if not check_batch_key(data, batch, "Cannot apply Harmony!"): return rep try: from harmony import harmonize except ImportError as e: import sys logger.error(f"{e}\nNeed Harmony! Try 'pip install harmony-pytorch'.") sys.exit(-1) logger.info("Start integration using Harmony.") out_rep = rep + "_harmony" data.obsm["X_" + out_rep] = harmonize( X_from_rep(data, rep, n_comps), data.obs, batch, n_clusters = n_clusters, n_jobs = n_jobs, random_state = random_state, use_gpu = use_gpu, max_iter_harmony = max_iter_harmony, ) return out_rep
def net_umap( data: MultimodalData, rep: str = "pca", n_jobs: int = -1, n_components: int = 2, n_neighbors: int = 15, min_dist: float = 0.5, spread: float = 1.0, densmap: bool = False, dens_lambda: float = 2.0, dens_frac: float = 0.3, dens_var_shift: float = 0.1, random_state: int = 0, select_frac: float = 0.1, select_K: int = 25, select_alpha: float = 1.0, full_speed: bool = False, net_alpha: float = 0.1, polish_learning_rate: float = 10.0, polish_n_epochs: int = 30, out_basis: str = "net_umap", ) -> None: """Calculate Net-UMAP embedding of cells. Net-UMAP is an approximated UMAP embedding using Deep Learning model to improve the speed. In specific, the deep model used is MLPRegressor_, the *scikit-learn* implementation of Multi-layer Perceptron regressor. See [Li20]_ for details. .. _MLPRegressor: https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPRegressor.html Parameters ---------- data: ``pegasusio.MultimodalData`` Annotated data matrix with rows for cells and columns for genes. rep: ``str``, optional, default: ``"pca"`` Representation of data used for the calculation. By default, use PCA coordinates. If ``None``, use the count matrix ``data.X``. n_jobs: ``int``, optional, default: ``-1`` Number of threads to use. If ``-1``, use all physical CPU cores. n_components: ``int``, optional, default: ``2`` Dimension of calculated UMAP coordinates. By default, generate 2-dimensional data for 2D visualization. n_neighbors: ``int``, optional, default: ``15`` Number of nearest neighbors considered during the computation. min_dist: ``float``, optional, default: ``0.5`` The effective minimum distance between embedded data points. spread: ``float``, optional, default: ``1.0`` The effective scale of embedded data points. densmap: ``bool``, optional, default: ``False`` Whether the density-augmented objective of densMAP should be used for optimization, which will generate an embedding where local densities are encouraged to be correlated with those in the original space. dens_lambda: ``float``, optional, default: ``2.0`` Controls the regularization weight of the density correlation term in densMAP. Only works when *densmap* is ``True``. Larger values prioritize density preservation over the UMAP objective, while values closer to 0 for the opposite direction. Notice that setting this parameter to ``0`` is equivalent to running the original UMAP algorithm. dens_frac: ``float``, optional, default: ``0.3`` Controls the fraction of epochs (between 0 and 1) where the density-augmented objective is used in densMAP. Only works when *densmap* is ``True``. The first ``(1 - dens_frac)`` fraction of epochs optimize the original UMAP objective before introducing the density correlation term. dens_var_shift: ``float``, optional, default, ``0.1`` A small constant added to the variance of local radii in the embedding when calculating the density correlation objective to prevent numerical instability from dividing by a small number. Only works when *densmap* is ``True``. random_state: ``int``, optional, default: ``0`` Random seed set for reproducing results. select_frac: ``float``, optional, default: ``0.1`` Down sampling fraction on the cells. select_K: ``int``, optional, default: ``25`` Number of neighbors to be used to estimate local density for each data point for down sampling. select_alpha: ``float``, optional, default: ``1.0`` Weight the down sample to be proportional to ``radius ** select_alpha``. full_speed: ``bool``, optional, default: ``False`` * If ``True``, use multiple threads in constructing ``hnsw`` index. However, the kNN results are not reproducible. * Otherwise, use only one thread to make sure results are reproducible. net_alpha: ``float``, optional, default: ``0.1`` L2 penalty (regularization term) parameter of the deep regressor. polish_learning_frac: ``float``, optional, default: ``10.0`` After running the deep regressor to predict new coordinates, use ``polish_learning_frac`` * ``n_obs`` as the learning rate to polish the coordinates. polish_n_iter: ``int``, optional, default: ``30`` Number of iterations for polishing UMAP run. out_basis: ``str``, optional, default: ``"net_umap"`` Key name for calculated UMAP coordinates to store. Returns ------- ``None`` Update ``data.obsm``: * ``data.obsm['X_' + out_basis]``: Net UMAP coordinates of the data. Update ``data.obs``: * ``data.obs['ds_selected']``: Boolean array to indicate which cells are selected during the down sampling phase. Examples -------- >>> pg.net_umap(data) """ rep = update_rep(rep) n_jobs = eff_n_jobs(n_jobs) knn_indices, knn_dists = get_neighbors(data, K=select_K, rep=rep, n_jobs=n_jobs, random_state=random_state, full_speed=full_speed) selected = select_cells( knn_dists, select_frac, K=select_K, alpha=select_alpha, random_state=random_state, ) X_full = X_from_rep(data, rep) X = X_full[selected, :] if data.shape[0] < n_neighbors: logger.warning( f"Warning: Number of samples = {data.shape[0]} < K = {n_neighbors}!\n Set K to {data.shape[0]}." ) n_neighbors = data.shape[0] ds_indices_key = "ds_" + rep + "_knn_indices" # ds refers to down-sampling ds_distances_key = "ds_" + rep + "_knn_distances" indices, distances = calculate_nearest_neighbors( X, K=n_neighbors, n_jobs=n_jobs, random_state=random_state, full_speed=full_speed, ) data.uns[ds_indices_key] = indices data.uns[ds_distances_key] = distances knn_indices = np.insert(data.uns[ds_indices_key][:, 0:n_neighbors - 1], 0, range(X.shape[0]), axis=1) knn_dists = np.insert(data.uns[ds_distances_key][:, 0:n_neighbors - 1], 0, 0.0, axis=1) X_umap = calc_umap( X, n_components=n_components, n_neighbors=n_neighbors, min_dist=min_dist, spread=spread, densmap=densmap, dens_lambda=dens_lambda, dens_frac=dens_frac, dens_var_shift=dens_var_shift, random_state=random_state, knn_indices=knn_indices, knn_dists=knn_dists, ) data.uns["X_" + out_basis + "_small"] = X_umap data.obs["ds_selected"] = selected Y_init = np.zeros((data.shape[0], n_components), dtype=np.float64) Y_init[selected, :] = X_umap Y_init[~selected, :] = net_train_and_predict(X, X_umap, X_full[~selected, :], net_alpha, n_jobs, random_state, verbose=True) data.obsm["X_" + out_basis + "_pred"] = Y_init knn_indices, knn_dists = get_neighbors(data, K=n_neighbors, rep=rep, n_jobs=n_jobs, random_state=random_state, full_speed=full_speed) knn_indices = np.insert(knn_indices[:, 0:n_neighbors - 1], 0, range(data.shape[0]), axis=1) knn_dists = np.insert(knn_dists[:, 0:n_neighbors - 1], 0, 0.0, axis=1) key = f"X_{out_basis}" data.obsm[key] = calc_umap( X_full, n_components=n_components, n_neighbors=n_neighbors, min_dist=min_dist, spread=spread, densmap=densmap, dens_lambda=dens_lambda, dens_frac=dens_frac, dens_var_shift=dens_var_shift, random_state=random_state, init=Y_init, n_epochs=polish_n_epochs, learning_rate=polish_learning_rate, knn_indices=knn_indices, knn_dists=knn_dists, ) data.register_attr(key, "basis")
def umap( data: MultimodalData, rep: str = "pca", rep_ncomps: int = None, n_components: int = 2, n_neighbors: int = 15, min_dist: float = 0.5, spread: float = 1.0, densmap: bool = False, dens_lambda: float = 2.0, dens_frac: float = 0.3, dens_var_shift: float = 0.1, n_jobs: int = -1, full_speed: bool = False, random_state: int = 0, out_basis: str = "umap", ) -> None: """Calculate UMAP embedding of cells. This function uses umap-learn_ package. See [McInnes18]_ for details on UMAP. .. _umap-learn: https://github.com/lmcinnes/umap Parameters ---------- data: ``pegasusio.MultimodalData`` Annotated data matrix with rows for cells and columns for genes. rep: ``str``, optional, default: ``"pca"`` Representation of data used for the calculation. By default, use PCA coordinates. If ``None``, use the count matrix ``data.X``. rep_ncomps: `int`, optional (default: None) Number of components to be used in `rep`. If rep_ncomps == None, use all components; otherwise, use the minimum of rep_ncomps and rep's dimensions. n_components: ``int``, optional, default: ``2`` Dimension of calculated UMAP coordinates. By default, generate 2-dimensional data for 2D visualization. n_neighbors: ``int``, optional, default: ``15`` Number of nearest neighbors considered during the computation. min_dist: ``float``, optional, default: ``0.5`` The effective minimum distance between embedded data points. spread: ``float``, optional, default: ``1.0`` The effective scale of embedded data points. densmap: ``bool``, optional, default: ``False`` Whether the density-augmented objective of densMAP should be used for optimization, which will generate an embedding where local densities are encouraged to be correlated with those in the original space. dens_lambda: ``float``, optional, default: ``2.0`` Controls the regularization weight of the density correlation term in densMAP. Only works when *densmap* is ``True``. Larger values prioritize density preservation over the UMAP objective, while values closer to 0 for the opposite direction. Notice that setting this parameter to ``0`` is equivalent to running the original UMAP algorithm. dens_frac: ``float``, optional, default: ``0.3`` Controls the fraction of epochs (between 0 and 1) where the density-augmented objective is used in densMAP. Only works when *densmap* is ``True``. The first ``(1 - dens_frac)`` fraction of epochs optimize the original UMAP objective before introducing the density correlation term. dens_var_shift: ``float``, optional, default, ``0.1`` A small constant added to the variance of local radii in the embedding when calculating the density correlation objective to prevent numerical instability from dividing by a small number. Only works when *densmap* is ``True``. n_jobs: ``int``, optional, default: ``-1`` Number of threads to use for computing kNN graphs. If ``-1``, use all physical CPU cores. full_speed: ``bool``, optional, default: ``False`` * If ``True``, use multiple threads in constructing ``hnsw`` index. However, the kNN results are not reproducible. * Otherwise, use only one thread to make sure results are reproducible. random_state: ``int``, optional, default: ``0`` Random seed set for reproducing results. out_basis: ``str``, optional, default: ``"umap"`` Key name for calculated UMAP coordinates to store. Returns ------- ``None`` Update ``data.obsm``: * ``data.obsm['X_' + out_basis]``: UMAP coordinates of the data. Examples -------- >>> pg.umap(data) """ rep = update_rep(rep) X = X_from_rep(data, rep, rep_ncomps) if data.shape[0] < n_neighbors: logger.warning( f"Warning: Number of samples = {data.shape[0]} < K = {n_neighbors}!\n Set K to {data.shape[0]}." ) n_neighbors = data.shape[0] knn_indices, knn_dists = get_neighbors(data, K=n_neighbors, rep=rep, n_jobs=n_jobs, random_state=random_state, full_speed=full_speed) knn_indices = np.insert(knn_indices[:, 0:n_neighbors - 1], 0, range(data.shape[0]), axis=1) knn_dists = np.insert(knn_dists[:, 0:n_neighbors - 1], 0, 0.0, axis=1) key = f"X_{out_basis}" data.obsm[key] = calc_umap( X, n_components=n_components, n_neighbors=n_neighbors, min_dist=min_dist, spread=spread, densmap=densmap, dens_lambda=dens_lambda, dens_frac=dens_frac, dens_var_shift=dens_var_shift, random_state=random_state, knn_indices=knn_indices, knn_dists=knn_dists, ) data.register_attr(key, "basis")