def random_subsample(adata, frac=.5): subset = np.random.choice([True, False], size=adata.n_obs, p=[frac, 1-frac]).sum() adata.obs['subset'] = subset adata_subset = adata[subset].copy() neighbors(adata_subset) moments(adata_subset) return adata_subset
def moments(adata, n_neighbors=30, n_pcs=30, mode='connectivities', renormalize=False, copy=False): """Computes first order moments for velocity estimation. Arguments --------- adata: :class:`~anndata.AnnData` Annotated data matrix. n_neighbors: `int` (default: 30) Number of neighbors to use. n_pcs: `int` (default: 30) Number of principal components to use. mode: `'connectivities'` or `'distances'` (default: `'connectivities'`) Distance metric to use for moment computation. renormalize: `bool` (default: `False`) Renormalize the moments by total counts per cell to its median. copy: `bool` (default: `False`) Return a copy instead of writing to adata. Returns ------- Returns or updates `adata` with the attributes Ms: `.layers` dense matrix with first order moments of spliced counts. Mu: `.layers` dense matrix with first order moments of unspliced counts. """ if 'neighbors' not in adata.uns.keys() or n_neighbors > adata.uns['neighbors']['params']['n_neighbors']: from scanpy.api.pp import neighbors, pca if 'X_pca' not in adata.obsm.keys() or n_pcs > adata.obsm['X_pca'].shape[1]: pca(adata, n_comps=n_pcs, svd_solver='arpack') neighbors(adata, n_neighbors=n_neighbors, use_rep='X_pca') if mode not in adata.uns['neighbors']: raise ValueError('mode can only be \'connectivities\' or \'distances\'') logg.info('computing moments', r=True) normalize_layers(adata) connectivities = get_connectivities(adata, mode) #connectivities += connectivities.dot(connectivities*.5) adata.layers['Ms'] = csr_matrix.dot(connectivities, csr_matrix(adata.layers['spliced'])).A adata.layers['Mu'] = csr_matrix.dot(connectivities, csr_matrix(adata.layers['unspliced'])).A if renormalize: normalize_layers(adata, layers={'Ms', 'Mu'}) logg.info(' finished', time=True, end=' ' if settings.verbosity > 2 else '\n') logg.hint( 'added to `.layers`\n' ' \'Ms\', moments of spliced abundances\n' ' \'Mu\', moments of unspliced abundances') return adata if copy else None
def pca_neighbors_umap(adata, results_folder, nrpcs=50, nrpcs_neigh=None, nrneigh=10, method='NULL'): ''' parameters ---------- adata: `ÀnnData` AnnData object that is to be exported results_folder: `str` path to the results folder nrpcs: int | nrpcs = 50 number of principle components to calculate nrpcs_neigh: int | nrpcs_neigh = 50 number of principle components to use for nearest neighbor calculation. When set to None the number is chosen automatically. For .n_vars < 50, .X is used, otherwise ‘X_pca’ is used with 50 components. nrneigh: int | nrpcs = None number of principle components to calculate method: `str` Method for nearest neighbor calculation. Can be set to 'NULL' or bbknn ''' start = time() random_state = 0 print('Using random_state = 0 for all the following calculations') #PCA sc_pca(adata, svd_solver='arpack', random_state=random_state, n_comps=nrpcs) adata.obsm['X_pca'] *= -1 # multiply by -1 to match Seurat print( "PCA calculated using svd_solver = 'arpack'. PCA multiplied by -1 to match Seurat output." ) #generate plot of PCA fig, (ax1, ax2) = subplots(ncols=2, nrows=1) fig.set_figwidth(12) fig.set_figheight(6) fig.tight_layout(pad=4.5) cumulative_variance = cumsum(adata.uns['pca']['variance_ratio']) x = list(range(nrpcs)) data = DataFrame({'x': x, 'y': cumulative_variance}) ax1.scatter(x=x, y=cumulative_variance) ax1.set_ylabel('cumulative explained variance') ax1.set_xlabel('PCA components') ax1.set_title('cumulative explained variance (as ratio)') sc_pl_pca( adata, ax=ax2, ) fig.savefig(join(results_folder, 'figures', 'PCA.png')) #display(fig) #neighbors if (method == 'bbknn'): if ('batch' in adata.obs.columns): bbknn.bbknn(adata) else: neighbors(adata, n_neighbors=nrneigh, random_state=random_state, n_pcs=nrpcs_neigh) print('Nearest neighbors calculated with n_neighbors = ' + str(nrneigh)) if nrpcs_neigh == 0: print('Using .X to calculate nearest neighbors instead of PCs.') logging.info( 'Neighborhood analysis performed with .X instead of PCs.') #umap sc_umap(adata, random_state=random_state) print('UMAP coordinates calculated.') logging.info('Neighborhood analysis completed, and UMAP generated.') logging.info( '\t Time for PCA, nearest neighbor calculation and UMAP generation: ' + str(round(time() - start, 3)) + 's') #export metadata start = time() export_metadata(adata, basepath=results_folder, n_pcs=3, umap=True, tsne=False) logging.info( 'Metadata containing 3 PCAs and UMAP coordinates exported successfully to file.' ) logging.info('Time for export: ' + str(round(time() - start, 3)) + 's') return (adata)
def recluster(adata, celltype, celltype_label='leiden', min_mean=0.0125, max_mean=4, min_disp=0.5, resolution=1.0, regress_out_key=None, random_seed=0, show_plot_filter=False, method='leiden'): """ Perform subclustering on specific celltype to identify subclusters. Extract all cells that belong to the pre-labeled celltype into a new data subset. This datasubset is initialized with the raw data contained in adata.raw. New highly variable genes are selected and a new clustering is performed. The function returns the adata subset with the new clustering annotation. This can be performed on leiden clusters by setting celltype_label = 'leiden' and passing the clusters that are to be selected for reclustering as strings or tuple of strings to the parameter celltype. Parameters ---------- adata: the complete AnnData object of the Dataset. celltype: `str` or (`str`) string identifying the cluster which is to be filtered out, if more than one is to be selected please pass them as a tuple not as a list! celltype_label: `str` | default = 'leiden' string identifying which column in adata.obs will be matching with the celltype argument. min_mean: `float` | default = 0.0125 the minimum gene expression a gene must have to be considered highly variable max_mean: `float` | default = 4 the maximum gene expression a gene can have to be considered highly variable min_disp: `float` | default = 0.5 the minimum dispersion a gene must have to be considered highly variable regress_out_key: `list of str` | default = None A list of string identifiers of the adata.obs columns that should be regressed out before performing clustering. If None then no regress_out is calculated. random_seed: `int` | default = 0 the random seed that is used to produce reproducible PCA, clustering and UMAP results show_plot_filter: `bool` | default = False boolian value indicating if a plot showing the filtering results for highly variable gene detection should be displayed or not method: `str` | default = 'louvain' clustering method to use for the reclustering of the datasubset. Possible:louvain/leiden Returns ------- AnnData object containing the subcluster annotated with PCA, nearest neighbors, louvain cluster, and UMAP coordinates. Examples -------- For a more detailed example of the entire reclustering process please refer to the code examples. >>> import besca as bc >>> import scanpy.api as sc >>> adata = bc.datasets.pbmc3k_processed() >>> adata_subset = bc.tl.rc.recluster(adata, celltype=('0', '1', '3', '6'), resolution = 1.3) >>> sc.pl.umap(adata_subset, color = ['louvain', 'CD3G', 'CD8A', 'CD4', 'IL7R', 'NKG7', 'GNLY']) """ if (not method in ['leiden', 'louvain']): raise ValueError("method argument should be leiden or louvain") if type(celltype) == str: cluster_subset = _subset_adata( adata, adata.obs.get(celltype_label) == celltype) elif type(celltype) == tuple: filter = adata.obs.get(celltype_label) == 'NONE' for i in range(len(celltype)): filter = filter | (adata.obs.get(celltype_label) == celltype[i]) cluster_subset = _subset_adata(adata, filter) else: sys.exit('specify cluster input as a string or tuple') cluster_subset.raw = cluster_subset #identify highly variable genes filter_result = filter_genes_dispersion(cluster_subset.X, min_mean=min_mean, max_mean=max_mean, min_disp=min_disp) if show_plot_filter: plot_filter(filter_result) print('In total', str(sum(filter_result.gene_subset)), 'highly variable genes selected within cluster') #apply filter cluster_subset = _subset_adata(cluster_subset, filter_result.gene_subset, axis=1, raw=False) #perform further processing log1p(cluster_subset) if regress_out_key is not None: regress_out(cluster_subset, keys=regress_out_key) sc_scale(cluster_subset) sc_pca( cluster_subset, random_state=random_seed, svd_solver='arpack' ) #using `svd_solver='arpack' ensures that the PCA leads to reproducible results neighbors(cluster_subset, n_neighbors=10, random_state=random_seed) umap(cluster_subset, random_state=random_seed) if method == 'louvain': louvain(cluster_subset, resolution=resolution, random_state=random_seed) if method == 'leiden': leiden(cluster_subset, resolution=resolution, random_state=random_seed) return (cluster_subset)