def maxLikGlobalDimEst(adata, k=20, nrpcs=50, rlib_loc=''): """ Estimates the intrinsic dimensionality of the data, based on the 'maxLikGlobalDimEst' function of the 'intrinsicDimension' R package. Parameters ---------- adata: `AnnData` AnnData object of RNA counts. k: ` Number of neighbours to use in the 'maxLikGlobalDimEst'. Choosing k between 10 and 20 generally yields the best results. nrpcs: Number of PCs to compute initially before estimating the dimensionality. Consider increasing it for very high dimensional data. rlib_loc: `str` R library location that will be added to the default .libPaths() to locate the required packages. Returns ------- Returns the estimated intrinsic dimensionality of the data that can be used for graph clustering. """ rpy2_import = importlib.util.find_spec('rpy2') if rpy2_import is None: raise ImportError( "maxLikGlobalDimEst requires rpy2. Install with pip install rpy2") import rpy2.robjects as ro import anndata2ri from scipy.sparse import issparse ro.globalenv['rlib_loc'] = rlib_loc ro.r('.libPaths(c(rlib_loc, .libPaths()))') ro.r('suppressPackageStartupMessages(library(intrinsicDimension))') random_state = 0 print('Using random_state = 0 for all the following calculations') sc_pca(adata, svd_solver='arpack', random_state=0, n_comps=nrpcs) adata.obsm['X_pca'] *= -1 # multiply by -1 to match Seurat ro.globalenv['pcs'] = adata.obsm['X_pca'] ro.globalenv['k'] = k ro.r('n <- maxLikGlobalDimEst(as.matrix(pcs), k=k, unbiased=TRUE)') ro.r('message("Estimated dimensionality: ", round(n$dim.est))') n_dimest = ro.r('round(n$dim.est)') return int(n_dimest[0])
def pca_neighbors_umap(adata, results_folder, nrpcs=50, nrpcs_neigh=None, nrneigh=10, method='NULL'): ''' parameters ---------- adata: `ÀnnData` AnnData object that is to be exported results_folder: `str` path to the results folder nrpcs: int | nrpcs = 50 number of principle components to calculate nrpcs_neigh: int | nrpcs_neigh = 50 number of principle components to use for nearest neighbor calculation. When set to None the number is chosen automatically. For .n_vars < 50, .X is used, otherwise ‘X_pca’ is used with 50 components. nrneigh: int | nrpcs = None number of principle components to calculate method: `str` Method for nearest neighbor calculation. Can be set to 'NULL' or bbknn ''' start = time() random_state = 0 print('Using random_state = 0 for all the following calculations') # PCA sc_pca(adata, svd_solver='arpack', random_state=random_state, n_comps=nrpcs) adata.obsm['X_pca'] *= -1 # multiply by -1 to match Seurat print( "PCA calculated using svd_solver = 'arpack'. PCA multiplied by -1 to match Seurat output." ) # generate plot of PCA fig, (ax1, ax2) = subplots(ncols=2, nrows=1) fig.set_figwidth(12) fig.set_figheight(6) fig.tight_layout(pad=4.5) cumulative_variance = cumsum(adata.uns['pca']['variance_ratio']) x = list(range(nrpcs)) data = DataFrame({'x': x, 'y': cumulative_variance}) ax1.scatter(x=x, y=cumulative_variance) ax1.set_ylabel('cumulative explained variance') ax1.set_xlabel('PCA components') ax1.set_title('cumulative explained variance (as ratio)') sc_pl_pca(adata, ax=ax2) fig.savefig(join(results_folder, 'figures', 'PCA.png')) # display(fig) # Inner function for simple pca-umap witrhout any correctuon def run_no_correction(): neighbors(adata, n_neighbors=nrneigh, random_state=random_state, n_pcs=nrpcs_neigh) print('Nearest neighbors calculated with n_neighbors = ' + str(nrneigh)) if nrpcs_neigh == 0: print('Using .X to calculate nearest neighbors instead of PCs.') logging.info( 'Neighborhood analysis performed with .X instead of PCs.') # neighbors if (method == 'bbknn'): if ('batch' in adata.obs.columns): if len(set(adata.obs.get('batch'))) == 1: print( 'column "batch" only contains one value. We cannot correct for those; BBKNN is NOT applied.' ) run_no_correction() else: bbknn.bbknn(adata) else: sys.exit( 'bbknn correction requires a column "batch" in the observations.' ) else: run_no_correction() # umap sc_umap(adata, random_state=random_state) print('UMAP coordinates calculated.') logging.info('Neighborhood analysis completed, and UMAP generated.') logging.info( '\t Time for PCA, nearest neighbor calculation and UMAP generation: ' + str(round(time() - start, 3)) + 's') # export metadata start = time() export_metadata(adata, basepath=results_folder, n_pcs=3, umap=True, tsne=False) logging.info( 'Metadata containing 3 PCAs and UMAP coordinates exported successfully to file.' ) logging.info('Time for export: ' + str(round(time() - start, 3)) + 's') return (adata)
def recluster(adata, celltype, celltype_label='leiden', min_mean=0.0125, max_mean=4, min_disp=0.5, resolution=1.0, regress_out_key=None, random_seed=0, show_plot_filter=False, method='leiden', batch_key=None, n_shared=2): """ Perform subclustering on specific celltype to identify subclusters. Extract all cells that belong to the pre-labeled celltype into a new data subset. This datasubset is initialized with the raw data contained in adata.raw. New highly variable genes are selected and a new clustering is performed. The function returns the adata subset with the new clustering annotation. This can be performed on leiden clusters by setting celltype_label = 'leiden' and passing the clusters that are to be selected for reclustering as strings or tuple of strings to the parameter celltype. Parameters ---------- adata: the complete AnnData object of the Dataset. celltype: `str` or (`str`) string identifying the cluster which is to be filtered out, if more than one is to be selected please pass them as a tuple not as a list! celltype_label: `str` | default = 'leiden' string identifying which column in adata.obs will be matching with the celltype argument. min_mean: `float` | default = 0.0125 the minimum gene expression a gene must have to be considered highly variable max_mean: `float` | default = 4 the maximum gene expression a gene can have to be considered highly variable min_disp: `float` | default = 0.5 the minimum dispersion a gene must have to be considered highly variable regress_out_key: `list of str` | default = None A list of string identifiers of the adata.obs columns that should be regressed out before performing clustering. If None then no regress_out is calculated. random_seed: `int` | default = 0 the random seed that is used to produce reproducible PCA, clustering and UMAP results show_plot_filter: `bool` | default = False boolian value indicating if a plot showing the filtering results for highly variable gene detection should be displayed or not method: `str` | default = 'leiden' clustering method to use for the reclustering of the datasubset. Possible:louvain/leiden batch_key: `str` | default = None Specify a batch key if the HVG calculation should be done per batch n_share: `int` | default = 3 Divide the nr. of batched by this nr. to get the shared HVGs considered (e.g. >=1/3 of samples) Returns ------- AnnData object containing the subcluster annotated with PCA, nearest neighbors, louvain cluster, and UMAP coordinates. Examples -------- For a more detailed example of the entire reclustering process please refer to the code examples. >>> import besca as bc >>> import scanpy as sc >>> adata = bc.datasets.pbmc3k_processed() >>> adata_subset = bc.tl.rc.recluster(adata, celltype=('0', '1', '3', '6'), resolution = 1.3) >>> sc.pl.umap(adata_subset, color = ['leiden', 'CD3G', 'CD8A', 'CD4', 'IL7R', 'NKG7', 'GNLY']) """ if (not method in ['leiden', 'louvain']): raise ValueError("method argument should be leiden or louvain") if type(celltype) == str: cluster_subset = _subset_adata( adata, adata.obs.get(celltype_label) == celltype) elif type(celltype) == tuple: filter = adata.obs.get(celltype_label) == 'NONE' for i in range(len(celltype)): filter = filter | (adata.obs.get(celltype_label) == celltype[i]) cluster_subset = _subset_adata(adata, filter) else: sys.exit('specify cluster input as a string or tuple') cluster_subset.raw = cluster_subset #identify highly variable genes sc_highly_variable_genes(cluster_subset, min_mean=min_mean, max_mean=max_mean, min_disp=min_disp, inplace=True, batch_key=batch_key) if (batch_key != None): hvglist = cluster_subset.var['highly_variable'].copy() hvglist.loc[cluster_subset.var['highly_variable_nbatches'] >= len(set(cluster_subset.obs[batch_key])) / n_shared, ] = True cluster_subset.var['highly_variable'] = hvglist.copy() if show_plot_filter: pl_highly_variable_genes(cluster_subset, show=True) print('In total', str(sum(cluster_subset.var.highly_variable)), 'highly variable genes selected within cluster') #apply filter cluster_subset = _subset_adata(cluster_subset, cluster_subset.var.highly_variable, axis=1, raw=False) #perform further processing # log1p(cluster_subset) # data already logged if regress_out_key is not None: regress_out(cluster_subset, keys=regress_out_key) sc_scale(cluster_subset, max_value=10) sc_pca( cluster_subset, random_state=random_seed, svd_solver='arpack' ) #using `svd_solver='arpack' ensures that the PCA leads to reproducible results neighbors(cluster_subset, n_neighbors=10, random_state=random_seed) umap(cluster_subset, random_state=random_seed) if method == 'louvain': louvain(cluster_subset, resolution=resolution, random_state=random_seed) if method == 'leiden': leiden(cluster_subset, resolution=resolution, random_state=random_seed) return (cluster_subset)