def kBET_single(matrix, batch, type_ = None, k0 = 10, knn=None, subsample=0.5, heuristic=True, verbose=False): """ params: matrix: expression matrix (at the moment: a PCA matrix, so do.pca is set to FALSE batch: series or list of batch assignemnts subsample: fraction to be subsampled. No subsampling if `subsample=None` returns: kBET p-value """ anndata2ri.activate() ro.r("library(kBET)") if verbose: print("importing expression matrix") ro.globalenv['data_mtrx'] = matrix ro.globalenv['batch'] = batch #print(matrix.shape) #print(len(batch)) if verbose: print("kBET estimation") #k0 = len(batch) if len(batch) < 50 else 'NULL' ro.globalenv['knn_graph'] = knn ro.globalenv['k0'] = k0 batch_estimate = ro.r(f"batch.estimate <- kBET(data_mtrx, batch, knn=knn_graph, k0=k0, plot=FALSE, do.pca=FALSE, heuristic=FALSE, adapt=FALSE, verbose={str(verbose).upper()})") anndata2ri.deactivate() try: ro.r("batch.estimate$average.pval")[0] except rpy2.rinterface_lib.embedded.RRuntimeError: return np.nan else: return ro.r("batch.estimate$average.pval")[0]
def identify_empty_droplets(data, min_cells=3, **kw): """Detect empty droplets using DropletUtils """ import rpy2.robjects as robj from rpy2.robjects import default_converter from rpy2.robjects.packages import importr import anndata2ri from rpy2.robjects.conversion import localconverter importr("DropletUtils") adata = data.copy() col_sum = adata.X.sum(0) if hasattr(col_sum, 'A'): col_sum = col_sum.A.squeeze() keep = col_sum > min_cells adata = adata[:,keep] #adata.X = adata.X.tocsc() anndata2ri.activate() robj.globalenv["X"] = adata res = robj.r('res <- emptyDrops(assay(X))') anndata2ri.deactivate() keep = res.loc[res.FDR<0.01,:] data = data[keep.index,:] data.obs['empty_FDR'] = keep['FDR'] return data
def saveSeurat(adata, path, batch, hvgs=None): import re ro.r('library(Seurat)') ro.r('library(scater)') anndata2ri.activate() if sparse.issparse(adata.X): if not adata.X.has_sorted_indices: adata.X.sort_indices() for key in adata.layers: if sparse.issparse(adata.layers[key]): if not adata.layers[key].has_sorted_indices: adata.layers[key].sort_indices() ro.globalenv['adata'] = adata ro.r('sobj = as.Seurat(adata, counts="counts", data = "X")') # Fix error if levels are 0 and 1 # ro.r(f'sobj$batch <- as.character(sobj${batch})') ro.r(f'Idents(sobj) = "{batch}"') ro.r(f'saveRDS(sobj, file="{path}")') if hvgs is not None: hvg_out = re.sub('\.RDS$', '', path) + '_hvg.RDS' #hvg_out = path+'_hvg.rds' ro.globalenv['hvgs'] = hvgs ro.r('unlist(hvgs)') ro.r(f'saveRDS(hvgs, file="{hvg_out}")') anndata2ri.deactivate()
def test_py2rpy_activate(check, shape, dataset): try: anndata2ri.activate() globalenv["adata"] = dataset() finally: anndata2ri.deactivate() ex = globalenv["adata"] assert tuple(baseenv["dim"](ex)[::-1]) == shape check(ex)
def test_convert_activate(check, shape, dataset): try: anndata2ri.activate() ad = dataset() finally: anndata2ri.deactivate() assert isinstance(ad, AnnData) assert ad.shape == shape check(ad)
def save_adata(adata: AnnData, transpose: bool = False): anndata2ri.activate() if transpose: r.saveRDS(adata.X.T, file="adata_t.rds") else: r.saveRDS(adata.X, file="adata.rds") r.saveRDS(adata.obs_names.values, file="obs_names.rds") r.saveRDS(adata.var_names.values, file="var_names.rds") anndata2ri.deactivate()
def test_py2rpy2_numpy_pbmc68k(): """This has some weird metadata""" from scanpy.datasets import pbmc68k_reduced try: anndata2ri.activate() with catch_warnings(record=True) as logs: # type: List[WarningMessage] simplefilter("ignore", DeprecationWarning) globalenv["adata"] = pbmc68k_reduced() assert len(logs) == 0, [m.message for m in logs] finally: anndata2ri.deactivate()
def log_scran_pooling(adata): """Normalize data with scran via rpy2.""" import anndata2ri import scIB.preprocessing scprep.run.install_bioconductor("scran") # Normalize via scran-pooling with own clustering at res=0.5 scIB.preprocessing.normalize(adata) anndata2ri.deactivate() # Make lightweight del adata.raw
def save_stemnet_cluster_pop(size: int, col: int): anndata2ri.activate() with open(DATA_DIR / "benchmarking" / "runtime_analysis" / "gpcca.pickle", "rb") as fin: data = pickle.load(fin)[size][str(col)] # old name: main_states cluster_annot = data["terminal_states"] clusters = cluster_annot.cat.categories df = pd.DataFrame(dict(zip(clusters, [cluster_annot.isin([c]) for c in clusters]))) r.saveRDS(df, file="cluster_pop.rds") anndata2ri.deactivate()
def test_py2rpy2_numpy_pbmc68k(): """This has some weird metadata""" from scanpy.datasets import pbmc68k_reduced try: anndata2ri.activate() with catch_warnings(record=True) as logs: # type: List[WarningMessage] simplefilter("ignore", DeprecationWarning) globalenv["adata"] = pbmc68k_reduced() assert len(logs) == 1, [m.message for m in logs] assert logs[0].category is NotConvertedWarning assert "scipy.sparse.csr.csr_matrix" in str(logs[0].message) finally: anndata2ri.deactivate()
def readSeurat(path): anndata2ri.activate() ro.r('library(Seurat)') ro.r('library(scater)') ro.r(f'sobj <- readRDS("{path}")') adata = ro.r('as.SingleCellExperiment(sobj)') anndata2ri.deactivate() #Test for 'X_EMB' if 'X_EMB' in adata.obsm: if 'X_emb' in adata.obsm: print( 'overwriting existing `adata.obsm["X_emb"] in the adata object' ) adata.obsm['X_emb'] = adata.obsm['X_EMB'] del adata.obsm['X_EMB'] return (adata)
def normalize(adata, min_mean=0.1, log=True, precluster=True, sparsify=True): checkAdata(adata) # Check for 0 count cells if np.any(adata.X.sum(axis=1) == 0): raise ValueError('found 0 count cells in the AnnData object.' ' Please filter these from your dataset.') # Check for 0 count genes if np.any(adata.X.sum(axis=0) == 0): raise ValueError('found 0 count genes in the AnnData object.' ' Please filter these from your dataset.') if sparsify: # massive speedup when working with sparse matrix if not sparse.issparse( adata.X): # quick fix: HVG doesn't work on dense matrix adata.X = sparse.csr_matrix(adata.X) anndata2ri.activate() ro.r('library("scran")') # keep raw counts adata.layers["counts"] = adata.X.copy() is_sparse = False X = adata.X.T # convert to CSC if possible. See https://github.com/MarioniLab/scran/issues/70 if sparse.issparse(X): is_sparse = True if X.nnz > 2**31 - 1: X = X.tocoo() else: X = X.tocsc() ro.globalenv['data_mat'] = X if precluster: # Preliminary clustering for differentiated normalisation adata_pp = adata.copy() sc.pp.normalize_per_cell(adata_pp, counts_per_cell_after=1e6) sc.pp.log1p(adata_pp) sc.pp.pca(adata_pp, n_comps=15, svd_solver='arpack') sc.pp.neighbors(adata_pp) sc.tl.louvain(adata_pp, key_added='groups', resolution=0.5) ro.globalenv['input_groups'] = adata_pp.obs['groups'] size_factors = ro.r( 'sizeFactors(computeSumFactors(SingleCellExperiment(' 'list(counts=data_mat)), clusters = input_groups,' f' min.mean = {min_mean}))') del adata_pp else: size_factors = ro.r( 'sizeFactors(computeSumFactors(SingleCellExperiment(' f'list(counts=data_mat)), min.mean = {min_mean}))') # modify adata adata.obs['size_factors'] = size_factors adata.X /= adata.obs['size_factors'].values[:, None] if log: print("Note! Performing log1p-transformation after normalization.") sc.pp.log1p(adata) else: print("No log-transformation performed after normalization.") if is_sparse: # convert to sparse, bc operation always converts to dense adata.X = sparse.csr_matrix(adata.X) adata.raw = adata # Store the full data set in 'raw' as log-normalised data for statistical testing # Free memory in R ro.r('rm(list=ls())') ro.r( 'lapply(names(sessionInfo()$loadedOnly), require, character.only = TRUE)' ) ro.r( 'invisible(lapply(paste0("package:", names(sessionInfo()$otherPkgs)), ' 'detach, character.only=TRUE, unload=TRUE))') ro.r('gc()') anndata2ri.deactivate()