def saveSeurat(adata, path, batch, hvgs=None): import re ro.r('library(Seurat)') ro.r('library(scater)') anndata2ri.activate() if sparse.issparse(adata.X): if not adata.X.has_sorted_indices: adata.X.sort_indices() for key in adata.layers: if sparse.issparse(adata.layers[key]): if not adata.layers[key].has_sorted_indices: adata.layers[key].sort_indices() ro.globalenv['adata'] = adata ro.r('sobj = as.Seurat(adata, counts="counts", data = "X")') # Fix error if levels are 0 and 1 # ro.r(f'sobj$batch <- as.character(sobj${batch})') ro.r(f'Idents(sobj) = "{batch}"') ro.r(f'saveRDS(sobj, file="{path}")') if hvgs is not None: hvg_out = re.sub('\.RDS$', '', path) + '_hvg.RDS' #hvg_out = path+'_hvg.rds' ro.globalenv['hvgs'] = hvgs ro.r('unlist(hvgs)') ro.r(f'saveRDS(hvgs, file="{hvg_out}")') anndata2ri.deactivate()
def run(self): """ Function to call scTransform from Python """ ro.r('library(Seurat)') ro.r('library(scater)') anndata2ri.activate() sc.pp.filter_genes(self.data, min_cells=5) if issparse(self.data.X): if not self.data.X.has_sorted_indices: self.data.X.sort_indices() for key in self.data.layers: if issparse(self.data.layers[key]): if not self.data.layers[key].has_sorted_indices: self.data.layers[key].sort_indices() ro.globalenv['adata'] = self.data ro.r('seurat_obj = as.Seurat(adata, counts="X", data = NULL)') ro.r('res <- SCTransform(object=seurat_obj)') norm_x = ro.r('res@assays$SCT@data').T self.data.layers['normalized'] = norm_x self.dump_to_h5ad("scTransform")
def kBET_single(matrix, batch, type_ = None, k0 = 10, knn=None, subsample=0.5, heuristic=True, verbose=False): """ params: matrix: expression matrix (at the moment: a PCA matrix, so do.pca is set to FALSE batch: series or list of batch assignemnts subsample: fraction to be subsampled. No subsampling if `subsample=None` returns: kBET p-value """ anndata2ri.activate() ro.r("library(kBET)") if verbose: print("importing expression matrix") ro.globalenv['data_mtrx'] = matrix ro.globalenv['batch'] = batch #print(matrix.shape) #print(len(batch)) if verbose: print("kBET estimation") #k0 = len(batch) if len(batch) < 50 else 'NULL' ro.globalenv['knn_graph'] = knn ro.globalenv['k0'] = k0 batch_estimate = ro.r(f"batch.estimate <- kBET(data_mtrx, batch, knn=knn_graph, k0=k0, plot=FALSE, do.pca=FALSE, heuristic=FALSE, adapt=FALSE, verbose={str(verbose).upper()})") anndata2ri.deactivate() try: ro.r("batch.estimate$average.pval")[0] except rpy2.rinterface_lib.embedded.RRuntimeError: return np.nan else: return ro.r("batch.estimate$average.pval")[0]
def identify_empty_droplets(data, min_cells=3, **kw): """Detect empty droplets using DropletUtils """ import rpy2.robjects as robj from rpy2.robjects import default_converter from rpy2.robjects.packages import importr import anndata2ri from rpy2.robjects.conversion import localconverter importr("DropletUtils") adata = data.copy() col_sum = adata.X.sum(0) if hasattr(col_sum, 'A'): col_sum = col_sum.A.squeeze() keep = col_sum > min_cells adata = adata[:,keep] #adata.X = adata.X.tocsc() anndata2ri.activate() robj.globalenv["X"] = adata res = robj.r('res <- emptyDrops(assay(X))') anndata2ri.deactivate() keep = res.loc[res.FDR<0.01,:] data = data[keep.index,:] data.obs['empty_FDR'] = keep['FDR'] return data
def pyScTransform(adata, output_file=None): """ Function to call scTransform from Python """ import rpy2.robjects as ro import anndata2ri ro.r('library(Seurat)') ro.r('library(scater)') anndata2ri.activate() sc.pp.filter_genes(adata, min_cells=5) if issparse(adata.X): if not adata.X.has_sorted_indices: adata.X.sort_indices() for key in adata.layers: if issparse(adata.layers[key]): if not adata.layers[key].has_sorted_indices: adata.layers[key].sort_indices() ro.globalenv['adata'] = adata ro.r('seurat_obj = as.Seurat(adata, counts="X", data = NULL)') ro.r('res <- SCTransform(object=seurat_obj, return.only.var.genes = FALSE, do.correct.umi = FALSE)') norm_x = ro.r('res@[email protected]').T adata.layers['normalized'] = norm_x if output_file: adata.write(output_file)
def test_py2rpy_activate(check, shape, dataset): try: anndata2ri.activate() globalenv["adata"] = dataset() finally: anndata2ri.deactivate() ex = globalenv["adata"] assert tuple(baseenv["dim"](ex)[::-1]) == shape check(ex)
def test_convert_activate(check, shape, dataset): try: anndata2ri.activate() ad = dataset() finally: anndata2ri.deactivate() assert isinstance(ad, AnnData) assert ad.shape == shape check(ad)
def deviance(adata, n_genes=4000, rlib_loc=''): """ Wrapper of the 'deviance' method of highly-variable gene selection, included in the 'scry' R package. Parameters ---------- adata: `AnnData` AnnData object of RNA counts. n_genes: `int` Number of highly-variable genes to return. A selection of 4000-5000 generally yields the best results. rlib_loc: `str` R library location that will be added to the default .libPaths() to locate the required packages. Returns ------- returns an AnnData object reduced to the highly-variable genes. """ rpy2_import = importlib.util.find_spec('rpy2') if rpy2_import is None: raise ImportError( "deviance requires rpy2. Install with pip install rpy2") from rpy2.robjects.packages import importr import rpy2.robjects as ro import anndata2ri from scipy.sparse import issparse anndata2ri.activate() ro.globalenv['rlib_loc'] = rlib_loc ro.r('.libPaths(c(rlib_loc, .libPaths()))') ro.r('suppressPackageStartupMessages(library(scry))') ro.r('suppressPackageStartupMessages(library(Seurat))') if issparse(adata.X): if not adata.X.has_sorted_indices: adata.X.sort_indices() for key in adata.layers: if issparse(adata.layers[key]): if not adata.layers[key].has_sorted_indices: adata.layers[key].sort_indices() ro.globalenv['adata'] = adata ro.globalenv['n'] = n_genes print('Reducing the data to', n_genes, 'variable genes.') ro.globalenv['rownam'] = adata.var.index ro.r('seurat_obj = as.Seurat(adata, counts="X", data = NULL)') ro.r('adata <- t(as.matrix(seurat_obj@assays$RNA@counts))') ro.r('out <- devianceFeatureSelection(adata)') ro.r( 'out <- sort(devianceFeatureSelection(adata),decreasing = TRUE)[1:n] ') hvgs_r = ro.r('rownam[order(out, decreasing = TRUE)][1:n]') adata = adata[:, list(hvgs_r)] adata.var['highly_variable'] = True return adata
def save_adata(adata: AnnData, transpose: bool = False): anndata2ri.activate() if transpose: r.saveRDS(adata.X.T, file="adata_t.rds") else: r.saveRDS(adata.X, file="adata.rds") r.saveRDS(adata.obs_names.values, file="obs_names.rds") r.saveRDS(adata.var_names.values, file="var_names.rds") anndata2ri.deactivate()
def test_py2rpy2_numpy_pbmc68k(): """This has some weird metadata""" from scanpy.datasets import pbmc68k_reduced try: anndata2ri.activate() with catch_warnings(record=True) as logs: # type: List[WarningMessage] simplefilter("ignore", DeprecationWarning) globalenv["adata"] = pbmc68k_reduced() assert len(logs) == 0, [m.message for m in logs] finally: anndata2ri.deactivate()
def test_py2rpy2_numpy_pbmc68k(): """This has some weird metadata""" from scanpy.datasets import pbmc68k_reduced try: anndata2ri.activate() with catch_warnings(record=True) as logs: # type: List[WarningMessage] simplefilter("ignore", DeprecationWarning) globalenv["adata"] = pbmc68k_reduced() assert len(logs) == 1, [m.message for m in logs] assert logs[0].category is NotConvertedWarning assert "scipy.sparse.csr.csr_matrix" in str(logs[0].message) finally: anndata2ri.deactivate()
def save_stemnet_cluster_pop(size: int, col: int): anndata2ri.activate() with open(DATA_DIR / "benchmarking" / "runtime_analysis" / "gpcca.pickle", "rb") as fin: data = pickle.load(fin)[size][str(col)] # old name: main_states cluster_annot = data["terminal_states"] clusters = cluster_annot.cat.categories df = pd.DataFrame(dict(zip(clusters, [cluster_annot.isin([c]) for c in clusters]))) r.saveRDS(df, file="cluster_pop.rds") anndata2ri.deactivate()
def pca_outliers(adata, min_genes_per_cell=5, verbose=True): """ Function to filter outliers using scater PCA on quality measures """ import numpy as np import rpy2.robjects as ro import anndata2ri import scanpy as sc from rpy2.robjects import pandas2ri from scipy.sparse import issparse import rpy2.rinterface_lib.callbacks import logging if not verbose: rpy2.rinterface_lib.callbacks.logger.setLevel(logging.ERROR) ro.r('library(scater)') pandas2ri.activate() anndata2ri.activate() print("Loading objects into R") if issparse(adata.X): ro.globalenv['rawMatrix'] = adata.X.T.todense() else: ro.globalenv['rawMatrix'] = adata.X.T ro.globalenv['variables'] = adata.var_names.copy() ro.globalenv['observations'] = adata.obs[['total_counts']] print('Calculate PCA outliers') ro.r('') ro.r('pd <- DataFrame(data = observations)') ro.r('colnames(rawMatrix) <- rownames(pd)') ro.r('rownames(rawMatrix) <- variables') ro.r( 'sce <- SingleCellExperiment(assays = list(counts = as.matrix(rawMatrix) ), colData = pd)' ) ro.r('sce <- calculateQCMetrics(sce)') ro.r('sce <- runPCA(sce, use_coldata = TRUE, detect_outliers = TRUE)') ro.r('cat("Nr of outliers detected:", sum(sce$outlier), sep=" ")') ro.r('outlier2 = sce@colData@rownames[sce$outlier]') ro.r( 'plotReducedDim(sce, use_dimred="PCA", shape_by = "outlier", size_by = "total_counts", colour_by = "total_features_by_counts")' ) outlier2 = ro.r('outlier2') adata = adata[np.invert(np.in1d(adata.obs_names, outlier2))].copy() sc.pp.filter_genes(adata, min_cells=min_genes_per_cell) return adata
def readSeurat(path): anndata2ri.activate() ro.r('library(Seurat)') ro.r('library(scater)') ro.r(f'sobj <- readRDS("{path}")') adata = ro.r('as.SingleCellExperiment(sobj)') anndata2ri.deactivate() #Test for 'X_EMB' if 'X_EMB' in adata.obsm: if 'X_emb' in adata.obsm: print( 'overwriting existing `adata.obsm["X_emb"] in the adata object' ) adata.obsm['X_emb'] = adata.obsm['X_EMB'] del adata.obsm['X_EMB'] return (adata)
def normalize(adata, min_mean=0.1): checkAdata(adata) # massive speedup when working with sparse matrix if not sparse.issparse( adata.X): # quick fix: HVG doesn't work on dense matrix adata.X = sparse.csr_matrix(adata.X) anndata2ri.activate() ro.r('library("scran")') # keep raw counts adata.layers["counts"] = adata.X.copy() # Preliminary clustering for differentiated normalisation adata_pp = adata.copy() sc.pp.normalize_per_cell(adata_pp, counts_per_cell_after=1e6) sc.pp.log1p(adata_pp) sc.pp.pca(adata_pp, n_comps=15, svd_solver='arpack') sc.pp.neighbors(adata_pp) sc.tl.louvain(adata_pp, key_added='groups', resolution=0.5) ro.globalenv['data_mat'] = adata.X.T ro.globalenv['input_groups'] = adata_pp.obs['groups'] size_factors = ro.r( f'computeSumFactors(data_mat, clusters = input_groups, min.mean = {min_mean})' ) del adata_pp # modify adata adata.obs['size_factors'] = size_factors adata.X /= adata.obs['size_factors'].values[:, None] sc.pp.log1p(adata) # convert to sparse, bc operation always converts to dense adata.X = sparse.csr_matrix(adata.X) adata.raw = adata # Store the full data set in 'raw' as log-normalised data for statistical testing
def normalize(adata, min_mean=0.1, log=True, precluster=True, sparsify=True): checkAdata(adata) # Check for 0 count cells if np.any(adata.X.sum(axis=1) == 0): raise ValueError('found 0 count cells in the AnnData object.' ' Please filter these from your dataset.') # Check for 0 count genes if np.any(adata.X.sum(axis=0) == 0): raise ValueError('found 0 count genes in the AnnData object.' ' Please filter these from your dataset.') if sparsify: # massive speedup when working with sparse matrix if not sparse.issparse( adata.X): # quick fix: HVG doesn't work on dense matrix adata.X = sparse.csr_matrix(adata.X) anndata2ri.activate() ro.r('library("scran")') # keep raw counts adata.layers["counts"] = adata.X.copy() is_sparse = False X = adata.X.T # convert to CSC if possible. See https://github.com/MarioniLab/scran/issues/70 if sparse.issparse(X): is_sparse = True if X.nnz > 2**31 - 1: X = X.tocoo() else: X = X.tocsc() ro.globalenv['data_mat'] = X if precluster: # Preliminary clustering for differentiated normalisation adata_pp = adata.copy() sc.pp.normalize_per_cell(adata_pp, counts_per_cell_after=1e6) sc.pp.log1p(adata_pp) sc.pp.pca(adata_pp, n_comps=15, svd_solver='arpack') sc.pp.neighbors(adata_pp) sc.tl.louvain(adata_pp, key_added='groups', resolution=0.5) ro.globalenv['input_groups'] = adata_pp.obs['groups'] size_factors = ro.r( 'sizeFactors(computeSumFactors(SingleCellExperiment(' 'list(counts=data_mat)), clusters = input_groups,' f' min.mean = {min_mean}))') del adata_pp else: size_factors = ro.r( 'sizeFactors(computeSumFactors(SingleCellExperiment(' f'list(counts=data_mat)), min.mean = {min_mean}))') # modify adata adata.obs['size_factors'] = size_factors adata.X /= adata.obs['size_factors'].values[:, None] if log: print("Note! Performing log1p-transformation after normalization.") sc.pp.log1p(adata) else: print("No log-transformation performed after normalization.") if is_sparse: # convert to sparse, bc operation always converts to dense adata.X = sparse.csr_matrix(adata.X) adata.raw = adata # Store the full data set in 'raw' as log-normalised data for statistical testing # Free memory in R ro.r('rm(list=ls())') ro.r( 'lapply(names(sessionInfo()$loadedOnly), require, character.only = TRUE)' ) ro.r( 'invisible(lapply(paste0("package:", names(sessionInfo()$otherPkgs)), ' 'detach, character.only=TRUE, unload=TRUE))') ro.r('gc()')
def valOutlier(adata, nmads=3, rlib_loc=''): """ Estimates and returns the thresholds to use for gene/cell filtering based on outliers calculated from the deviation to the median QCs. Wrapper function based on 'isOutlier' function of the 'scater' R package. Parameters ---------- adata: `AnnData` Unfiltered AnnData object of RNA counts. nmads: `int` Number of median absolute deviation to use as threshold for outlier detection. Lenient NMADS (3 to 5) generally yield the best results. rlib_loc: `str` R library location that will be added to the default .libPaths() to locate the required packages. Returns ------- The estimated parameters to set in the besca workflow considering the QC distribution. """ rpy2_import = importlib.util.find_spec('rpy2') if rpy2_import is None: raise ImportError( "deviance requires rpy2. Install with pip install rpy2") import rpy2.robjects as ro import anndata2ri from scipy.sparse import issparse anndata2ri.activate() ro.globalenv['rlib_loc'] = rlib_loc ro.r('.libPaths(c(rlib_loc, .libPaths()))') ro.r('suppressPackageStartupMessages(library(scater))') ro.r('suppressPackageStartupMessages(library(Matrix))') ro.r('suppressPackageStartupMessages(library(Seurat))') if issparse(adata.X): if not adata.X.has_sorted_indices: adata.X.sort_indices() for key in adata.layers: if issparse(adata.layers[key]): if not adata.layers[key].has_sorted_indices: adata.layers[key].sort_indices() ro.globalenv['dat'] = adata ro.globalenv['sym'] = adata.var['SYMBOL'] ro.r('seurat_obj = as.Seurat(dat, counts="X", data = NULL)') ro.r( 'dat = SingleCellExperiment(assays = list(counts=seurat_obj@assays$RNA@counts) )' ) ro.r('rownames(dat) <- sym') ro.r(''' valOutlier <- function(dat, nmads = 3){ mito <- grep('MT-', rownames(dat)) if (length(mito) == 0){ mito <- NULL } else { mito <- list(Mito = mito) } stats_cells <- perCellQCMetrics(dat, subsets = mito ) stats_genes <- perCellQCMetrics(t(counts(dat))) lower_detected <- as.numeric(attr(isOutlier(stats_cells$detected, nmads = nmads, type = 'lower'), 'thresholds')['lower']) if(lower_detected < 0) lower_detected <- 0 rm_detected <- sum(isOutlier(stats_cells$detected, nmads = nmads, type = 'lower')) lower_expressed <- as.numeric(attr(isOutlier(stats_genes$detected, nmads = nmads, type = 'lower'), 'thresholds')['lower']) if(lower_expressed < 0) lower_expressed <- 0 rm_expressed <- sum(isOutlier(stats_genes$detected, nmads = nmads, type = 'lower')) lower_sum <- as.numeric(attr(isOutlier(stats_cells$sum, nmads = nmads, type = 'lower'), 'thresholds')['lower']) if(lower_sum < 0) lower_sum <- 0 rm_sum <- sum(isOutlier(stats_cells$sum, nmads = nmads, type = 'lower')) higher_detected <- as.numeric(attr(isOutlier(stats_cells$detected, nmads = nmads, type = 'higher'), 'thresholds')['higher']) rm_high_detected <- sum(isOutlier(stats_cells$detected, nmads = nmads, type = 'higher')) if(!length(mito) == 0){ max_mito <- as.numeric(attr(isOutlier(stats_cells$subsets_Mito_percent , nmads = nmads, type = 'higher'), 'thresholds')['higher'])/100 if(max_mito>1) max_mito <- 1 rm_mito <- sum(isOutlier(stats_cells$subsets_Mito_percent, nmads = nmads, type = 'higher')) } higher_sum <- as.numeric(attr(isOutlier(stats_cells$sum, nmads = nmads, type = 'higher'), 'thresholds')['higher']) if(is.na(higher_sum)) higher_sum <- as.numeric(attr(isOutlier(stats_cells$sum, nmads = nmads, type = 'higher'), 'thresholds')['higher', 1]) rm_high_sum <- sum(isOutlier(stats_cells$sum, nmads = nmads, type = 'higher')) message('Advised parameters based on outliers with ',nmads, ' NMADS:') message('standard_min_genes: ', round(lower_detected,2), ', removing ', rm_detected, ' cells') message('standard_min_cells: ', round(lower_expressed, 2), ', removing ', rm_expressed, ' genes') message('standard_min_counts: ', round(lower_sum,2), ', removing ', rm_sum, ' cells') message('standard_n_genes: ', round(higher_detected, 2), ', removing ', rm_high_detected, ' cells') if(!length(mito) == 0) { message('standard_percent_mito: ', round(max_mito, 2), ', removing ', rm_mito, ' cells') } else { message('No mitochondrial gene detected.') max_mito <- 1 } message('standard_max_counts: ', round(higher_sum, 2), ', removing ', rm_high_sum, ' cells') return(c(round(lower_detected,2), round(lower_expressed, 2), round(lower_sum,2), round(higher_detected, 2), round(max_mito, 2), round(higher_sum, 2))) } ''') ro.globalenv['nmads'] = nmads return ro.r('valOutlier(dat, nmads = nmads)')
def scTransform(adata, hvg=False, n_genes=4000, rlib_loc=''): """ Function to call scTransform normalization or HVG selection from Python. Modified from https://github.com/normjam/benchmark/blob/master/normbench/methods/ad2seurat.py. Parameters ---------- adata: `AnnData` AnnData object of RNA counts. hvg: `boolean` Should the hvg method be used (returning a reduced adata object) or the normalization method (returning a normalized adata). n_genes: `int` Number of hvgs to return if the hvg method is selected. A selection of 4000-5000 generally yields the best results. rlib_loc: `str` R library location that will be added to the default .libPaths() to locate the required packages. Returns ------- returns an AnnData object reduced to the highly-variable genes. """ rpy2_import = importlib.util.find_spec('rpy2') if rpy2_import is None: raise ImportError( "deviance requires rpy2. Install with pip install rpy2") import rpy2.robjects as ro from rpy2.robjects import numpy2ri import anndata2ri from scipy.sparse import issparse ro.globalenv['rlib_loc'] = rlib_loc ro.r('.libPaths(c(rlib_loc, .libPaths()))') ro.r('suppressPackageStartupMessages(library(Seurat))') ro.r('suppressPackageStartupMessages(library(scater))') anndata2ri.activate() sc.pp.filter_genes(adata, min_cells=5) if issparse(adata.X): if not adata.X.has_sorted_indices: adata.X.sort_indices() for key in adata.layers: if issparse(adata.layers[key]): if not adata.layers[key].has_sorted_indices: adata.layers[key].sort_indices() ro.globalenv['adata'] = adata ro.r('seurat_obj = as.Seurat(adata, counts="X", data = NULL)') if hvg: numpy2ri.activate() ro.globalenv['n_genes'] = n_genes print('Reducing the data to', n_genes, 'variable genes.') ro.r( 'res <- SCTransform(object=seurat_obj, return.only.var.genes = TRUE, do.correct.umi = FALSE, variable.features.n = n_genes)' ) hvgs_r = ro.r('res@[email protected]') adata = adata[:, list(hvgs_r)] adata.var['highly_variable'] = True return adata else: ro.r( 'res <- SCTransform(object=seurat_obj, return.only.var.genes = FALSE, do.correct.umi = FALSE)' ) norm_x = ro.r('res@[email protected]').T adata.layers['counts'] = norm_x adata.raw = adata return adata
def sctransform(adata, genes=2000, min_genes_per_cell=5, method='poisson', latent=None, batch=None, cores=1, memory=10, verbose=True): """ Function to use scTransform. It needs at least the adata.obj['total_counts'] number of UMIs calculated in the data. """ import numpy as np import rpy2.robjects as ro import anndata2ri import scanpy as sc from rpy2.robjects import pandas2ri from scipy.sparse import issparse import rpy2.rinterface_lib.callbacks import logging if not verbose: rpy2.rinterface_lib.callbacks.logger.setLevel(logging.ERROR) ro.r('library(scater)') ro.r('library(sctransform)') ro.r('library(future)') pandas2ri.activate() anndata2ri.activate() print('Filtering genes') sc.pp.filter_genes(adata, min_cells=min_genes_per_cell) if issparse(adata.X): ro.globalenv['rawMatrix'] = adata.X.T.todense() else: ro.globalenv['rawMatrix'] = adata.X.T latent_var = [] if latent is None: ro.r('cells_info = as.data.frame( colSums(rawMatrix) )') ro.globalenv['cellnames'] = np.asarray(adata.obs_names) ro.r('rownames(cells_info) = cellnames') else: latent_var = latent ro.globalenv['cells_info'] = adata.obs[latent_var] latent_var = ['"data.' + i + '"' for i in latent_var] ro.globalenv['genes_name'] = adata.var_names ro.r('cell_df <- DataFrame(data = cells_info)') #ro.r('print(head(cell_df))') #ro.r('print(rownames(cell_df)[1:10])') #ro.r('rawMatrix=as.data.frame(rawMatrix)') ro.r('colnames(rawMatrix) <- rownames(cell_df)') ro.r('rownames(rawMatrix) <- genes_name') print('Configure future multithreading') ro.globalenv['cores'] = cores ro.globalenv['memory'] = memory ro.r('future::plan(strategy = \'multicore\', workers = cores)') ro.r('options(future.globals.maxSize = memory * 1024 ^ 3)') print('Run scTransform') ro.globalenv['genes'] = int(genes) ro.globalenv['min_genes_per_cell'] = int(min_genes_per_cell) ro.globalenv['method'] = method stringCommand = 'vst_out=vst( as.matrix(rawMatrix), cell_attr=cell_df, n_genes=genes, method=method, show_progress=TRUE, min_cells=min_genes_per_cell, return_corrected_umi=TRUE' #latent_var = ['"data.'+i+'"' for i in latent_var] if batch is not None: batch = '"data.' + batch + '"' stringCommand = stringCommand + ', batch_var=' + batch if latent is not None: latent_var.remove(batch) if ((len(latent_var) > 1) and (batch is not None)) | ((len(latent_var) >= 1) and (batch is None)): #print(latent_var) stringCommand = stringCommand + ', latent_var=c(' + ','.join( latent_var) + ')' stringCommand += ')' print("Running the command:", stringCommand) ro.r(stringCommand) print('Extract results') new_matrix = ro.r('vst_out$y') sct_genes = ro.r('rownames(vst_out$model_pars)') all_genes = ro.r('rownames(vst_out$y)') umi_corrected = ro.r('vst_out$umi_corrected') adata = adata[:, all_genes].copy() adata.var['highly_variable'] = [i in sct_genes for i in adata.var_names] adata.layers['norm_sct'] = np.transpose(new_matrix) adata.layers['umi_corr'] = umi_corrected.T.copy() return adata
import scanpy as sc import warnings warnings.filterwarnings("ignore", category=DeprecationWarning) import anndata2ri import numpy as np import pandas as pd import os,sys try: import cPickle as pickle except ImportError: # python 3.x import pickle #from rpy2.robjects import r # Activate the anndata2ri conversion between SingleCellExperiment and AnnData anndata2ri.activate() #Loading the rpy2 extension enables cell magic to be used #This runs R code in jupyter notebook cells %load_ext rpy2.ipython folder = 'de10;1' folder = ['de10;1','de10;2','de10;3', 'de10;4', 'de10;5', 'db10_1', 'db10;2','db10;3', 'db10;4', 'db10;5', 'dm10;1', 'dm10;2', 'dm10;3','dm10;4', 'dm10;5', 'dp10;1', 'dp10;2', 'dp10;3', 'dp10;4','dp10;5', 'db10_1','db10;2', 'db10;3', 'db10;4','db10;5'] folder = 'de10;1' # folder = ['de10;1','de10;2','de10;3', 'de10;4', 'de10;5'] ## single file
def pca_covariates(adata, covariates=['total_counts'], verbose=False): """ Function to output R^2 of covariates against PCA projection """ import numpy as np import pandas as pd import rpy2.robjects as ro import anndata2ri import scanpy as sc from rpy2.robjects import pandas2ri from scipy.sparse import issparse import rpy2.rinterface_lib.callbacks import logging if not verbose: rpy2.rinterface_lib.callbacks.logger.setLevel(logging.ERROR) import seaborn as sns import matplotlib.pyplot as plt ro.r('library(scater)') pandas2ri.activate() anndata2ri.activate() print("Loading objects into R") if issparse(adata.X): ro.globalenv['rawMatrix'] = np.log1p(adata.X.T.todense()) else: ro.globalenv['rawMatrix'] = np.log1p(adata.X.T) ro.globalenv['observations'] = adata.obs[covariates] print('Calculate PCA covariates') ro.r('pd <- DataFrame(data = observations)') #ro.r('print(pd[1:5,])') ro.r('colnames(rawMatrix) <- rownames(pd)') ro.r( 'sce <- SingleCellExperiment(assays = list(counts = as.matrix(rawMatrix) ), colData = pd)' ) commandString = 'getVarianceExplained(sce, exprs_values = "counts", variables = c(' variables = ['"data.' + i + '"' for i in covariates] commandString = commandString + ','.join(variables) + ') )' print("using the R command") print(commandString) vals = ro.r(commandString) medians = np.argsort(-np.median(vals, 0)) medianVals = -np.sort(-np.median(vals, 0)) vals = pd.DataFrame(vals[:, medians]) #print(covariates) #print(medians) vals.columns = np.asarray(covariates)[medians] plt.rcParams['figure.figsize'] = (8, 8) f, ax = plt.subplots(1) for nn, mm in zip(vals.columns, medianVals): sns.kdeplot(vals[nn], ax=ax, label=nn, clip=(mm, 97), gridsize=100) ax.set_xscale("symlog") #plt.xlim(0,100) ax.legend(title="Covariates", loc='best') adata.uns['pca_covariates'] = vals return adata