def apply_transferFunction_metric(r_stream1, r_stream2, evalresp1, evalresp2): """" Invoke a named "correlation" R metric and convert the R dataframe result into a Pandas dataframe. :param r_stream1: an r_stream object :param r_stream2: an r_stream object :param metric_function_name: the name of the set of metrics :return: """ R_function = robjects.r('IRISMustangMetrics::transferFunctionMetric') # NOTE: Conversion of dataframes only works if you activate but we don't want conversion # NOTE: to always be automatic so we deactivate() after we're done converting. pandas2ri.activate() r_evalresp1 = pandas2ri.py2ri_pandasdataframe(evalresp1) r_evalresp2 = pandas2ri.py2ri_pandasdataframe(evalresp2) pandas2ri.deactivate() # TODO: Can we just activate/deactivate before/after R_function() without converting # TODO: r_evalresp1/2 ahead of time? # Calculate the metric r_metriclist = R_function(r_stream1, r_stream2, r_evalresp1, r_evalresp2) r_dataframe = _R_metricList2DF(r_metriclist) pandas2ri.activate() df = pandas2ri.ri2py_dataframe(r_dataframe) pandas2ri.deactivate() # Convert columns from R POSIXct to pyton UTCDateTime df.starttime = df.starttime.apply(UTCDateTime) df.endtime = df.endtime.apply(UTCDateTime) return df
def run_simple(A, B): from rpy2.robjects import pandas2ri from rpy2.robjects.packages import importr import rpy2.robjects as ro r = ro.r pandas2ri.activate() limma = importr('limma') edgeR = importr('edgeR') counts = pd.concat([A, B], 1) groups = r.factor(r.c(*([0] * A.shape[1] + [1] * B.shape[1]))) ro.globalenv['exp'] = groups design = r('model.matrix(~exp)') dge = r.DGEList(counts=counts) dge = r.calcNormFactors(dge) v = r.voom(dge, design, plot=False) fit = r.lmFit(v, design) fit = r.eBayes(fit) tt = r.topTable(fit, coef=r.ncol(design), number=1e12) ttidx = r['row.names'](tt) tt = pandas2ri.ri2py(tt) cols = tt.columns.to_series() cols[0] = 'lfc' cols[3] = 'pval' cols[4] = 'padj' tt.columns = cols tt['slp'] = np.log10(tt['pval']) tt.loc[tt['lfc'] > 0, 'slp'] = -np.log10(tt.loc[tt['lfc'] > 0, 'pval']) tt.index = ttidx return tt
def deaScranDESeq2(counts, conds, comparisons, alpha, scran_clusters=False): """Makes a call to DESeq2 with SCRAN to perform D.E.A. in the given counts matrix with the given conditions and comparisons. Returns a list of DESeq2 results for each comparison """ results = list() n_cells = len(counts.columns) try: pandas2ri.activate() deseq2 = RimportLibrary("DESeq2") scran = RimportLibrary("scran") multicore = RimportLibrary("BiocParallel") multicore.register(multicore.MulticoreParam(multiprocessing.cpu_count()-1)) as_matrix = r["as.matrix"] # Create the R conditions and counts data r_counts = pandas2ri.py2ri(counts) cond = robjects.StrVector(conds) r_call = """ function(r_counts) { sce = SingleCellExperiment(assays=list(counts=r_counts)) return(sce) } """ r_func = r(r_call) sce = r_func(as_matrix(r_counts)) if scran_clusters: r_clusters = scran.quickCluster(as_matrix(r_counts), max(n_cells/10, 10)) min_cluster_size = min(Counter(r_clusters).values()) sizes = list(set([round((min_cluster_size/2) / i) for i in [5,4,3,2,1]])) sce = scran.computeSumFactors(sce, clusters=r_clusters, sizes=sizes, positive=True) else: sizes = list(set([round((n_cells/2) * i) for i in [0.1,0.2,0.3,0.4,0.5]])) sce = scran.computeSumFactors(sce, sizes=sizes, positive=True) sce = r.normalize(sce) dds = r.convertTo(sce, type="DESeq2") r_call = """ function(dds, conditions){ colData(dds)$conditions = as.factor(conditions) design(dds) = formula(~ conditions) return(dds) } """ r_func = r(r_call) dds = r_func(dds, cond) dds = r.DESeq(dds) # Perform the comparisons and store results in list for A,B in comparisons: result = r.results(dds, contrast=r.c("conditions", A, B), alpha=alpha) result = r['as.data.frame'](result) genes = r['rownames'](result) result = pandas2ri.ri2py_dataframe(result) # There seems to be a problem parsing the rownames from R to pandas # so we do it manually result.index = genes results.append(result) pandas2ri.deactivate() except Exception as e: raise e return results
def computeSumFactors(counts, scran_clusters=True): """ Compute normalization factors using the deconvolution method described in Marioni et al. Returns the computed size factors as a vector. :param counts: a matrix of counts (genes as rows) :return returns the normalization factors a vector """ n_cells = len(counts.columns) pandas2ri.activate() r_counts = pandas2ri.py2ri(counts) scran = RimportLibrary("scran") as_matrix = r["as.matrix"] if scran_clusters and n_cells >= 50: r_clusters = scran.quickCluster(as_matrix(r_counts), min(n_cells/10, 10), method="igraph") min_cluster_size = min(Counter(r_clusters).values()) sizes = list(range(min(int(min_cluster_size/4), 10), min(int(min_cluster_size/2), 50), 5)) dds = scran.computeSumFactors(as_matrix(r_counts), clusters=r_clusters, sizes=sizes) else: sizes = list(range(min(int(n_cells/4), 10), min(int(n_cells/2), 50), 5)) dds = scran.computeSumFactors(as_matrix(r_counts), sizes=sizes) pandas_sf = pandas2ri.ri2py(dds) pandas2ri.deactivate() return pandas_sf
def read_rdata(rdata_fullpath, table_name): """ Returns the pandas DataFrame """ from rpy2.robjects import pandas2ri, r pandas2ri.activate() # we want forward slashes for R rdata_fullpath_forR = rdata_fullpath.replace("\\", "/") print "Loading %s" % rdata_fullpath_forR # read in the data from the R session with python r['load'](rdata_fullpath_forR) # check that it's there table_df = pandas2ri.ri2py(r['model_summary']) # fillna for col in table_df.columns: nullcount = sum(pandas.isnull(table_df[col])) if nullcount > 0: print " Found %5d NA values in column %s" % (nullcount, col) table_df = table_df.fillna(0) for col in table_df.columns: nullcount = sum(pandas.isnull(table_df[col])) if nullcount > 0: print " -> Found %5d NA values in column %s" % (nullcount, col) print "Read %d lines from %s" % (len(table_df), rdata_fullpath) return table_df
def logCountsWithFactors(counts, size_factors): """ Uses the R package scater to log a matrix of counts (genes as rows) and a vector of size factor using the method normalize(). :param counts: a matrix of counts (genes as rows) :param size_factors: a vector of size factors :return the normalized log counts (genes as rows) """ columns = counts.columns indexes = counts.index pandas2ri.activate() r_counts = pandas2ri.py2ri(counts) scater = RimportLibrary("scran") r_call = """ function(counts, size_factors){ sce = SingleCellExperiment(assays=list(counts=as.matrix(counts))) sizeFactors(sce) = size_factors sce = normalize(sce) norm_counts = logcounts(sce) return(as.data.frame(norm_counts)) } """ r_func = r(r_call) r_norm_counts = r_func(r_counts, size_factors) pandas_norm_counts = pandas2ri.ri2py(r_norm_counts) pandas_norm_counts.index = indexes pandas_norm_counts.columns = columns pandas2ri.deactivate() return pandas_norm_counts
def testActivate(self): robjects.conversion.py2ri = robjects.default_py2ri self.assertNotEqual(rpyp.pandas2ri, robjects.conversion.py2ri) rpyp.activate() self.assertEqual(rpyp.pandas2ri, robjects.conversion.py2ri) rpyp.deactivate() self.assertEqual(robjects.default_py2ri, robjects.conversion.py2ri)
def testSeries(self): Series = pandas.core.series.Series s = Series(numpy.random.randn(5), index=['a', 'b', 'c', 'd', 'e']) rpyp.activate() rp_s = robjects.conversion.py2ri(s) rpyp.deactivate() self.assertEqual(rinterface.FloatSexpVector, type(rp_s))
def dtwWrapper(data, rows, columns, k): ''' wrapper function for dynamic time warping. includes use of exponential adaptive tuning function with temporal correlation if k > 0 ''' # not explicitly called, but needs to be in R environment DTW = importr("dtw") # create a data frame of zeros of size number of ids x number of ids # fill it with the calculated distance metric for each pair wise comparison df_ = pd.DataFrame(index=rows, columns=columns) df_ = df_.fillna(0.0).astype(np.float64) # fill the array with dtw-distance values pandas2ri.activate() for i in rows: E.info("DTW %s" % i) for j in columns: series1 = data.loc[i].values.tolist() series2 = data.loc[j].values.tolist() DTW_value = (R.dtw(series1, series2)).rx('distance')[0][0] cort_value = temporalCorrelate(series1, series2) tuned_value = adaptiveTune(cort_value, k) time_dist = DTW_value * tuned_value df_.loc[i][j] = float(time_dist) df_[j][i] = float(time_dist) return df_
def variogram(self, i=0, plot_v=True, **kwargs): """ Generate a variogram Parameters ---------- self : Event object with at least one data column i : int data column index number (defaults to 0) plot_v : bool generate a plot of the variogram **kwargs (target_np, alpha, tol_hor, max_bnd, last_max) Returns ------- v : Dataframe containing output from r-variogram function """ from rpy2.robjects import pandas2ri pandas2ri.activate() rfuncs = import_r_tools() if 'X' not in self.ll_cols: self.set_ll() df = self.df cols = self.data_cols r_df = df.loc[:,['X', 'Y', cols[i]]].dropna(how='any') v = pandas2ri.ri2py(rfuncs.get_iSVG(r_df, 3, **kwargs)) if plot_v: v.plot(x='dist', y='gamma', marker = 'o', figsize=(8,4)) return v
def computeMnnBatchCorrection(counts): """Computes batch correction to a list of batches (data frames) where each data frame represents a batch (animal for instance). The batch correction is computed using Scran::mnnCorrect() from Marioni et al. :param counts: a list of matrices of counts :return returns a list of batch corrected matrices of counts """ pandas2ri.activate() as_matrix = r["as.matrix"] meta = [(x.index,x.columns) for x in counts] r_counts = [as_matrix(pandas2ri.py2ri(x)) for x in counts] RimportLibrary("scran") r_call = """ function(counts) { norm_counts = do.call(mnnCorrect, c(counts, cos.norm.out=FALSE)); return(lapply(norm_counts$corrected, as.data.frame)) } """ r_func = r(r_call) norm_counts = list() for i,x in enumerate(r_func(r_counts)): norm_c = pandas2ri.ri2py(x) norm_c.index = meta[i][0] norm_c.columns = meta[i][1] norm_counts.append(norm_c) pandas2ri.deactivate() return norm_counts
def krige(self, i=0, v=None, step=1, res=True, plot_v=False, plot_k=True, animated=False, **plot_kwargs): """ Krige the dataframe with a single data column or a column index number Parameters ------- self : Event object with at least one data column kwargs ------- i : int data column index number (defaults to 0) v : variogram to use in determining sill and range step : grid interval to krige on (in km) res : bool detrend points before computing kriged values - default True plot_v : bool plot variogram - default False plot_k : bool plot kriged values - default True animated : bool return axis for animation - default False **plot_kwargs (cmap, s, latlon, basemap, shpfile, POT, locs, colors) Returns ------- k : Dataframe containing output from r-krige function """ from rpy2.robjects import pandas2ri pandas2ri.activate() rfuncs = import_r_tools() if 'X' not in self.ll_cols: self.set_ll() if res: if not hasattr(self, 'res'): self.detrend() df = self.res else: df = self.df cols = self.data_cols r_df = df.loc[:,['X', 'Y', cols[i]]].dropna(how='any') if not v: v = pandas2ri.ri2py(rfuncs.get_variogram(r_df)) model = 'Sph' psill = r_df.var()[cols[i]] for j in range(len(v)): if v.gamma[j] > psill: rng = v.dist[j] break k = pandas2ri.ri2py(rfuncs.get_krige(r_df, psill, model, rng, step=step)) k['lat'] = k.y/110.574 k['lon'] = k.x/(111.320*(k['lat']*pi/180).apply(cos)) self.k = k if plot_k and animated: return self.plot_krige(i, k, rng, step=step, res=res, animated=animated, **plot_kwargs) elif plot_k and not animated: self.plot_krige(i, k, rng, step=step, res=res, animated=animated, **plot_kwargs) else: return k
def get_features(self, d={}, thresh=.01, sigma=3, min_size=4, const=5, return_dict=False, buffer=False): ''' Use r package SpatialVx to identify features. Parameters ---------- thresh: .01 sigma: 3 min_size: 4 const: 5 buffer: False Return ------ p: pd.Panel containing parameters characterizing the features found ''' from rpy2 import robjects from rpy2.robjects.packages import importr from rpy2.robjects import pandas2ri pandas2ri.activate() SpatialVx = importr('SpatialVx') rsummary = robjects.r.summary r_tools = import_r_tools() ll = np.array([self.lon.flatten('F'), self.lat.flatten('F')]).T for i in range(self.box.shape[0]-1): hold = SpatialVx.make_SpatialVx(self.box[i,:,:], self.box[i+1,:,:], loc=ll) look = r_tools.FeatureFinder_gaussian(hold, nx=self.box.shape[2], ny=self.box.shape[1], thresh=thresh, smoothpar=sigma, **(dotvars(min_size=min_size))) try: x = rsummary(look, silent=True)[0] except: continue px = pandas2ri.ri2py(x) df0 = pd.DataFrame(px, columns=['centroidX', 'centroidY', 'area', 'OrientationAngle', 'AspectRatio', 'Intensity0.25', 'Intensity0.9']) df0['Observed'] = list(df0.index+1) m = SpatialVx.centmatch(look, criteria=3, const=const) p = pandas2ri.ri2py(m[12]) df1 = pd.DataFrame(p, columns=['Forecast', 'Observed']) l = SpatialVx.FeatureMatchAnalyzer(m) try: p = pandas2ri.ri2py(rsummary(l, silent=True)) except: continue df2 = pd.DataFrame(p, columns=['Partial Hausdorff Distance','Mean Error Distance','Mean Square Error Distance', 'Pratts Figure of Merit','Minimum Separation Distance', 'Centroid Distance', 'Angle Difference','Area Ratio','Intersection Area','Bearing', 'Baddeleys Delta Metric', 'Hausdorff Distance']) df3 = df1.join(df2) d.update({self.time[i]: pd.merge(df0, df3, how='outer')}) if return_dict: return(d) p = pd.Panel(d) if buffer: return(self.add_buffer(p)) return(p)
def conditionDESeq2(data_frame, header, alpha, res_dir): ''' Perform DESeq2-based analysis of condition:time interaction dependent differential expression ''' E.info("Differential expression testing for %s" % header) cols = data_frame.columns # py2ri requires activation pandas2ri.activate() counts = pandas2ri.py2ri(data_frame) des_times = ro.IntVector([x.split(".")[1] for x in cols]) des_reps = ro.StrVector([x.split(".")[2] for x in cols]) des_cond = ro.StrVector([x.split(".")[0] for x in cols]) genes = ro.StrVector([x for x in data_frame.index]) # setup counts table and design frame R('''suppressPackageStartupMessages(library("DESeq2"))''') R('''sink(file="/dev/null")''') R('''times <- as.factor(%s)''' % des_times.r_repr()) R('''reps <- c(%s)''' % des_reps.r_repr()) R('''condition <- c(%s)''' % des_cond.r_repr()) R('''design <- data.frame(times, reps, condition)''') R('''counts <- data.frame(%s)''' % counts.r_repr()) R('''genes <- c(%s)''' % genes.r_repr()) R('''rownames(counts) <- genes''') R('''rownames(design) <- colnames(counts)''') # use DESeq() with LRT and reduced formula. Use effect # size moderation R('''dds <- DESeqDataSetFromMatrix(countData=counts, ''' '''colData=design, ''' '''design=~reps + times + condition + times:condition)''') R('''dds <- DESeq(dds, test="LRT", ''' '''reduced=~reps + times + condition, betaPrior=T)''') R('''res <- results(dds)[order(results(dds)$padj, na.last=T), ]''') R('''res.df <- data.frame(res)''') # generate dispersion and MA plots R('''png("%s/%s-dispersions.png")''' % (res_dir, header)) R('''plotDispEsts(dds)''') R('''dev.off()''') R('''png("%s/%s-MAplot.png")''' % (res_dir, header)) R('''plotMA(res, alpha=%0.3f, ylim=c(-5,5))''' % alpha) R('''dev.off()''') R('''sink(file=NULL)''') df = pandas2ri.ri2py(R['res.df']) return df
def import_r_tools(filename='r-tools.R'): from rpy2.robjects import pandas2ri, r, globalenv from rpy2.robjects.packages import STAP pandas2ri.activate() path = os.path.dirname(os.path.realpath(__file__)) with open(os.path.join(path, filename), 'r') as f: string = f.read() rfuncs = STAP(string, "rfuncs") return rfuncs
def treeCutting(infile, expression_file, cluster_file, cluster_algorithm, deepsplit=False): ''' Use dynamic tree cutting to derive clusters for each resampled distance matrix ''' wgcna_out = "/dev/null" E.info("loading distance matrix") df = pd.read_table(infile, sep="\t", header=0, index_col=0) df = df.fillna(0.0) genes = df.index genes_r = ro.StrVector([g for g in genes]) # py2ri requires activation pandas2ri.activate() rdf = pandas2ri.py2ri(df) R.assign("distance_data", rdf) R.assign("gene_ids", genes_r) R('''sink(file='%(wgcna_out)s')''' % locals()) R('''suppressPackageStartupMessages(library("WGCNA"))''') R('''suppressPackageStartupMessages(library("flashClust"))''') E.info("clustering data by %s linkage" % cluster_algorithm) R('''rownames(distance_data) <- gene_ids''') R('''clustering <- flashClust(as.dist(distance_data),''' ''' method='%(cluster_algorithm)s')''' % locals()) if deepsplit: R('''cluster_cut <- cutreeDynamic(dendro=clustering, ''' '''minClusterSize=50, deepSplit=T)''') else: R('''cluster_cut <- cutreeDynamic(dendro=clustering, ''' '''minClusterSize=50, deepSplit=F)''') R('''color_cut <- labels2colors(cluster_cut)''') R('''write.table(color_cut, file = '%(cluster_file)s',''' '''sep="\t")''' % locals()) R('''cluster_matched <- data.frame(cbind(rownames(distance_data),''' '''color_cut))''') R('''colnames(cluster_matched) = c("gene_id", "cluster")''') R('''cluster_matched <- data.frame(cluster_matched$gene_id,''' '''cluster_matched$cluster)''') R('''sink(file=NULL)''') cluster_frame = pandas2ri.ri2py(R["cluster_matched"]) cluster_frame.columns = ['gene_id', 'cluster'] cluster_frame.index = cluster_frame['gene_id'] cluster_frame.drop(['gene_id'], inplace=True, axis=1) return cluster_frame
def testActivate(self): #FIXME: is the following still making sense ? self.assertNotEqual(rpyp.py2ri, robjects.conversion.py2ri) l = len(robjects.conversion.py2ri.registry) k = set(robjects.conversion.py2ri.registry.keys()) rpyp.activate() self.assertTrue(len(conversion.py2ri.registry) > l) rpyp.deactivate() self.assertEqual(l, len(conversion.py2ri.registry)) self.assertEqual(k, set(conversion.py2ri.registry.keys()))
def testRi2pandas(self): rdataf = robjects.r('data.frame(a=1:2, b=I(c("a", "b")), c=c("a", "b"))') rpyp.activate() pandas_df = robjects.conversion.ri2py(rdataf) rpyp.deactivate() self.assertIsInstance(pandas_df, pandas.DataFrame) self.assertEquals(('a', 'b', 'c'), tuple(pandas_df.keys())) self.assertEquals(pandas_df['a'].dtype, numpy.dtype('int32')) self.assertEquals(pandas_df['b'].dtype, numpy.dtype('O')) self.assertEquals(pandas_df['c'].dtype, numpy.dtype('O'))
def testSeries_issue264(self): Series = pandas.core.series.Series s = Series(('a', 'b', 'c', 'd', 'e'), index=pandas.Int64Index([0,1,2,3,4])) rpyp.activate() rp_s = robjects.conversion.py2ri(s) rpyp.deactivate() # segfault before the fix str(rp_s) self.assertEqual(rinterface.ListSexpVector, type(rp_s))
def computeNClusters(counts, min_size=20): """Computes the number of clusters from the data using Scran::quickCluster""" pandas2ri.activate() r_counts = pandas2ri.py2ri(counts.transpose()) scran = RimportLibrary("scran") as_matrix = r["as.matrix"] clusters = scran.quickCluster(as_matrix(r_counts), min_size, method="igraph") n_clust = len(set(clusters)) pandas2ri.deactivate() return n_clust
def Kriging_Interpolation_Array(input_array, x_vector, y_vector): """ Interpolate data in an array using Ordinary Kriging Reference: https://cran.r-project.org/web/packages/automap/automap.pdf """ # Total values in array n_values = np.isfinite(input_array).sum() # Load function pandas2ri.activate() robjects.r(''' library(gstat) library(sp) library(automap) kriging_interpolation <- function(x_vec, y_vec, values_arr, n_values){ # Parameters shape <- dim(values_arr) counter <- 1 df <- data.frame(X=numeric(n_values), Y=numeric(n_values), INFZ=numeric(n_values)) # Save values into a data frame for (i in seq(shape[2])) { for (j in seq(shape[1])) { if (is.finite(values_arr[j, i])) { df[counter,] <- c(x_vec[i], y_vec[j], values_arr[j, i]) counter <- counter + 1 } } } # Grid coordinates(df) = ~X+Y int_grid <- expand.grid(x_vec, y_vec) names(int_grid) <- c("X", "Y") coordinates(int_grid) = ~X+Y gridded(int_grid) = TRUE # Kriging krig_output <- autoKrige(INFZ~1, df, int_grid) # Array values_out <- matrix(krig_output$krige_output$var1.pred, nrow=length(y_vec), ncol=length(x_vec), byrow = TRUE) return(values_out) } ''') kriging_interpolation = robjects.r['kriging_interpolation'] # Execute kriging function and get array r_array = kriging_interpolation(x_vector, y_vector, input_array, n_values) array_out = np.array(r_array) # Return return array_out
def _zeros_from_weather if __name__ == '__main__': pandas2ri.activate() # підключаю бази даних: con_in = sqlite3.connect('clear_takeoff_test.db') con_out = sqlite3.connect('Data.db') con_test = sqlite3.connect('test.db') # prelude2_sql(con_test) # prelude1_Rdata(con_test) # prelude3_sql(con_test) prelude4_sql(con_test)
def pandas_load(name): ''' loads .rdata file (R dataframe file) and returns it as Pandas dataframe. :param name: .rdata filename (eg: 'subset.Rdata') :return: pandas dataframe object ''' pandas2ri.activate() r.load(name) # name = 'subset.fcuk.Rdata' # name_without_ext = r['.'.join(name.split('.')[-2::-1][::-1])] # print(r.ls()) # ls() - list of active objects in R env df = pandas2ri.ri2py(r[r.ls()[0]]) return df
def testRi2pandas_issue207(self): d = robjects.DataFrame({'x': 1}) rpyp.activate() try: ok = True robjects.globalenv['d'] = d except ValueError: ok = False finally: rpyp.deactivate() if 'd' in robjects.globalenv: del(robjects.globalenv['d']) self.assertTrue(ok)
def load_ipython_extension(ip): """Load the extension in IPython.""" if pandas2ri: pandas2ri.activate() else: numpy2ri.activate() ip.register_magics(RMagics) # Initialising rpy2 interferes with readline. Since, at this point, we've # probably just loaded rpy2, we reset the delimiters. See issue gh-2759. if ip.has_readline: ip.readline.set_completer_delims(ip.readline_delims)
def computeNClusters(counts, min_size=20): """Computes the number of clusters from the data using Scran::quickCluster""" pandas2ri.activate() r_counts = pandas2ri.py2ri(counts.transpose()) scran = RimportLibrary("scran") multicore = RimportLibrary("BiocParallel") multicore.register(multicore.MulticoreParam(multiprocessing.cpu_count()-1)) as_matrix = r["as.matrix"] clusters = scran.quickCluster(as_matrix(r_counts), min_size) n_clust = len(set(clusters)) pandas2ri.deactivate() return n_clust
def load_ipython_extension(ip): """Load the extension in IPython.""" if hasattr(baseconversion, 'activate'): # This is pandas2ri if pandas is installed, # or numpy2ri otherwise baseconversion.activate() ip.register_magics(RMagics) # Initialising rpy2 interferes with readline. Since, at this point, we've # probably just loaded rpy2, we reset the delimiters. See issue gh-2759. if ip.has_readline: ip.readline.set_completer_delims(ip.readline_delims)
def run_eblink(tmp, tmp_dir, column_types, a, b, iterations, filenum, numrecords): ''' Provides an interface with R to run ebLink in the background through R. ''' pandas2ri.activate() # Get base packages # Import data to link data = ro.r('read.csv(file = "{}", header = T)'.format(tmp)) # Set necessary variables ## X.c contains the categorical variables ## X.s contains the string variables ## p.c is the number of categorical variables ## p.s contains the number of string variables matrix = ro.r['as.matrix'] c_cols = [x for x in column_types if column_types[x].upper() == 'C'] xc = matrix(data[c_cols]) s_cols = [x for x in column_types if column_types[x].upper() == 'S'] xs = matrix(data[s_cols]) pc = ro.IntVector([len(filter(lambda x: x == 'C', column_types.values()))]) ps = ro.IntVector([len(filter(lambda x: x == 'S', column_types.values()))]) # Number of iterations g = ro.IntVector([iterations]) # Number of entries in file m = ro.IntVector([numrecords]) # File number identifier fn = ro.IntVector(filenum) # Subjective choices for distortion probability prior a = ro.IntVector([a]) b = ro.IntVector([b]) # Steepness parameter; pre-set to recommended value c = ro.IntVector([STEEPNESS]) # Edit distance function; can be swapped for others if desired ro.r("d <- function(string1,string2){adist(string1,string2)}") d = ro.r['d'] # Loads in Gibbs sampler and plyr packages eb_pack = ro.r("source('{}', chdir = TRUE)".format(find('ebGibbsSampler.R', '../../'))) plyr = importr("plyr") # Move to tmp directory to save results file os.chdir(tmp_dir) print 'Running the gibbs sampler...' # Runs the gibbs sampler gibbs = ro.r['rl.gibbs'] lam = gibbs(file_num = fn, X_s = xs, X_c=xc, num_gs=g, a=a, b=b, c=c, d=d, M=m) os.chdir('..') # Calculate estimated population sizes by finding number of uniques appl = ro.r['apply'] ro.r("len_uniq <- function(x){length(unique(x))}") len_uniq = ro.r['len_uniq'] estPopSize = appl(lam, 1, len_uniq) return np.array(lam), np.array(estPopSize)
def run2(counts, formula, normcounts = None): from rpy2.robjects import pandas2ri from rpy2.robjects.packages import importr import rpy2.robjects as ro r = ro.r pandas2ri.activate() limma = importr('limma') edgeR = importr('edgeR') design_matrix = counts.T.reset_index()[counts.columns.names] ro.globalenv['design.matrix'] = design_matrix design = r('as.data.frame(model.matrix(' + formula + ', data=design.matrix))') dge = r.DGEList(counts=counts) dge = r.calcNormFactors(dge) v = r.voom(dge, design, plot=False) ro.globalenv['v'] = v if not normcounts is None: r('write.table(v, "' + normcounts + '",sep="\t",quote = F,col.names = NA)') fit = r.lmFit(v, design) fit = r.eBayes(fit) rv = [] print(r.ncol(design)[0]) for i in range(1, r.ncol(design)[0]): colname = r.colnames(design)[i] tt = r.topTable(fit, coef=i, number=1e12) ttidx = r['row.names'](tt) tt = pandas2ri.ri2py(tt) cols = tt.columns.to_series() cols[0] = 'lfc' cols[3] = 'pval' cols[4] = 'padj' tt.columns = cols tt['slp'] = np.log10(tt['pval']) tt.loc[tt['lfc'] > 0, 'slp'] = -np.log10(tt.loc[tt['lfc'] > 0, 'pval']) if r.ncol(design)[0] > 2: #prepend colname to columns - only if there are more factors cols = tt.columns.to_series().apply(lambda x: '{}_{}'.format(colname, x)) tt.columns = cols tt.index = ttidx rv.append(tt) return pd.concat(rv, axis=1)
def computeRLEFactors(counts): """ Compute normalization size factors using the RLE method described in EdgeR and returns then as a vector. :param counts: a matrix of counts (genes as rows) :return returns the normalization factors a vector """ pandas2ri.activate() r_counts = pandas2ri.py2ri(counts) edger = RimportLibrary("edgeR") as_matrix = r["as.matrix"] dds = edger.calcNormFactors(as_matrix(r_counts), method="RLE") pandas_sf = pandas2ri.ri2py(dds) pandas_cm = pandas2ri.ri2py(r.colSums(counts)) pandas2ri.deactivate() return pandas_sf * pandas_cm
def r(code=None, path=None, rel=True, conda=True, convert=True, repo='https://cran.microsoft.com/', **kwargs): ''' Runs the R script and returns the result. :arg str code: R code to execute. :arg str path: R script path. Cannot be used if code is specified :arg bool rel: True treats path as relative to the caller function's file :arg bool conda: True overrides R_HOME to use the Conda R :arg bool convert: True converts R objects to Pandas and vice versa :arg str repo: CRAN repo URL All other keyword arguments as passed as parameters ''' # Use Conda R if possible if conda: r_home = _conda_r_home() if r_home: os.environ['R_HOME'] = r_home # Import the global R session try: from rpy2.robjects import r, pandas2ri, globalenv except ImportError: app_log.error('rpy2 not installed. Run "conda install rpy2"') raise except RuntimeError: app_log.error('Cannot find R. Set R_HOME env variable') raise # Set a repo so that install.packages() need not ask for one r('local({r <- getOption("repos"); r["CRAN"] <- "%s"; options(repos = r)})' % repo) # Activate or de-activate automatic conversion # https://pandas.pydata.org/pandas-docs/version/0.22.0/r_interface.html if convert: pandas2ri.activate() else: pandas2ri.deactivate() # Pass all other kwargs as global environment variables for key, val in kwargs.items(): globalenv[key] = val if code and path: raise RuntimeError('Use r(code=) or r(path=...), not both') if path: # if rel=True, load path relative to parent directory if rel: stack = inspect.getouterframes(inspect.currentframe(), 2) folder = os.path.dirname(os.path.abspath(stack[1][1])) path = os.path.join(folder, path) result = r.source(path, chdir=True) # source() returns a withVisible: $value and $visible. Use only the first result = result[0] else: result = r(code) return result
def R_var_importance(nsamples=40000, data_store=None): base = importr('base') ################################################### # load dataframe store = pd.HDFStore(data_store) print(store) #pandas2ri.activate() Xtrain = store['Xtrain'] ytrain = store['ytrain'] #Xtest, Xtrain, ytrain, Xval, yval, test_idx, val_idx = prepareAllFeatures() #sample if nsamples != -1: if isinstance(nsamples, str) and 'shuffle' in nsamples: print("Shuffle train data...") rows = np.random.choice(len(Xtrain.index), size=len(Xtrain.index), replace=False) else: rows = np.random.choice(len(Xtrain.index), size=nsamples, replace=False) print("unique rows: %6.2f" % (float(np.unique(rows).shape[0]) / float(rows.shape[0]))) Xtrain = Xtrain.iloc[rows, :] ytrain = ytrain.iloc[rows] store.close() pandas2ri.activate() print(Xtrain.info()) print(Xtrain.describe(include='all')) Xtrain_R = pandas2ri.py2ri_pandasdataframe(Xtrain) ytrain_R = pandas2ri.py2ri_pandasseries(ytrain) #print Xtrain_R ################################################### # R-code # http://stackoverflow.com/questions/27801409/get-field-values-from-rpy2-random-forest-object r = robjects.r r['options'](warn=-1) r.library('randomForest') rf = r.randomForest(Xtrain_R, ytrain_R, ntree=250, importance=True, do_trace=1) df_imp_R = rf.rx("importance") df_imp_R = base.as_data_frame(df_imp_R) df_imp = pandas2ri.ri2py(df_imp_R) df_imp = df_imp.sort(columns=['importance.IncNodePurity'], ascending=False) print(df_imp) with pd.option_context('display.max_rows', 999, 'display.max_columns', 3): print(list(df_imp.index)) #print r.dimnames(rf[8]) r.varImpPlot(rf, sort=True, n_var=30)
def load_rds(filename, types=None): import os import pandas as pd, numpy as np import rpy2.robjects as RO import rpy2.robjects.vectors as RV import rpy2.rinterface as RI from rpy2.robjects import numpy2ri numpy2ri.activate() from rpy2.robjects import pandas2ri pandas2ri.activate() def load(data, types, rpy2_version=3): if types is not None and not isinstance(data, types): return np.array([]) # FIXME: I'm not sure if I should keep two versions here # rpy2_version 2.9.X is more tedious but it handles BoolVector better # rpy2 version 3.0.1 converts bool to integer directly without dealing with # NA properly. It gives something like (0,1,-234235). # Possibly the best thing to do is to open an issue for it to the developers. if rpy2_version == 2: # below works for rpy2 version 2.9.X if isinstance(data, RI.RNULLType): res = None elif isinstance(data, RV.BoolVector): data = RO.r['as.integer'](data) res = np.array(data, dtype=int) # Handle c(NA, NA) situation if np.sum(np.logical_and(res != 0, res != 1)): res = res.astype(float) res[res < 0] = np.nan res[res > 1] = np.nan elif isinstance(data, RV.FactorVector): data = RO.r['as.character'](data) res = np.array(data, dtype=str) elif isinstance(data, RV.IntVector): res = np.array(data, dtype=int) elif isinstance(data, RV.FloatVector): res = np.array(data, dtype=float) elif isinstance(data, RV.StrVector): res = np.array(data, dtype=str) elif isinstance(data, RV.DataFrame): res = pd.DataFrame(data) elif isinstance(data, RV.Matrix): res = np.matrix(data) elif isinstance(data, RV.Array): res = np.array(data) else: # I do not know what to do for this # But I do not want to throw an error either res = str(data) else: if isinstance(data, RI.NULLType): res = None else: res = data if isinstance(res, np.ndarray) and res.shape == (1, ): res = res[0] return res def load_dict(res, data, types): '''load data to res''' names = data.names if not isinstance(data.names, RI.NULLType) else [ i + 1 for i in range(len(data)) ] for name, value in zip(names, list(data)): if isinstance(value, RV.ListVector): res[name] = {} res[name] = load_dict(res[name], value, types) else: res[name] = load(value, types) return res # if not os.path.isfile(filename): raise IOError('Cannot find file ``{}``!'.format(filename)) rds = RO.r['readRDS'](filename) if isinstance(rds, RV.ListVector): res = load_dict({}, rds, types) else: res = load(rds, types) return res
def analyze( self, ground_truth: np.array = None, r_home: str = "", r_path: str = r"", alpha: float = 0.05, ) -> Tuple[pd.DataFrame, Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]]: """ Analyzes results from R script for SCDC from scdney packege. It is assumed that the effect on the first cell type is significant, all others are not. Parameters ---------- ground_truth binary array for comparison to ground truth r_home path to R installation on your machine, e.g. "C:/Program Files/R/R-4.0.3" r_path path to R executable on your machine, e.g. "C:/Program Files/R/R-4.0.3/bin/x64" alpha p-value cutoff Returns ------- summary and classification results Tuple Tuple(raw summary from R, True positive...) """ os.environ["R_HOME"] = r_home os.environ["PATH"] = r_path + ";" + os.environ["PATH"] if ground_truth is None: ground_truth = np.zeros(self.k) import rpy2.robjects as rp from rpy2.robjects import numpy2ri, pandas2ri numpy2ri.activate() pandas2ri.activate() r_summary = rp.r(f""" library(scdney) library(tidyverse) library(broom.mixed) clust = scDC_noClustering({rp.vectors.StrVector(self.scdc_celltypes).r_repr()}, {rp.vectors.StrVector(self.scdc_subject).r_repr()}, calCI=TRUE, calCI_method=c("BCa"), nboot=100) glm = fitGLM(clust, {rp.vectors.StrVector(self.scdc_sample_cond).r_repr()}, pairwise=FALSE, subject_effect=FALSE) sum = summary(glm$pool_res_fixed) sum """) r_summary = pd.DataFrame(r_summary) p_values = r_summary.loc[r_summary["term"].str.contains("condCond_1"), "p.value"].values true_indices = np.where(ground_truth == True)[0] false_indices = np.where(ground_truth == False)[0] pval = np.nan_to_num(np.array(p_values), nan=1) tp = sum(pval[true_indices] < alpha) fn = sum(pval[true_indices] >= alpha) tn = sum(pval[false_indices] >= alpha) fp = sum(pval[false_indices] < alpha) return r_summary, (tp, tn, fp, fn)
def fit_model( self, method: str = "we.eBH", r_home: str = "", r_path: str = r"", *args, **kwargs ): """ Fits ALDEx2 model. Parameters ---------- method method that is used to calculate p-values (column name in ALDEx2's output) r_home path to R installation on your machine, e.g. "C:/Program Files/R/R-4.0.3" r_path path to R executable on your machine, e.g. "C:/Program Files/R/R-4.0.3/bin/x64" args passed to `ALDEx2.clr` kwargs passed to `ALDEx2.clr` Returns ------- """ os.environ["R_HOME"] = r_home os.environ["PATH"] = r_path + ";" + os.environ["PATH"] K = self.y.shape[1] if self.y.shape[0] == 2: p_val = [0 for _ in range(K)] self.result = None else: import rpy2.robjects as rp from rpy2.robjects import numpy2ri, pandas2ri numpy2ri.activate() pandas2ri.activate() import rpy2.robjects.packages as rpackages aldex2 = rpackages.importr("ALDEx2") x_fact = pd.factorize(self.x)[0] cond = rp.vectors.FloatVector(x_fact.astype("str").flatten().tolist()) X_t = self.y.T nr, nc = X_t.shape X_r = rp.r.matrix(X_t, nrow=nr, ncol=nc) if "denom" in kwargs.keys(): kwargs["denom"] = rp.vectors.FloatVector(kwargs["denom"]) aldex_out = aldex2.aldex_clr(X_r, cond, *args, **kwargs) aldex_out = aldex2.aldex_ttest(aldex_out) aldex_out = pd.DataFrame(aldex_out) p_val = aldex_out.loc[:, method] self.result = aldex_out self.p_val = p_val
def _quantile_normalize(job_context: Dict, ks_check=True, ks_stat=0.001) -> Dict: """ Apply quantile normalization. """ # Prepare our QN target file organism = job_context['organism'] qn_target = utils.get_most_recent_qn_target_for_organism(organism) if not qn_target: logger.error( "Could not find QN target for Organism!", organism=organism, dataset_id=job_context['dataset'].id, dataset_data=job_context['dataset'].data, processor_job_id=job_context["job"].id, ) job_context['dataset'].success = False job_context[ 'job'].failure_reason = "Could not find QN target for Organism: " + str( organism) job_context[ 'dataset'].failure_reason = "Could not find QN target for Organism: " + str( organism) job_context['dataset'].save() job_context['job'].success = False job_context[ 'failure_reason'] = "Could not find QN target for Organism: " + str( organism) return job_context else: qn_target_path = qn_target.sync_from_s3() qn_target_frame = pd.read_csv(qn_target_path, sep='\t', header=None, index_col=None, error_bad_lines=False) # Prepare our RPy2 bridge pandas2ri.activate() preprocessCore = importr('preprocessCore') as_numeric = rlang("as.numeric") data_matrix = rlang('data.matrix') # Convert the smashed frames to an R numeric Matrix # and the target Dataframe into an R numeric Vector target_vector = as_numeric(qn_target_frame[0]) merged_matrix = data_matrix(job_context['merged_no_qn']) # Perform the Actual QN reso = preprocessCore.normalize_quantiles_use_target( x=merged_matrix, target=target_vector, copy=True) # Verify this QN, related: https://github.com/AlexsLemonade/refinebio/issues/599#issuecomment-422132009 set_seed = rlang("set.seed") combn = rlang("combn") ncol = rlang("ncol") ks_test = rlang("ks.test") which = rlang("which") set_seed(123) n = ncol(reso)[0] m = 2 if n >= m: combos = combn(ncol(reso), 2) # Convert to NP, Shuffle, Return to R ar = np.array(combos) np.random.shuffle(np.transpose(ar)) nr, nc = ar.shape combos = ro.r.matrix(ar, nrow=nr, ncol=nc) # adapted from # https://stackoverflow.com/questions/9661469/r-t-test-over-all-columns # apply KS test to randomly selected pairs of columns (samples) for i in range(1, min(ncol(combos)[0], 100)): value1 = combos.rx(1, i)[0] value2 = combos.rx(2, i)[0] test_a = reso.rx(True, value1) test_b = reso.rx(True, value2) # RNA-seq has a lot of zeroes in it, which # breaks the ks_test. Therefore we want to # filter them out. To do this we drop the # lowest half of the values. If there's # still zeroes in there, then that's # probably too many zeroes so it's okay to # fail. median_a = np.median(test_a) median_b = np.median(test_b) # `which` returns indices which are # 1-indexed. Python accesses lists with # zero-indexes, even if that list is # actually an R vector. Therefore subtract # 1 to account for the difference. test_a = [test_a[i - 1] for i in which(test_a > median_a)] test_b = [test_b[i - 1] for i in which(test_b > median_b)] # The python list comprehension gives us a # python list, but ks_test wants an R # vector so let's go back. test_a = as_numeric(test_a) test_b = as_numeric(test_b) ks_res = ks_test(test_a, test_b) statistic = ks_res.rx('statistic')[0][0] pvalue = ks_res.rx('p.value')[0][0] job_context['ks_statistic'] = statistic job_context['ks_pvalue'] = pvalue # We're unsure of how strigent to be about # the pvalue just yet, so we're extra lax # rather than failing tons of tests. This may need tuning. if ks_check: if statistic > ks_stat or pvalue < 0.8: job_context['ks_warning'] = ( "Failed Kolmogorov Smirnov test! Stat: " + str(statistic) + ", PVal: " + str(pvalue)) else: logger.warning( "Not enough columns to perform KS test - either bad smash or single saple smash.", dset=job_context['dataset'].id) # And finally convert back to Pandas ar = np.array(reso) new_merged = pd.DataFrame(ar, columns=job_context['merged_no_qn'].columns, index=job_context['merged_no_qn'].index) job_context['merged_qn'] = new_merged merged = new_merged return job_context
""" This module is used to analyze the result of the executeR module """ import numpy as np import pandas as pd import pickle import rpy2 from rpy2.robjects import pandas2ri pandas2ri.activate() from executeR import DataframeStore PICKLE_PATH = '../files/' if __name__ == '__main__': rsnips = pickle.load(open(PICKLE_PATH + "r_dfs.pkl", "rb")).pairs # print(type(rdict)) print(len(rsnips)) # TODO check if all ndarrays were actually supposed to be Series; there might # have been a lost in translation for vectors (it might need to be explicit) uniques = set() count_results = 0 count_errors = 0 types = [] for k in rsnips: expr = k['expr'] out = k['test_results'] # if type(v) == np.ndarray: if expr in uniques: continue errors = 0
def pd_active(self): pandas2ri.activate()
def pythonWrapper4Pet(dataframe, snps, covars, trait1, trait2, model1, scriptsdir, model2, resamples=999): ''' This is just Python wrapper around the R code for the PET calculations ''' py2ri.activate() E.info("Checking regression models") if model1 == "logistic": R('''trait1.mod <- binomial''') R('''trait1.link <- "logit" ''') elif model1 == "linear": R('''trait1.mod <- gaussian''') R('''trait1.link <- "identity" ''') if model2 == "logistic": R('''trait2.mod <- binomial''') R('''trait2.link <- "logit" ''') elif model2 == "linear": R('''trait2.mod <- gaussian''') R('''trait2.link <- "identity" ''') E.info("Running {} regression for trait 1: {}".format(model1, trait1)) E.info("Running {} regression for trait 2: {}".format(model2, trait2)) R('''source("%(scriptsdir)s/PET_functions.R")''' % locals()) E.info("Pushing data objects into the R environment") # push everything into the R environment r_df = py2ri.py2ri_pandasdataframe(dataframe) R.assign("data.df", r_df) r_snps = ro.StrVector([sp for sp in snps]) R.assign("snp.list", r_snps) E.info("Parsing covariates") covars = covars.split(",") r_covar = ro.StrVector([cv for cv in covars]) R.assign("covar.list", r_covar) E.info("{} covariates found to adjust " "in regression models".format(len(covars))) # clean up, replacing "missing values" with NAs for R R('''data.df[data.df == -9] <- NA''') R('''pet_results <- list()''') # loop over all SNP, calculate PCC and p-value # this takes a long time <- need to think of speed ups # possible Python-pure implementation, i.e. with LIMIX? E.info("Iteratively calculating PCC for all SNPs") R('''results <- loopPET(data.df=data.df, trait1="%(trait1)s", trait2="%(trait2)s", ''' '''trait1.link=trait1.link, trait2.link=trait2.link, ''' '''trait1.mod=trait1.mod, trait2.mod=trait2.mod, covars=covar.list,''' '''resamples=%(resamples)i, snp.list=snp.list)''' % locals()) R('''out.res <- data.frame(do.call(rbind, results))''') R('''colnames(out.res) <- c("PCC", "pvalue")''') py_out = py2ri.ri2py_dataframe(R["out.res"]) return py_out
def getdpsis(psifile): #Given a table of psis, calculate LME-based p values psidf = pd.read_table(psifile, sep='\t', header=0, index_col=False) pandas2ri.activate( ) #allow conversion between pandas dataframes and r dataframes #define R packages nlme = importr('nlme') base = importr('base') stats = importr('stats') qv = importr('qvalue') #define formulae fmla = Formula('value ~ 1 + conda + polyA') rndm = Formula('~ 1 | samples') nullfmla = Formula('value ~ 1 + polyA') nullrndm = Formula('~1 | samples') #Remove any gene that has a psi of NA in any sample psidf = psidf.dropna(axis=0) #Store relationships of conditions and the samples in that condition #It's important that this dictionary be ordered because we are going to be iterating through it fracs = ['cytosol', 'membrane', 'insoluble', 'total'] for combination in combinations(fracs, 2): fraca = combination[0] fracb = combination[1] pvalues = [] samp_conds = OrderedDict({ fraca: { 'polyA': [fraca + '_polyA_Rep1', fraca + '_polyA_Rep2'], 'ribodep': [fraca + '_ribodep_Rep1', fraca + '_ribodep_Rep2'] }, fracb: { 'polyA': [fracb + '_polyA_Rep1', fracb + '_polyA_Rep2'], 'ribodep': [fracb + '_ribodep_Rep1', fracb + '_ribodep_Rep2'] } }) #Get a list of all samples samps = [] for cond in samp_conds: for libprep in samp_conds[cond]: samps += samp_conds[cond][libprep] #Iterate through rows, making a dictionary from every row, turning it into a dataframe, then calculating p value genecounter = 0 for index, row in psidf.iterrows(): genecounter += 1 if genecounter % 1000 == 0: print 'Gene {0}...'.format(genecounter) d = getrowdict(index, row, samp_conds, fraca, fracb) #Turn this dictionary into a dataframe rowdf = pd.DataFrame.from_dict(d) #Get lme p value try: lm_alt = nlme.lme(fmla, random=rndm, data=rowdf, method='ML') #test lm_null = nlme.lme(nullfmla, random=nullrndm, data=rowdf, method='ML') #control logratio = (stats.logLik(lm_alt)[0] - stats.logLik(lm_null)[0]) * 2 pvalue = stats.pchisq(logratio, df=1, lower_tail=False)[0] #format decimal pvalue = float('{:.2e}'.format(pvalue)) except RRuntimeError: print 'RRuntime error for {0}!'.format(row['Gene']) pvalue = 1.0 pvalues.append(pvalue) #Turn list of pvalues into qvalues pvec = FloatVector(pvalues) #Get qvalues object qobj = qv.qvalue(p=pvec) #qvalues are index 2 of qvalue object qvalues = list(qobj[2]) #format decimal qvalues = [float('{:.2e}'.format(qvalue)) for qvalue in qvalues] #Add pvalues and qvalues to df pvalcolname = fraca + '_vs_' + fracb + '_pval' qvalcolname = fraca + '_vs_' + fracb + '_qval' psidf[pvalcolname] = pvalues psidf[qvalcolname] = qvalues psidf.to_csv('Drosophilapsi.pval.txt', sep='\t', header=True, index=False)
def r_cal_b(df): robjects.r(''' # create a function `f` f <- function(df, verbose=FALSE) { if (verbose) { cat("I am calling f().\n") } xMin<-min(df$x) xMax<-max(df$x) yMin<-min(df$y) yMax<-max(df$y) xy_PPP <- with(df, ppp(x, y, c(xMin,xMin+50), c(yMin,yMin+50))) #xy_PPP <- with(df, ppp(x, y, c(xMin,xMax), c(yMin,yMax))) #xy_PPP <- with(df, ppp(x, y, c(-25,25), c(-25,25))) #plot(xy_PPP) xy=df summary(xy) xy <- unique(xy) xy<-data.matrix(xy) # mean center mc <- apply(xy, 2, mean) # standard distance sd <- sqrt(sum((xy[,1] - mc[1])^2 + (xy[,2] - mc[2])^2) / nrow(xy)) #study area buffer_area=50*50 #Density dens <- nrow(xy) / buffer_area library(spatstat) win<-owin(c(-25,25), c(-25,25)) #弃之,python下无法安装rspatial,R环境下可以 #library(devtools) #if (!require("rspatial")) devtools::install_github('rspatial/rspatial') #remotes::install_github("rspatial/rspatial") #devtools::install_github("rspatial/rspatial") #devtools::install_github("rstudio/sparkapi") #library(rspatial) #r <- raster(win) #样方统计 quadrat_C<-quadratcount(xy_PPP,nx=5,ny=5) #plot(quadrat_C) # number of quadrats quadrats <- sum(quadrat_C) f<-table(quadrat_C) f<-data.frame(f) # number of cases cases <- sum(as.integer(f$quadrat_C) * f$Freq) mu <- cases / quadrats ff <- data.frame(as.integer(f$quadrat_C),f$Freq) colnames(ff) <- c('K', 'X') ff$Kmu <- ff$K - mu ff$Kmu2 <- ff$Kmu^2 ff$XKmu2 <- ff$Kmu2 * ff$X #The observed variance s2 is s2 <- sum(ff$XKmu2) / (sum(ff$X)-1) #the VMR is VMR <- s2 / mu #Estimators of the empty-space function F(r) Fs<-Fest(xy_PPP) #plot(Fs) F_km<-mean(Fs$km) #nearest-neighbour function G(r) Gs<-Gest(xy_PPP) G_km<--mean(Gs$km) newlist<-list(VMR,F_km,G_km) return(newlist) #return(VMR) } ''') r_f = robjects.r['f'] pandas2ri.activate() r_DF = pandas2ri.py2ri(df[[ "x", "y" ]]) #将python下的dataFrame转换为R下的data.frame,传入R语言,即robjects.r()定义的函数 res = r_f(r_DF) #返回R语言计算的结果 # print("+"*50) # print(res) return res
def mdl_fit(model_vars, df, y_param, ci_level=0.95): """ Function to fit final model and extract modelling statistics Input: model variables as a list, dataframe holding all the data, dependent variable, confidence level for reporting statistics i.e. 0.95 for 95% Output: dataframe with model coefficients and statistics """ #---------------------------------------------------------------------- # Import necessary modules import rpy2.robjects.numpy2ri rpy2.robjects.numpy2ri.activate() import numpy as np from rpy2.robjects import pandas2ri pandas2ri.activate() from rpy2.robjects.packages import STAP import scipy.stats as stats #---------------------------------------------------------------------- # Fit R model # Set R function as string to fit model and return results string_ord_mdl = """ mdl_func <- function(formula,df) { library(VGAM) mdl1=vglm(formula,family=propodds, data=df) ll=logLik(mdl1) coefficients_df=coef(summary(mdl1)) coefficient_cols=colnames(coefficients_df) coefficient_rows=rownames(coefficients_df) output<-list(ll,coefficients_df,coefficient_cols,coefficient_rows) return(output) } """ # Transform pandas dataframe to R format rdf = pandas2ri.py2ri(df) # Set R formula as string using the model parameters and dependent variable formula = 'as.ordered(' + y_param + ') ~ ' + "+".join(model_vars) # Define R function to be used in Python ord_ll = STAP(string_ord_mdl, "ord_ll") # Fit model output_R = ord_ll.mdl_func(formula, rdf) # Extract data and place them in Pandas dataframe coeff_df_temp = output_R[1] coeff_df = pandas2ri.ri2py_dataframe(coeff_df_temp) cols_df = list(output_R[2]) rows_df = list(output_R[3]) coeff_df.columns = cols_df coeff_df.index = rows_df #---------------------------------------------------------------------- # Calculate statistics # Number of parameters n_vars = len(coeff_df) # Degrees for freedom for t-distribution deg_free = len(df) - n_vars # Calculate alpha value from confidence interval alpha_ = 1.0 - ci_level # array to hold the low % confidence intervals low_arr = np.zeros(len(coeff_df)) # array to hold the high % confidence intervals high_arr = np.zeros(len(coeff_df)) # array to hold the Wald test p-values p_val_arr = np.zeros(len(coeff_df)) # array to hold the t statistic t_value_arr = np.zeros(len(coeff_df)) # loop counter variable index_arr = 0 for index, row in coeff_df.iterrows(): # Get standard error for variable coefficient from R model fit data std_error = row['Std. Error'] # Get variable coefficient value from R model fit data coeff_value = row['Estimate'] # Calculate t_critical statistic for desired confidence interval t_critical = stats.t.ppf(1 - (alpha_ / 2.), df=deg_free) # Calculate low - high confidence interval limits low_arr[index_arr] = coeff_value - (t_critical * std_error) high_arr[index_arr] = coeff_value + (t_critical * std_error) # t statistic calculation to get p-value t_value = coeff_value / std_error t_value_arr[index_arr] = t_value # Calculate p-value p_val_arr[index_arr] = 2.0 * \ (1.0 - stats.t.cdf(np.abs(t_value), deg_free)) index_arr += 1 # Set arrays to dataframe columns coeff_df['Low ' + str((1.0 - alpha_) * 100) + '%'] = low_arr coeff_df['High ' + str((1.0 - alpha_) * 100) + '%'] = high_arr coeff_df['P Value'] = p_val_arr coeff_df['t Value'] = t_value_arr # Delete statistics of R model fit referring to normal distribution coeff_df.drop(['z value', 'Pr(>|z|)'], axis=1, inplace=True) # Return dataframe with model fit coefficients and statistics return coeff_df
def generateLouvainCluster(edgeList): # no weights # G = nx.Graph(edgeList) # weighted edges: networkx,does not work # https://github.com/vtraag/louvain-igraph # https://python-louvain.readthedocs.io/en/latest/api.html # G = nx.Graph() # G.add_weighted_edges_from(edgeList) # partition = community.best_partition(G,weight='weight') # valueResults = [] # for key in partition.keys(): # valueResults.append(partition[key]) # df = pd.DataFrame() # df['Cluster']=valueResults # R: # https://github.com/dgrun/RaceID3_StemID2_package/blob/master/R/VarID_functions.R fromVec = [] toVec = [] weightVec = [] for edge in edgeList: fromVec.append(edge[0]) toVec.append(edge[1]) weightVec.append(edge[2]) import rpy2.robjects as ro from rpy2.robjects.packages import importr from rpy2.robjects import r, pandas2ri pandas2ri.activate() igraph = importr('igraph') base = importr('base') fromV = ro.FloatVector(fromVec) toV = ro.FloatVector(toVec) # weightV= ro.FloatVector([0.1,1.0,1.0,0.1]) weightV = ro.FloatVector(weightVec) links = ro.DataFrame({'from': fromV, 'to': toV, 'weight': weightV}) g = igraph.graph_from_data_frame(links, directed=False) cl = igraph.cluster_louvain(g) def as_dict(vector): """Convert an RPy2 ListVector to a Python dict""" result = {} for i, name in enumerate(vector.names): if isinstance(vector[i], ro.ListVector): result[name] = as_dict(vector[i]) elif len(vector[i]) == 1: result[name] = vector[i][0] else: result[name] = vector[i] return result cl_dict = as_dict(cl) df = pd.DataFrame() # df['Cluster']=cl_dict['membership'] size = float(len(set(cl_dict['membership']))) listResult = [] count = 0 for i in range(len(cl_dict['membership'])): listResult.append(int(cl_dict['membership'][i]) - 1) count += 1 return listResult, size
def sctransform(adata, genes=2000, min_genes_per_cell=5, method='poisson', latent=None, batch=None, cores=1, memory=10, verbose=True): """ Function to use scTransform. It needs at least the adata.obj['total_counts'] number of UMIs calculated in the data. """ import numpy as np import rpy2.robjects as ro import anndata2ri import scanpy as sc from rpy2.robjects import pandas2ri from scipy.sparse import issparse import rpy2.rinterface_lib.callbacks import logging if not verbose: rpy2.rinterface_lib.callbacks.logger.setLevel(logging.ERROR) ro.r('library(scater)') ro.r('library(sctransform)') ro.r('library(future)') pandas2ri.activate() anndata2ri.activate() print('Filtering genes') sc.pp.filter_genes(adata, min_cells=min_genes_per_cell) if issparse(adata.X): ro.globalenv['rawMatrix'] = adata.X.T.todense() else: ro.globalenv['rawMatrix'] = adata.X.T latent_var = [] if latent is None: ro.r('cells_info = as.data.frame( colSums(rawMatrix) )') ro.globalenv['cellnames'] = np.asarray(adata.obs_names) ro.r('rownames(cells_info) = cellnames') else: latent_var = latent ro.globalenv['cells_info'] = adata.obs[latent_var] latent_var = ['"data.' + i + '"' for i in latent_var] ro.globalenv['genes_name'] = adata.var_names ro.r('cell_df <- DataFrame(data = cells_info)') #ro.r('print(head(cell_df))') #ro.r('print(rownames(cell_df)[1:10])') #ro.r('rawMatrix=as.data.frame(rawMatrix)') ro.r('colnames(rawMatrix) <- rownames(cell_df)') ro.r('rownames(rawMatrix) <- genes_name') print('Configure future multithreading') ro.globalenv['cores'] = cores ro.globalenv['memory'] = memory ro.r('future::plan(strategy = \'multicore\', workers = cores)') ro.r('options(future.globals.maxSize = memory * 1024 ^ 3)') print('Run scTransform') ro.globalenv['genes'] = int(genes) ro.globalenv['min_genes_per_cell'] = int(min_genes_per_cell) ro.globalenv['method'] = method stringCommand = 'vst_out=vst( as.matrix(rawMatrix), cell_attr=cell_df, n_genes=genes, method=method, show_progress=TRUE, min_cells=min_genes_per_cell, return_corrected_umi=TRUE' #latent_var = ['"data.'+i+'"' for i in latent_var] if batch is not None: batch = '"data.' + batch + '"' stringCommand = stringCommand + ', batch_var=' + batch if latent is not None: latent_var.remove(batch) if ((len(latent_var) > 1) and (batch is not None)) | ((len(latent_var) >= 1) and (batch is None)): #print(latent_var) stringCommand = stringCommand + ', latent_var=c(' + ','.join( latent_var) + ')' stringCommand += ')' print("Running the command:", stringCommand) ro.r(stringCommand) print('Extract results') new_matrix = ro.r('vst_out$y') sct_genes = ro.r('rownames(vst_out$model_pars)') all_genes = ro.r('rownames(vst_out$y)') umi_corrected = ro.r('vst_out$umi_corrected') adata = adata[:, all_genes].copy() adata.var['highly_variable'] = [i in sct_genes for i in adata.var_names] adata.layers['norm_sct'] = np.transpose(new_matrix) adata.layers['umi_corr'] = umi_corrected.T.copy() return adata
def runme(count_matrix_path, design_matrix_path, gene_column): import pandas as pd import rpy2 from rpy2.robjects import pandas2ri, Formula, r assert rpy2.__version__, '2.9.1' ' Please install rpy2 2.9.1 to run this script' assert pd.__version__, '0.19' ' Please install pandas 0.19 to run this script' pandas2ri.activate() from rpy2.robjects.packages import importr try: deseq = importr('DESeq2') except: EnvironmentError('Please install DESeq2 in your R environment') # Necessary to translate R dataframe back to Pandas to_dataframe = r('function(x) data.frame(x)') print('Loading data with pandas') count_matrix_df = pd.read_csv( count_matrix_path, sep=',', ) design_matrix_df = pd.read_csv(design_matrix_path, sep=',', index_col=0) class py_DESeq2: def __init__(self, count_matrix, design_matrix, design_formula, gene_column='id'): try: assert gene_column in count_matrix.columns, 'Wrong gene id column name' gene_id = count_matrix[gene_column] except AttributeError: sys.exit('Wrong Pandas dataframe?') self.dds = None self.deseq_result = None self.resLFC = None self.comparison = None self.normalized_count_matrix = None self.gene_column = gene_column self.gene_id = count_matrix[self.gene_column] count_matrix = count_matrix.drop(gene_column, axis=1) print( f'Number of columns in counts data {count_matrix.shape[1]} | ' f'Number of rows in design matrix {design_matrix.shape[0]}') # Load dataframe into R environment # Important: Change to r.data() if you use numpys and rpy2 latests versions count_matrix = pandas2ri.py2ri(count_matrix) # Assign columns to NULL count_matrix.names = rpy2.rinterface.NULL self.count_matrix = count_matrix self.design_matrix = pandas2ri.py2ri(design_matrix) self.design_formula = Formula(design_formula) def run_deseq(self, **kwargs): self.dds = deseq.DESeqDataSetFromMatrix( countData=self.count_matrix, colData=self.design_matrix, design=self.design_formula) self.dds = deseq.DESeq(self.dds, **kwargs) # Previous script had "deseq.counts" instead self.normalized_count_matrix = deseq.counts_DESeqDataSet( self.dds, normalized=True) def get_deseq_result(self, **kwargs): self.comparison = deseq.resultsNames(self.dds) self.deseq_result = deseq.results(self.dds, **kwargs) self.deseq_result = to_dataframe(self.deseq_result) self.deseq_result = pandas2ri.ri2py( self.deseq_result) ## back to pandas dataframe self.deseq_result[self.gene_column] = self.gene_id.values return self.deseq_result print('Creating R objects') deseq2_exp = py_DESeq2(count_matrix=count_matrix_df, design_matrix=design_matrix_df, design_formula='~ class_label', gene_column=gene_column) print('Running DESeq2 scripts...please be patient') deseq2_exp.run_deseq() print('Almost done...getting the results ready') results = deseq2_exp.get_deseq_result() results.to_csv('results.csv') print('Done!')
def calculate_measures(x): # Save ts version of our data for some of the below functions #rbase.set_seed(123) # reproducibility seed #x_ts_contiguous = r.ts(FloatVector(na_contiguous(x))) #print(x_ts_contiguous) # Now "activate" pandas2ri and numpy2ri pandas2ri.activate() numpy2ri.activate() N = len(x) freq = find_freq_r(x) fx = (math.exp((freq - 1) / 50) - 1) / (1 + math.exp((freq - 1) / 50)) # Decomposition decomp_x = decompose(x) # Adjust data # Unfortunately it looks like frequency is calculated a different way in the decompose function # Thus there may be data for which this function is evaulated when 'seasonality' is null # Going to add an extra check to make sure to not evaluate this if all the values are null #print(decomp_x['seasonality']) if freq > 1 and (not decomp_x['seasonality'].isnull().all()): fit = decomp_x['trend'] + decomp_x['seasonality'] else: # Nonseasonal data fit = decomp_x['trend'] adj_x = decomp_x['x'] - fit + np.mean(decomp_x['trend'].dropna()) # Backtransformation of adjusted data if decomp_x['transform']: # The below line of code doesn't work for some reason #t_adj_x = inv_boxcox(adj_x.values, decomp_x['lambda']) # Use actual formula instead (but do inverse because we're solving for x) ''' The Box-Cox transform is given by: y = (x**lmbda - 1) / lmbda, for lmbda > 0 log(x), for lmbda = 0 ''' if decomp_x['lambda'] == 0: # Assuming base of 10 (x = 10^y) t_adj_x = 10**adj_x else: # x = ((y * lambda) + 1) ^ (1/lambda) t_adj_x = ((adj_x * decomp_x['lambda']) + 1)**(1 / decomp_x['lambda']) else: t_adj_x = adj_x # Trend and seasonal measures v_adj = np.var(adj_x.dropna()) threshold = 0.00000000001 if (freq > 1): detrend = decomp_x['x'] - decomp_x['trend'] deseason = decomp_x['x'] - decomp_x['seasonality'] if np.var(deseason.dropna()) < threshold: trend = 0 else: trend = max(0, min(1, 1 - (v_adj / np.var(deseason.dropna())))) if np.var(detrend.dropna()) < threshold: seasonality = 0 else: seasonality = max(0, min(1, 1 - (v_adj / np.var(detrend.dropna())))) else: # Nonseasonal data if np.var(decomp_x['x'].dropna()) < threshold: trend = 0 else: trend = max(0, min(1, 1 - (v_adj / np.var(decomp_x['x'].dropna())))) seasonality = 0 measures = [fx, trend, seasonality] # Measures on original data xbar = np.mean(x.dropna()) std = np.std(x.dropna()) # Serial correlation (make sure box pierce statistic is returned as well) #bp = boxpierce(x, lags=max_lag) #Had to fix stattest module in pypr package via: https://gist.github.com/betterxys/1def38e1fcbb7f3b2dab2393bcea52f0 max_lag = 10 lbvalue, pvalue, bpvalue, bppvalue = acorr_ljungbox(x, lags=max_lag, boxpierce=True) # The above returns values for each lag, so just grab the final value Q = bpvalue[-1] / (N * max_lag) fQ = f2_transformation(Q, 7.53, 0.103) # Nonlinearity (THIS REQUIRES THE TIMESERIES OBJECT VERSION OF OUR DATA) ''' non_linear_test = rtseries.terasvirta_test_ts(x_ts_contiguous,type = "Chisq") #non_linear_test = rtseries.terasvirta_test_default(y=x_contiguous,x=x_contiguous.index.dayofyear,type = "Chisq") p = non_linear_test[np.where(non_linear_test.names == 'statistic')[0].item()][0] fp = f1_transformation(p,0.069,2.304) ''' fp = None # Skewness skew = abs(np.mean((x.dropna() - xbar)**3) / std**3) fs = f1_transformation(skew, 1.510, 5.993) # Kurtosis kurtosis = np.mean((x.dropna() - xbar)**4) / std**4 fk = f1_transformation(kurtosis, 2.273, 11567) # Hurst=d+0.5 where d is fractional difference hurst = rfracdiff.fracdiff(na_contiguous(x), 0, 0) H = hurst[np.where(hurst.names == 'd')[0].item()].item() + 0.5 # Lyapunov Exponent if freq > (N - 10): # There is insufficient data, declare this variable as none fLyap = None else: Ly = np.zeros(N - freq) for i in range(0, (N - freq)): diffs = abs(x.iloc[i] - x) date_idx = diffs.sort_values().index int_idx = pd.Index( [diffs.index.get_loc(date) for date in date_idx]) idx = int_idx[int_idx < (N - freq)] j = idx[1] try: Ly[i] = math.log( abs((x.iloc[i + freq] - x.iloc[j + freq]) / (x.iloc[i] - x.iloc[j]))) / freq except ValueError: # domain error, means log(0) was taken Ly[i] = 0 if (np.isnan(Ly[i]) or (Ly[i] == np.Inf) or (Ly[i] == -np.Inf)): Ly[i] = np.nan Lyap = np.mean(Ly[~np.isnan(Ly)]) fLyap = math.exp(Lyap) / (1 + math.exp(Lyap)) measures = measures + [fQ, fp, fs, fk, H, fLyap] # Measures on adjusted data xbar = np.mean(t_adj_x.dropna()) std = np.std(t_adj_x.dropna()) # Serial correlation (make sure box pierce statistic is returned as well) #bp = boxpierce(adj_x, lags=max_lag) max_lag = 10 lbvalue, pvalue, bpvalue, bppvalue = acorr_ljungbox(na_contiguous(adj_x), lags=max_lag, boxpierce=True) # The above returns values for each lag, so just grab the final value Q = bpvalue[-1] / (N * max_lag) fQ = f2_transformation(Q, 7.53, 0.103) # Nonlinearity (add try/except block to capture data where this doesn't work) # (THIS REQUIRES THE TIMESERIES OBJECT VERSION OF OUR DATA) try: adj_x_contiguous = na_contiguous(adj_x) non_linear_test = rtseries.terasvirta_test_ts(adj_x_contiguous, type="Chisq") #non_linear_test = rtseries.terasvirta_test_default(y=adj_x_contiguous,x=adj_x_contiguous.index.dayofyear,type = "Chisq") p = non_linear_test[np.where( non_linear_test.names == 'statistic')[0].item()][0] fp = f1_transformation(p, 0.069, 2.304) except ValueError: print('This block did not work for the following data:\n', adj_x) # Skewness skew = abs(np.mean((t_adj_x.dropna() - xbar)**3) / (std**3)) fs = f1_transformation(skew, 1.510, 5.993) # Kurtosis kurtosis = np.mean((t_adj_x.dropna() - xbar)**4) / (std**4) fk = f1_transformation(kurtosis, 2.273, 11567) measures_list = measures + [fQ, fp, fs, fk] measures_df = pd.DataFrame.from_dict( measures_dct, orient='index', columns=[ "frequency", "trend", "seasonal", "autocorrelation", "non-linear", "skewness", "kurtosis", "Hurst", "Lyapunov", "dc autocorrelation", "dc non-linear", "dc skewness", "dc kurtosis" ]) return measures_df
try: import rpy2.robjects.pandas2ri as rpandas except ModuleNotFoundError as e: if "tzlocal" in e.msg: raise ModuleNotFoundError( e.msg + "\n Install tzlocal with `pip install tzlocal`." ) else: raise ModuleNotFoundError(e) from rpy2.robjects.packages import importr __all__ = ["import_adehabitat", "import_trajr", "plot_ltraj", "to_trajr", "to_ltraj"] rpandas.activate() ADEHABITAT_INSTALLED = False TRAJR_INSTALLED = False def import_adehabitat(suppress_messages=True): global ADEHABITAT_INSTALLED if not ADEHABITAT_INSTALLED: utils = rpackages.importr("utils", suppress_messages=suppress_messages) print("Importing adehabitat") utils.chooseCRANmirror(ind=1) utils.install_packages("adehabitatLT") ADEHABITAT_INSTALLED = True adehabitat = importr("adehabitatLT", suppress_messages=suppress_messages) return adehabitat
def consensusClustering(infile, cutHeight, cluster_algorithm, min_size=30, deepsplit=False): ''' hierachichal clustering based on gene-cluster correlation across resampled datasets. cut tree based with dynamic tree cut TODO: change this to cutHeight? i.e. 0.2 = 80% clustering agreement OR use dynamic tree cut without deepsplit. ''' condition = infile.split("/")[1].split("-")[0] wgcna_out = "tmp.dir/consensus-WGCNA.out" R('''sink(file='%(wgcna_out)s')''' % locals()) R('''suppressMessages(library("WGCNA"))''') R('''suppressMessages(library("flashClust"))''') E.info("loading distance matrix") df = pd.read_table(infile, sep="\t", header=0, index_col=0) labels = df.index.tolist() labels_r = ro.StrVector([l for l in labels]) # py2ri requires activation pandas2ri.activate() df_r = pandas2ri.py2ri(df) R.assign("distance.frame", df_r) R.assign("labels", labels_r) # large matricies/distance objects may need more # memory - allocate 1GB R('''memory.limit(10000)''') R('''rownames(distance.frame) <- labels''') R('''distance_data <- data.matrix(distance.frame)''') E.info("clustering data by %s linkage" % cluster_algorithm) R('''clustering <- flashClust(as.dist(1-distance_data),''' '''method='%(cluster_algorithm)s')''' % locals()) if cutHeight > float(0.01): R('''cluster_cut <- cutreeStatic(dendro=clustering, ''' '''minSize=%(min_size)i, cutHeight=%(cutHeight)s)''' % locals()) elif deepsplit: R('''cluster_cut <- cutreeDynamic(dendro=clustering, ''' '''deepSplit=T, minClusterSize=%(min_size)i)''' % locals()) else: R('''cluster_cut <- cutreeDynamic(dendro=clustering, ''' '''deepSplit=F, minClusterSize=%(min_size)i)''' % locals()) R('''color_cut <- labels2colors(cluster_cut)''') R('''cluster_matched <- data.frame(cbind(rownames(distance_data),''' '''color_cut))''') R('''colnames(cluster_matched) = c("gene_id", "cluster")''') R('''cluster_matched <- data.frame(cluster_matched$gene_id,''' '''cluster_matched$cluster)''') # plot and save dendrogram of clustering # AH: disabled, requires plots.dir to exist which might not be the case # AH: and thus causes this method to fail. Path names need to be parameterizable. # R('''png("plots.dir/%(condition)s-dendrogram-consensus_clustering.png")''' # % locals()) # R('''plotDendroAndColors(dendro=clustering, colors=color_cut,''' # '''groupLabels="Dynamic tree cut",''' # '''dendroLabels=F, addGuide=T, guideHang=0.05, ''' # '''hang=0.03, main="%(condition)s")''' % locals()) # R('''dev.off()''') # R('''sink(file=NULL)''') cluster_frame = pandas2ri.ri2py(R["cluster_matched"]) return cluster_frame
def fit_model( self, r_home: str = "", r_path: str = r"", *args, **kwargs ): """ Fits Beta-Binomial model. Parameters ---------- method method that is used to calculate p-values r_home path to R installation on your machine, e.g. "C:/Program Files/R/R-4.0.3" r_path path to R executable on your machine, e.g. "C:/Program Files/R/R-4.0.3/bin/x64" args passed to `corncob` kwargs passed to `corncob` Returns ------- """ os.environ["R_HOME"] = r_home os.environ["PATH"] = r_path + ";" + os.environ["PATH"] K = self.y.shape[1] if self.y.shape[0] == 2: p_val = [0 for _ in range(K)] self.result = None else: import rpy2.robjects as rp from rpy2.robjects import numpy2ri, pandas2ri numpy2ri.activate() pandas2ri.activate() if self.y.shape[0] == 4: phi = 1 else: phi = self.covariate_column p_val = rp.r(f""" library(corncob) library(phyloseq) #prepare phyloseq data format counts = {pandas2ri.py2rpy_pandasdataframe(pd.DataFrame(self.y, columns=self.var.index)).r_repr()} sample = {pandas2ri.py2rpy_pandasdataframe(pd.DataFrame(self.x, columns=[self.covariate_column])).r_repr()} cell_types = colnames(counts) OTU = otu_table(counts, taxa_are_rows = FALSE) #create phyloseq data object data = phyloseq(OTU, sample_data(sample)) corncob_out = differentialTest(formula = ~ {self.covariate_column}, phi.formula = ~ {phi}, formula_null = ~ 1, phi.formula_null = ~ {phi}, test = "LRT", boot = FALSE, data = data, fdr_cutoff = 0.05 ) # Test functions on a single cell type # corncob = bbdml(formula = cell_type ~ 1, # phi.formula = ~ 1, # data = data) # corncob_DA = bbdml(formula = cell_type ~ {self.covariate_column}, # phi.formula = ~ {self.covariate_column}, # data = data) # p_vals[cell_type] = lrtest(mod_null = corncob, mod = corncob_DA) p_vals = corncob_out$p_fdr p_vals """) self.p_val = p_val
def main(arg_dir='output', respMatrix=None, arg_url=None): #mongoClient = arguments.url #Importa o pacote utils do R para instalar e importar pacotes R utils = rpackages.importr('utils') utils.chooseCRANmirror(ind=1) #Lista de pacotes R para instalar #O pacote ltm é usado para o calculo dos parametros do IRT packnames = ('ltm', 'ltm') #Verifica se o pacote ja esta instalado, caso não, instala names_to_install = [x for x in packnames if not rpackages.isinstalled(x)] if len(names_to_install) > 0: print('Instalando o pacote ltm do R\n') utils.install_packages(StrVector(names_to_install)) #Importa o pacore ltm do R ltm = rpackages.importr('ltm') pandas2ri.activate() if arg_dir != '': if not os.path.exists(arg_dir): os.makedirs(arg_dir) out = '/' + arg_dir else: out = '' #Pega todos os arquivos contendo os valores para o IRT list_data_irt = [] if respMatrix == None: #Lista todos os diretorios de datasets da pasta output list_dir = os.listdir(os.getcwd() + out) for path in list_dir: # if os.path.exists(os.getcwd()+out+'/'+path+'/'+path+'_irt.csv'): try: read = csv.reader( open( os.getcwd() + out + '/' + path + '/' + path + '_irt.csv', "r")) list_data_irt.append(path + '_irt.csv') except IOError: print( 'Nao foi encontrado o arquivo para calculo do irt do dataset ', path) else: list_data_irt.append(respMatrix) #file = ('heart-statlog_irt.csv') #data = robjects.r('PL3.rasch<-tpm(read.csv(file="heart-statlog_irt.csv"))') #print('\nIniciando calculo dos parametros do IRT para os datasets: ',list_dir) #Inicia o calculo do IRT para todos os datasets for f in range(len(list_data_irt)): print("Calculando os parametros do IRT para o dataset: ", list_data_irt[f]) #Calcula os parametros do IRT com o pacote ltm do R if respMatrix == None: file = os.getcwd( ) + '/' + out + '/' + list_dir[f] + '/' + list_data_irt[f] else: file = formatMatrix(list_data_irt[f]) file = file.replace('\\', '/') #try: data = robjects.r('tpm(read.csv(file="' + file + '"),IRT.param = TRUE)') #except: # data = robjects.r('tpm(read.csv(file="'+file+'"),control = list(optimizer = "nlminb"))') #Trata os dados dos parametros par = (str(data).split('\n')) #Adciona os parametros em um dicionario parameter_dict = {} parameters = ['Discriminacao', 'Dificuldade', 'Adivinhacao'] for i in range(len(par)): try: if par[i][0] == 'V': pass else: continue except: continue item = par[i].split()[0] tmp_dict = {} for p in range(3): tmp_dict[parameters[p]] = float(par[i].split()[3 - p]) parameter_dict[item] = tmp_dict list_dis = [] list_dif = [] list_adv = [] for i in parameter_dict: list_dis.append(parameter_dict[i]['Discriminacao']) list_dif.append(parameter_dict[i]['Dificuldade']) list_adv.append(parameter_dict[i]['Adivinhacao']) # normalized_dis = normalize(list_dis,-4,4) # normalized_dif = normalize(list_dif,-4,4) # c = 0 # for i in parameter_dict: # parameter_dict[i]['Discriminacao'] = normalized_dis[c] # parameter_dict[i]['Dificuldade'] = normalized_dif[c] # c += 1 dataframe = pd.DataFrame.from_dict(parameter_dict) dataframe = dataframe.reindex(index=parameters) #break #Salva os parametros do IRT na pasta de cada dataset if respMatrix == None: dataframe.transpose().to_csv(r'' + os.getcwd() + out + '/' + list_dir[f] + '/irt_item_param.csv') else: os.remove(file) dataframe.transpose().to_csv(r'' + os.getcwd() + out + '/irt_item_param.csv') #Insere os dados do IRT no MongoDB if arg_url != None: try: insertMongo(parameter_dict, arg_url, list_dir[f]) print('==> Dados salvos com sucesso :)\n') except: print( "Não foi possivel inserir os dados no MongoDB :/ \nVerifique se a url passada do banco está correta, assim como nome e senha\n" )
def save_rds(data, filename): import collections, re import pandas as pd import numpy as np import rpy2.robjects as RO import rpy2.rinterface as RI from rpy2.robjects import numpy2ri numpy2ri.activate() from rpy2.robjects import pandas2ri pandas2ri.activate() # Supported data types: # int, float, str, tuple, list, numpy array # numpy matrix and pandas dataframe int_type = (int, np.int8, np.int16, np.int32, np.int64) float_type = (float, np.float) def assign(name, value): name = re.sub(r'[^\w' + '_.' + ']', '_', name) if isinstance(value, (tuple, list)): if all(isinstance(item, int_type) for item in value): value = np.asarray(value, dtype=int) elif all(isinstance(item, float_type) for item in value): value = np.asarray(value, dtype=float) else: value = np.asarray(value) if isinstance(value, np.matrix): value = np.asarray(value) if isinstance( value, tuple(flatten_list((str, float_type, int_type, np.ndarray)))): if isinstance(value, np.ndarray) and value.dtype.kind == "u": value = value.astype(int) RO.r.assign(name, value) elif isinstance(value, pd.DataFrame): # FIXME: does not always work well for pd.DataFrame RO.r.assign(name, value) elif value is None: RO.r.assign(name, RI.NULL) else: raise ValueError( "Saving ``{}`` to RDS file is not supported!".format( str(type(value)))) # def assign_dict(name, value): RO.r('%s <- list()' % name) for k, v in value.items(): k = re.sub(r'[^\w' + '_.' + ']', '_', str(k)) if k.isdigit(): k = str(k) if isinstance(v, collections.Mapping): assign_dict('%s$%s' % (name, k), v) else: assign('item', v) RO.r('%s$%s <- item' % (name, k)) # if isinstance(data, collections.Mapping): assign_dict('res', data) else: assign('res', data) RO.r("saveRDS(res, '%s')" % filename)
class SampleHeatmap(RnaseqqcTracker): table = "sailfish_transcripts" py2ri.activate() def getTracks(self, subset=None): return ("all") def getCorrelations(self, dataframe): ''' Perform hierarchical clustering on a dataframe of expression values Arguments --------- dataframe: pandas.Core.DataFrame a dataframe containing gene IDs, sample IDs and gene expression values Returns ------- corr_frame: pandas.Core.DataFrame a dataframe of a pair-wise correlation matrix across samples. Uses the Pearson correlation. ''' # set sample_id to index pivot = dataframe.pivot(index="sample_name", columns="transcript_id", values="TPM") transpose = pivot.T # why do I have to resort to R???? r_df = py2ri.py2ri_pandasdataframe(transpose) R.assign("p.df", r_df) R('''p.mat <- apply(p.df, 2, as.numeric)''') R('''cor.df <- cor(p.mat)''') r_cor = R["cor.df"] py_cor = py2ri.ri2py_dataframe(r_cor) corr_frame = py_cor return corr_frame def getFactors(self, dataframe): '''Get factor/experimental design levels from table ''' statement = ("SELECT factor_value, sample_name, factor " "FROM factors AS f " "JOIN samples AS s " "ON f.sample_id = s.id " "WHERE factor != 'genome'" % locals()) factor_df = self.getDataFrame(statement) merged = pd.merge(dataframe, factor_df, left_index=True, right_on="sample_name", how='outer') return merged def __call__(self, track, slice=None): statement = ("SELECT s.sample_name, t.transcript_id, t.TPM " "FROM %(table)s AS t, samples AS s " "WHERE transcript_id != 'Transcript' " "AND t.sample_id = s.id") df = self.getDataFrame(statement) mdf = self.getCorrelations(df) mdf.columns = set(df["sample_name"]) mdf.index = set(df["sample_name"]) all_df = self.getFactors(mdf) return all_df.set_index("factor")
def nnd_hotdeck_using_rpy2(receiver=None, donor=None, matching_variables=None, z_variables=None, donor_classes=None): from rpy2.robjects.packages import importr from rpy2.robjects import pandas2ri assert receiver is not None and donor is not None assert matching_variables is not None pandas2ri.activate() StatMatch = importr("StatMatch") if isinstance(donor_classes, str): assert donor_classes in receiver, 'Donor class not present in receiver' assert donor_classes in donor, 'Donor class not present in donor' try: if donor_classes: out_NND = StatMatch.NND_hotdeck( data_rec=receiver, data_don=donor, match_vars=pd.Series(matching_variables), don_class=pd.Series(donor_classes)) else: out_NND = StatMatch.NND_hotdeck( data_rec=receiver, data_don=donor, match_vars=pd.Series(matching_variables), # don_class = pd.Series(donor_classes) ) except Exception as e: print(1) print(receiver) print(2) print(donor) print(3) print(pd.Series(matching_variables)) print e # create synthetic data.set, without the # duplication of the matching variables fused_0 = pandas2ri.ri2py( StatMatch.create_fused(data_rec=receiver, data_don=donor, mtc_ids=out_NND[0], z_vars=pd.Series(z_variables))) # create synthetic data.set, with the "duplication" # of the matching variables fused_1 = pandas2ri.ri2py( StatMatch.create_fused(data_rec=receiver, data_don=donor, mtc_ids=out_NND[0], z_vars=pd.Series(z_variables), dup_x=True, match_vars=pd.Series(matching_variables))) return fused_0, fused_1
def granger_causality(data, cols, y_var, lags, our_type, list_subcausalities=False): y_subset = data[y_var] pandas2ri.activate() data = pandas2ri.py2ri(data) ### We define the functions; robjects.r(''' is.installed <- function(mypkg){ is.element(mypkg, installed.packages()[,1]) } # check if package "gtools" is installed if (!is.installed("gtools")){ install.packages("gtools", INSTALL_opts = '--no-lock', repos='https://cloud.r-project.org') } if (!is.installed("vars")){ install.packages("vars", INSTALL_opts = '--no-lock', repos='https://cloud.r-project.org') } library("gtools") library("vars") for (k in .libPaths()){ k <- paste0(k,"/00LOCK") unlink(k, recursive = TRUE) } get_p_value <- function(data,lags,y_values,causes,our_type){ data <- as.data.frame(data) mycols <- c(as.character(unlist(causes))) mydata <- data[c(as.character(unlist(causes)))] mydata <- as.data.frame(mydata) mydata <- cbind(Temperatures = y_values,mydata) var.2c <- VAR(mydata, p = lags, type = our_type) ### In this case, we are using trended Granger causality my_vcov <- vcovHC(var.2c) mycause <- causality(var.2c, cause = mycols) return(c(mycause$Granger$p.value)) } permuts <- function(data,order,y,columns,our_type){ list_perms <- do.call("c", lapply(seq_along(columns), function(i) combn(columns, i, FUN = list))) d <- data.frame(x = NA, y = 1:length(list_perms)) i <- 1 columns <- unlist(columns) while (i<=length(list_perms)){ myp <- get_p_value(data,order,y,list_perms[i][[1]],our_type = our_type) d[i,] <- c(toString(unlist(list_perms[i][[1]])),as.numeric(myp)) i <- i + 1 } colnames(d) <- c("Sets of variables","p-value") d$`p-value` <- as.numeric(d$`p-value`) return(d) #return(.libPaths()) #return(unlist(list_perms[i-1][[1]])) } ''') r_f = robjects.globalenv['get_p_value'] permuts = robjects.globalenv['permuts'] robjects.r.library("vars") our_causes = robjects.r('as.data.frame')(cols) if list_subcausalities == True: mydf = permuts(data, lags, robjects.Vector(y_subset), our_causes, our_type) return (mydf) return (r_f(data, lags, robjects.Vector(y_subset), our_causes, our_type))
def deseqNormalize(infile, time_points, reps, conditions=None): ''' Library size normalisation and variance stabilizing transformation of timeseries RNA-seq data :param infile: count table from NGS-seq experiment :type infile: str :param time_points: time point labels :type time_points: str list :param reps: replicates labels :type reps: str list :param conditions: if multiple experimental conditions are to be normalised at the same time :type conditions: str list ''' # MM: NB - this should be split into separate library size # normalisation and VST transformations # maybe add in different transformation options. pandas2ri.activate() reps = reps # load library R('''suppressMessages(library("DESeq"))''') # generates a lists for the design data frame # of the proper length # these need to be rpy2 objects to be parsed # properly in the string formatting E.info("converting to pandas dataframe object") if infile.split(".")[-1] == "gz": comp = "gzip" else: comp = None data_frame = pd.read_table(infile, index_col=0, header=0, sep="\t", compression=comp) # py2ri requires activation pandas2ri.activate() rdf = pandas2ri.py2ri(data_frame) if not conditions: time_rep_comb = [x for x in itertools.product(time_points, reps)] time_cond = ro.StrVector([x[0] for x in time_rep_comb]) rep_cond = ro.StrVector([x[1] for x in time_rep_comb]) R.assign('countsTable', rdf) R('''design <- data.frame(row.names=colnames(countsTable),''' '''times=%s, replicates=%s)''' % (time_cond.r_repr(), rep_cond.r_repr())) elif conditions: design_dict = {} for x in data_frame.columns.values: sample_dict = {} sample_dict['condition'] = str(x).split(".")[0] sample_dict['times'] = int(str(x).split(".")[1]) sample_dict['replicates'] = str(x).split(".")[2] design_dict[x] = sample_dict design_frame = pd.DataFrame(design_dict) design_frame = design_frame.T des_cond = design_frame['condition'].values.tolist() des_time = design_frame['times'].values.tolist() des_reps = design_frame['replicates'].values.tolist() cond_cond = ro.StrVector([x for x in des_cond]) time_cond = ro.StrVector([x for x in des_time]) rep_cond = ro.StrVector([x for x in des_reps]) R.assign('countsTable', rdf) R.assign('design', design_frame) # create the count data set and normalize to library size # transform with variance stabilizing transformation # only select genes with an average of ten reads mapping E.info("calculating size factors and dispersion") R('''notZero <- (rowMeans(countsTable) > 1)''') R('''cds <- newCountDataSet(countsTable[notZero, ], design)''') R('''cds_size <- estimateSizeFactors(cds)''') R('''cds_disp <- estimateDispersions(cds_size, method="blind")''') E.info("applying variance stabilizing transformation") R('''vst <- varianceStabilizingTransformation(cds_disp)''') # format data set to long format with condition and replicate labels # convert to a numpy array R('''replicates <- c(%s)''' % rep_cond.r_repr()) R('''times <- c(%s)''' % time_cond.r_repr()) if conditions: R('''conditions <- c(%s)''' % cond_cond.r_repr()) R('''trans_vst = data.frame(t(exprs(vst)), ''' '''times, replicates, conditions)''') else: R('''trans_vst = data.frame(t(exprs(vst)), times, replicates)''') # load data and convert to pandas object data_file = pandas2ri.ri2py(R["trans_vst"]) return data_file
def call_fitter( site_inputs_training, y_training, site_inputs_validation, hprm, ): assert y_training.ndim == 1 path_R_files = os.path.join( paths.outputs, 'R_files/', ) os.makedirs( path_R_files, exist_ok=True, ) ### Data data_training = { **{ simplify_inpt_name(inpt, trsfm, prm, location): site_inputs_training[inpt, trsfm, prm, location].values for inpt, trsfm, prm, location in site_inputs_training }, 'target': y_training.values, } data_validation = { simplify_inpt_name(inpt, trsfm, prm, location): site_inputs_validation[inpt, trsfm, prm, location].values for inpt, trsfm, prm, location in site_inputs_validation } # Convert arrays pandas2ri.activate() df_train = pandas2ri.py2rpy(pd.DataFrame.from_dict(data_training)) df_test = pandas2ri.py2rpy(pd.DataFrame.from_dict(data_validation)) pandas2ri.deactivate() # Save converted files r.assign("data_train", df_train) r("save(data_train, file='{0}/temp_dat_for_r_train.gzip', compress=TRUE)". format(path_R_files)) r.assign("data_test", df_test) r("save(data_test, file='{0}/temp_dat_for_r_test.gzip', compress=TRUE)". format(path_R_files)) nb_unique = {k: len(np.unique(v)) for k, v in site_inputs_training.items()} string_formula = make_gam_formula( site_inputs_training.columns, nb_unique, hprm, ) ### Launch the R script path2script = os.path.join( os.path.dirname(__file__), 'load_fit_predict_savePredictions.R', ) args = [string_formula, path_R_files] cmd = ['Rscript', path2script] + args # Python will quote what must be quoted in subprocess.check_output print('launch Rscript') x = subprocess.check_output(cmd, universal_newlines=True) print(x) y_hat_training = r['read.table']( "{0}/predictions_from_r_train.gzip".format(path_R_files)) y_hat_training = pandas2ri.rpy2py(y_hat_training) y_hat_training = y_hat_training.values y_hat_validation = r['read.table']( "{0}/predictions_from_r_test.gzip".format(path_R_files)) y_hat_validation = pandas2ri.rpy2py(y_hat_validation) y_hat_validation = y_hat_validation.values return y_hat_training, y_hat_validation
def covarFilter(infile, time_points, replicates, quantile): ''' Filter gene list based on the distribution of the sums of the covariance of each gene. This is highly recommended to reduce the total number of genes used in the dynamic time warping clustering to reduce the computational time. The threshold is placed at the intersection of the expected and observed value for the given quantile. ''' time_points.sort() time_rep_comb = [x for x in itertools.product(time_points, replicates)] time_cond = ro.StrVector([x[0] for x in time_rep_comb]) rep_cond = ro.StrVector([x[1] for x in time_rep_comb]) df = pd.read_table(infile, sep="\t", header=0, index_col=0) df.drop(['replicates'], inplace=True, axis=1) df.drop(['times'], inplace=True, axis=1) df = df.fillna(0.0) # convert data frame and import into R namespace # py2ri requires activation pandas2ri.activate() R.assign('diff_data', pandas2ri.py2ri(df)) E.info("loading data frame") # need to be careful about column headers and transposing data frames R('''trans_data <- data.frame(diff_data)''') R('''times <- c(%s)''' % time_cond.r_repr()) R('''replicates <- c(%s)''' % rep_cond.r_repr()) # calculate the covariance matrix for all genes # sum each gene's covariance vector E.info("calculating sum of covariance of expression") R('''covar.mat <- abs(cov(trans_data))''') R('''sum.covar <- rowSums(covar.mat)''') R('''exp.covar <- abs(qnorm(ppoints(sum.covar),''' '''mean=mean(sum.covar), sd=sd(sum.covar)))''') R('''sum.covar.quant <- quantile(sum.covar)''') R('''exp.covar.quant <- quantile(exp.covar)''') E.info("filter on quantile") R('''filtered_genes <- names(sum.covar[sum.covar > ''' '''sum.covar.quant[%(quantile)i]''' ''' & sum.covar > exp.covar.quant[%(quantile)i]])''' % locals()) R('''filtered_frame <- data.frame(diff_data[, filtered_genes],''' '''times, replicates)''') # load data and convert to pandas object filtered_frame = pandas2ri.ri2py(R["filtered_frame"]).T return filtered_frame
import numpy as np import pandas as pd from rpy2.robjects import r, pandas2ri import datetime as dt from sklearn import preprocessing as prep # set directory import os os.chdir('/Users/valeriebradley/github/libdems/projection/') pandas2ri.activate() # to translate R obj into pandas df def recode_covar_data(): data_path = '/Users/valeriebradley/Documents/LibDems/data/' r['load'](data_path + "model_sample_data_cleaned.RData") dr_covars = r.dr_covars dr_covars['VANID'] = dr_covars['VANID'].astype(int) dr_covars.set_index('VANID', inplace=True) # calculate reg timing eday = '2017-06-08' dr_covars['year_reg'] = [ dt.datetime.strptime(date, '%Y-%m-%d').date().year for date in dr_covars['DATE_OF_UPDATE'].values ] dr_covars['wks_from_reg_to_eday'] = [ (dt.datetime.strptime(eday, '%Y-%m-%d') - dt.datetime.strptime(date, '%Y-%m-%d')) / dt.timedelta(days=1)
import os import sys import itertools import gzip import numpy as np from Bio import SeqIO import argparse import subprocess import pandas as pd from collections import OrderedDict from rpy2.robjects.packages import importr from rpy2.robjects import pandas2ri, Formula, FloatVector from rpy2.rinterface import RRuntimeError import math pandas2ri.activate( ) #allow conversion between pandas dataframes and r dataframes #define R packages nlme = importr('nlme') base = importr('base') stats = importr('stats') #qv = importr('qvalue') #define formulae fmla = Formula('value ~ 1 + cond1') rndm = Formula('~ 1 | samples') nullfmla = Formula('value ~ 1') nullrndm = Formula('~1 | samples') samps = [ 'samp1A', 'samp1B', 'samp1C', 'samp1D', 'samp2A', 'samp2B', 'samp2C',
def _infer_network(self, data): """ Infer the network. Args: data (pd.DataFrame): data to be used for the inference. """ # quantization step with optimal k based on BIC quantized = data.apply(lambda column: k_means_vector_quantization( column.values.reshape(-1, 1), k_min=self.k_min, k_max=self.k_max, k_step=self.k_step, **self.parameters), axis=0) entities = data.columns number_of_entities = len(entities) # activate implicit conversion from pandas to R objects pandas2ri.activate() fun_chisq = importr('FunChisq') # preparing variables to pass to FunChisq independent_variables = None dependent_variables = np.array([], dtype=int) ne_range = range(1, number_of_entities + 1) for index, entity in enumerate(entities): r_index = index + 1 dependent_variables = np.hstack([ dependent_variables, np.array( list(filter(lambda e_index: e_index != r_index, ne_range))) ]) if independent_variables is None: independent_variables = np.vstack( [np.array(r_index) for _ in range(number_of_entities - 1)]) else: independent_variables = np.vstack([ independent_variables, np.vstack([ np.array(r_index) for _ in range(number_of_entities - 1) ]) ]) # running FunChisq interactions = ro.conversion.rpy2py( fun_chisq.test_interactions(quantized.T.values, list(independent_variables), pd.Series(dependent_variables), entities.values)) # test correction if self.correction in CORRECTIONS: significants = CORRECTIONS[self.correction]( interactions['p.value'], self.confidence_threshold) interactions = interactions.iloc[significants] interactions.columns = [ 'gene1', 'gene2', 'p-value', 'statistic', 'estimate' ] # if undirected keep only interaction with higher importance if # both directions are significant if self.undirected is True: interactions = interactions.apply( lambda row: pd.Series(sort_interaction_entities(row)), axis=1) interactions.columns = [ 'gene1', 'gene2', 'p-value', 'statistic', 'estimate' ] interactions['grouping'] = [ '{}_{}'.format(*sorted(pair)) for pair in zip(interactions['gene1'], interactions['gene2']) ] selected_interactions = interactions.groupby([ 'grouping' ])['p-value'].transform(min) == interactions['p-value'] interactions = interactions[selected_interactions] # prepare the interactions interactions = interactions[['gene1', 'gene2', 'statistic']] interactions.columns = ['e1', 'e2', 'intensity'] self.graph = InteractionTable(df=interactions).to_graph( undirected=self.undirected) logger.debug('inferred with {}'.format(self.method))