def get_IRIS_package_versions(logger): """ Return a dataframe of version information for IRIS R packages used in ISPAQ. """ IRIS_packages = ['seismicRoll','IRISSeismic','IRISMustangMetrics'] # Get version information for locally installed and CRAN available IRIS_packages r_installed = robjects.r("installed.packages()[c('seismicRoll','IRISSeismic','IRISMustangMetrics'),'Version']") installed_versions = pandas2ri.ri2py(r_installed).tolist() r_available = robjects.r("available.packages()[c('seismicRoll','IRISSeismic','IRISMustangMetrics'),'Version']") cran_versions = pandas2ri.ri2py(r_available).tolist() # Find any 'old' installed packages that available for an upgrade r_old = robjects.r("old.packages()[,'Package']") old = pandas2ri.ri2py(r_old).tolist() # Create a needsUpgrade array upgrade = [False, False, False] for i in range(len(IRIS_packages)): if IRIS_packages[i] in old: upgrade[i] = True # Put information in a dataframe df = pd.DataFrame({'package': IRIS_packages, 'installed': installed_versions, 'CRAN': cran_versions, 'upgrade': upgrade}) # Reorder columns from default alphabetic df = df[['package','installed','CRAN','upgrade']] return(df)
def load_data() -> pd.DataFrame: importr('faraway') r.data('chredlin'); chredlin = pandas2ri.ri2py(r.chredlin) chredlin = chredlin.set_index(pandas2ri.ri2py(r.chredlin.rownames)) chredlin['log_income'] = np.log(chredlin['income']) return chredlin
def apply_PSD_metric(r_stream, *args, **kwargs): """" Invoke the PSDMetric and convert the R dataframe result into a Pandas dataframe. :param r_stream: an r_stream object :return: """ R_function = robjects.r('IRISMustangMetrics::PSDMetric') r_listOfLists = R_function(r_stream, *args, **kwargs) # args and kwargs shouldn't be needed in theory r_metriclist = r_listOfLists[0] r_dataframe = _R_metricList2DF(r_metriclist) df = pandas2ri.ri2py(r_dataframe) # Convert columns from R POSIXct to pyton UTCDateTime df.starttime = df.starttime.apply(UTCDateTime) df.endtime = df.endtime.apply(UTCDateTime) # TODO: What to do about the list of spectraMetrics? # TODO: We would need a new R_spectrumMetricList2DF function to process this further. ###r_spectrumList = r_listOfLists[1] # correctedPSD is returned as a dataframe r_correctedPSD = r_listOfLists[2] correctedPSD = pandas2ri.ri2py(r_correctedPSD) # Convert columns from R POSIXct to pyton UTCDateTime correctedPSD.starttime = correctedPSD.starttime.apply(UTCDateTime) correctedPSD.endtime = correctedPSD.endtime.apply(UTCDateTime) r_PDF = r_listOfLists[3] PDF = pandas2ri.ri2py(r_PDF) return (df, correctedPSD, PDF)
def convert_fit_to_python(self, fit): coeffs_r = fit.rx2('coefficients') coeffs = pandas2ri.ri2py(coeffs_r) coeff_names = pandas2ri.ri2py(coeffs_r.names).tolist() coeff_series = pd.Series({k: v for k, v in zip(coeff_names, coeffs)}) fitted_values = pandas2ri.ri2py(fit.rx2('fitted.values')) return coeff_series
def race_predict(df): # todo: why are there missing counties? df = df.query('(county != "None") and (county == county)') df.set_index([[1]], inplace=True) r = robjects.r pandas2ri.activate() wru = importr('wru') # https://github.com/kosukeimai/wru # df.loc[3, 'surname'] = 'Althaus' # df.dropna(inplace=True) df['age'] = df['age'].apply(lambda x: round(x)) census_data = joblib.load('data_files/census_data_all_states_county.pkl') X_out = wru.predict_race(voter_file=df, census_geo='county', census_key=census_key, sex=True, age=True, census_data=census_data) print(pandas2ri.ri2py(X_out)) census_data = joblib.load('data_files/census_data_all_states_tract.pkl') X_out = wru.predict_race(voter_file=df, census_geo='tract', census_key=census_key, sex=True, age=True, census_data=census_data) print(pandas2ri.ri2py(X_out))
def test_sum_stats_save_load(history: History): arr = sp.random.rand(10) arr2 = sp.random.rand(10, 2) particle_list = [ Particle(m=0, parameter=Parameter({"a": 23, "b": 12}), weight=.2, accepted_sum_stats=[{"ss1": .1, "ss2": arr2, "ss3": example_df(), "rdf0": r["faithful"]}], # TODO: check why iris fails accepted_distances=[.1]), Particle(m=0, parameter=Parameter({"a": 23, "b": 12}), weight=.2, accepted_sum_stats=[{"ss12": .11, "ss22": arr, "ss33": example_df(), "rdf": r["mtcars"]}], accepted_distances=[.1])] history.append_population(0, 42, Population(particle_list), 2, ["m1", "m2"]) weights, sum_stats = history.get_weighted_sum_stats_for_model(0, 0) assert (weights == 0.5).all() assert sum_stats[0]["ss1"] == .1 assert (sum_stats[0]["ss2"] == arr2).all() assert (sum_stats[0]["ss3"] == example_df()).all().all() assert (sum_stats[0]["rdf0"] == pandas2ri.ri2py(r["faithful"])).all().all() assert sum_stats[1]["ss12"] == .11 assert (sum_stats[1]["ss22"] == arr).all() assert (sum_stats[1]["ss33"] == example_df()).all().all() assert (sum_stats[1]["rdf"] == pandas2ri.ri2py(r["mtcars"])).all().all()
def read_data(self, data=None, df_name=None): if df_name is None: df_name = self.df_name if isinstance(data, type(None)): if self.data_type == 'dataframe': self.data = pd.read_csv(self.data_source) elif self.data_type == 'Rdata' and rpy2_imported: robjects.r['load'](self.data_source) for df_name in [self.df_name, "vz", "vcdb", "healthcare"]: try: self.data = pandas2ri.ri2py(robjects.r[df_name]) self.data_type = "dataframe" break except LookupError: self.data = None if isinstance(self.data, type(None)): raise LookupError("Could not find dataframe name in Rdata file. please specify with df_name=<name>") elif self.data_type == 'json': self.data = [] for path in self.data_source: self.data += [os.path.join(dirpath, f) for dirpath, dirnames, files in os.walk(path) for f in files if f.endswith('.json')] else: raise ValueError("Data type not supported. If datatype is Rdata, please make sure rpy2 loaded correctly.") else: self.data = data if type(data) is pd.core.frame.DataFrame: self.data_type = 'dataframe' elif type(data) is list: self.data_type = 'json' elif type(data) is rpy2.robjects.vectors.DataFrame and rpy2_imported: self.data = pandas2ri.ri2py(self.data) self.data_type = 'dataframe' else: raise ValueError("type of data boject is unrecognized. If data object is Rdata, please make sure rpy2 loaded correctly.")
def items(self, n_rows_cached=100, include_rsid=None): """ Retrieve generator of variants, one by one. Although variants are returned in the order as they are stored in the BGEN file, when there are variants with the same positions their order is not guaranteed. :param n_rows_cached: :return: """ # retrieve positions if include_rsid is not None: stm = 'select distinct rsid, position from Variant where rsid in ({}) order by file_start_position asc'.format( ', '.join(["'{}'".format(x) for x in include_rsid])) else: stm = 'select distinct rsid, position from Variant order by file_start_position asc' with sqlite3.connect(self.bgi_path) as conn: cur = conn.cursor() cur.execute(stm) iteration = 0 while True: if iteration > 0: cached_data_struct = cached_data.__sexp__ del cached_data del cached_data_struct gc.collect() positions = cur.fetchmany(size=n_rows_cached) if not positions: break rsids = [x[0] for x in positions] positions = [x[1] for x in positions] if include_rsid is None: ranges = pd.DataFrame({ 'chromosome': [self.chr_number], 'start': [positions[0]], 'end': [positions[-1]], }) # rbgen = importr('rbgen') cached_data = self.rbgen.bgen_load(self.bgen_path, ranges) else: cached_data = self.rbgen.bgen_load(self.bgen_path, rsids=StrVector(rsids)) all_variants = pandas2ri.ri2py(cached_data[0]) all_probs = pandas2ri.ri2py(cached_data[4]) iteration += 1 for row_idx, (rsid, row) in enumerate(all_variants.iterrows()): dosage_row = row.rename({'chromosome': 'chr'}) dosage_row['chr'] = int(dosage_row.chr) dosage_row['dosages'] = np.dot(all_probs[row_idx, :, :], [0, 1, 2]) yield dosage_row
def extract_dataframe_from_R(dataframe_name): temp = pandas2ri.ri2py(r(dataframe_name)) temp_rows = pandas2ri.ri2py(r("rownames(" + dataframe_name + ")")) temp_cols = np.float32(pandas2ri.ri2py(r("colnames(" + dataframe_name + ")"))) df = pd.DataFrame(data = temp, columns = temp_cols, index = temp_rows) return df
def fit_and_predict(self, train, horizon): r_string = """ function(data, frequency, horizon){ library(forecast) ts_data <- ts(data, frequency=frequency) fit <- HoltWinters(ts_data) fitted_df <- data.frame(fit$fitted) forecast <- forecast(fit, h = horizon) forecast_df <- data.frame(forecast) output <- list(fitted_df, forecast_df) return(output) } """ r_func = robjects.r(r_string) # Run R pandas2ri.activate() output_list = r_func(train, self.frequency, horizon) fit = pandas2ri.ri2py(output_list[0]) forecast = pandas2ri.ri2py(output_list[1]) pandas2ri.deactivate() return fit, forecast
def predict(self, xtest): """Predicts class via majority vote. Parameters ---------- xtest : pd.DataFrame features for test set """ if new_pandas_flag: r_xtest = pandas2ri.py2ri(xtest) else: r_xtest = com.convert_to_r_dataframe(xtest) #r_xtest = pandas2ri.py2ri(xtest) pred = self.rf_pred(self.rf, r_xtest) if new_pandas_flag: #py_pred = pandas2ri.ri2py(pred) tmp_genes = pred[1] tmp_pred_class = pred[0] genes = pandas2ri.ri2py(tmp_genes) pred_class = pandas2ri.ri2py(tmp_pred_class) else: py_pred = com.convert_robj(pred) genes, pred_class = zip(*py_pred.items()) #genes = com.convert_robj(tmp_genes) #pred_class = com.convert_robj(tmp_pred_class) tmp_df = pd.DataFrame({'pred_class': pred_class}, index=genes) tmp_df = tmp_df.reindex(xtest.index) tmp_df -= 1 # for some reason the class numbers start at 1 return tmp_df['pred_class']
def fit_and_predict(self, train, horizon): r_string = """ function(data, frequency, horizon){ library(forecast) if(length(frequency) == 1){ ts_data <- ts(data, frequency=frequency) }else{ ts_data <- msts(data, seasonal.periods=frequency) } fit <- tbats(ts_data) fitted_df <- data.frame(fit$fitted.values) forecast <- forecast(fit, h = horizon) forecast_df <- data.frame(forecast) output <- list(fitted_df, forecast_df) return(output) } """ r_func = robjects.r(r_string) pandas2ri.activate() output_list = r_func(train, robjects.IntVector(self.frequency), horizon) fit = pandas2ri.ri2py(output_list[0]) forecast = pandas2ri.ri2py(output_list[1]) pandas2ri.deactivate() return fit, forecast
def read_spss_to_df(self): """Use R functions to read SPSS files Input -> NULL ==================================================================================================== Output -> Return a tuple of a python DataFrame and an np array of descriptions of column names (i.e. features descriptions) """ from rpy2.robjects import r from string import Template from rpy2.robjects import pandas2ri import unicodedata file_location = self._file_path # or "./1 - 110778/110778.sav" file_location_csv = file_location[:-4] + ".csv" r_code = Template(''' library(foreign) library(plyr) df <- read.spss ("$origin_file", to.data.frame=TRUE) desc <- attr(df,"variable.labels") write.csv(df, file="$output_file", na="") ''') r_code = r_code.substitute(origin_file=file_location, output_file=file_location_csv) # Substitute input and output file with variables presented in python r(r_code) # Run the above r code in r global environment df = pandas2ri.ri2py(r('df')) # convert from r data frame into pandas data frame df = df.applymap(lambda x: unicodedata.normalize('NFKD', x).encode('ascii','ignore') if type(x) == unicode else x) # Translate unicode encoding into ascii encoding desc = pandas2ri.ri2py(r('desc')) # convert into python variable for j, ele in enumerate(desc): if type(desc[j]) == np.unicode_: desc[j] = str(unicodedata.normalize('NFKD', desc[j]).encode('ascii','ignore')) # http://stackoverflow.com/questions/1207457/convert-a-unicode-string-to-a-string-in-python-containing-extra-symbols desc = desc.astype(np.string_) return df, desc
def _predict_one(self, fitted): """ Parameters ---------- fitted Returns ------- """ # lagged conditional variances h = pandas2ri.ri2py(fitted.slots['h.t'])[-self.garch_lags:] h = pd.Series(data=h, index=self.garch_names) # lagged squared residuals eps = pandas2ri.ri2py(fitted.slots['residuals'])[-self.arch_lags:]**2 eps = pd.Series(data=eps, index=self.arch_names) # 1 to be multiplied with omega omega = pd.Series(data=[1], index=["omega"]) # all together data = pd.concat((h, eps, omega)) res = self.get_coef(fitted).drop("mu", errors="ignore").dot(data) return res
def convert_fit_to_python(self, fit): coeffs_r = fit.rx2('coefficients') coeffs= pandas2ri.ri2py(coeffs_r) coeff_names = pandas2ri.ri2py(coeffs_r.names).tolist() coeff_series = pd.Series({k:v for k,v in zip(coeff_names, coeffs)}) fitted_values = pandas2ri.ri2py(fit.rx2('fitted.values')) return coeff_series
def ref_estimate_cell_counts(input_r_object_dir, algorithm, reference, library, output_csv): """Reference based cell type estimates.""" import rpy2.robjects as robjects from rpy2.robjects import pandas2ri, numpy2ri pandas2ri.activate() from pymethylprocess.meffil_functions import est_cell_counts_meffil, est_cell_counts_minfi, est_cell_counts_IDOL os.makedirs(output_csv[:output_csv.rfind('/')], exist_ok=True) read_r_object = robjects.r('readRDS') robjects.r( 'library({})'.format(algorithm if algorithm != 'IDOL' else 'minfi')) if algorithm == 'meffil': qc_list = read_r_object(join(input_r_object_dir, 'QCObjects.rds')) cell_counts = est_cell_counts_meffil(qc_list, reference) else: rgset = read_r_object(join(input_r_object_dir, 'RGSet.rds')) if algorithm == 'meffil': cell_counts = est_cell_counts_minfi(rgset) else: cell_counts = est_cell_counts_IDOL(rgset, library) # find where samples intersect pandas2ri.ri2py( robjects.r('as.data.frame')(cell_counts)).to_csv(output_csv)
def test_sum_stats_save_load(history: History): arr = sp.random.rand(10) arr2 = sp.random.rand(10, 2) particle_list = [ Particle(0, Parameter({ "a": 23, "b": 12 }), .2, [.1], [{ "ss1": .1, "ss2": arr2, "ss3": example_df(), "rdf0": r["iris"] }], [], True), Particle(0, Parameter({ "a": 23, "b": 12 }), .2, [.1], [{ "ss12": .11, "ss22": arr, "ss33": example_df(), "rdf": r["mtcars"] }], [], True) ] history.append_population(0, 42, Population(particle_list), 2, ["m1", "m2"]) weights, sum_stats = history.get_sum_stats(0, 0) assert (weights == 0.5).all() assert sum_stats[0]["ss1"] == .1 assert (sum_stats[0]["ss2"] == arr2).all() assert (sum_stats[0]["ss3"] == example_df()).all().all() assert (sum_stats[0]["rdf0"] == pandas2ri.ri2py(r["iris"])).all().all() assert sum_stats[1]["ss12"] == .11 assert (sum_stats[1]["ss22"] == arr).all() assert (sum_stats[1]["ss33"] == example_df()).all().all() assert (sum_stats[1]["rdf"] == pandas2ri.ri2py(r["mtcars"])).all().all()
def plot_qc_metrics(self, output_dir): """Plot QC results from ENmix pipeline and possible minfi. Still experimental. Parameters ---------- output_dir Where to store plots.""" self.enmix.plotCtrl(self.RGset) grdevice = importr("grDevices") geneplotter = importr("geneplotter") base = importr('base') anno=self.minfi.getAnnotation(self.RGset) anno_py = pandas2ri.ri2py(robjects.r['as'](anno,'data.frame')) beta_py = pandas2ri.ri2py(self.beta) beta1=numpy2ri.py2ri(beta_py[anno_py["Type"]=="I"]) beta2=numpy2ri.py2ri(beta_py[anno_py["Type"]=="II"]) grdevice.jpeg(output_dir+'/dist.jpg',height=900,width=600) base.par(mfrow=robjects.vectors.IntVector([3,2])) self.enmix.multidensity(self.beta, main="Multidensity") self.enmix.multifreqpoly(self.beta, xlab="Beta value") self.enmix.multidensity(beta1, main="Multidensity: Infinium I") self.enmix.multifreqpoly(beta1, main="Multidensity: Infinium I", xlab="Beta value") self.enmix.multidensity(beta2, main="Multidensity: Infinium II") self.enmix.multifreqpoly(beta2, main="Multidensity: Infinium II", xlab="Beta value") grdevice.dev_off() self.minfi.qcReport(self.RGset, pdf = "{}/qcReport.pdf".format(output_dir)) self.minfi.mdsPlot(self.RGset) self.minfi.densityPlot(self.RGset, main='Beta', xlab='Beta')
def lmm_analysis(dataframe, filename, output, target): """ Perform a glmm analysis of the data of interest. :param dataframe: (pandas DataFrame) a dataframe :param filename: (string) list of float """ r_df = pandas2ri.py2ri(dataframe) stat_s = r(""" require("DHARMa") require(lme4) require("MASS") function(data, name, target){ mod <- lmer(%s ~ size_tad + (1|Tad), data=data) nulmod <- lm(%s ~ size_tad, data=data) simulationOutput <- simulateResiduals(fittedModel = mod, n = 250) png(paste(name, "/mod_dignostics_%s.png", sep=""), height=1080, width=1920) par(mfrow=c(2, 2)) plot(simulationOutput) dev.off() simulationOutput <- simulateResiduals(fittedModel = nulmod, n = 250) png(paste(name, "/nulmod_dignostics_%s.png", sep=""), height=1080, width=1920) par(mfrow=c(2, 2)) plot(simulationOutput) dev.off() return(anova(mod, nulmod, test="Chisq")) } """ % (target, target, target, target)) res = stat_s(r_df, output, target) print(res) pandas2ri.ri2py(res).to_csv("%s_glmm_stats.txt" % filename, sep="\t", index=False)
def krige(self, i=0, v=None, step=1, res=True, plot_v=False, plot_k=True, animated=False, **plot_kwargs): """ Krige the dataframe with a single data column or a column index number Parameters ------- self : Event object with at least one data column kwargs ------- i : int data column index number (defaults to 0) v : variogram to use in determining sill and range step : grid interval to krige on (in km) res : bool detrend points before computing kriged values - default True plot_v : bool plot variogram - default False plot_k : bool plot kriged values - default True animated : bool return axis for animation - default False **plot_kwargs (cmap, s, latlon, basemap, shpfile, POT, locs, colors) Returns ------- k : Dataframe containing output from r-krige function """ from rpy2.robjects import pandas2ri pandas2ri.activate() rfuncs = import_r_tools() if 'X' not in self.ll_cols: self.set_ll() if res: if not hasattr(self, 'res'): self.detrend() df = self.res else: df = self.df cols = self.data_cols r_df = df.loc[:,['X', 'Y', cols[i]]].dropna(how='any') if not v: v = pandas2ri.ri2py(rfuncs.get_variogram(r_df)) model = 'Sph' psill = r_df.var()[cols[i]] for j in range(len(v)): if v.gamma[j] > psill: rng = v.dist[j] break k = pandas2ri.ri2py(rfuncs.get_krige(r_df, psill, model, rng, step=step)) k['lat'] = k.y/110.574 k['lon'] = k.x/(111.320*(k['lat']*pi/180).apply(cos)) self.k = k if plot_k and animated: return self.plot_krige(i, k, rng, step=step, res=res, animated=animated, **plot_kwargs) elif plot_k and not animated: self.plot_krige(i, k, rng, step=step, res=res, animated=animated, **plot_kwargs) else: return k
def test_fit_with_pandas_data(self, Model, dataframe): X, y = dataframe model = Model(scriptname='myscript', funcname='myfunc', some='kwarg') model.fit(X, y) funcargs = model.r['myfunc'].call_args assert (ri2py(funcargs[0][0]).values == X.values).all() assert (ri2py(funcargs[0][1]) == y).all() assert funcargs[1]['some'] == 'kwarg'
def get_features(self, d={}, thresh=.01, sigma=3, min_size=4, const=5, return_dict=False, buffer=False): ''' Use r package SpatialVx to identify features. Parameters ---------- thresh: .01 sigma: 3 min_size: 4 const: 5 buffer: False Return ------ p: pd.Panel containing parameters characterizing the features found ''' from rpy2 import robjects from rpy2.robjects.packages import importr from rpy2.robjects import pandas2ri pandas2ri.activate() SpatialVx = importr('SpatialVx') rsummary = robjects.r.summary r_tools = import_r_tools() ll = np.array([self.lon.flatten('F'), self.lat.flatten('F')]).T for i in range(self.box.shape[0]-1): hold = SpatialVx.make_SpatialVx(self.box[i,:,:], self.box[i+1,:,:], loc=ll) look = r_tools.FeatureFinder_gaussian(hold, nx=self.box.shape[2], ny=self.box.shape[1], thresh=thresh, smoothpar=sigma, **(dotvars(min_size=min_size))) try: x = rsummary(look, silent=True)[0] except: continue px = pandas2ri.ri2py(x) df0 = pd.DataFrame(px, columns=['centroidX', 'centroidY', 'area', 'OrientationAngle', 'AspectRatio', 'Intensity0.25', 'Intensity0.9']) df0['Observed'] = list(df0.index+1) m = SpatialVx.centmatch(look, criteria=3, const=const) p = pandas2ri.ri2py(m[12]) df1 = pd.DataFrame(p, columns=['Forecast', 'Observed']) l = SpatialVx.FeatureMatchAnalyzer(m) try: p = pandas2ri.ri2py(rsummary(l, silent=True)) except: continue df2 = pd.DataFrame(p, columns=['Partial Hausdorff Distance','Mean Error Distance','Mean Square Error Distance', 'Pratts Figure of Merit','Minimum Separation Distance', 'Centroid Distance', 'Angle Difference','Area Ratio','Intersection Area','Bearing', 'Baddeleys Delta Metric', 'Hausdorff Distance']) df3 = df1.join(df2) d.update({self.time[i]: pd.merge(df0, df3, how='outer')}) if return_dict: return(d) p = pd.Panel(d) if buffer: return(self.add_buffer(p)) return(p)
def cfit_to_df(dgelist, c_fit): cpm = edgeR.cpm(dgelist, log=True) cpm_df = as_data_frame(cpm) cpm_df.index = pandas2ri.ri2py(robj.r('rownames')(cpm)) tt = edgeR.topTags(c_fit, n=np.inf, adjust_method='BH', sort_by='none') tt_df = as_data_frame(tt) tt_df.index = pandas2ri.ri2py(robj.r('rownames')(tt)) tt_df = tt_df.join(cpm_df, how='outer') tt_df.index.name = 'gene_id' return tt_df
def DEA(counts, design_r, contrasts=None, adjust_method='BH'): """ contrasts needs to be a dictionary with string keys and values to be used as design contrasts or a list of int values to be used as design coefficients Returns results and pd.DataFrame of normalized counts """ if isinstance(contrasts, list): coefs = True if not all(isinstance(k, int) for k in contrasts): raise ValueError('coefficient list should be all integers') coefnames = list(ro.r.colnames(design_r)) contrasts = OrderedDict([(coefnames[c - 1], c) for c in contrasts]) else: coefs = False if not (all(isinstance(k, str) for k in contrasts) and all(isinstance(v, str) for v in contrasts.values())): raise ValueError('contrast dict should be all string pairs') # import R limma package limma = importr('limma') # tranform counts with voom voomedCounts_r = limma.voom(counts, design=design_r, plot=True, normalize="quantile") fit_r = limma.lmFit(voomedCounts_r, design_r) fit_r = limma.eBayes(fit_r) coefficients_r = fit_r.rx2('coefficients') #fit_r$coefficients if coefs: fit_contrasts_r = fit_r else: contrasts_r, contrasts_p = prepareContrasts(design_r, contrasts.values(), RReturnOnly=False) fit_contrasts_r = limma.contrasts_fit(fit_r, contrasts_r) fit_contrasts_r = limma.eBayes(fit_contrasts_r) print(ro.r.summary(fit_contrasts_r)) #Full results results = OrderedDict() for res in contrasts: result_r = limma.topTable(fit_contrasts_r, coef=contrasts[res], n=len(counts), adjust_method=adjust_method) results[res] = pandas2ri.ri2py(result_r) results[res].index = ro.r.rownames(result_r) #results[res]['gene_label'] = results[res].index.map(lambda x: counts.index[int(x)-1]) print('# sig', res, '->', (results[res]['adj.P.Val'] <= 0.05).sum()) return results, pd.DataFrame(pandas2ri.ri2py(voomedCounts_r.rx2('E')), columns=counts.columns, index=counts.index)
def predict(self, indep_vars): ro.globalenv['test'] = pandas2ri.py2ri(indep_vars) ro.globalenv['fit'] = self.fitted_model if self.algorithm == "rprop+": return pandas2ri.ri2py( ro.r("compute(fit,test)$net.result") ) elif self.algorithm == "ADAPTgdwm": return pandas2ri.ri2py( ro.r("sim(fit$net, test)") )
def get_wunifrac_distance(phyloseq_d): R_phyloseq = importr('phyloseq') R_base = importr('base') distances = R_phyloseq.UniFrac(phyloseq_d, weighted=True, normalized=True, fast=True, parallel=False) distance_mat = R_base.as_matrix(distances) distance_df = pd.DataFrame(numpy2ri.ri2py(distance_mat), index=pandas2ri.ri2py(R_phyloseq.sample_names(phyloseq_d)), columns=pandas2ri.ri2py(R_phyloseq.sample_names(phyloseq_d)) ) return distance_df
def get_distance(phyloseq_d, dist_method): R_phyloseq = importr('phyloseq') R_base = importr('base') distances = R_phyloseq.distance(phyloseq_d, method=dist_method) distance_mat = R_base.as_matrix(distances) distance_df = pd.DataFrame(numpy2ri.ri2py(distance_mat), index=pandas2ri.ri2py(R_phyloseq.sample_names(phyloseq_d)), columns=pandas2ri.ri2py(R_phyloseq.sample_names(phyloseq_d)) ) return distance_df
def ets_1(train,test,hor=24,freq=24): pandas2ri.activate() forecast=importr('forecast') # forecast package ts=ro.r.ts # R time series fitted=ro.r('fitted') # function exporting forecasts used while fitting model r_train_ts=ts(train,frequency=freq) # construct R's ts object r_test_ts=ts(test,frequency=freq) # construct R's ts object fit_train=forecast.ets(r_train_ts) # find best model on train test fit_test=forecast.ets(r_test_ts,model=fit_train) # get predictions on test set train_pred=pd.Series(pandas2ri.ri2py(fitted(fit_train)),index=train.index) # reconstruct pandas DataFrame from R float vector test_pred=pd.Series(pandas2ri.ri2py(fitted(fit_test)),index=test.index) # reconstruct pandas DataFrame from R float vector return train_pred,test_pred
def assays(self, rs4_assays): list_vector = pandas2ri.ri2py(rs4_assays.slots["listData"]) self._assays = dict() for assay, label in zip(list_vector, list_vector.names): if type(assay) == robjects.methods.RS4: non_zero_elements = assay.slots["x"] row_numbers =pandas2ri.ri2py(assay.slots["i"]) column_pointers = pandas2ri.ri2py(assay.slots["p"]) nrows = len(list(pandas2ri.ri2py(assay.slots["Dimnames"]))[0]) self._assays[label] = SingleCellExperiment.DCGtoCSR(non_zero_elements, row_numbers, column_pointers, nrows) elif type(assay) == robjects.vectors.Matrix: self._assays[label] = csr_matrix(pandas2ri.ri2py(assay))
def getRAnoval(formula, data): ''' returns the data analysed by Kruskal wallis in 'R' using rpy2 module ''' model1 = robjects.r.lm(formula=formula, data=data) anv = robjects.r.anova(model1) postHocHSD = agr.HSD_test(model1, 'genotype', group=False, console=False) postHoc = pd.DataFrame(pandas2ri.ri2py(postHocHSD.rx2('comparison'))) smry1 = pd.DataFrame(pandas2ri.ri2py(anv)) pVal = smry1['Pr(>F)']['genotype'] fValue = smry1['F value']['genotype'] return {'pvalue': pVal, 'fvalue': fValue, 'posthoc': postHoc}
def train_elastic_net_wrapper(features_data_, features_, d_, data_annotation_, x_w=None, prune=True, nested_folds=10): x = numpy.array([features_data_[v] for v in features_.id.values]) dimnames = robjects.ListVector( [(1, robjects.StrVector(d_["individual"])), (2, robjects.StrVector(features_.id.values))]) x = robjects.r["matrix"](robjects.FloatVector(x.flatten()), ncol=features_.shape[0], dimnames=dimnames) y = robjects.FloatVector(d_[data_annotation_.gene_id]) nested_folds = robjects.FloatVector([nested_folds]) #py2ri chokes on None. if x_w is None: res = train_elastic_net(y, x, n_train_test_folds=nested_folds) else: res = train_elastic_net(y, x, penalty_factor=x_w, n_train_test_folds=nested_folds) # observation weights, not explanatory variable weight :( , x_weight = x_w) return pandas2ri.ri2py(res[0]), pandas2ri.ri2py(res[1])
def _parse_assayData(assayData, assay): """Parse Rpy2 assayData (Environment object) assayData: Rpy2 Environment object. assay: An assay name indicating the data to be loaded. Return a parsed expression dataframe (Pandas). """ pandas2ri.activate() mat = assayData[assay] # rpy2 expression matrix object data = pandas2ri.ri2py(mat) features = pandas2ri.ri2py(r.rownames(mat)) samples = pandas2ri.ri2py(r.colnames(mat)) return pd.DataFrame(data, index=features, columns=samples)
def bumfit(p_vals, tau): b = pandas2ri.py2ri(pd.Series(p_vals)) c = dunn.Bum(b) d = obase.summary(c, tau) at_sym = base.__dict__["@"] estimates = at_sym(d, "estimates") bum = at_sym(d, "bum") #pvals = at_sym(bum, "pvals") ahat = pandas2ri.ri2py(at_sym(bum, "ahat")) lhat = pandas2ri.ri2py(at_sym(bum, "lhat")) pihat = pandas2ri.ri2py(at_sym(bum, "pihat")) q = pandas2ri.ri2py(estimates) p = pd.DataFrame([ahat, lhat, pihat], index=['ahat', 'lhat', 'pihat']) return q, ahat
def run_fcs(ticker, debugTF=False, funcName='rForecast', **optx): # get data datax = pull_stock_data(ticker) asof = int(datax['pbdate'].iloc[-1]) # idxtm=map(lambda x:datetime.datetime.strptime(str(x),"%Y%m%d"),datax['pbdate']) # datax.set_index(pd.DatetimeIndex(idxtm),inplace=True) if debugTF is True: print datax.tail() # get r-code pandas2ri.activate() rstring = 'source("./_alan_ohlc_fcs.r")' r(rstring) # convert to r-data #df=pandas2ri.py2ri(datax[['pbdate','close']]) df = pandas2ri.py2ri(datax['close'][:]) # run r-function opts = { 'nfcs': 30, 'dwmTF': True, 'autoArima': False, 'difTF': True, 'funcname': 'rAR', 'logTF': True, 'plevel': 0.7, 'freq': 'W' } opts.update(optx) optR = subDict(opts, [ 'nfcs', 'plevel', 'funcname', 'autoArima', 'logTF', 'difTF', 'freq', 'fcsLst', 'dwmTF' ]) if debugTF: print >> sys.stderr, "==Input Args:{}".format(optR) print >> sys.stderr, "==asof {},df:\n{}".format( asof, datax['close'][-5:]) if funcName in robj.globalenv: funcArg = robj.globalenv[funcName] ret = funcArg(df, asof, debugTF=debugTF, **optR) if opts['dwmTF'] is True: dwm = pandas2ri.ri2py(ret[1]) dwm['ticker'] = ticker else: dwm = pd.DataFrame() dd = pandas2ri.ri2py(ret[0]) dd['ticker'] = ticker return (dd, dwm, datax)
def apply_PSD_metric(r_stream, *args, **kwargs): """" Invoke the PSDMetric and convert the R dataframe result into a Pandas dataframe. :param r_stream: an r_stream object :param (optional kwarg) evalresp= pandas dataframe of FAP from evalresp (freq,amp,phase) :return: tuple of GeneralValueMetrics, corrected PSD, and PDF """ R_function = robjects.r('IRISMustangMetrics::PSDMetric') pandas2ri.activate() # look for optional parameter evalresp=pd.DataFrame evalresp = None if 'evalresp' in kwargs: evalresp = kwargs['evalresp'] r_listOfLists = None if evalresp is not None: r_evalresp = pandas2ri.py2ri(evalresp) # convert to R dataframe r_listOfLists = R_function(r_stream, evalresp=r_evalresp) else: r_listOfLists = R_function(r_stream) r_metriclist = r_listOfLists[0] if r_metriclist: r_dataframe = _R_metricList2DF(r_metriclist) df = pandas2ri.ri2py(r_dataframe) # Convert columns from R POSIXct to python UTCDateTime df.starttime = df.starttime.apply(UTCDateTime) df.endtime = df.endtime.apply(UTCDateTime) # PSDMetric returns no PSD derived metrics else: df = pd.DataFrame() # correctedPSD is returned as a dataframe r_correctedPSD = r_listOfLists[2] PSDCorrected = pandas2ri.ri2py(r_correctedPSD) # Convert columns from R POSIXct to python UTCDateTime PSDCorrected.starttime = PSDCorrected.starttime.apply(UTCDateTime) PSDCorrected.endtime = PSDCorrected.endtime.apply(UTCDateTime) r_PDF = r_listOfLists[3] PDF = pandas2ri.ri2py(r_PDF) pandas2ri.deactivate() return (df, PSDCorrected, PDF)
def getRKrusWall(formula, data): ''' returns the data analysed by Kruskal wallis in 'R' using rpy2 module ''' krsWall = statsR.kruskal_test(formula=formula, data=data) krsWallPd = pd.DataFrame(pandas2ri.ri2py(krsWall.rx2('p.value'))) pVal = krsWallPd[0][0] postHocDunn = fsa.dunnTest(formula, data=data, method='bh') postHoc = pd.DataFrame(pandas2ri.ri2py(postHocDunn.rx2('res'))) chiSq = pd.DataFrame(pandas2ri.ri2py(krsWall.rx2('statistic'))) return { 'pvalue': pVal, 'chi-squared': chiSq, 'posthoc': postHoc.sort_values(by=['Comparison']) }
def SCCA_r(X,Y, n_components, pen): df_X = pd.DataFrame(X) df_Y = pd.DataFrame(Y) rmat_X = pandas2ri.py2ri(df_X) rmat_Y = pandas2ri.py2ri(df_Y) ri.globalenv['X'] = rmat_X ri.globalenv['Y'] = rmat_Y out = PMA.CCA(x=X, z=Y, K=n_components, niter =100, standardize=False, penaltyx=pen[0], penaltyz=pen[1]) df_u = pandas2ri.ri2py(out[1]) df_v = pandas2ri.ri2py(out[2]) cors = pandas2ri.ri2py(out[15]) loadings = (np.asmatrix(df_u), np.asmatrix(df_v)) return loadings, cors
def computeRLEFactors(counts): """ Compute normalization size factors using the RLE method described in EdgeR and returns then as a vector. :param counts: a matrix of counts (genes as rows) :return returns the normalization factors a vector """ pandas2ri.activate() r_counts = pandas2ri.py2ri(counts) edger = RimportLibrary("edgeR") as_matrix = r["as.matrix"] dds = edger.calcNormFactors(as_matrix(r_counts), method="RLE") pandas_sf = pandas2ri.ri2py(dds) pandas_cm = pandas2ri.ri2py(r.colSums(counts)) pandas2ri.deactivate() return pandas_sf * pandas_cm
def loadFile(fname, varname=None): """ fname : rdata or rds filename to be loaded varname : variable name inside rdata """ if varname is not None: ro.r['load'](fname) full_data = pandas2ri.ri2py(ro.r[varname]) else: #assume it is in rds format full_data = pandas2ri.ri2py(ro.r['readRDS'](fname)) if "date" in full_data.columns: full_data["date"] = pandas.to_datetime(full_data.date) if "idPolair" in full_data.columns: full_data["idPolair"] = full_data.idPolair.astype("category") return full_data
def computeMnnBatchCorrection(counts): """Computes batch correction to a list of batches (data frames) where each data frame represents a batch (animal for instance). The batch correction is computed using Scran::mnnCorrect() from Marioni et al. :param counts: a list of matrices of counts :return returns a list of batch corrected matrices of counts """ pandas2ri.activate() as_matrix = r["as.matrix"] meta = [(x.index,x.columns) for x in counts] r_counts = [as_matrix(pandas2ri.py2ri(x)) for x in counts] RimportLibrary("scran") r_call = """ function(counts) { norm_counts = do.call(mnnCorrect, c(counts, cos.norm.out=FALSE)); return(lapply(norm_counts$corrected, as.data.frame)) } """ r_func = r(r_call) norm_counts = list() for i,x in enumerate(r_func(r_counts)): norm_c = pandas2ri.ri2py(x) norm_c.index = meta[i][0] norm_c.columns = meta[i][1] norm_counts.append(norm_c) pandas2ri.deactivate() return norm_counts
def computeSumFactors(counts, scran_clusters=True): """ Compute normalization factors using the deconvolution method described in Marioni et al. Returns the computed size factors as a vector. :param counts: a matrix of counts (genes as rows) :return returns the normalization factors a vector """ n_cells = len(counts.columns) pandas2ri.activate() r_counts = pandas2ri.py2ri(counts) scran = RimportLibrary("scran") as_matrix = r["as.matrix"] if scran_clusters and n_cells >= 50: r_clusters = scran.quickCluster(as_matrix(r_counts), min(n_cells/10, 10), method="igraph") min_cluster_size = min(Counter(r_clusters).values()) sizes = list(range(min(int(min_cluster_size/4), 10), min(int(min_cluster_size/2), 50), 5)) dds = scran.computeSumFactors(as_matrix(r_counts), clusters=r_clusters, sizes=sizes) else: sizes = list(range(min(int(n_cells/4), 10), min(int(n_cells/2), 50), 5)) dds = scran.computeSumFactors(as_matrix(r_counts), sizes=sizes) pandas_sf = pandas2ri.ri2py(dds) pandas2ri.deactivate() return pandas_sf
def get_rdata(url): # For testing, probably want to do this a different way in production TODO response = urllib2.urlopen(url) html = response.read() fp = open("rdata" + url.replace("http://data.war-on-ice.net", "").replace("http://war-on-ice.com", ""), "w") fp.write(html) fp.close() robj = r.load("rdata" + url.replace("http://data.war-on-ice.net", "").replace("http://war-on-ice.com", "")) rdata = {} keys = {} for sets in robj: myRData = pandas2ri.ri2py(r[sets]) rdata[sets] = [] keys[sets] = set() # convert to DataFrame if not isinstance(myRData, pd.DataFrame): myRData = pd.DataFrame(myRData) for element in myRData: keys[sets].add(element) counter = 0 for value in myRData[element]: if counter >= len(rdata[sets]): rdata[sets].append({}) rdata[sets][counter][element] = value counter += 1 return rdata
def run_simple(A, B): from rpy2.robjects import pandas2ri from rpy2.robjects.packages import importr import rpy2.robjects as ro r = ro.r pandas2ri.activate() limma = importr('limma') edgeR = importr('edgeR') counts = pd.concat([A, B], 1) groups = r.factor(r.c(*([0] * A.shape[1] + [1] * B.shape[1]))) ro.globalenv['exp'] = groups design = r('model.matrix(~exp)') dge = r.DGEList(counts=counts) dge = r.calcNormFactors(dge) v = r.voom(dge, design, plot=False) fit = r.lmFit(v, design) fit = r.eBayes(fit) tt = r.topTable(fit, coef=r.ncol(design), number=1e12) ttidx = r['row.names'](tt) tt = pandas2ri.ri2py(tt) cols = tt.columns.to_series() cols[0] = 'lfc' cols[3] = 'pval' cols[4] = 'padj' tt.columns = cols tt['slp'] = np.log10(tt['pval']) tt.loc[tt['lfc'] > 0, 'slp'] = -np.log10(tt.loc[tt['lfc'] > 0, 'pval']) tt.index = ttidx return tt
def variogram(self, i=0, plot_v=True, **kwargs): """ Generate a variogram Parameters ---------- self : Event object with at least one data column i : int data column index number (defaults to 0) plot_v : bool generate a plot of the variogram **kwargs (target_np, alpha, tol_hor, max_bnd, last_max) Returns ------- v : Dataframe containing output from r-variogram function """ from rpy2.robjects import pandas2ri pandas2ri.activate() rfuncs = import_r_tools() if 'X' not in self.ll_cols: self.set_ll() df = self.df cols = self.data_cols r_df = df.loc[:,['X', 'Y', cols[i]]].dropna(how='any') v = pandas2ri.ri2py(rfuncs.get_iSVG(r_df, 3, **kwargs)) if plot_v: v.plot(x='dist', y='gamma', marker = 'o', figsize=(8,4)) return v
def find_extrema(se, window=5, span_points=25): #df = pd.DataFrame({'x': mpl.dates.date2num(x), 'y': y}) x=se.index y=se df = pd.DataFrame({'x': x, 'y': y}) span = span_points/len(df) lo = stats.loess('y~x', df, span=span, na_action=stats.na_exclude) # we have to use predict(lo) instead of lo.rx2('fitted') here, the latter # doesn't not include NAs fitted = pd.Series(pandas2ri.ri2py(stats.predict(lo)), index=df.index) max_ = pd.rolling_max(fitted, window, center=True) min_ = pd.rolling_min(fitted, window, center=True) df['fitted'] = fitted df['max'] = max_ df['min'] = min_ delta = max_ - fitted highs = df[delta<=0] delta = min_ - fitted lows = df[delta>=0] #globals()['fe_df'] = df #globals()['x'] = x #globals()['y'] = y #globals()['lows'] = lows #globals()['highs'] = highs return fitted, lows, highs
def read_rdata(rdata_fullpath, table_name): """ Returns the pandas DataFrame """ from rpy2.robjects import pandas2ri, r pandas2ri.activate() # we want forward slashes for R rdata_fullpath_forR = rdata_fullpath.replace("\\", "/") print "Loading %s" % rdata_fullpath_forR # read in the data from the R session with python r['load'](rdata_fullpath_forR) # check that it's there table_df = pandas2ri.ri2py(r['model_summary']) # fillna for col in table_df.columns: nullcount = sum(pandas.isnull(table_df[col])) if nullcount > 0: print " Found %5d NA values in column %s" % (nullcount, col) table_df = table_df.fillna(0) for col in table_df.columns: nullcount = sum(pandas.isnull(table_df[col])) if nullcount > 0: print " -> Found %5d NA values in column %s" % (nullcount, col) print "Read %d lines from %s" % (len(table_df), rdata_fullpath) return table_df
def mca( distance_matrix, dim = 2 ): """ calculate MCA matrix using R's FactorMineR """ # build up haplotype dataframe from fatools.lib.utils import acquire_R, release_R from rpy2 import robjects from rpy2.robjects import pandas2ri acquire_R() r_df = pandas2ri.py2ri(distance_matrix.H) robjects.globalenv['haplo_data'] = r_df marker_len = len(distance_matrix.H.columns) arguments = ','.join('as.factor(haplo_data[,%d])' % x for x in range(1, marker_len + 1)) robjects.r('haplo_df <- data.frame(%s)' % arguments) robjects.r('library(FactoMineR)') mca_res = robjects.r('MCA(haplo_df, graph=FALSE)') # get the individual coordinate coord = pandas2ri.ri2py(mca_res.rx('ind')[0].rx('coord')[0]) release_R() return (coord, None)
def logCountsWithFactors(counts, size_factors): """ Uses the R package scater to log a matrix of counts (genes as rows) and a vector of size factor using the method normalize(). :param counts: a matrix of counts (genes as rows) :param size_factors: a vector of size factors :return the normalized log counts (genes as rows) """ columns = counts.columns indexes = counts.index pandas2ri.activate() r_counts = pandas2ri.py2ri(counts) scater = RimportLibrary("scran") r_call = """ function(counts, size_factors){ sce = SingleCellExperiment(assays=list(counts=as.matrix(counts))) sizeFactors(sce) = size_factors sce = normalize(sce) norm_counts = logcounts(sce) return(as.data.frame(norm_counts)) } """ r_func = r(r_call) r_norm_counts = r_func(r_counts, size_factors) pandas_norm_counts = pandas2ri.ri2py(r_norm_counts) pandas_norm_counts.index = indexes pandas_norm_counts.columns = columns pandas2ri.deactivate() return pandas_norm_counts
def getEvalresp(network=None, station=None, location=None, channel=None, time=None, minfreq=None, maxfreq=None, nfreq=None, units=None, output="fap"): """ Returns a pandas dataframe with cinstrument response data from the IRIS DMC evalresp webservice. :param network: sncl network (string) :param station: sncl station (string) :param location: sncl location (string) :param channel: sncl channel (string) :param time: ObsPy UTCDateTime object specifying the time at which the response is evaluated. :param minfreq: Optional minimum frequency at which the response is evaluated. :param maxfreq: Optional maximum frequency at which the response is evaluated. :param nfreq: Optional number of frequencies at which response will be evaluated. :param units: Optional code specifying unit conversion. :param output: Output type ['fap'|'cs']. :return: pandas dataframe of response metadata. .. rubric:: Example >>> df = getDistaz(-146, 45, 10, 10) >>> df azimuth backAzimuth distance 1 241.57595 47.88017 39.97257 """ r_client = robjects.r('new("IrisClient")') # Convert python arguments to R equivalents time = R_POSIXct(time) (minfreq, maxfreq, nfreq, units, output) = _R_args(minfreq, maxfreq, nfreq, units, output) # Call the function and return a pandas dataframe with the results r_df = _R_getEvalresp(r_client, network, station, location, channel, time, minfreq, maxfreq, nfreq, units, output) df = pandas2ri.ri2py(r_df) return df
def read_r_to_python(path_I, path_II): """Read variables stored in R data format, and then convert it into Python data frame or array This method is DEPRECIATED due to class <R_Python_Unilever>""" from rpy2.robjects import r from rpy2.robjects import pandas2ri import unicodedata tmp = r.readRDS(path_I) # read from r file df = pandas2ri.ri2py(tmp) # convert into pandas data frame df = df.applymap(lambda x: unicodedata.normalize('NFKD', x).encode('ascii','ignore') if type(x) == unicode else x) # Translate unicode encoding into ascii encoding tmp = r.readRDS(path_II) desc = pandas2ri.ri2py(tmp) # convert into python variable for j, ele in enumerate(desc): if type(desc[j]) == np.unicode_: desc[j] = str(unicodedata.normalize('NFKD', desc[j]).encode('ascii','ignore')) # http://stackoverflow.com/questions/1207457/convert-a-unicode-string-to-a-string-in-python-containing-extra-symbols desc = desc.astype(np.string_) return df, desc
def conditionDESeq2(data_frame, header, alpha, res_dir): ''' Perform DESeq2-based analysis of condition:time interaction dependent differential expression ''' E.info("Differential expression testing for %s" % header) cols = data_frame.columns # py2ri requires activation pandas2ri.activate() counts = pandas2ri.py2ri(data_frame) des_times = ro.IntVector([x.split(".")[1] for x in cols]) des_reps = ro.StrVector([x.split(".")[2] for x in cols]) des_cond = ro.StrVector([x.split(".")[0] for x in cols]) genes = ro.StrVector([x for x in data_frame.index]) # setup counts table and design frame R('''suppressPackageStartupMessages(library("DESeq2"))''') R('''sink(file="/dev/null")''') R('''times <- as.factor(%s)''' % des_times.r_repr()) R('''reps <- c(%s)''' % des_reps.r_repr()) R('''condition <- c(%s)''' % des_cond.r_repr()) R('''design <- data.frame(times, reps, condition)''') R('''counts <- data.frame(%s)''' % counts.r_repr()) R('''genes <- c(%s)''' % genes.r_repr()) R('''rownames(counts) <- genes''') R('''rownames(design) <- colnames(counts)''') # use DESeq() with LRT and reduced formula. Use effect # size moderation R('''dds <- DESeqDataSetFromMatrix(countData=counts, ''' '''colData=design, ''' '''design=~reps + times + condition + times:condition)''') R('''dds <- DESeq(dds, test="LRT", ''' '''reduced=~reps + times + condition, betaPrior=T)''') R('''res <- results(dds)[order(results(dds)$padj, na.last=T), ]''') R('''res.df <- data.frame(res)''') # generate dispersion and MA plots R('''png("%s/%s-dispersions.png")''' % (res_dir, header)) R('''plotDispEsts(dds)''') R('''dev.off()''') R('''png("%s/%s-MAplot.png")''' % (res_dir, header)) R('''plotMA(res, alpha=%0.3f, ylim=c(-5,5))''' % alpha) R('''dev.off()''') R('''sink(file=NULL)''') df = pandas2ri.ri2py(R['res.df']) return df
def treeCutting(infile, expression_file, cluster_file, cluster_algorithm, deepsplit=False): ''' Use dynamic tree cutting to derive clusters for each resampled distance matrix ''' wgcna_out = "/dev/null" E.info("loading distance matrix") df = pd.read_table(infile, sep="\t", header=0, index_col=0) df = df.fillna(0.0) genes = df.index genes_r = ro.StrVector([g for g in genes]) # py2ri requires activation pandas2ri.activate() rdf = pandas2ri.py2ri(df) R.assign("distance_data", rdf) R.assign("gene_ids", genes_r) R('''sink(file='%(wgcna_out)s')''' % locals()) R('''suppressPackageStartupMessages(library("WGCNA"))''') R('''suppressPackageStartupMessages(library("flashClust"))''') E.info("clustering data by %s linkage" % cluster_algorithm) R('''rownames(distance_data) <- gene_ids''') R('''clustering <- flashClust(as.dist(distance_data),''' ''' method='%(cluster_algorithm)s')''' % locals()) if deepsplit: R('''cluster_cut <- cutreeDynamic(dendro=clustering, ''' '''minClusterSize=50, deepSplit=T)''') else: R('''cluster_cut <- cutreeDynamic(dendro=clustering, ''' '''minClusterSize=50, deepSplit=F)''') R('''color_cut <- labels2colors(cluster_cut)''') R('''write.table(color_cut, file = '%(cluster_file)s',''' '''sep="\t")''' % locals()) R('''cluster_matched <- data.frame(cbind(rownames(distance_data),''' '''color_cut))''') R('''colnames(cluster_matched) = c("gene_id", "cluster")''') R('''cluster_matched <- data.frame(cluster_matched$gene_id,''' '''cluster_matched$cluster)''') R('''sink(file=NULL)''') cluster_frame = pandas2ri.ri2py(R["cluster_matched"]) cluster_frame.columns = ['gene_id', 'cluster'] cluster_frame.index = cluster_frame['gene_id'] cluster_frame.drop(['gene_id'], inplace=True, axis=1) return cluster_frame
def pandas_load(name): ''' loads .rdata file (R dataframe file) and returns it as Pandas dataframe. :param name: .rdata filename (eg: 'subset.Rdata') :return: pandas dataframe object ''' pandas2ri.activate() r.load(name) # name = 'subset.fcuk.Rdata' # name_without_ext = r['.'.join(name.split('.')[-2::-1][::-1])] # print(r.ls()) # ls() - list of active objects in R env df = pandas2ri.ri2py(r[r.ls()[0]]) return df
def clust_read(self): base = importr('base') #Fetch $ form the instance's dictionary of attributes dolar = base.__dict__['$'] clust = dolar(self.clust_obj, 'widths') clus_width = dolar(self.clust_obj, 'clus.avg.widths') avg_width = dolar(self.clust_obj, 'avg.width') #Convert to pandas object self.cl_width = pandas2ri.ri2py(clus_width) self.avg_wid = pandas2ri.ri2py(avg_width) #pylist1 = pandas2ri.ri2py_dataframe(clus) CAMBIAR ALTERNATIVA pylist = com.convert_robj(clust) #Transform the first data object of cluster information data = pylist.reset_index() df = data.set_index('cluster') df.rename(columns = {'index':'win_id'}, inplace = True) dd = df.reset_index() #Create a list with cluster number and win_id, transform to dataframe gb = dd.groupby(('cluster')) result = gb['win_id'].unique() self.silinfo = result.to_frame()
def run2(counts, formula, normcounts = None): from rpy2.robjects import pandas2ri from rpy2.robjects.packages import importr import rpy2.robjects as ro r = ro.r pandas2ri.activate() limma = importr('limma') edgeR = importr('edgeR') design_matrix = counts.T.reset_index()[counts.columns.names] ro.globalenv['design.matrix'] = design_matrix design = r('as.data.frame(model.matrix(' + formula + ', data=design.matrix))') dge = r.DGEList(counts=counts) dge = r.calcNormFactors(dge) v = r.voom(dge, design, plot=False) ro.globalenv['v'] = v if not normcounts is None: r('write.table(v, "' + normcounts + '",sep="\t",quote = F,col.names = NA)') fit = r.lmFit(v, design) fit = r.eBayes(fit) rv = [] print(r.ncol(design)[0]) for i in range(1, r.ncol(design)[0]): colname = r.colnames(design)[i] tt = r.topTable(fit, coef=i, number=1e12) ttidx = r['row.names'](tt) tt = pandas2ri.ri2py(tt) cols = tt.columns.to_series() cols[0] = 'lfc' cols[3] = 'pval' cols[4] = 'padj' tt.columns = cols tt['slp'] = np.log10(tt['pval']) tt.loc[tt['lfc'] > 0, 'slp'] = -np.log10(tt.loc[tt['lfc'] > 0, 'pval']) if r.ncol(design)[0] > 2: #prepend colname to columns - only if there are more factors cols = tt.columns.to_series().apply(lambda x: '{}_{}'.format(colname, x)) tt.columns = cols tt.index = ttidx rv.append(tt) return pd.concat(rv, axis=1)
def transform(self, method="vst", inplace=True): ''' perform transformation on counts table current methods are: - deseq2 variance stabalising transformation - deseq rlog transformation ''' assert method in ["vst", "rlog"], ("method must be one of" "[vst, rlog]") method2function = {"vst": "varianceStabilizingTransformation", "rlog": "rlog"} t_function = method2function[method] transform = R(''' function(df){ suppressMessages(library('DESeq2')) design = data.frame(row.names = colnames(df), condition = seq(1, length(colnames(df)))) dds <- suppressMessages(DESeqDataSetFromMatrix( countData= df, colData = design, design = ~condition)) transformed <- suppressMessages(%(t_function)s(dds)) transformed_df <- as.data.frame(assay(transformed)) return(transformed_df) }''' % locals()) r_counts = pandas2ri.py2ri(self.table) df = pandas2ri.ri2py(transform(r_counts)) # losing rownames for some reason during the conversion?! df.index = self.table.index if inplace: self.table = df # R replaces "-" in column names with ".". Revert back! self.table.columns = [x.replace(".", "-") for x in self.table.columns] else: tmp_counts = self.clone() tmp_counts.table = df tmp_counts.table.columns = [x.replace(".", "-") for x in tmp_counts.table.columns] return tmp_counts
def computeSizeFactors(counts): """ Computes size factors using DESeq for the counts matrix given as input (Genes as rows and spots as columns). Returns the computed size factors as a vector. :param counts: a matrix of counts (genes as rows) :return returns the normalization factors a vector """ pandas2ri.activate() r_counts = pandas2ri.py2ri(counts) deseq2 = RimportLibrary("DESeq2") dds = deseq2.estimateSizeFactorsForMatrix(r_counts) pandas_sf = pandas2ri.ri2py(dds) pandas2ri.deactivate() return pandas_sf
def getTraveltime(latitude, longitude, depth, staLatitude, staLongitude): """ Returns a pandas dataframe with seismic traveltime data from the IRIS DMC traveltime web service. :param latitude: Latitude of seismic event. :param longitude: Longitude of seismic event. :param staLatitude: Latitude of seismic station. :param staLongitude: Longitude of seismic station. :return: pandas dataframe with columns: ``distance, depth, phaseName, travelTime, rayParam, takeoff, incident, puristDistance, puristName``. """ r_client = robjects.r('new("IrisClient")') # Call the function and return a pandas dataframe with the results r_df = _R_getTraveltime(r_client, latitude, longitude, depth, staLatitude, staLongitude) df = pandas2ri.ri2py(r_df) return df