def write_raov(self, fname, aov_name, stat_name='NULL'): """Save ANOVA and descriptive data R objects as a csv file using a custom R function """ robjects.r("""source('~/R_Functions/WriteAOV.R')""") toeval = "WriteAOV({0}, {1}, data={2})".format("'" + fname + "'", aov_name, stat_name) robjects.r(toeval)
def get_correlations_for_tickers(tickers, show_exception=False): corrs = [] start_time = datetime.datetime.now() first = True for ticker in tickers: if not first: time_left = get_time_left( start_time, len(corrs), scipy.special.comb(len(tickers), 2) ) print 'Finding Correlations for %s. Time remaining: %f minutes' % (ticker,time_left.seconds/60) first = False try: t_data = get_t_data(ticker) except Exception as e: if show_exception: print "throwing exception", e continue for ticker_2 in tickers: if ticker_2 == ticker: continue try: tdata_2 = get_t_data(ticker_2) except Exception as e: if show_exception: print "throwing exception", e, ticker_2 continue if len(t_data) != len(tdata_2): t_data, tdata_2 = du.remap_data(t_data, tdata_2) corr = get_correlation(t_data, tdata_2)[0] ident = '%s/%s' % (ticker, ticker_2) corrs.append((ident, corr)) r('gc()') gc.collect() gc.collect() return corrs
def learnModel(self, X, Y): Parameter.checkClass(X, numpy.ndarray) Parameter.checkClass(Y, numpy.ndarray) Parameter.checkArray(X) Parameter.checkArray(Y) if numpy.unique(Y).shape[0] < 2: raise ValueError("Vector of labels must be binary, currently numpy.unique(Y) = " + str(numpy.unique(Y))) #If Y is 1D make it 2D if Y.ndim == 1: Y = numpy.array([Y]).T XY = self._getDataFrame(X, Y) formula = robjects.Formula('class ~ .') self.learnModelDataFrame(formula, XY) gc.collect() robjects.r('gc(verbose=TRUE)') robjects.r('memory.profile()') gc.collect() if self.printMemStats: logging.debug(self.getLsos()()) logging.debug(ProfileUtils.memDisplay(locals()))
def rocbees(ARGVS): ''' ROC curves and the beeswarm plot from beeswarm R package ''' beeswarm = importr('beeswarm') Cairo = importr('Cairo') ROC = importr('ROC') filename = ARGVS['file'] data = ARGVS['data'] title = ARGVS['title'] category = ARGVS['opts'] filewrite = ROOT_PATH + '/media/tmp/' + filename resp = [] expr = [] names = data.keys() for name in names: resp.append(data[name]['resp']) expr.append(data[name]['expression']) robjects.r (''' approx3 <- function(x, y = NULL, theta = 0.001) { xy <- xy.coords(x, y) dx <- diff(xy$x)/(max(xy$x) - min(xy$x)) dy <- diff(xy$y)/(max(xy$y) - min(xy$y)) angle <- atan2(dy, dx) diff.angle <- diff(angle)%%pi abs.diff.angle <- pmin(diff.angle, pi - diff.angle) keep <- c(TRUE, abs.diff.angle > theta, TRUE) xy$x <- xy$x[keep] xy$y <- xy$y[keep] xy } aronroc <- function(x, truth, type = "l", xlab = expression(1 - specificity), ylab = "Sensitivity", ...) { require(ROC) r <- rocdemo.sca(truth, x) xy <- list(x = 1 - r@spec, y = r@sens) xy.trimmed <- approx3(xy) plot(xy.trimmed, type = type, xlab = xlab, ylab = ylab, ...) invisible(xy.trimmed) } plotResps <- function (filename,expr,resp,category, main='') { expr = as.numeric(expr) resp = as.character(resp) CairoPNG(filename=filename,width = 800, height = 400) par(oma = c(0,0,1,0)) layout(matrix(1:2, nrow = 1), widths = c(1,1)) beeswarm(expr ~ resp,col=c(1:length(unique(resp))), pch=16,xlab='Response Categories', ylab='Expression') par(xpd = NA) aronroc (expr, resp == category) title(main,outer=TRUE) dev.off() } ''') try: robjects.r['plotResps'](filename = filewrite, expr = expr, resp = resp, category = category, main = title) return filename except: return 'Error'
def run_code(self, code_str, use_input=False, use_output=False, excluded_inputs=set(), excluded_outputs=set()): """run_code runs a piece of code as a VisTrails module. use_input and use_output control whether to use the inputport and output port dictionary as local variables inside the execution.""" import vistrails.core.packagemanager def fail(msg): raise ModuleError(self, msg) def cache_this(): self.is_cacheable = lambda *args, **kwargs: True if use_input: inputDict = dict([(k, self.getInputFromPort(k)) for k in self.inputPorts if k not in excluded_inputs]) for k,v in inputDict.iteritems(): robjects.globalEnv[k] = v robjects.r(code_str) if use_output: for k in self.outputPorts: if k not in excluded_outputs and k in robjects.globalEnv: self.setResult(k, robjects.globalEnv[k])
def ilmoitus_tilastot(self, vaalipiiri=False): valinta = ["puolue_lyh",] if vaalipiiri: valinta.append("vaalipiiri") summat = robjects.r('''function(df)summarise(df, ilmoittaneita = length(df$etunimi), rahoitus_tot = sum(df$rahoitus_kaikki), kulut_tot = sum(df$kulut_kaikki), omat_varat = sum(df$omat_varat), lainat = sum(df$lainat), yksityinen_tuki = sum(df$yksityinen_tuki), yritys_tuki = sum(df$yritys_tuki), puolue_tuki = sum(df$puolue_tuki), puolueyhdistys_tuki = sum(df$puolueyhdistys_tuki), valitettu_tuki = sum(df$valitetty_tuki), muu_tuki = sum(df$muu_tuki)) ''') data_puolueet = plyr.ddply(self._ilmoitukset, robjects.StrVector(valinta), summat) data_puolueet = base.merge(data_puolueet, self.ehdokas_tilastot(vaalipiiri=vaalipiiri)) data_puolueet.colnames[-1] = "ehdokkaita_tot" osuudet = robjects.r('function(a, b)return(a / b)') data_puolueet = data_puolueet.cbind(data_puolueet, base.round(osuudet(data_puolueet.rx("ilmoittaneita"), data_puolueet.rx("ehdokkaita_tot")), 2), base.round(osuudet(data_puolueet.rx("rahoitus_tot"), data_puolueet.rx("ilmoittaneita")), 2), robjects.IntVector([2011, ])) data_puolueet.colnames[-3] = "ilmoittaneita_pros" data_puolueet.colnames[-2] = "rahoitus_suht" data_puolueet.colnames[-1] = "vuosi" return data_puolueet
def Rconverter(Robj, dataframe=False): """ Convert an object in R's namespace to one suitable for ipython's namespace. For a data.frame, it tries to return a structured array. It first checks for colnames, then names. If all are NULL, it returns np.asarray(Robj), else it tries to construct a recarray Parameters ---------- Robj: an R object returned from rpy2 """ is_data_frame = ro.r('is.data.frame') colnames = ro.r('colnames') rownames = ro.r('rownames') # with pandas, these could be used for the index names = ro.r('names') if dataframe: as_data_frame = ro.r('as.data.frame') cols = colnames(Robj) _names = names(Robj) if cols != ri.NULL: Robj = as_data_frame(Robj) names = tuple(np.array(cols)) elif _names != ri.NULL: names = tuple(np.array(_names)) else: # failed to find names return np.asarray(Robj) Robj = np.rec.fromarrays(Robj, names = names) return np.asarray(Robj)
def geno_to_rqtl_function( self ): # TODO: Need to figure out why some genofiles have the wrong format and don't convert properly print("Adding some custom helper functions to the R environment") ro.r( """ trim <- function( x ) { gsub("(^[[:space:]]+|[[:space:]]+$)", "", x) } getGenoCode <- function(header, name = 'unk'){ mat = which(unlist(lapply(header,function(x){ length(grep(paste('@',name,sep=''), x)) })) == 1) return(trim(strsplit(header[mat],':')[[1]][2])) } GENOtoCSVR <- function(genotypes = 'BXD.geno', out = 'cross.csvr', phenotype = NULL, sex = NULL, verbose = FALSE){ header = readLines(genotypes, 40) # Assume a geno header is not longer than 40 lines toskip = which(unlist(lapply(header, function(x){ length(grep("Chr\t", x)) })) == 1)-1 # Major hack to skip the geno headers genocodes <- c(getGenoCode(header, 'mat'), getGenoCode(header, 'het'), getGenoCode(header, 'pat')) # Get the genotype codes type <- getGenoCode(header, 'type') genodata <- read.csv(genotypes, sep='\t', skip=toskip, header=TRUE, na.strings=getGenoCode(header,'unk'), colClasses='character', comment.char = '#') cat('Genodata:', toskip, " ", dim(genodata), genocodes, '\n') if(is.null(phenotype)) phenotype <- runif((ncol(genodata)-4)) # If there isn't a phenotype, generate a random one if(is.null(sex)) sex <- rep('m', (ncol(genodata)-4)) # If there isn't a sex phenotype, treat all as males outCSVR <- rbind(c('Pheno', '', '', phenotype), # Phenotype c('sex', '', '', sex), # Sex phenotype for the mice cbind(genodata[,c('Locus','Chr', 'cM')], genodata[, 5:ncol(genodata)])) # Genotypes write.table(outCSVR, file = out, row.names=FALSE, col.names=FALSE,quote=FALSE, sep=',') # Save it to a file require(qtl) cross = read.cross(file=out, 'csvr', genotypes=genocodes) # Load the created cross file using R/qtl read.cross if(type == 'riset') cross <- convert2riself(cross) # If its a RIL, convert to a RIL in R/qtl return(cross) } """ )
def _normalize_tabbed(self, rfile): """input: raw tabbed matrix file (with column and row headers) return: normalized tabbed matrix file (with column and row headers)""" nfile = Ipy.TMP_DIR+'/norm.'+random_str()+'.tab' rcmd = 'source("%s")\nMGRAST_preprocessing(file_in="%s", file_out="%s", produce_fig="FALSE")\n'%(Ipy.LIB_DIR+'/preprocessing.r', rfile, nfile) ro.r(rcmd) return nfile
def annotated_refs(in_file, ref, config, out_file): """Use BioMart at Ensembl to add descriptions to each row. """ rpy2.r.assign('in.file', in_file) rpy2.r.assign('out.file', out_file) rpy2.r.assign("org", ref.get("ensembl_name", "")) rpy2.r(''' library(biomaRt) options(stringsAsFactors=FALSE) in.tbl <- read.csv(in.file, header=TRUE) in.tbl$pctsimilar <- in.tbl$hitidentities / in.tbl$hitlength print(summary(in.tbl)) nohits <- sum(in.tbl$hit == "") total <- length(in.tbl$hit) print(c("No hits", nohits, "Percent hit", (total - nohits) / total)) if (org != "") { txs <- unique(in.tbl$hit) mart <- useMart("ensembl", dataset=org) attrs <- c("ensembl_transcript_id", "embl", "description") filters <- c("ensembl_transcript_id") mart.result <- getBM(attributes=attrs, filters=filters, values=txs, mart=mart) names(mart.result) <- c("hit", "genbank.id", "description") final <- merge(in.tbl, mart.result, by="hit", all.x=TRUE) final.sort <- final[order(final$query),] print(head(final.sort)) write.csv(final.sort, out.file, row.names=FALSE, na="") } ''')
def save_model(self, file_name="stm_model.RData"): ''' Save the fitted model as an R object. Parameters: file_name - string The name of the file where the data will be saved ''' if self.trained == False: print "The model has not been fitted yet." else: modsave = {"mu": robjects.ListVector(self.mu), "sigma": self.sigma, "beta": robjects.ListVector({"beta": self.logbeta, "logbeta": self.logbeta}), "settings": robjects.ListVector(self.settings), "vocab": self.vocab, "convergence": self.convergence, "theta": self.theta, "eta": self.eta, "invsigma": self.invsigma} robjects.globalenv["modsave"] = robjects.ListVector(modsave) robjects.r("save(modsave, file='" + filename + "')" ) print "STM model saved as R object."
def start(self): filePath = QFileInfo(self.outName).absoluteFilePath() filePath.replace("\\", "/") file_name = QFileInfo(self.outName).baseName() driver_list = self.driver.split("(") self.driver = driver_list[0] self.driver.chop(1) extension = driver_list[1].right(5) extension.chop(1) if self.driver == "GeoTIFF": self.driver = "GTiff" elif self.driver == "Erdas Imagine Images": self.driver = "HFA" elif self.driver == "Arc/Info ASCII Grid": self.driver = "AAIGrid" elif self.driver == "ENVI Header Labelled": self.driver = "ENVI" elif self.driver == "JPEG-2000 part 1": self.driver = "JPEG2000" elif self.driver == "Portable Network Graphics": self.driver = "PNG" elif self.driver == "USGS Optional ASCII DEM": self.driver = "USGSDEM" if not filePath.endsWith(extension, Qt.CaseInsensitive) and self.driver != "ENVI": filePath = filePath.append(extension) if not filePath.isEmpty(): if self.driver == "AAIGrid" or self.driver == "JPEG2000" or \ self.driver == "PNG" or self.driver == "USGSDEM": r_code = "saveDataset(dataset=copyDataset(create2GDAL(dataset=%s, type='Float32'), driver='%s'), filename='%s')" % (unicode(self.layerName), unicode(self.driver), unicode(filePath)) robjects.r(r_code) else: r_code = "writeGDAL(dataset=%s, fname='%s', drivername='%s', type='Float32')" % (unicode(self.layerName), unicode(filePath), unicode(self.driver)) robjects.r(r_code) rlayer = QgsRasterLayer(unicode(filePath), unicode(file_name)) return rlayer
def test_anova(): "Test ANOVA" from rpy2.robjects import r r_require('car') ds = datasets.get_uv() ds.to_r('ds') # fixed effects aov = test.anova('fltvar', 'A*B', ds=ds) print aov fs = run_on_lm_fitter('fltvar', 'A*B', ds) r_res = r("Anova(lm(fltvar ~ A * B, ds, type=2))") assert_f_tests_equal(aov.f_tests, r_res, fs, 'Anova') # random effects aov = test.anova('fltvar', 'A*B*rm', ds=ds) print aov fs = run_on_lm_fitter('fltvar', 'A*B*rm', ds) r('test.aov <- aov(fltvar ~ A * B + Error(rm / (A * B)), ds)') print r('test.summary <- summary(test.aov)') r_res = r['test.summary'][1:] assert_f_tests_equal(aov.f_tests, r_res, fs, 'rmaov') # not fully specified model with random effects assert_raises(NotImplementedError, test.anova, 'fltvar', 'A*rm', ds=ds) # empty cells dss = ds.sub("A%B != ('a2', 'b2')") assert_raises(NotImplementedError, test.anova, 'fltvar', 'A*B', ds=dss) assert_raises(NotImplementedError, run_on_lm_fitter, 'fltvar', 'A*B', ds=dss) dss = ds.sub("A%B != ('a1', 'b1')") assert_raises(NotImplementedError, test.anova, 'fltvar', 'A*B', ds=dss) assert_raises(NotImplementedError, run_on_lm_fitter, 'fltvar', 'A*B', ds=dss)
def draw_quality_plot(db_file, plot_file, position_select, title): """Draw a plot of remapped qualities using ggplot2. Remapping information is pulled from the sqlite3 database using sqldf according to the position select attribute, which is a selection phrase like '> 50' or '=28'. plyr is used to summarize data by the original and remapped score for all selected positions. ggplot2 plots a heatmap of remapped counts at each (original, remap) coordinate, with a x=y line added for reference. """ robjects.r.assign('db.file', db_file) robjects.r.assign('plot.file', plot_file) robjects.r.assign('position.select', position_select) robjects.r.assign('title', title) robjects.r(''' library(sqldf) library(plyr) library(ggplot2) sql <- paste("select * from data WHERE position", position.select, sep=" ") exp.data <- sqldf(sql, dbname=db.file) remap.data <- ddply(exp.data, c("orig", "remap"), transform, count=sum(count)) p <- ggplot(remap.data, aes(orig, remap)) + geom_tile(aes(fill = count)) + scale_fill_gradient(low = "white", high = "steelblue", trans="log") + opts(panel.background = theme_rect(fill = "white"), title=title) + geom_abline(intercept=0, slope=1) ggsave(plot.file, p, width=6, height=6) ''')
def _plot_stats(self, bam_name): robjects.r.assign('rep_cnt',numpy2ri.numpy2ri(self.frag_rep.keys())) robjects.r.assign('rep_freq',numpy2ri.numpy2ri(self.frag_rep.values())) robjects.r.assign('size_distr',numpy2ri.numpy2ri(self.frag_size.keys())) robjects.r.assign('size_freq',numpy2ri.numpy2ri(self.frag_size.values())) robjects.r.assign('nb_frag',self.nb_frag) robjects.r.assign('main',bam_name) robjects.r(""" rep_cnt = as.integer(rep_cnt) Od = order(rep_cnt) rep_freq = as.integer(rep_freq)[Od]*1e-6 rep_cnt = rep_cnt[Od] I100 = rep_cnt<100 rep_cnt = c(rep_cnt[I100],100) rep_freq = c(rep_freq[I100],sum(rep_freq[!I100])) size_distr = as.integer(size_distr) Od = order(size_distr) size_freq = as.integer(size_freq)[Od]/nb_frag size_distr = size_distr[Od] par(mfrow=c(2,1),lwd=2,cex=1.1,cex.main=1.3,cex.lab=1.1,cex.axis=.8,oma=c(0,0,3,0),mar=c(5,5,1,1),las=1,pch=20) plot(rep_cnt,rep_freq,type='s',main='Fragment redundancy',xlab='Nb of copies',ylab='Frequency (millions)', log='y',xlim=c(1,100),xaxt='n',ylim=c(1e-6,nb_frag*1e-6)) abline(h=nb_frag*1e-6,col='red') text(50,nb_frag*1e-6,nb_frag,col='red',pos=1) axis(side=1,at=seq(10,100,by=10),labels=c(seq(10,90,by=10),">100")) plot(size_distr,size_freq,type='s',main='Fragment size distribution',xlab='Size',ylab='Density') title(main=main,outer=T) """)
def transformDFByRFunction(self, df, function, library="", *args): r_df= pandas_df_to_r_df(df) if library != "": import_=r.r("library(%s)"%library) function_=r.r(function) r_updated_df= function_(r_df, *args) return r_matrix_to_dataframe(r_updated_df)
def openRecodificar(self): # Abro el dialogo self.dialogUi = self.d_recodificar self.dialogUi.setWindowTitle("Recodificar") self.dialogUi.show() # Inicializo los combobox def f(x): print x rinterface.set_writeconsole(f) n_items = "length(names(datos))" n_items = robjects.r(n_items) n_items = n_items[0] self.dialogUi.cb_variable_reco.clear() self.dialogUi.cb_variable_reco.addItem("**Variable no seleccionada**") for i in range(n_items): item_factor = "names(datos)[" + str(i+1) + "]" item_factor = robjects.r(item_factor) self.dialogUi.cb_variable_reco.addItem(str(item_factor[0])) self.dialogUi.cb_variable_reco.currentIndexChanged.connect(self.changeComboBox) rinterface.set_writeconsole(rinterface.consolePrint) # Signals QtCore.QObject.connect(self.dialogUi.buttonBox, QtCore.SIGNAL("accepted()"), self.accept, QtCore.Qt.UniqueConnection) QtCore.QObject.connect(self.dialogUi.buttonBox, QtCore.SIGNAL("rejected()"), self.cancel, QtCore.Qt.UniqueConnection)
def estimate_clusters_numbers(self, X): nr,nc = X.shape X_trainr = ro.r.matrix(X, nrow=nr, ncol=nc) ro.r.assign("matrix_val", X_trainr) ro.r("NUMC = 2:20") ro.r("out <- SIMLR_Estimate_Number_of_Clusters(t(log(1+matrix_val)), NUMC = NUMC, cores.ratio = 0)") self.estimated_clusters = ro.r("out$K1")
def read_spss_to_df(self): """Use R functions to read SPSS files Input -> NULL ==================================================================================================== Output -> Return a tuple of a python DataFrame and an np array of descriptions of column names (i.e. features descriptions) """ from rpy2.robjects import r from string import Template from rpy2.robjects import pandas2ri import unicodedata file_location = self._file_path # or "./1 - 110778/110778.sav" file_location_csv = file_location[:-4] + ".csv" r_code = Template(''' library(foreign) library(plyr) df <- read.spss ("$origin_file", to.data.frame=TRUE) desc <- attr(df,"variable.labels") write.csv(df, file="$output_file", na="") ''') r_code = r_code.substitute(origin_file=file_location, output_file=file_location_csv) # Substitute input and output file with variables presented in python r(r_code) # Run the above r code in r global environment df = pandas2ri.ri2py(r('df')) # convert from r data frame into pandas data frame df = df.applymap(lambda x: unicodedata.normalize('NFKD', x).encode('ascii','ignore') if type(x) == unicode else x) # Translate unicode encoding into ascii encoding desc = pandas2ri.ri2py(r('desc')) # convert into python variable for j, ele in enumerate(desc): if type(desc[j]) == np.unicode_: desc[j] = str(unicodedata.normalize('NFKD', desc[j]).encode('ascii','ignore')) # http://stackoverflow.com/questions/1207457/convert-a-unicode-string-to-a-string-in-python-containing-extra-symbols desc = desc.astype(np.string_) return df, desc
def openPercentiles(self): self.dialogUi = self.d_percentiles self.dialogUi.setWindowTitle("Percentiles") self.dialogUi.show() # Inicializo los combobox def f(x): print x rinterface.set_writeconsole(f) n_items = "length(names(datos))" n_items = robjects.r(n_items) n_items = n_items[0] self.dialogUi.cb_variable.clear() self.dialogUi.cb_factor.clear() self.dialogUi.cb_factor.addItem(str("Sin factor")) for i in range(n_items): item_factor = "names(datos)[" + str(i+1) + "]" item_factor = robjects.r(item_factor) self.dialogUi.cb_variable.addItem(str(item_factor[0])) self.dialogUi.cb_factor.addItem(str(item_factor[0])) rinterface.set_writeconsole(rinterface.consolePrint) # Signals QtCore.QObject.connect(self.dialogUi.buttonBox, QtCore.SIGNAL("accepted()"), self.acceptPercentiles, QtCore.Qt.UniqueConnection) QtCore.QObject.connect(self.dialogUi.buttonBox, QtCore.SIGNAL("rejected()"), self.cancel, QtCore.Qt.UniqueConnection)
def openHistograma(self): self.dialogUi = self.d_histograma self.dialogUi.setWindowTitle("Histograma") self.dialogUi.show() # Abro el multiselector self.listMulti = [] self.dialogUi.var_select.clear() QtCore.QObject.connect(self.dialogUi.pushMultiSelector, QtCore.SIGNAL("clicked()"), self.openMultiSelector, QtCore.Qt.UniqueConnection) # Inicializo los combobox def f(x): print x rinterface.set_writeconsole(f) n_items = "length(names(datos))" n_items = robjects.r(n_items) n_items = n_items[0] self.dialogUi.cb_variable.clear() for i in range(n_items): item_factor = "names(datos)[" + str(i+1) + "]" item_factor = robjects.r(item_factor) self.dialogUi.cb_variable.addItem(str(item_factor[0])) rinterface.set_writeconsole(rinterface.consolePrint) # Signals QtCore.QObject.connect(self.dialogUi.buttonBox, QtCore.SIGNAL("accepted()"), self.acceptHistograma, QtCore.Qt.UniqueConnection) QtCore.QObject.connect(self.dialogUi.buttonBox, QtCore.SIGNAL("rejected()"), self.cancel, QtCore.Qt.UniqueConnection)
def create_dataframe(self): """create & save a R dataframe in dataframe field""" if self.model is not None: ModelField = Pool().get('ir.model.field') model_fields = ModelField.search([('model', '=', self.model)]) model = Pool().get(self.model.model) records = model.search([]) fields_info = [FieldInfo(field.name, field.ttype) for field in model_fields] df = dataframe(records, fields_info) self.data = buffer(pickle.dumps(df)) self.save() elif self.script is not None: # clean R workspace # robjects.r['source'] could be used instead of robjects.r robjects.r("""rm(list = ls(all.names=TRUE))""") try: # run code uploaded by users try: robjects.r(self.script.code) except RRuntimeError, err: self.raise_user_error('r_error', (err,)) globalenv = robjects.r["globalenv"]() try: obj = globalenv['out'] except LookupError: obj = None if isinstance(obj, robjects.DataFrame): self.data = buffer(pickle.dumps(obj)) else: self.data = None finally:
def read(self, filename): """Parse content and metadata of markdown files""" QUIET = self.settings.get('RMD_READER_KNITR_QUIET', True) ENCODING = self.settings.get('RMD_READER_KNITR_ENCODING', 'UTF-8') CLEANUP = self.settings.get('RMD_READER_CLEANUP', True) RENAME_PLOT = self.settings.get('RMD_READER_RENAME_PLOT', True) logger.debug("RMD_READER_KNITR_QUIET = %s", QUIET) logger.debug("RMD_READER_KNITR_ENCODING = %s", ENCODING) logger.debug("RMD_READER_CLEANUP = %s", CLEANUP) logger.debug("RMD_READER_RENAME_PLOT = %s", RENAME_PLOT) # replace single backslashes with double backslashes filename = filename.replace('\\', '\\\\') # parse Rmd file - generate md file md_filename = filename.replace('.Rmd', '.aux').replace('.rmd', '.aux') if RENAME_PLOT: chunk_label = os.path.splitext(os.path.basename(filename))[0] logger.debug('Chunk label: %s', chunk_label) robjects.r(''' opts_knit$set(unnamed.chunk.label="{unnamed_chunk_label}") render_markdown() hook_plot <- knit_hooks$get('plot') knit_hooks$set(plot=function(x, options) hook_plot(paste0("{{filename}}/", x), options)) '''.format(unnamed_chunk_label=chunk_label)) knitr.knit(filename, md_filename, quiet=QUIET, encoding=ENCODING) # read md file - create a MarkdownReader md_reader = readers.MarkdownReader(self.settings) content, metadata = md_reader.read(md_filename) # remove md file if CLEANUP: os.remove(md_filename) return content, metadata
def adjust_pvalue(input_path,method): ''' Uses R to adjust the pvalues ''' try: if method == "None": return saved_stdout, saved_stderr = sys.stdout, sys.stderr sys.stdout = sys.stderr = open(os.devnull, "w") r_script = """ t5 = as.matrix(read.table(\""""+input_path+"""\",sep="\t", header=T, row.names=1)) rptemp <- t5 rp1 <- apply(abs(t5), 2, function(x) {p.adjust(x, \""""+method+"""\")}) for (i in 1:nrow(t5)){ for (j in (1:ncol(t5))) { if (rptemp[i,j] < 0) { rp1[i,j] <- -1*rp1[i,j]} } } write.table(rp1,\""""+input_path+"""\",sep="\t") """ robjects.r(r_script) sys.stdout, sys.stderr = saved_stdout, saved_stderr except Exception, e: logger.error("R CRASHED") logger.error(traceback.print_exc()) logger.error(str(e))
def adjust_detailed_pvalue(input_path,method): try: if method == "None": return saved_stdout, saved_stderr = sys.stdout, sys.stderr sys.stdout = sys.stderr = open(os.devnull, "w") r_script = """ t5 <- as.data.frame(read.table(\""""+input_path+"""\", sep="\t", header=TRUE)) rp1 <- p.adjust(abs(t5$P.value)) for (i in 1:length(t5$P.value)) { if(t5$P.value[i] < 0) { rp1[i] <- -1*rp1[i] } } t5 <- cbind(t5, P.adj=rp1) write.table(t5, \""""+input_path+"""\", sep="\t", row.names=F,quote=F) """ robjects.r(r_script) sys.stdout, sys.stderr = saved_stdout, saved_stderr except Exception, e: logger.error("R CRASHED") logger.error(traceback.print_exc()) logger.error(str(e))
def testRS4Auto_Type(self): robjects.r("library(stats4)") class MLE(robjects.methods.RS4): __metaclass__ = robjects.methods.RS4Auto_Type __rname__ = "mle" __rpackagename__ = "stats4"
def save_rdata(ids, model_name, filename): """save data from model and one level of joined data (one2one, one2many, many2many and many2one)""" list_map = {} add_to_map( set(ids), model_name, list_map ) df = {} tmpdir = os.path.dirname(filename) imagedir = tmpdir + "/images" os.mkdir(imagedir) for mod_name, id_list in list_map.iteritems(): model = Pool().get(mod_name) records = model.search([('id', 'in', list(id_list))]) fields_info = [FieldInfo(name, ttype._type) for name,ttype in model._fields.iteritems() if ttype._type in py2r] df[mod_name] = dataframe(records, fields_info) print "saving in Rdata: ", mod_name, list(id_list) """ save images """ for name,ttype in model._fields.iteritems(): if ttype._type == "binary" and name[-4:] == "_map": for record in records: value = getattr(record, name) imgpath = os.path.join(imagedir, (str(record)+'_'+name).replace(',','_').replace('.','_')+'.png') print "SAVING ", imgpath imgfile = open(imgpath,'wb') imgfile.write(value) for mod_name, dfr in df.iteritems(): robjects.r.assign(mod_name, dfr) robjects.r("save(list=c("+','.join(["'"+mod_name+"'" for mod_name, dfr in df.iteritems() ])+ "), file='"+filename+"')")
def testRS4_TypeAccessors(self): robjects.r['setClass']("R_A", robjects.r('list(foo="numeric")')) robjects.r['setMethod']("length", signature="R_A", definition = robjects.r("function(x) 123")) class R_A(methods.RS4): __metaclass__ = methods.RS4_Type __slots__ = ('get_length', 'length') __accessors__ = (('length', None, 'get_length', False, 'get the length'), ('length', None, None, True, 'length')) def __init__(self): obj = robjects.r['new']('R_A') self.__sexp__ = obj.__sexp__ class A(R_A): __rname__ = 'R_A' ra = R_A() self.assertEquals(123, ra.get_length()[0]) self.assertEquals(123, ra.length[0]) a = A() self.assertEquals(123, a.get_length()[0]) self.assertEquals(123, a.length[0])
def testRS4_TypeAccessors(self): robjects.r["setClass"]("R_A", robjects.r('list(foo="numeric")')) robjects.r["setMethod"]("length", signature="R_A", definition=robjects.r("function(x) 123")) class R_A(methods.RS4): __metaclass__ = methods.RS4_Type __slots__ = ("get_length", "length") __accessors__ = ( ("length", None, "get_length", False, "get the length"), ("length", None, None, True, "length"), ) def __init__(self): obj = robjects.r["new"]("R_A") self.__sexp__ = obj.__sexp__ class A(R_A): __rname__ = "R_A" ra = R_A() self.assertEqual(123, ra.get_length()[0]) self.assertEqual(123, ra.length[0]) a = A() self.assertEqual(123, a.get_length()[0]) self.assertEqual(123, a.length[0])
def updateRObjects(self): splayers = currentRObjects() for widget in self.widgets: if isinstance(widget, SpComboBox) \ or isinstance(widget, SpListWidget): sptypes = widget.spTypes() for sptype in sptypes: for layer in splayers.keys(): if splayers[layer] == sptype.strip() \ or sptype.strip() == "all": value = layer widget.addItem(value) if splayers[layer] in VECTORTYPES \ and (sptype.strip() == "data.frame" \ or sptype.strip() == "all"): value = layer+"@data" widget.addItem(value) if splayers[layer] in VECTORTYPES \ or splayers[layer] == "data.frame": for item in list(robjects.r('names(%s)' % (layer))): if splayers[layer] == "data.frame": value = layer+"$"+item else: value = layer+"@data$"+item if str(robjects.r('class(%s)' % (value))[0]) == sptype.strip() \ or sptype.strip() == "all": widget.addItem(value)
import os from IPython.display import Image import rpy2.robjects as robjects import pandas as pd from rpy2.robjects import pandas2ri from rpy2.robjects import default_converter from rpy2.robjects.conversion import localconverter read_delim = robjects.r('read.delim') seq_data = read_delim('sequence.index', header=True, stringsAsFactors=False) #In R: # seq.data <- read.delim('sequence.index', header=TRUE, #stringsAsFactors=FALSE)
def init_r_system(): r('require("missForest")') r('require("MICE")') r('require("EMB")') r('require("Amelia")') r('require("matrix_completion")') r('require("softImpute")')
def pca_outliers(adata, min_genes_per_cell=5, verbose=True): """ Function to filter outliers using scater PCA on quality measures """ import numpy as np import rpy2.robjects as ro import anndata2ri import scanpy as sc from rpy2.robjects import pandas2ri from scipy.sparse import issparse import rpy2.rinterface_lib.callbacks import logging if not verbose: rpy2.rinterface_lib.callbacks.logger.setLevel(logging.ERROR) ro.r('library(scater)') pandas2ri.activate() anndata2ri.activate() print("Loading objects into R") if issparse(adata.X): ro.globalenv['rawMatrix'] = adata.X.T.todense() else: ro.globalenv['rawMatrix'] = adata.X.T ro.globalenv['variables'] = adata.var_names.copy() ro.globalenv['observations'] = adata.obs[['total_counts']] print('Calculate PCA outliers') ro.r('') ro.r('pd <- DataFrame(data = observations)') ro.r('colnames(rawMatrix) <- rownames(pd)') ro.r('rownames(rawMatrix) <- variables') ro.r( 'sce <- SingleCellExperiment(assays = list(counts = as.matrix(rawMatrix) ), colData = pd)' ) ro.r('sce <- calculateQCMetrics(sce)') ro.r('sce <- runPCA(sce, use_coldata = TRUE, detect_outliers = TRUE)') ro.r('cat("Nr of outliers detected:", sum(sce$outlier), sep=" ")') ro.r('outlier2 = sce@colData@rownames[sce$outlier]') ro.r( 'plotReducedDim(sce, use_dimred="PCA", shape_by = "outlier", size_by = "total_counts", colour_by = "total_features_by_counts")' ) outlier2 = ro.r('outlier2') adata = adata[np.invert(np.in1d(adata.obs_names, outlier2))].copy() sc.pp.filter_genes(adata, min_cells=min_genes_per_cell) return adata
def pca_covariates(adata, covariates=['total_counts'], verbose=False): """ Function to output R^2 of covariates against PCA projection """ import numpy as np import pandas as pd import rpy2.robjects as ro import anndata2ri import scanpy as sc from rpy2.robjects import pandas2ri from scipy.sparse import issparse import rpy2.rinterface_lib.callbacks import logging if not verbose: rpy2.rinterface_lib.callbacks.logger.setLevel(logging.ERROR) import seaborn as sns import matplotlib.pyplot as plt ro.r('library(scater)') pandas2ri.activate() anndata2ri.activate() print("Loading objects into R") if issparse(adata.X): ro.globalenv['rawMatrix'] = np.log1p(adata.X.T.todense()) else: ro.globalenv['rawMatrix'] = np.log1p(adata.X.T) ro.globalenv['observations'] = adata.obs[covariates] print('Calculate PCA covariates') ro.r('pd <- DataFrame(data = observations)') #ro.r('print(pd[1:5,])') ro.r('colnames(rawMatrix) <- rownames(pd)') ro.r( 'sce <- SingleCellExperiment(assays = list(counts = as.matrix(rawMatrix) ), colData = pd)' ) commandString = 'getVarianceExplained(sce, exprs_values = "counts", variables = c(' variables = ['"data.' + i + '"' for i in covariates] commandString = commandString + ','.join(variables) + ') )' print("using the R command") print(commandString) vals = ro.r(commandString) medians = np.argsort(-np.median(vals, 0)) medianVals = -np.sort(-np.median(vals, 0)) vals = pd.DataFrame(vals[:, medians]) #print(covariates) #print(medians) vals.columns = np.asarray(covariates)[medians] plt.rcParams['figure.figsize'] = (8, 8) f, ax = plt.subplots(1) for nn, mm in zip(vals.columns, medianVals): sns.kdeplot(vals[nn], ax=ax, label=nn, clip=(mm, 97), gridsize=100) ax.set_xscale("symlog") #plt.xlim(0,100) ax.legend(title="Covariates", loc='best') adata.uns['pca_covariates'] = vals return adata
# %% import rpy2.robjects as ro from rpy2.robjects.packages import importr import sys sys.path.append("../") from fumipo_stat.util import py2r base = importr('base') ro.r('print("Hello R")') ro.r("x <- c(0, 1, 2, 3, 4)") ro.r("y <- c(0, 1, 0, 2, 3)") result = ro.r("lm(y~x)") print(result) print(base.summary(result)) assert list(base.summary(result).rx(4)[0].names.rx2(1)) == ["(Intercept)", "x"] assert list(base.summary(result).rx(4)[0].names.rx2(2)) == [ "Estimate", "Std. Error", "t value", "Pr(>|t|)" ] assert list(base.summary(result).rx(4)[0].rownames) == ["(Intercept)", "x"] # %% import numpy as np import pandas as pd from rpy2.robjects import pandas2ri, numpy2ri from rpy2.robjects.conversion import localconverter ro.r("x <- c(0, 1, 2, 3, 4)") ro.r("y <- c(0, 1, 0, 2, 3)")
def load_edgeR(): """ Load edgeR library into R. """ robj.r("library(edgeR)")
from collections import OrderedDict import pandas as pd output = sys.argv[1] softwares = OrderedDict() # TODO: need to replace hard coded software with retrieving ones in the toolsinfo softwares["Pipeline"] = 'r20160208' skewer_ver = subprocess.check_output('echo `/mnt/software/skewer/skewer -v`', shell=True).decode("utf-8") softwares["skewer"]=re.split("[: ]+", skewer_ver)[2] #filter(None, re.split("[: ]+", skewer_ver))[2] star_ver = subprocess.check_output("/mnt/software/STAR-dir/STAR-2.4.2a/STAR --version", stderr=subprocess.STDOUT, shell=True) softwares["STAR"] = star_ver.split("_")[1] rsem_ver = subprocess.check_output("/mnt/software/rsem-dir/rsem-1.2.22/rsem-calculate-expression -version", shell=True) softwares["RSEM"] = rsem_ver.split()[-1] version = robjects.r(""" function (p) { paste(packageVersion(p),collapse=".") }""") softwares["R"] = list(version("base"))[0] softwares["EdgeR"] = list(version("edgeR"))[0] softwares["EBSeq"] = list(version("EBSeq"))[0] softwares_df = pd.DataFrame(softwares, index = [0]).T softwares_df.columns=['Version'] softwares_html = softwares_df.to_html(classes="table table-bordered table-hover", escape=False) with open(output, 'w') as f_out: f_out.write(softwares_html)
def gaussian_setup(X, Y, run_CV=True): """ Some calculations that can be reused by methods: lambda.min, lambda.1se, lambda.theory and Reid et al. estimate of noise """ n, p = X.shape Xn = X / np.sqrt((X**2).sum(0))[None, :] numpy2ri.activate() rpy.r.assign('X', X) rpy.r.assign('Y', Y) numpy2ri.deactivate() rpy.r('X=as.matrix(X)') rpy.r('Y=as.numeric(Y)') l_theory = np.fabs(Xn.T.dot(np.random.standard_normal( (n, 500)))).max(1).mean() * np.ones(p) if run_CV: numpy2ri.activate() rpy.r.assign('X', X) rpy.r.assign('Y', Y) rpy.r('X=as.matrix(X)') rpy.r('Y=as.numeric(Y)') rpy.r('G = cv.glmnet(X, Y, intercept=FALSE, standardize=FALSE)') rpy.r( 'sigma_reid = selectiveInference:::estimate_sigma(X, Y, coef(G, s="lambda.min")[-1]) # sigma via Reid et al.' ) rpy.r("L = G[['lambda.min']]") rpy.r("L1 = G[['lambda.1se']]") L = rpy.r('L') L1 = rpy.r('L1') sigma_reid = rpy.r('sigma_reid')[0] numpy2ri.deactivate() return L * np.sqrt(X.shape[0]) * 1.0001, L1 * np.sqrt( X.shape[0]) * 1.0001, l_theory, sigma_reid else: return None, None, l_theory, None
def model_fit(model_options, X, y, *args, **kwargs): if model_options.model_name == '1': clf = sl.Lasso(alpha=model_options.lambda_value) clf.fit(X, y) return clf elif model_options.model_name == '2': clf = sl.ElasticNet(alpha=model_options.lambda_value, l1_ratio=model_options.ratio_value) clf.fit(X, y) return clf elif model_options.model_name == '3': clf = sl.Lars(copy_X=True, eps=model_options.lambda_value, fit_intercept=True, fit_path=True, normalize=True, positive=False, precompute='auto', verbose=False) clf.fit(X, y) return clf elif model_options.model_name == '4': from sklearn.gaussian_process import GaussianProcessRegressor clf = GaussianProcessRegressor(kernel=model_options.kernel, n_restarts_optimizer=3, random_state=2018) clf.fit(X, y) return clf elif model_options.model_name == '5': from sklearn.gaussian_process import GaussianProcessRegressor clf = GaussianProcessRegressor(kernel=model_options.kernel, normalize_y='T', n_restarts_optimizer=3, random_state=2018) clf.fit(X, y[:, 0]) return clf elif model_options.model_name == '6': clf = sl.LogisticRegression(penalty=model_options.normSelection, C=1 / model_options.lambda_value) clf.fit(X, y) return clf elif model_options.model_name == '7': clf = sl.MultiTaskLasso(alpha=model_options.lambda_value) clf.fit(X, y) return clf elif model_options.model_name == '8': X_input = Input(model_options.input_shape) X_ = ZeroPadding2D((3, 3))(X_input) X_ = Conv2D(32, (7, 7), strides=(1, 1), name='conv0')(X_) X_ = BatchNormalization(axis=3, name='bn0')(X_) X_ = Activation('relu')(X_) X_ = MaxPooling2D((2, 2), name='max_pool')(X_) X_ = Flatten()(X_) X_ = Dense(1, activation='sigmoid', name='fc')(X_) clf = Model(inputs=X_input, outputs=X_) clf.compile('adam', 'binary_crossentropy', metrics=['accuracy']) clf.fit(X, y, epochs=20, batch_size=50, verbose=1, validation_data=(model_options.X_train_valid, model_options.y_train_valid)) return clf elif model_options.model_name == '9': from statsmodels.tsa.ar_model import AR clf = AR(X).fit(maxlag=int(model_options.lambda_value)) return clf elif model_options.model_name == '10': clf = robjects.r( '''mod1 = cubist(x = trainx, y = trainy, committees = 10)''') return clf elif model_options.model_name == '11': # Create a tensor Regressor estimator if model_options.tensorReg_type == '1': clf = KruskalRegressor(weight_rank=model_options.rank + 1, tol=10e-7, n_iter_max=100, reg_W=1, verbose=0) elif model_options.tensorReg_type == '2': clf = TuckerRegressor( weight_ranks=[model_options.rank + 1, model_options.rank + 1], tol=10e-7, n_iter_max=100, reg_W=1, verbose=0) # Fit the estimator to the data clf.fit(X, y) return clf elif model_options.model_name == '12': Z = kwargs.get('Z', None) clf = InsituEnsemble(model_options.lambda_value, X, y, Z) return clf elif model_options.model_name == '13': from sklearn import svm clf = svm.SVC(C=model_options.lambda_value) clf.fit(X, y) return clf
## A script for extracting info about the patients used in the analysis ## Load necessary modules from rpy2 import robjects as ro import numpy as np import os ro.r('library(survival)') import re ##This call will only work if you are running python from the command line. ##If you are not running from the command line manually type in your paths. BASE_DIR = os.path.dirname( os.path.dirname(os.path.dirname(os.path.dirname( os.path.abspath(__file__))))) ## There were three clinical files with nonredundant data. V4.0 is in general the most uptodate, but it is possible ## for data in the other files to be more uptodate. As a result, clinical data will be merged. f = open( os.path.join(BASE_DIR, 'tcga_data', 'UCEC', 'clinical', 'nationwidechildrens.org_clinical_follow_up_v4.0_ucec.txt')) ##get the column indexes needed columns = f.readline().split('\t') patient_column = columns.index('bcr_patient_barcode') alive_column = columns.index('last_contact_days_to') death_column = columns.index('death_days_to') f.readline() f.readline() data = [i.split('\t') for i in f] ## A patient can be listed multiple times in the file. The most recent listing (furthest down in the file), contains the most recent
def run(self): """ Run the regression using R """ # Source R script to define the function import rpy2.robjects as ro from rpy2.robjects import pandas2ri from .r_code.r_utilities import ewasresult2py, df_pandas2r r_code_folder = Path(__file__).parent / "r_code" filename = str(r_code_folder / "ewas_r.R") ro.r.source(filename) # Print warnings as they occur ro.r("options(warn=1)") # Lists of regression variables (NULL if empty) bin_vars = ro.StrVector(self.regression_variables["binary"]) cat_vars = ro.StrVector(self.regression_variables["categorical"]) cont_vars = ro.StrVector(self.regression_variables["continuous"]) if len(bin_vars) == 0: bin_vars = ro.NULL if len(cat_vars) == 0: cat_vars = ro.NULL if len(cont_vars) == 0: cont_vars = ro.NULL # Lists of covariates (NULL if empty) dtypes = _get_dtypes(self.data) bin_covars = ro.StrVector( [v for v in self.covariates if (dtypes.loc[v] == "binary")] ) cat_covars = ro.StrVector( [v for v in self.covariates if (dtypes.loc[v] == "categorical")] ) cont_covars = ro.StrVector( [v for v in self.covariates if dtypes.loc[v] == "continuous"] ) if len(bin_covars) == 0: bin_covars = ro.NULL if len(cat_covars) == 0: cat_covars = ro.NULL if len(cont_covars) == 0: cont_covars = ro.NULL # Allow nonvarying covariates by default to match python ewas (warn instead of error) allowed_nonvarying = ro.StrVector(self.covariates) # Run with or without survey design info if self.survey_design_spec is None: # Reset the index on data so that the first column is "ID" (note 'data' becomes a local variable) data = self.data.reset_index(drop=False) data = data[ [ "ID", ] + [c for c in data.columns if c != "ID"] ] with ro.conversion.localconverter( ro.default_converter + pandas2ri.converter ): data_r = df_pandas2r(data) result = ro.r.ewas( d=data_r, bin_vars=bin_vars, cat_vars=cat_vars, cont_vars=cont_vars, y=self.outcome_variable, bin_covars=bin_covars, cat_covars=cat_covars, cont_covars=cont_covars, regression_family=self.family, allowed_nonvarying=allowed_nonvarying, min_n=self.min_n, ) else: # Merge weights into data and get weight name(s) (Note 'data' becomes a local variable) if self.survey_design_spec.single_weight: weights = self.survey_design_spec.weight_name data = pd.merge( self.data, self.survey_design_spec.weight_values, left_index=True, right_index=True, how="left", ) elif self.survey_design_spec.multi_weight: weights = self.survey_design_spec.weight_names data = pd.merge( self.data, pd.DataFrame(self.survey_design_spec.weight_values), left_index=True, right_index=True, how="left", ) else: raise ValueError("Weights must be provided") # Gather optional parts of survey parameters kwargs = dict() # Cluster IDs if self.survey_design_spec.has_cluster: kwargs["ids"] = f"{self.survey_design_spec.cluster_name}" data[ self.survey_design_spec.cluster_name ] = self.survey_design_spec.cluster_values else: kwargs["ids"] = ro.NULL # Strata if self.survey_design_spec.has_strata: kwargs["strata"] = f"{self.survey_design_spec.strata_name}" data[ self.survey_design_spec.strata_name ] = self.survey_design_spec.strata_values # fpc if self.survey_design_spec.has_fpc: kwargs["fpc"] = f"{self.survey_design_spec.fpc_name}" data[ self.survey_design_spec.fpc_name ] = self.survey_design_spec.fpc_values_original # Single cluster setting ro.r( f'options("survey.lonely.psu"="{self.survey_design_spec.single_cluster}")' ) # Reset the index on data so that the first column is "ID" data = data.reset_index(drop=False) data = data[ [ "ID", ] + [c for c in data.columns if c != "ID"] ] with ro.conversion.localconverter( ro.default_converter + pandas2ri.converter ): data_r = df_pandas2r(data) if self.survey_design_spec.multi_weight: # Must convert python dict of var:weight name to a named list in R weights = ro.ListVector(weights) result = ro.r.ewas( d=data_r, bin_vars=bin_vars, cat_vars=cat_vars, cont_vars=cont_vars, y=self.outcome_variable, bin_covars=bin_covars, cat_covars=cat_covars, cont_covars=cont_covars, regression_family=self.family, allowed_nonvarying=allowed_nonvarying, min_n=self.min_n, weights=weights, subset=self.survey_design_spec.subset_array, drop_unweighted=self.survey_design_spec.drop_unweighted, **kwargs, ) result = ewasresult2py(result) # Ensure correct dtypes (float may be objects if the are all NaN) float_cols = [ "Beta", "SE", "Variable_pvalue", "LRT_pvalue", "Diff_AIC", "pvalue", ] result[float_cols] = result[float_cols].astype("float64") self.result = result.reset_index(drop=False) self.run_complete = True
import numpy as np import pandas as pd # Rpy import rpy2.robjects as rpy from rpy2.robjects import numpy2ri rpy.r( 'suppressMessages(library(selectiveInference)); suppressMessages(library(knockoff))' ) # R libraries we will use rpy.r(""" estimate_sigma_data_splitting = function(X,y, verbose=FALSE){ nrep = 10 sigma_est = 0 nest = 0 for (i in 1:nrep){ n=nrow(X) m=floor(n/2) subsample = sample(1:n, m, replace=FALSE) leftover = setdiff(1:n, subsample) CV = cv.glmnet(X[subsample,], y[subsample], standardize=FALSE, intercept=FALSE, family="gaussian") beta_hat = coef(CV, s="lambda.min")[-1] selected = which(beta_hat!=0) if (verbose){ print(c("nselected", length(selected))) } if (length(selected)>0){ LM = lm(y[leftover]~X[leftover,][,selected]) sigma_est = sigma_est+sigma(LM)
def __call__(self, **kw): if kw.get('input_type') == 'Table': filename = kw.get('table') assert os.path.exists( str(filename)), "File not found: '%s'" % filename robjects.r(""" Mdata = read.delim('%s',row.names=1) conds = sapply(strsplit(colnames(Mdata),".",fixed=T),"[[",1) """ % filename) conds = robjects.r("conds").rx() else: from QuantifyTable import QuantifyTablePlugin assembly = genrep.Assembly(kw.get('assembly')) chrmeta = assembly.chrmeta or "guess" kw['score_op'] = 'sum' signals1 = kw['Group1']['signals1'] signals2 = kw['Group2']['signals2'] if not isinstance(signals1, (list, tuple)): signals1 = [signals1] if not isinstance(signals2, (list, tuple)): signals2 = [signals2] signals = signals1 + signals2 kw['SigMulti'] = { 'signals': signals } # to pass it to QuantifyTable plugin table = QuantifyTablePlugin().quantify(**kw) stracks = [] norm_factors = [] for sig in signals: assert os.path.exists( str(sig)), "Signal file not found: '%s'." % sig _t = track(sig, chrmeta=chrmeta) if 'normalization' in _t.info: _nf = float(_t.info['normalization']) elif 'nreads' in _t.info: _nf = float(_t.info['nreads']) * 1e-7 / float( _t.info.get('read_extension', 1)) else: _nf = 1 stracks.append(_t) norm_factors.append(_nf) t = track(table,chrmeta=chrmeta,fields=['chr','start','end','name']+ \ ['score%d'%x for x in range(len(signals))]) _f = [f for f in t.fields if f.startswith('score')] de_list = list(t.read(fields=['name'] + _f)) t.close() os.remove(table) # Turn all scores into integers de_matrix = numpy.asarray([[ int(float(s) * norm_factors[k] + .5) for k, s in enumerate(x[1:]) ] for x in de_list], dtype=numpy.float) rownames = numpy.asarray([x[0] for x in de_list]) colnames = numpy.asarray([s.name for s in stracks]) # if all prefixes are identical within a group, keep this prefix as group identifier. if len(list(set( [x.split('.')[0] for x in colnames[:len(signals1)]] ))) == 1 \ and len(list(set( [x.split('.')[0] for x in colnames[len(signals1):]] ))) == 1: group1 = colnames[0].split('.')[0] group2 = colnames[-1].split('.')[0] else: group1 = "Group1" group2 = "Group2" conds = [group1] * len(signals1) + [group2] * len(signals2) robjects.r.assign('Mdata', numpy2ri(de_matrix)) robjects.r.assign('row_names', robjects.StrVector(rownames)) robjects.r.assign('col_names', robjects.StrVector(colnames)) robjects.r.assign('conds', robjects.StrVector(conds)) robjects.r(""" Mdata = as.data.frame(Mdata,row.names=row_names) colnames(Mdata) = col_names """) robjects.r(""" library(DESeq) if (all(table(conds)>=3)){ # if >3 replicates in all conditions method = 'per-condition' # for each group estimate the variance from its replicates sharingMode = 'gene-est-only' # use the per-gene variance estimates only } else if (any(table(conds)>1)){ # if few replicates method = 'pooled' # use all groups with replicates to estimate the variance sharingMode = 'maximum' # use the max of the GLM fit and the estimated variance } else { # if no replicates method = 'blind' # pools all groups together to estimate the variance sharingMode='fit-only' # use only the GLM fit across the pooled variance } cds = newCountDataSet(Mdata, conds) cds = estimateSizeFactors(cds) test = try({ cds = estimateDispersions(cds, method=method, fitType='parametric', sharingMode=sharingMode) }) if(class(test) == "try-error") { cds = estimateDispersions(cds, method=method, fitType='local', sharingMode=sharingMode) } """) groups = list(set(conds)) couples = itertools.combinations(groups, 2) output = self.temporary_path(fname='DE') for c in couples: out = "%s_%s-%s.txt" % ((output, ) + tuple(c)) robjects.r(""" res = nbinomTest(cds, '%s', '%s') write.table(res[order(res[,8]),], '%s', row.names=F, quote=F, sep='\t') """ % (c[0], c[1], out)) if kw.get('complete') is None: clean = self.clean_deseq_output(out, c) shutil.move(clean, out) self.new_file(out, 'differential_expression') return self.display_time()
initial_size=N_Xpreb, population_id=0) ] if debug: dd = msprime.DemographyDebugger( population_configurations=population_configurations, migration_matrix=migration_matrix, demographic_events=demographic_events) dd.print_history() return return population_configurations, migration_matrix, demographic_events #Calculate fROH from empirical data rread_command = "read.table(file = \"" + eroh_file + "\", header = T)" r_df = robjects.r(rread_command) pd_df = pd.DataFrame.from_dict( {key: np.asarray(r_df.rx2(key)) for key in r_df.names}) #calculate croh for each individual ecroh_pd_df = pd_df.groupby("IID").sum() #calc avg fROH for simulated data eFROH = mean(ecroh_pd_df['KB'] * 1000 / len_chr) print("Empirical FROH is " + str(eFROH)) #Find the number of variants in empirical data for line in open(sroh_file): if "out of" in line: evariants = int(line.split(" ", 1)[0]) print("%d variants in empirical dataset" % (evariants))
from rpy2.robjects.packages import importr from rpy2.robjects import Formula, Environment import rpy2.robjects as ro from rpy2.robjects import FloatVector, ListVector, IntVector, StrVector, NULL stats = importr('stats') base = importr('base') # Create matrix in R v = ro.FloatVector([1.1, 2.2, 3.3, 4.4, 5.5, 6.6]) m = ro.r.matrix(v, nrow=2) m = ro.r['matrix'](v, nrow=2) ctl = FloatVector( [4.17, 5.58, 5.18, 6.11, 4.50, 4.61, 5.17, 4.53, 5.33, 5.14]) trt = FloatVector( [4.81, 4.17, 4.41, 3.59, 5.87, 3.83, 6.03, 4.89, 4.32, 4.69]) group = base.gl(2, 10, 20, labels=["Ctl", "Trt"]) weight = ctl + trt ro.globalenv["weight"] = weight ro.globalenv["group"] = group lm_D9 = stats.lm("weight ~ group") print(stats.anova(lm_D9)) lm_D90 = stats.lm("weight ~ group - 1") print(base.summary(lm_D90)) res = ro.StrVector(['abc', 'def']) v = ro.FloatVector([1.1, 2.2, 3.3, 4.4, 5.5, 6.6]) m = ro.r['matrix'](v, nrow=2) letters = ro.r['letters'] rcode = 'paste(%s, collapse="-")' % (letters.r_repr()) res = ro.r(rcode)
def setup(): r("install.packages('batchmeans', repos='http://cran.us.r-project.org')") r.require('batchmeans')
Python binding for the copula function from R using Rpy. """ from __future__ import division import numpy as np import statistics as st from scipy.interpolate import interp1d from scipy.stats import kendalltau, pearsonr, spearmanr from stats import scoreatpercentile from rpy2.robjects import r import rpy2.robjects.numpy2ri rpy2.robjects.numpy2ri.activate() r("library('copula')") class Copula(): """ This class estimate parameter of copula generate joint random variable for the parameters This class has following three copulas: Clayton Frank Gumbel Example: x = np.random.normal(size=20) y = np.random.normal(size=20) foo = Copula(x, y, 'frank') u,v = foo.generate_uv(1000)
ts_log_decompose.dropna(inplace=True) test_stationarity(ts_log_decompose) #Let's start some modelling! Firstly let's run some #shit here to figoure out the p and q values in our #ARIMA model lag_acf = acf(ts_log_diff, nlags=20) lag_pacf = pacf(ts_log_diff, nlags=20, method='ols') plt.subplot(121) plt.plot(lag_acf) plt.axhline(y=0,linestyle='--',color='gray') plt.axhline(y=-1.96/np.sqrt(len(ts_log_diff)),linestyle='--',color='gray') plt.axhline(y=1.96/np.sqrt(len(ts_log_diff)),linestyle='--',color='gray') plt.title('Autocorrelation Function') plt.subplot(122) plt.plot(lag_pacf) plt.axhline(y=0,linestyle='--',color='gray') plt.axhline(y=-1.96/np.sqrt(len(ts_log_diff)),linestyle='--',color='gray') plt.axhline(y=1.96/np.sqrt(len(ts_log_diff)),linestyle='--',color='gray') plt.title('Partial Autocorrelation Function') plt.tight_layout() plt.show() #furkin Auto-Arima this shiznit pandas2ri.activate() ro.r('install.packages("forecast")') ro.r('library(forecast)') rdf = pandas2ri.py2ri(ts_log_diff) ro.globalenv['r_timeseries'] = rdf pred = ro.r('as.data.frame(forecast(auto.arima(r_timeseries),h=5))') pred
def execute(snplist, chr): out = open(options.o + ".dist", "w") count1, count2, count = 0, 0, 1 rhash, alist, blist = [], [], [] ## loop through first SNP for i in snplist: if count % (float(options.r) / 10) == 0: print count, "positions processed, now at position:", i.rstrip() count += 1 pos = int(i.split()[1]) ## loop through second SNP for j in snplist[count1:]: pos1 = int(j.split()[1]) ## get allele frequencies if pos1 <= pos: continue afhash = doubleallelecodes(i, j, individual) if afhash == "NA": print "afhash", i, j continue ## calculate r2 from Allele frequencies fi, fj, ftotal, allele1, allele2 = afhash rsq = rsquared(fi, fj, ftotal) if rsq == "NA": print "rsquared", fi, fj continue rhash.append(rsq) ##store positions of SNP1 alist.append(pos) ##store positions of SNP2 blist.append(pos1) #print inversion,rsq,pos,pos1 out.write( "\t".join(map(str, [chr, pos, pos1, rsq, allele1, allele2])) + "\n") #print count1,i.rstrip() count1 += 1 #### make x-axis labels based on the assumption that the labels go from 0-1 and now you need to scale the whole genome relative to these borders binlist, labellist = [], [] ##last SNP pos upper = max(alist + blist) ## first SNP pos lower = min(alist + blist) ## make sure that there is at least ONE SNP if upper - lower != 0: ## here caluclate the relative step of one basepair step = 1 / float(upper - lower) invcoo = inversionrect(chr, step) ## set step size to 2mb co = 2000000 ## this is the stepsize between the ticks stepsize = co * step ## this is the start position start = (co - lower) * step ## bin the steps in a list binlist.append(start) labellist.append(str(co / 1000000) + "mb") co += 2000000 start += stepsize ## append ticks until the step is larger than the position of the last SNP while (co < upper): labellist.append(str(co / 1000000) + "mb") binlist.append(start) co += 2000000 start += stepsize ## convert python to R cp = robjects.vectors.FloatVector(rhash) al = robjects.vectors.IntVector(alist) bl = robjects.vectors.IntVector(blist) bins = robjects.vectors.FloatVector(binlist) labels = robjects.vectors.StrVector(labellist) r.assign('values', cp) r.assign('al', al) r.assign('bl', bl) r.assign('bins', bins) r.assign('labels', labels) ## open graphics device and load libraries r('library("LDheatmap")') r('png("' + options.o + "_" + chr + '.png",width=5000,height=5000)') ##convert distance list to distance matrix r('x.names <- sort(unique(c(al, bl)))') r('x.dist <- matrix(0, length(x.names), length(x.names))') r('dimnames(x.dist) <- list(x.names, x.names)') r('x.ind <- rbind(cbind(match(al, x.names), match(bl, x.names)),cbind(match(bl, x.names), match(al, x.names)))' ) r('x.dist[x.ind] <- rep(values, 2)') #print r('t(arev(x.dist))') ## make LDHeatmap grid object based on the r2 values. Use the topo color palette and put the Chromosome and Inversion in the title. Also print the number of SNPs used.Rotate the whole heatmap by 270 degrees. r('M<-LDheatmap(x.dist,sort(unique(c(al, bl)),decreasing=F),color=topo.colors(20),flip=T,geneMapLabelX=10000,title="")' ) ## add an X-Axis above heatmap and use the labels generated above r('la<-LDheatmap.addGrob(M, grid.xaxis(label=labels,at=bins,main=F,gp=gpar(cex=10),name="axis"),height=0)' ) ## add inversion breakpoints if invcoo != "NA": invcount = 0 alphabet = ["a", "b", "c", "d", "e", "f", "g", "h"] for coord in invcoo: #print coord ## add red line for the inversion boundaries r('l' + alphabet[invcount + 1] + '<-LDheatmap.addGrob(l' + alphabet[invcount] + ', grid.lines(x=c(' + str(coord[0]) + ',' + str(coord[1]) + '),y=' + str(1.1 + (invcount / float(5))) + ',gp=gpar( lwd=8,col="red")),height=' + str(0.1 + (invcount / float(500))) + ')') ## add label for the inversion r('l' + alphabet[invcount + 2] + '<-LDheatmap.addGrob(l' + alphabet[invcount + 1] + ', grid.text("' + str(coord[2]) + '",x=' + str(coord[0]) + ',y=' + str(1.3 + (invcount / float(5))) + ',gp = gpar(cex = 5)),height=' + str(0.1 + (invcount / float(500))) + ')') invcount += 2 ## make everything white. r('grid.edit("axis", gp = gpar(col = "white"))') ## and then just make the ticks and the labels black r('grid.edit(gPath("axis", "labels"), gp = gpar(col = "black"))') r('grid.edit(gPath("axis", "ticks"), gp = gpar(col = "black",lwd=4))') ## resize the linewidth of the segments r('grid.edit(gPath("geneMap", "segments"), gp = gpar(lwd = 0.2))') ## increae the size of the color key labels r('grid.edit("Key", gp = gpar(cex = 8))') ## increase the size of the title #r('grid.edit(gPath("heatMap", "title"), gp = gpar(cex=0))') r('dev.off()')
def test_anova_r_sleep(): "Test ANOVA accuracy by comparing with R (sleep dataset)" from rpy2.robjects import r # "sleep" dataset print r('data(sleep)') ds = Dataset.from_r('sleep') ds['ID'].random = True # independent measures aov = test.ANOVA('extra', 'group', ds=ds) fs = run_on_lm_fitter('extra', 'group', ds) print r('sleep.aov <- aov(extra ~ group, sleep)') print r('sleep.summary <- summary(sleep.aov)') r_res = r['sleep.summary'][0] assert_f_test_equal(aov.f_tests[0], r_res, 0, fs[0]) # repeated measures aov = test.ANOVA('extra', 'group * ID', ds=ds) fs = run_on_lm_fitter('extra', 'group * ID', ds) print r('sleep.aov <- aov(extra ~ group + Error(ID / group), sleep)') print r('sleep.summary <- summary(sleep.aov)') r_res = r['sleep.summary'][1][0] assert_f_test_equal(aov.f_tests[0], r_res, 0, fs[0]) # unbalanced (independent measures) ds2 = ds[1:] print r('sleep2 <- subset(sleep, (group == 2) | (ID != 1))') aov = test.ANOVA('extra', 'group', ds=ds2) fs = run_on_lm_fitter('extra', 'group', ds2) print r('sleep2.aov <- aov(extra ~ group, sleep2)') print r('sleep2.summary <- summary(sleep2.aov)') r_res = r['sleep2.summary'][0] assert_f_test_equal(aov.f_tests[0], r_res, 0, fs[0])
import os import sys import getopt import rpy2.robjects.packages as rpackages import rpy2.robjects as robjects from itertools import combinations ## GENERATE R markdown and md according to the config file: #1. dego 1v1 #2. dego stages vs #3. index.md DATADIR = "../" robjects.r['source'](os.path.join(DATADIR, "conf/config.R")) ### load config names = robjects.r("names(data_src)") lst_1v1 = list(combinations(names, 2)) stages = robjects.r("stage_lst") project_name = robjects.r("PROJECT") seen = set() u_stages = [x for x in stages if x not in seen and not seen.add(x)] ##remove dup lst_stages = list(combinations(u_stages, 2)) cluster_use = "seurat_clusters" try: options, args = getopt.getopt(sys.argv[1:], "c:") except getopt.GetoptError: print("Erorr Parametes")
def __call__(self, *args, **kwargs): return robjects.r(*args, **kwargs)
def analyze2(lo3): value = { "중이염": 24, "급성심근경색": 1, "골수이식": 10, "위암": 11, "간암": 12, "제왕절개": 13, "관상동맥우회술": 14, "뇌졸중": 2, "요양병원": 20, "당뇨병": 22, "대장암": 23, "유방암": 25, "폐암": 26, "천식": 27, "폐질환": 28, "폐렴": 29, "고혈압": 3, "중환자실": 30, "혈액투석": 4, "정신과": 5, "고관절치환술": 7, "췌장암": 8, "식도암": 9 } num = str(value.get(lo3)) if lo3 not in value: print("평가결과가 없음") elif num == 11: { robj. r('table_result<<-data.frame()\n' 'table_region<<-table_region[order(table_region$item.asmGrd' + str(num) + '),]\n' 'for(i in 1:nrow(table_region)) {\n' 'if(((as.character(table_region$item.asmGrd' + str(num) + '[i])=="등급제외" || as.character(table_region$item.asmGrd' + str(num) + '[i])=="평가제외")) &&' '(as.character(table_region$item.asmGrd16[i])=="등급제외" || as.character(table_region$item.asmGrd16[i])=="평가제외")))' 'next\n' 'table_result<<-rbind.fill(table_result, table_region[i,])}\n' 'table_result<<-table_result[order(table_result$item.asmGrd' + str(num) + '),]\n' 'print(head(table_result$item.yadmNm.x, 10))\n' 'write.csv(table_result, file="result.csv", row.names=FALSE)') } elif num == 12: { robj. r('table_result<<-data.frame()\n' 'table_region<<-table_region[order(table_region$item.asmGrd' + str(num) + '),]\n' 'for(i in 1:nrow(table_region)) {\n' 'if(((as.character(table_region$item.asmGrd' + str(num) + '[i])=="등급제외" || as.character(table_region$item.asmGrd' + str(num) + '[i])=="평가제외")) &&' '(as.character(table_region$item.asmGrd15[i])=="등급제외" || as.character(table_region$item.asmGrd15[i])=="평가제외")))' 'next\n' 'table_result<<-rbind.fill(table_result, table_region[i,])}\n' 'table_result<<-table_result[order(table_result$item.asmGrd' + str(num) + '),]\n' 'print(head(table_result$item.yadmNm.x, 10))\n' 'write.csv(table_result, file="result.csv", row.names=FALSE)') } else: { robj.r( 'table_result<<-data.frame()\n' 'table_region<<-table_region[order(table_region$item.asmGrd' + str(num) + '),]\n' 'for(i in 1:nrow(table_region)) {\n' 'if(as.character(table_region$item.asmGrd' + str(num) + '[i])=="등급제외" || as.character(table_region$item.asmGrd' + str(num) + '[i])=="평가제외")\n' 'next\n' 'table_result<<-rbind.fill(table_result, table_region[i,])}\n' 'table_result<<-table_result[order(table_result$item.asmGrd' + str(num) + '),]\n' 'print(head(table_result$item.yadmNm.x, 10))\n' 'write.csv(table_result, file="result.csv", row.names=FALSE)') }
from libmetgem import msp from rdkit import Chem from rdkit.Chem import Draw from rdkit.Chem.Draw import IPythonConsole from rdkit.Chem.rdMolDescriptors import CalcExactMolWt, CalcMolFormula from rdkit.Chem import AllChem from DeepFrag.utils import load_model, ms_correlation from DeepFrag.utils import read_ms, morgan_fp, ms2vec, model_predict, plot_compare_ms from DeepFrag.loss import pearson, loss from DeepFrag.annotate import annotate_ms from pycdk.pycdk import add_formula, subtract_formula, check_formula, getFormulaExactMass import rpy2.robjects as robjects import rpy2.robjects.numpy2ri as numpy2ri numpy2ri.activate() robjects.r('''source('DeepFrag/metfrag.R')''') generateFragments = robjects.globalenv['generateFragments'] msp_file = 'RIKEN_PlaSMA/RIKEN_PlaSMA_Pos.msp' model = load_model('RIKEN_PlaSMA_Pos_10') pretrain = load_model('simulated_Pos_10V') result = pd.read_csv('Result/RIKEN_PlaSMA_Pos_10.csv') # parser dataset ms = [] smiles = [] energies = [] modes = [] parser = msp.read(msp_file) for i, (params, data) in enumerate(parser): if 'collisionenergy' in params:
def instance_methods(self): rstring = ''' function(X, ran_gf) { for (gf in ran_gf) { X[[gf]] <- as.factor(X[[gf]]) } return(X) } ''' process_ran_gf = robjects.r(rstring) rstring = ''' function(X) { for (c in names(X)) { if (is.numeric(X[[c]])) { X[[paste0('z_',c)]] <- scale(X[[c]]) } } return(X) } ''' add_z = robjects.r(rstring) rstring = ''' function(bform, df) { return(bam(as.formula(bform), data=df, drop.unused.levels=FALSE, nthreads=10)) } ''' fit = robjects.r(rstring) rstring = ''' function(model) { return(summary(model)) } ''' summary = robjects.r(rstring) rstring = ''' function(model, bform, df, subjects=NULL, words=NULL) { for (c in names(df)) { if (is.numeric(df[[c]])) { df[[paste0('z_',c)]] <- scale(df[[c]]) } } select = logical(nrow(df)) select = !select if (grepl('subject', bform) & !is.null(subjects)) { select = select & df$subject %in% subjects } grepl('word', bform) if (grepl('word', bform) & !is.null(words)) { select = select & (word %in% words) } preds = predict(model, df[select,]) df$preds = NA df[select,]$preds = preds return(df$preds) } ''' predict = robjects.r(rstring) rstring = ''' function(df, col) { return(unique(df[[col]])) } ''' unique = robjects.r(rstring) return process_ran_gf, add_z, fit, summary, predict, unique
def deseq2( pseudobulk: UnimodalData, design: str, contrast: Tuple[str, str, str], de_key: str = "deseq2", replaceOutliers: bool = True, ) -> None: """Perform Differential Expression (DE) Analysis using DESeq2 on pseduobulk data. This function calls R package DESeq2, requiring DESeq2 in R installed. DE analysis will be performed on all pseudo-bulk matrices in pseudobulk. Parameters ---------- pseudobulk: ``UnimodalData`` Pseudobulk data with rows for samples and columns for genes. If pseudobulk contains multiple matrices, DESeq2 will apply to all matrices. design: ``str`` Design formula that will be passed to DESeq2 contrast: ``Tuple[str, str, str]`` A tuple of three elements passing to DESeq2: a factor in design formula, a level in the factor as numeritor of fold change, and a level as denominator of fold change. de_key: ``str``, optional, default: ``"deseq2"`` Key name of DE analysis results stored. For cluster.X, stored key will be cluster.de_key replaceOutliers: ``bool``, optional, default: ``True`` If execute DESeq2's replaceOutliers step. If set to ``False``, we will set minReplicatesForReplace=Inf in ``DESeq`` function and set cooksCutoff=False in ``results`` function. Returns ------- ``None`` Update ``pseudobulk.varm``: ``pseudobulk.varm[de_key]``: DE analysis result for pseudo-bulk count matrix. ``pseudobulk.varm[cluster.de_key]``: DE results for cluster-specific pseudo-bulk count matrices. Examples -------- >>> pg.deseq2(pseudobulk, '~gender', ('gender', 'female', 'male')) """ try: import rpy2.robjects as ro from rpy2.robjects import pandas2ri, numpy2ri, Formula from rpy2.robjects.packages import importr from rpy2.robjects.conversion import localconverter except ModuleNotFoundError as e: import sys logger.error(f"{e}\nNeed rpy2! Try 'pip install rpy2'.") sys.exit(-1) try: deseq2 = importr('DESeq2') except ModuleNotFoundError: import sys text = """Please install DESeq2 in order to run this function.\n To install this package, start R and enter:\n if (!require("BiocManager", quietly = TRUE)) install.packages("BiocManager") BiocManager::install("DESeq2")""" logger.error(text) sys.exit(-1) import math to_dataframe = ro.r('function(x) data.frame(x)') for mat_key in pseudobulk.list_keys(): with localconverter(ro.default_converter + numpy2ri.converter + pandas2ri.converter): dds = deseq2.DESeqDataSetFromMatrix( countData=pseudobulk.get_matrix(mat_key).T, colData=pseudobulk.obs, design=Formula(design)) if replaceOutliers: dds = deseq2.DESeq(dds) res = deseq2.results(dds, contrast=ro.StrVector(contrast)) else: dds = deseq2.DESeq(dds, minReplicatesForReplace=math.inf) res = deseq2.results(dds, contrast=ro.StrVector(contrast), cooksCutoff=False) with localconverter(ro.default_converter + pandas2ri.converter): res_df = ro.conversion.rpy2py(to_dataframe(res)) res_df.fillna( { 'log2FoldChange': 0.0, 'lfcSE': 0.0, 'stat': 0.0, 'pvalue': 1.0, 'padj': 1.0 }, inplace=True) de_res_key = de_key if mat_key.find( '.') < 0 else f"{mat_key.partition('.')[0]}.{de_key}" pseudobulk.varm[de_res_key] = res_df.to_records(index=False)
def testGetclassdef(self): robjects.r('library(stats4)') cr = methods.getclassdef('mle', 'stats4') self.assertFalse(cr.virtual)
from rpy2.robjects import r import os import rpy2.robjects.packages as rpackages import rpy2.robjects as robj utils = rpackages.importr('utils') # R 기본 패키지 호출 robj.r('setwd("~/Desktop")') r.library('plyr') robj.r('load("~/Desktop/MediWeb-master/R_file/main.RData")') robj.r('table_region<-data.frame()') ##### Category by Region ##### # start robj.r('start<-function(sido_s, sggu_s){\n' 'for(i in 1:nrow(table_united)){\n' 'if((sido_s==as.character(table_united$item.sidoCdNm[i])) && ' '(sggu_s==as.character(table_united$item.sgguCdNm[i]))) {\n' 'start_num=i\n' 'return(start_num)}}}') # end robj.r('end<-function(sido_e, sggu_e, start_num){\n' 'for(i in start_num:nrow(table_united)){\n' 'if(!((sido_e==as.character(table_united$item.sidoCdNm[i])) && ' '(sggu_e==as.character(table_united$item.sgguCdNm[i])))) {\n' 'end_num=i-1\n' 'return(end_num)}}}')
def testRS4Auto_Type_nopackname(self): robjects.r('library(stats4)') class MLE(robjects.methods.RS4): __metaclass__ = robjects.methods.RS4Auto_Type __rname__ = 'mle'
import rpy2.robjects as robjects from rpy2.robjects.packages import importr from rpy2.robjects import pandas2ri pandas2ri.activate() import warnings warnings.filterwarnings("ignore", category=UserWarning) rstring = ''' function(x) { return(scale(x, scale=FALSE)) } ''' center = robjects.r(rstring) robjects.globalenv["c."] = center rstring = ''' function(x) { return(scale(x, scale=TRUE)) } ''' z_score = robjects.r(rstring) robjects.globalenv["z."] = z_score rstring = ''' function(x) { return(x/sd(x)) }