def generate_images(data,template): """Run Sunniva's R script to generate the 5 plots she likes. :param data: list of tuples (dose,response,experiment)""" ro.r(""" source("graph/R/4in1skript_ankervariation_backup.R") library("gplots") library("bmd") library("splines") library("plyr") library("emdbook") """) data_array = asarray(zip(*data)) dose = data_array[0]; response = data_array[1]; experiment = data_array[2] ro.r.assign('dose',numpy2ri(dose)) ro.r.assign('response',numpy2ri(response)) ro.r.assign('experiment',numpy2ri(experiment)) ro.r.assign('outfilename',template) # sunny/media/images/fit_images_1 ro.r(""" mydata = data.frame(dose=dose,response=response,experiment=experiment) for (run in c(1,2,3,4)){ outname = paste(outfilename,'_',run, sep="") plotname = paste(outfilename,'_',run,'.png', sep="") processData(mydata, outname=outname, xlab="Concentration [AU]", plotname=plotname, run=run) } """)
def _plot_stats(self, bam_name): robjects.r.assign('rep_cnt', numpy2ri.numpy2ri(self.frag_rep.keys())) robjects.r.assign('rep_freq', numpy2ri.numpy2ri(self.frag_rep.values())) robjects.r.assign('size_distr', numpy2ri.numpy2ri(self.frag_size.keys())) robjects.r.assign('size_freq', numpy2ri.numpy2ri(self.frag_size.values())) robjects.r.assign('nb_frag', self.nb_frag) robjects.r.assign('main', bam_name) robjects.r(""" rep_cnt = as.integer(rep_cnt) Od = order(rep_cnt) rep_freq = as.integer(rep_freq)[Od]*1e-6 rep_cnt = rep_cnt[Od] I100 = rep_cnt<100 rep_cnt = c(rep_cnt[I100],100) rep_freq = c(rep_freq[I100],sum(rep_freq[!I100])) size_distr = as.integer(size_distr) Od = order(size_distr) size_freq = as.integer(size_freq)[Od]/nb_frag size_distr = size_distr[Od] par(mfrow=c(2,1),lwd=2,cex=1.1,cex.main=1.3,cex.lab=1.1,cex.axis=.8,oma=c(0,0,3,0),mar=c(5,5,1,1),las=1,pch=20) plot(rep_cnt,rep_freq,type='s',main='Fragment redundancy',xlab='Nb of copies',ylab='Frequency (millions)', log='y',xlim=c(1,100),xaxt='n',ylim=c(1e-6,nb_frag*1e-6)) abline(h=nb_frag*1e-6,col='red') text(50,nb_frag*1e-6,nb_frag,col='red',pos=1) axis(side=1,at=seq(10,100,by=10),labels=c(seq(10,90,by=10),">100")) plot(size_distr,size_freq,type='s',main='Fragment size distribution',xlab='Size',ylab='Density') title(main=main,outer=T) """)
def fit(self, X, y): self.classes_ = np.unique(y) y = np.searchsorted(self.classes_, y) + 1 X = numpy2ri(X) y = ro.FactorVector(numpy2ri(y)) self.model_ = rf.randomForest(X, y, **self.params) return self
def wilcox_test_R(x, y, alternative='less'): """ Call R implementation of single-sided Wilcoxon rank sum test with alternative hypothesis that @x is less than @y NOTE: Calling R many times is slow! rather use python function if possible """ if alternative not in ['two.sided', 'less', 'greater']: raise ValueError("Alternative hypothesis should be either 'two.sided', 'less' or 'greater'") import rpy2 from rpy2.robjects.numpy2ri import numpy2ri from rpy2.robjects.packages import importr from rpy2.robjects import pandas2ri pandas2ri.activate() statspackage = importr('stats', robject_translations={'format_perc': '_format_perc'}) result = statspackage.wilcox_test(numpy2ri(x), numpy2ri(y), alternative=alternative, paired=True, exact=False, correct=False) pyresultdict = pandas2ri.ri2py(result) for k, v in pyresultdict.items(): # print(k, v) if k == 'p.value': pval = v[0] return pval
def _plot_stats(self, bam_name): robjects.r.assign('rep_cnt',numpy2ri.numpy2ri(self.frag_rep.keys())) robjects.r.assign('rep_freq',numpy2ri.numpy2ri(self.frag_rep.values())) robjects.r.assign('size_distr',numpy2ri.numpy2ri(self.frag_size.keys())) robjects.r.assign('size_freq',numpy2ri.numpy2ri(self.frag_size.values())) robjects.r.assign('nb_frag',self.nb_frag) robjects.r.assign('main',bam_name) robjects.r(""" rep_cnt = as.integer(rep_cnt) Od = order(rep_cnt) rep_freq = as.integer(rep_freq)[Od]*1e-6 rep_cnt = rep_cnt[Od] I100 = rep_cnt<100 rep_cnt = c(rep_cnt[I100],100) rep_freq = c(rep_freq[I100],sum(rep_freq[!I100])) size_distr = as.integer(size_distr) Od = order(size_distr) size_freq = as.integer(size_freq)[Od]/nb_frag size_distr = size_distr[Od] par(mfrow=c(2,1),lwd=2,cex=1.1,cex.main=1.3,cex.lab=1.1,cex.axis=.8,oma=c(0,0,3,0),mar=c(5,5,1,1),las=1,pch=20) plot(rep_cnt,rep_freq,type='s',main='Fragment redundancy',xlab='Nb of copies',ylab='Frequency (millions)', log='y',xlim=c(1,100),xaxt='n',ylim=c(1e-6,nb_frag*1e-6)) abline(h=nb_frag*1e-6,col='red') text(50,nb_frag*1e-6,nb_frag,col='red',pos=1) axis(side=1,at=seq(10,100,by=10),labels=c(seq(10,90,by=10),">100")) plot(size_distr,size_freq,type='s',main='Fragment size distribution',xlab='Size',ylab='Density') title(main=main,outer=T) """)
def voom(counts, library_size): from rpy2.robjects.packages import importr from rpy2.robjects.numpy2ri import numpy2ri logger.info("Running limma voom in R") limma = importr('limma') edgeR = importr('edgeR') base_r = importr('base') r_dollar = getattr(base_r, '$') library_size_r = base_r.c(numpy2ri(library_size.values)) counts_r = edgeR.DGEList(counts=numpy2ri(counts.values.T), lib_size=library_size_r) counts_r = edgeR.calcNormFactors(counts_r) v = limma.voom(counts_r, plot=False) gexp = xr.DataArray(np.array(r_dollar(v, 'E')).T, coords=counts.coords, attrs={ 'units': 'lb(re 1)', 'long_name': "Gene expression in log2 range" }) weights = xr.DataArray(np.array(r_dollar(v, 'weights')).T, coords=counts.coords, attrs={ 'units': 'lb(re 1)', 'long_name': "Limma voom weights" }) return VoomResult(gexp, weights)
def generate_images(data, template): """Run Sunniva's R script to generate the 5 plots she likes. :param data: list of tuples (dose,response,experiment)""" ro.r(""" source("graph/R/4in1skript_ankervariation_backup.R") library("gplots") library("bmd") library("splines") library("plyr") library("emdbook") """) data_array = asarray(zip(*data)) dose = data_array[0] response = data_array[1] experiment = data_array[2] ro.r.assign('dose', numpy2ri(dose)) ro.r.assign('response', numpy2ri(response)) ro.r.assign('experiment', numpy2ri(experiment)) ro.r.assign('outfilename', template) # sunny/media/images/fit_images_1 ro.r(""" mydata = data.frame(dose=dose,response=response,experiment=experiment) for (run in c(1,2,3,4)){ outname = paste(outfilename,'_',run, sep="") plotname = paste(outfilename,'_',run,'.png', sep="") processData(mydata, outname=outname, xlab="Concentration [AU]", plotname=plotname, run=run) } """)
def cluster_by_grs(target, source, env): import graphmod as gm args = source[-1].read() verb_map = {} gr_map = {} instances = gm.Instances() gm.load_instances(source[0].rstr(), instances) for ii in range(len(instances)): verb = instances.get_name("verb_lemma", instances.at(ii)["verb_lemma"][0]) grs = [instances.get_name("gr", x) for x in instances.at(ii)["gr"]] verb_map[verb] = verb_map.get(verb, len(verb_map)) for gr in grs: gr_map[gr] = gr_map.get(gr, len(gr_map)) data = numpy.zeros(shape=(len(verb_map), len(gr_map))) for ii in range(len(instances)): verb = instances.get_name("verb_lemma", instances.at(ii)["verb_lemma"][0]) verb_id = verb_map[verb] grs = [instances.get_name("gr", x) for x in instances.at(ii)["gr"]] gr_ids = [gr_map[x] for x in grs] for gr in gr_ids: data[verb_id, gr] += 1 data = numpy.transpose(data.T / data.sum(1)) tres = numpy.asarray(rcluster.clusGap(numpy2ri(data), FUN=stats.kmeans, K_max=30, B=500).rx2("Tab")) gaps = tres[:, 2] err = tres[:, 3] best = rcluster.maxSE(numpy2ri(gaps), numpy2ri(err), method="globalmax") res = stats.kmeans(numpy2ri(data), centers=best) verbs = dict([(v, k) for k, v in verb_map.iteritems()]) ofd = meta_open(target[0].rstr(), "w") for c in set(res.rx2("cluster")): ofd.write(" ".join([verbs[i] for i, a in enumerate(res.rx2("cluster")) if a == c]) + "\n") return None
def test1(self): rkt = rpackages.importr('rkt') nyear = 4 nseas = 5 year = np.repeat(np.arange(2000, 2000 + nyear), nseas) dekad = np.tile(1 + np.arange(nseas), nyear) data = np.random.rand(nseas * nyear) + np.arange(nseas * nyear) * 0.1 if 1: year = robjects.IntVector(year) dekad = robjects.IntVector(dekad) data = robjects.FloatVector(data) else: year = rpyn.numpy2ri(year) dekad = rpyn.numpy2ri(dekad) data = rpyn.numpy2ri(data) print(year) print(dekad) print(data) self.res = rkt.rkt(year, data, dekad) print(self.res) df = pandas2ri.ri2py_dataframe(rw.res).transpose() df.columns = self.res.names df = df[['sl', 'S', 'B', 'varS', 'tau']] print(pd.concat([df, df, df])) self.df = df
def _from_python(obj): if isinstance(obj, DataFrame): obj = convert_to_r_dataframe(obj) elif isinstance(obj, Series): obj = numpy2ri(obj.values) elif isinstance(obj, np.ndarray): obj = numpy2ri(obj) return obj
def _from_python(obj): if isinstance(obj, DataFrame): obj = py2ri(obj) elif isinstance(obj, Series): obj = numpy2ri(obj.values) elif isinstance(obj, np.ndarray): obj = numpy2ri(obj) return obj
def identity_heatmap_plot(numpy_matrix, labels, header="", xlab="", ylab="", reverse=False, output_path="~/test.svg"): import rpy2.robjects as robjects import rpy2.robjects.numpy2ri rpy2.robjects.numpy2ri.activate() import rpy2.robjects.numpy2ri as numpy2ri from pandas import DataFrame from rpy2.robjects import pandas2ri pandas2ri.activate() robjects.r.assign('Mdata',numpy2ri.numpy2ri(numpy_matrix)) b = np.asarray(labels, dtype='str') robjects.r.assign('labels',numpy2ri.numpy2ri(b)) if reverse: plot = ''' cols <- rev(brewer.pal(9, "Blues")) heatmap.2(100-as.matrix(Mdata), trace="none", key="True",col=cols, na.rm=TRUE, density.info='none', cellnote=as.matrix(Mdata), notecol="black", labRow=labels, labCol=labels) ''' else: plot = ''' cols <- brewer.pal(9, "Blues") h <- heatmap.2(as.matrix(Mdata), trace="none", key="True",col=cols, na.rm=TRUE, density.info='none', cellnote=as.matrix(Mdata), notecol="black", labRow=labels, labCol=labels) ''' robjects.r(''' library(Cairo) library(ggplot2) library(gplots) library(RColorBrewer) rownames(Mdata) <- labels colnames(Mdata) <- labels h <- length(Mdata[,1])/4+8 w <- length(Mdata[,1])/2+8 print(h) print(w) svg('%s',height=h,width=w) par(oma = c(5, 0, 0, 8), xpd=TRUE) par(mar = c(5,1,1,8)) par(cex.main=1,oma=c(22,0,0,20), xpd=TRUE, new=TRUE) %s #print(h) write.table(Mdata, "/home/trestan/identity.tab", sep="\t",col.names = NA) dev.off() ''' % (output_path, plot))
def plot_dnds(wt, ga): dnds_wt = numpy2ri(wt) dnds_ga = numpy2ri(ga) r.assign('wt', dnds_wt) r.assign('ga', dnds_ga) r(' source("src/R/figure_dnds.R") ')
def plot_stabhyddiff(stab, hyd): d1 = numpy2ri(stab) r.assign('stab', d1) d2 = numpy2ri(hyd) r.assign('hyd', d2) r(' source("src/R/figure_sites_stab_nothyd.R") ')
def plot_entropy(wt, ga): ent_wt = numpy2ri(wt) ent_ga = numpy2ri(ga) r.assign('wt', ent_wt) r.assign('ga', ent_ga) r(' source("src/R/figure_entropy.R") ')
def plot_diversity(wt, ga): div_wt = numpy2ri(wt) div_ga = numpy2ri(ga) r.assign('wt', div_wt) r.assign('ga', div_ga) r(' source("src/R/figure_diversity.R") ')
def boxplot(values,labels,output=None,format='pdf',new=True,last=True,**kwargs): """Creates a box-and-whiskers plot of *values* split by *labels*.""" if not isinstance(values,ndarray): values = asarray(values) if not isinstance(labels,ndarray): labels = asarray(labels) plotopt,output = _begin(output=output,format=format,new=new,**kwargs) robjects.r.assign('values',numpy2ri.numpy2ri(values)) robjects.r.assign('labels',numpy2ri.numpy2ri(labels)) robjects.r("boxplot(values ~ labels,lty=1,varwidth=T)") _end("",last,**kwargs) return output
def basic_plot(values_x, values_y=False, header="", xlab="", ylab="", output_path="~/test.svg", type="hist"): import rpy2.robjects as robjects import rpy2.robjects.numpy2ri rpy2.robjects.numpy2ri.activate() import rpy2.robjects.numpy2ri as numpy2ri from pandas import DataFrame from rpy2.robjects import pandas2ri pandas2ri.activate() a = np.asarray(values_x, dtype='float') if values_y: b = np.asarray(values_y, dtype='float') robjects.r.assign('values_x', numpy2ri.numpy2ri(a)) robjects.r.assign('values_y', numpy2ri.numpy2ri(b)) robjects.r(''' #library(genoPlotR) library(Cairo) library(ggplot2) library(plyr) #mu <- ddply(plot_data, "comp", summarise, identity.mean=median(identity)) #print (mu) #plot_data$identity <- as.numeric(plot_data$identity) svg('%s',height=6,width=14) plot(values_x, values_y, pch=20) # , ylim=c(0,100) dev.off() ''' % (output_path)) else: robjects.r.assign('values_x', numpy2ri.numpy2ri(a)) robjects.r(''' #library(genoPlotR) library(Cairo) library(ggplot2) svg('%s',height=7,width=7) #barplot(table(values_x), main="Conservation of predicted effectors in other genomes") library(ggplot2) mytable <- as.data.frame(table(values_x)) #print(mytable) p <- ggplot(mytable, aes(x = reorder(values_x, -order(values_x)), y = Freq)) + geom_bar(stat = "identity") p <- p + theme(axis.text.x = element_text(angle = 90, hjust = 1))+ coord_flip() print(p) dev.off() ''' % (output_path))
def model_drm(fit_name,dose,response,fixed=''): ro.r.assign('dose',numpy2ri(dose)) ro.r.assign('response',numpy2ri(response)) if fixed: fixed = 'fixed='+list2r(list(fixed)) fit_fct = ro.r(fit_name+'('+fixed+')') try: model = drc.drm(ro.Formula('response~dose'),fct=fit_fct) return model except RRuntimeError, re: return "R: "+str(re)
def build_vine(self): """After being initialized, the vine copula is created. """ r_structure = numpy2ri(self._structure) r_family = numpy2ri(permute_params( self._family_changed, self._structure)) r_par = numpy2ri(permute_params(self._param1, self._structure)) r_par2 = numpy2ri(permute_params(self._param2, self._structure)) self._rvine = R_VINECOPULA.RVineMatrix( r_structure, r_family, r_par, r_par2) self._to_rebuild = False
def plot_tediff_supplement(r1_hela, r1_human, r2_hela, r2_human): r1hela = numpy2ri(r1_hela) r.assign('r1hela', r1hela) r1human = numpy2ri(r1_human) r.assign('r1human', r1human) r2hela = numpy2ri(r2_hela) r.assign('r2hela', r2hela) r2human = numpy2ri(r2_human) r.assign('r2human', r2human) r(' source("src/R/figure_supplement_nopt.R") ')
def model_selection(data): """:param data: list of tuples (dose,response,experiment)""" dose,response,experiment = asarray(zip(*data)) ro.r.assign('dose',numpy2ri(dose)) ro.r.assign('response',numpy2ri(response)) ro.r.assign('experiment',numpy2ri(experiment)) bmdrcdata = ro.r('data.frame(dose=dose,response=response,experiment=experiment)') selected_models = ro.r('bestModel')(bmdrcdata) if selected_models == ro.rinterface.NULL: # No model found selected_model = None else: selected_model = selected_models[0] return selected_model
def scatterplot(X,Y,output=None,format='pdf',new=True,last=True,ratio=1.375,**kwargs): """Creates a scatter plot of X vs Y. If Y is a list of arrays, a different color will be used for each of them.""" plotopt,output = _begin(output=output,format=format,new=new,ratio=ratio,**kwargs) robjects.r.assign('xdata',numpy2ri.numpy2ri(X)) if not(isinstance(Y,(list,tuple))): Y = [Y] robjects.r.assign('ydata',numpy2ri.numpy2ri(Y[0])) robjects.r("plot(xdata,ydata%s)" %plotopt) for n in range(1,len(Y)): robjects.r.assign('ydata',numpy2ri.numpy2ri(Y[n])) robjects.r("points(xdata,ydata,col=%i)" %(n+1)) _end(",pch=20",last,**kwargs) return output
def model_drm(fit_name,_data,fixed=''): data_array = asarray(zip(*_data)) dose = data_array[0]; response = data_array[1] ro.r.assign('dose',numpy2ri(dose)) ro.r.assign('response',numpy2ri(response)) if fixed: fixed = 'fixed='+list2r(list(fixed)) fit_fct = ro.r(fit_name+'('+fixed+')') try: model = drc.drm(ro.Formula('response~dose'),fct=fit_fct) return model except RRuntimeError, re: return "R: "+str(re)
def plot_sites_supplement(data_stab, data_agg, data_hyd): stab = numpy2ri(data_stab) r.assign('stab', stab) agg = numpy2ri(data_agg) r.assign('agg', agg) hyd = numpy2ri(data_hyd) r.assign('hyd', hyd) r(' source("src/R/figure_supplement_sites.R") ') r(' source("src/R/figure_supplement_P1_stab.R") ') r(' source("src/R/figure_supplement_P1_agg.R") ') r(' source("src/R/figure_supplement_P1_hyd.R") ')
def plot_entropy_supplement(ent_wt, ent_ga, div_wt, div_ga): ent_wt = numpy2ri(ent_wt) ent_ga = numpy2ri(ent_ga) div_wt = numpy2ri(div_wt) div_ga = numpy2ri(div_ga) r.assign('entwt', ent_wt) r.assign('entga', ent_ga) r.assign('divwt', div_wt) r.assign('divga', div_ga) r(' source("src/R/figure_supplement_entropy.R") ')
def __call__(self, casecon, genotype, **kwargs): casecon = robjects.IntVector(casecon) genotype = nri.numpy2ri(genotype) res = self.fn(robjects.IntVector(casecon), nri.numpy2ri(genotype), **kwargs) res = dict(res.iteritems()) # convert from rpy2 stuffs to python dict. skey = [k for k in res if k.endswith('.stat')][0] perm_p = res['perm.pval'][0] asym_p = res.get('asym.pval', [None])[0] return dict(stat=res[skey][0], perm_p=perm_p, asym_p=asym_p, function=self.function_name)
def _plot_pdf(self,filename,stats,title=""): import rpy2.robjects as robjects import rpy2.robjects.numpy2ri as numpy2ri robjects.r('pdf("%s",paper="a4",height=8,width=8)' %filename) for chrom,st in stats.iteritems(): if chrom: _title = title+":"+chrom else: _title = title if 'feat_stats' in st: fst = st['feat_stats'] robjects.r.assign('len',numpy2ri.numpy2ri(fst[1].keys())) robjects.r.assign('num',numpy2ri.numpy2ri(fst[1].values())) robjects.r.assign('ylim',max(10,fst[0])) robjects.r.assign('med',fst[2][5]) robjects.r.assign('men',fst[2][3]) robjects.r.assign('sdv',fst[2][4]) robjects.r(""" ypos=1 len=as.numeric(len) num=as.numeric(num) par(lwd=2,cex=1.1,cex.main=1.5,cex.lab=1.3,cex.axis=.8,mar=c(5,5,1,1),las=1,pch=20) plot(len,num,type='h',main='%s',xlab='Feature Length',ylab='Frequency',ylim=c(1,ylim),log='y') abline(v=med,col='red') text(med,ylim,paste("median",med,sep="="),col='red',pos=4) abline(h=ylim[1],col='green') mtext(paste(ylim[1],"features"),side=2,at=10,col='green',las=1) arrows(men-sdv,ypos,men+sdv,ypos,angle=90,code=3,length=.15,col='blue') points(men,ypos,pch=19,col='blue') """%_title) if 'score_stats' in st: sst = st['score_stats'] robjects.r.assign('score',numpy2ri.numpy2ri(sst[0].keys())) robjects.r.assign('num',numpy2ri.numpy2ri(sst[0].values())) robjects.r.assign('med',sst[1][5]) robjects.r.assign('men',sst[1][3]) robjects.r.assign('sdv',sst[1][4]) robjects.r(""" ypos=1 score=as.numeric(score) num=as.numeric(num) par(lwd=2,cex=1.1,cex.main=1.5,cex.lab=1.3,cex.axis=0.8,mar=c(5,5,1,1),las=1,pch=20) plot(score,num,type='h',main='%s',xlab='Score',ylab='Frequency',log='y') abline(v=med,col='red') text(med,ylim[1],paste("median",med,sep="="),col='red',pos=4) arrows(men-sdv,ypos,men+sdv,ypos,angle=90,code=3,length=.15,col='blue') points(men,ypos,pch=19,col='blue') """%_title) robjects.r("dev.off()") return None
def model_selection(data): """:param data: list of tuples (dose,response,experiment)""" dose, response, experiment = asarray(zip(*data)) ro.r.assign('dose', numpy2ri(dose)) ro.r.assign('response', numpy2ri(response)) ro.r.assign('experiment', numpy2ri(experiment)) bmdrcdata = ro.r( 'data.frame(dose=dose,response=response,experiment=experiment)') selected_models = ro.r('bestModel')(bmdrcdata) if selected_models == ro.rinterface.NULL: # No model found selected_model = None else: selected_model = selected_models[0] return selected_model
def model_drm(fit_name, _data, fixed=''): data_array = asarray(zip(*_data)) dose = data_array[0] response = data_array[1] ro.r.assign('dose', numpy2ri(dose)) ro.r.assign('response', numpy2ri(response)) if fixed: fixed = 'fixed=' + list2r(list(fixed)) fit_fct = ro.r(fit_name + '(' + fixed + ')') try: model = drc.drm(ro.Formula('response~dose'), fct=fit_fct) return model except RRuntimeError, re: return "R: " + str(re)
def plot_average_coverage(wt, ga): wt = np.sum(wt, 1) ga = np.sum(ga, 1) data_wt = numpy2ri(wt) data_ga = numpy2ri(ga) r.assign('wt', data_wt) r.assign('ga', data_ga) r(' wt <- as.matrix(wt) ') r(' ga <- as.matrix(ga) ') r(' source("src/R/figure_coverage.R") ')
def RCopula(): import rpy2.robjects as ro from rpy2.robjects.numpy2ri import numpy2ri from rpy2.robjects.packages import importr copula = importr('copula') n_rv, n_dim = 6, 2 data = np.random.rand(n_rv, n_dim) data2 = np.random.rand(n_rv / 2, n_dim) print "data:\n", data print "data2:\n", data2 print copula.C_n(numpy2ri(data), numpy2ri(data2)) mycopula = buildEmpiricalCopula(data) print mycopula
def RCopula(): import rpy2.robjects as ro from rpy2.robjects.numpy2ri import numpy2ri from rpy2.robjects.packages import importr copula = importr('copula') n_rv, n_dim = 6, 2 data = np.random.rand(n_rv, n_dim) data2 = np.random.rand(n_rv/2, n_dim) print "data:\n", data print "data2:\n", data2 print copula.C_n(numpy2ri(data), numpy2ri(data2)) mycopula = buildEmpiricalCopula(data) print mycopula
def py2ri_pandasseries(obj): if obj.dtype == '<M8[ns]': # time series d = [ IntVector([x.year for x in obj]), IntVector([x.month for x in obj]), IntVector([x.day for x in obj]), IntVector([x.hour for x in obj]), IntVector([x.minute for x in obj]), IntVector([x.second for x in obj]) ] res = ISOdatetime(*d) #FIXME: can the POSIXct be created from the POSIXct constructor ? # (is '<M8[ns]' mapping to Python datetime.datetime ?) res = POSIXct(res) else: # converted as a numpy array res = numpy2ri.numpy2ri(obj.values) # "index" is equivalent to "names" in R if obj.ndim == 1: res.do_slot_assign('names', StrVector(tuple(str(x) for x in obj.index))) else: res.do_slot_assign('dimnames', SexpVector(conversion.py2ri(obj.index))) return res
def fuzzyCMeans(data, k): data_train_matrix = numpy2ri(data) results = e1071.cmeans(data_train_matrix, k) centers = np.array(results.rx2('centers')) membership = np.array(results.rx2('membership')) withinerror = np.array(results.rx2('withinerror')) return withinerror
def R_reconstruction(series, tau, m): ''' http://cran.r-project.org/web/packages/tseriesChaos/tseriesChaos.pdf embedd(x, m, d, lags) ''' res = R_tseriesChaos.embedd(numpy2ri(series), m, tau) print type(res), np.asmatrix(res)
def plot_hyddiff(data): d = numpy2ri(data) r.assign('data', d) r(' source("src/R/figure_sites_hyd.R") ') r(' source("src/R/figure_P1_hyd.R") ')
def dv2_manova(DV1, DV2, IV): ''' ''' stats = importr('stats') formula = R.formula("cbind(factor0, factor1) ~ IV") env = formula.environment env["factor0"] = numpy2ri(DV1) env["factor1"] = numpy2ri(DV2) env["IV"] = numpy2ri(IV) ols_str = stats.lm(formula) results = stats.manova(ols_str) #report manova test print(R.summary(results, test='Wilks').rx('stats')) print(R.summary(R.aov(ols_str)))
def generate_images(data,template): """:param data: list of tuples (dose,response,experiment)""" ro.r(""" source("graph/R/machPlots.R") source("graph/R/processData.R") """) data_array = asarray(zip(*data)) dose = data_array[0]; response = data_array[1]; experiment = data_array[2] ro.r.assign('dose',numpy2ri(dose)) ro.r.assign('response',numpy2ri(response)) ro.r.assign('experiment',numpy2ri(experiment)) ro.r.assign('outfilename',template) ro.r(""" mydata = data.frame(dose=dose,response=response,experiment=experiment) processData(mydata, title="DRM", xlab="Dose", outfilename=outfilename, cooksfilename='', run=3) """)
def multiple_aa_composition_pca(numpy_matrix, path): ''' # pca of multiple datasets # first column = color factor :param numpy_matrix: :param target_psoition: :param path: :return: ''' import rpy2.robjects as robjects import rpy2.robjects.numpy2ri as numpy2ri rpy2.robjects.numpy2ri.activate() robjects.r.assign('Mdata', numpy2ri.numpy2ri(numpy_matrix)) robjects.r(''' library("FactoMineR") library("factoextra") print(class(Mdata)) Mdata[is.na(Mdata)] <- 0 mat <- as.data.frame(Mdata[,2:length(Mdata[1,])]) print(head(data.matrix(mat))) aa.pca <- PCA(data.matrix(mat), graph = FALSE) png("%s", height=600, width=600) print(fviz_pca_ind(aa.pca, label="none", habillage=as.factor(Mdata[,1]))) # , label="none", habillage=Mdata[,1] dev.off() ''' % (path))
def save_matrix_R(filename, matrix): rmatrix = npr.numpy2ri(matrix) r.assign('data', rmatrix) r.save('data', file=filename)
def cube_to_r(incube, reverse_dims=True): """ Convert a cube or numpy array to a data struct recognised by R """ """ Arguments: incube:- single data cube reverse_dims : Reverse dimensions? -default True Returns :- Data structure that can be passed to a R function """ from rpy2.robjects.numpy2ri import numpy2ri # Check if input data is a cube, otherwise # handle it as a numpy array if isinstance(incube, iris.cube.Cube): in_data = incube.data else: in_data = incube # Reverse dimensions if requested if reverse_dims: in_data = in_data.transpose() # numpy method # Convert to R structure --recognises only numpy array return numpy2ri(in_data)
def process_ccs(cc_in): cc = cc_in[:300, :300] for i, c in enumerate(cc): c[i] = 0 import rpy2.robjects as robjects from rpy2.rlike.container import TaggedList from rpy2.robjects.packages import importr r = robjects.r base = importr("base") # create a numerical matrix of size 100x10 filled with NAs nc = nr = shape(cc)[0] from rpy2.robjects.numpy2ri import numpy2ri m = numpy2ri(cc) # robjects.r['matrix'](v, nrow = nr, ncol = nc) biclust = importr("biclust") mb = biclust.binarize(m, 0.90) # hcv = r.hclust(r.dist(mb)) # hcv = r.hclust(r.dist(mb)) # hm = r.heatmap(mb) # raise Exception() out = biclust.biclust(m, method=biclust.BCPlaid()) n_bc = out.do_slot("Number") rows = array(out.do_slot("RowxNumber")) cols = array(out.do_slot("NumberxCol")).T return rows, cols, array(m), array(mb)
def aa_composition_pca(numpy_matrix, target_psoition, path): import rpy2.robjects as robjects import rpy2.robjects.numpy2ri as numpy2ri rpy2.robjects.numpy2ri.activate() robjects.r.assign('Mdata', numpy2ri.numpy2ri(numpy_matrix)) robjects.r(''' print(head(Mdata)) plot_scores<-function(scores,x,y, target){ plot(scores[,x],scores[,y],xlab=paste("comp.",as.character(x)),ylab=paste("comp.",as.character(y)), xlim=range(scores[,c(x,y)]),ylim=range(scores[,c(x,y)]),cex=1.5,pch=20) points(scores[target,x],scores[target,y],pch=18, col="red") #text(scores[25,x],scores[25,y],labels="test",col="red",cex=0.9) abline(h=0,col=2) abline(v=0,col=2) } visual<-function(groups,clustertable, target){ pca2 <- princomp(clustertable) par(mfrow=c(1,3),pty="s") plot_scores(pca2$scores,1,2, target) plot_scores(pca2$scores,1,3,target) plot_scores(pca2$scores,2,3, target) } png("%s", height=500, width=1300) visual(c(1,2,3), Mdata, %s) dev.off() ''' % (path, target_psoition))
def dataframe(d): """ convert a dict of numbers to an RDataframe """ df = {} if d is None: return robjects.r('as.null()') else: for k, v in d.iteritems(): df[k] = numpy2ri(numpy.array(v)) dataf = robjects.r['data.frame'](**df) return dataf
def py2ri_pandasindex(obj): if obj.dtype.kind == 'O': return StrVector(obj) else: # pandas2ri should definitely not have to know which paths remain to be # converted by numpy2ri # Answer: the thing is that pandas2ri builds on the conversion # rules defined by numpy2ri - deferring to numpy2ri is allowing # us to reuse that code. return numpy2ri.numpy2ri(obj)
def cluster_verbs(target, source, env): args = source[-1].read() return None datas, verbs = pickle.load(open(source[0].rstr(), "rb")) data = datas[("verb", "verb_class")] data = numpy.transpose(data.T / data.sum(1)) if "clusters" in args: res = stats.kmeans(numpy2ri(data), centers=args["clusters"]) else: tres = numpy.asarray(rcluster.clusGap(numpy2ri(data), FUN=stats.kmeans, K_max=30, B=500).rx2("Tab")) gaps = tres[:, 2] err = tres[:, 3] best = rcluster.maxSE(numpy2ri(gaps), numpy2ri(err), method="globalmax") res = stats.kmeans(numpy2ri(data), centers=best) ofd = meta_open(target[0].rstr(), "w") for c in set(res.rx2("cluster")): ofd.write(" ".join([verbs[i] for i, a in enumerate(res.rx2("cluster")) if a == c]) + "\n") return None
def smoothScatter(X,Y,output=None,format='png',new=True,last=True,**kwargs): """Creates a dotplot of Y values versus X values.""" plotopt,output = _begin(output=output,format=format,new=new,**kwargs) if 'nbin' in kwargs: plotopt += ',nbin=c(%i,%i)' %tuple(kwargs['nbin']) if 'bandwidth' in kwargs: plotopt += ',bandwidth=c(%f,%f)' %tuple(kwargs['bandwidth']) robjects.r.assign('xdata',numpy2ri.numpy2ri(X)) robjects.r.assign('ydata',numpy2ri.numpy2ri(Y)) robjects.r.assign('colrs', robjects.StrVector(kwargs.get("color",["lightgrey","blue","red"]))) robjects.r(""" library(graphics) colramp = colorRampPalette(colrs,interpolate="spline") smoothScatter(xdata,ydata,colramp=colramp%s) """ %plotopt) # library(RColorBrewer) # allcols = densCols(xdata,ydata,colramp=colramp) # plot(xdata,ydata,pch='.',col=allcols, cex=4%s)""" %plotopt) _end("",last,**kwargs) return output
def cluster_verbs(target, source, env): args = source[-1].read() verbs, samples = pickle.load(meta_open(source[0].rstr())) samples = samples.sum(2) data = numpy.transpose(samples.T / samples.sum(1)) res = stats.kmeans(numpy2ri(data), centers=args.get("clusters", 20)) #data[args["matrix"]].shape[0] / 10) ofd = meta_open(target[0].rstr(), "w") for c in set(res.rx2("cluster")): ofd.write(" ".join([verbs[i] for i, a in enumerate(res.rx2("cluster")) if a == c]) + "\n") return None
def hist(X,options={},output=None,format='pdf',new=True,last=True,**kwargs): """Create a histogram of the values in vector *X*.""" plotopt,output = _begin(output=output,format=format,new=new,**kwargs) rargs = "" for opt,val in options.iteritems(): rargs += ", %s=%s" % (opt,list2r(val)) robjects.r.assign('X',numpy2ri.numpy2ri(X)) robjects.r("hist(X %s)" % rargs) _end("",last,**kwargs) return output
def screw_around(): pi = robj.r['pi'] print pi print pi+2 print pi[0] print pi[0]+2 #create fake binned array nrow = 5 ncol = 10 counter = 0 binned = np.zeros((nrow, ncol), dtype="float64") for row in xrange(nrow): for col in xrange(ncol): binned[row, col] = counter counter += 1 #print binned #get binned array into R data.frame #vec = robj.FloatVector([1.1, 2.2, 0, 4.4, 5.5, ]) #print binned.shape print numpy2ri(binned) rdf = robj.r['data.frame'](numpy2ri(binned), code="ID1000") #print rdf # now see if we can get R to use this dataframe myRcode = """ square <- function(rdf) { myv = rdf$X2 + rdf$X3 return(myv) } doit <- function() { source("/srv/scratch/carolyn/Dengue_code/Rtest_rpy.R") run_test_wrap(3) } """ print "wwwwah" powerpack = SignatureTranslatedAnonymousPackage(myRcode, "powerpack") print powerpack._rpy2r.keys() #to reveal the functions within powerpack print powerpack.square(rdf) #to run the function "square" found in powerpack print powerpack.doit()
def save_simmat_R(filename, simmat): rmatrix = npr.numpy2ri(simmat.matrix) r.assign('data', rmatrix) r("rownames(%s) <- c%s" % ('data', tuple(simmat.labels))) r("colnames(%s) <- c%s" % ('data', tuple(simmat.labels))) r.save('data', file=filename)
def pandas2ri(obj): if isinstance(obj, PandasDataFrame): od = OrderedDict() for name, values in obj.iteritems(): if values.dtype.kind == 'O': od[name] = StrVector(values) else: od[name] = pandas2ri(values) return DataFrame(od) elif isinstance(obj, PandasIndex): if obj.dtype.kind == 'O': return StrVector(obj) else: # only other alternative to 'O' is integer, I think, # which goes straight to the numpy converter. return numpy2ri.numpy2ri(obj) elif isinstance(obj, PandasSeries): if obj.dtype == '<M8[ns]': # time series d = [IntVector([x.year for x in obj]), IntVector([x.month for x in obj]), IntVector([x.day for x in obj]), IntVector([x.hour for x in obj]), IntVector([x.minute for x in obj]), IntVector([x.second for x in obj])] res = ISOdatetime(*d) #FIXME: can the POSIXct be created from the POSIXct constructor ? # (is '<M8[ns]' mapping to Python datetime.datetime ?) res = POSIXct(res) else: # converted as a numpy array res = numpy2ri.numpy2ri(obj.values) # "index" is equivalent to "names" in R if obj.ndim == 1: res.do_slot_assign('names', ListVector({'x': pandas2ri(obj.index)})) else: res.do_slot_assign('dimnames', ListVector(pandas2ri(obj.index))) return res else: return original_py2ri(obj)
def old_cluster_verbs(target, source, env): args = source[-1].read() #verbs, samples = pickle.load(meta_open(source[0].rstr())) #samples = numpy.asarray(samples) #samples = samples.sum(2) feat = args.get("feat", "class") all_data = {} for line in open(source[0].rstr()): toks = line.strip().split() if not toks[0].startswith("_"): verb = toks[0] other = toks[1] vals = [float(x.strip("[],")) for x in toks[2:]] if sum(vals) > 0: all_data[verb] = all_data.get(verb, {}) all_data[verb][other] = vals data = numpy.zeros(shape=(len(all_data), len(all_data.values()[0]["_%s" % feat]))) verbs = sorted(all_data.keys()) for i, verb in enumerate(verbs): data[i, :] = all_data[verb]["_%s" % feat] data = numpy.transpose(data.T / data.sum(1)) if "clusters" in args: res = stats.kmeans(numpy2ri(data), centers=args["clusters"]) else: tres = numpy.asarray(rcluster.clusGap(numpy2ri(data), FUN=stats.kmeans, K_max=30, B=500).rx2("Tab")) gaps = tres[:, 2] err = tres[:, 3] best = rcluster.maxSE(numpy2ri(gaps), numpy2ri(err), method="globalmax") res = stats.kmeans(numpy2ri(data), centers=best) ofd = meta_open(target[0].rstr(), "w") for c in set(res.rx2("cluster")): ofd.write(" ".join([verbs[i] for i, a in enumerate(res.rx2("cluster")) if a == c]) + "\n") return None
def cluster_by_valex(target, source, env): import graphmod as gm args = source[-1].read() target_verbs = set() instances = gm.Instances() gm.load_instances(source[0].rstr(), instances) for vid in range(instances.get_size("verb_lemma")): target_verbs.add(instances.get_name("verb_lemma", vid)) data = {} scfs = {} verbs = {} for fname in sorted(glob(os.path.join("%s/lex-%s" % (env["VALEX_LEXICON"], args["lexicon"]), "*"))): verb = os.path.basename(fname).split(".")[0] if verb not in target_verbs: continue data[verb] = {} for m in re.finditer(r":CLASSES \((.*?)\).*\n.*FREQCNT (\d+)", meta_open(fname).read()): scf = int(m.group(1).split()[0]) count = int(m.group(2)) scfs[scf] = scfs.get(scf, 0) + count verbs[verb] = verbs.get(verb, 0) + count data[verb][scf] = count ddata = numpy.zeros(shape=(len(verbs), len(scfs))) verbs = sorted(verbs) scfs = sorted(scfs) for row, verb in enumerate(verbs): for col, scf in enumerate(scfs): ddata[row, col] = data[verb].get(scf, 0) data = numpy.transpose(ddata.T / ddata.sum(1)) tres = numpy.asarray(rcluster.clusGap(numpy2ri(data), FUN=stats.kmeans, K_max=30, B=500).rx2("Tab")) gaps = tres[:, 2] err = tres[:, 3] best = rcluster.maxSE(numpy2ri(gaps), numpy2ri(err), method="globalmax") res = stats.kmeans(numpy2ri(data), centers=best) ofd = meta_open(target[0].rstr(), "w") for c in set(res.rx2("cluster")): ofd.write(" ".join([verbs[i] for i, a in enumerate(res.rx2("cluster")) if a == c]) + "\n") return None
def R_correlationIntegral(series, tau, m, t, r): ''' http://cran.r-project.org/web/packages/tseriesChaos/tseriesChaos.pdf C2(series, m, d, t, eps) series: time series m: embedding dimension d: time delay t: Theiler window eps: length scale ''' res = R_tseriesChaos.C2(numpy2ri(series), m, tau, t, r) print res[0]
def fit(self, X, y): # Check params self.n_features_ = X.shape[1] if isinstance(self.max_features, str): if self.max_features == "auto": max_features = max(1, int(np.sqrt(self.n_features_))) elif self.max_features == "sqrt": max_features = max(1, int(np.sqrt(self.n_features_))) elif self.max_features == "log2": max_features = max(1, int(np.log2(self.n_features_))) else: raise ValueError( 'Invalid value for max_features. Allowed string ' 'values are "auto", "sqrt" or "log2".') elif self.max_features is None: max_features = self.n_features_ elif isinstance(self.max_features, (numbers.Integral, np.integer)): max_features = self.max_features else: # float max_features = int(self.max_features * self.n_features_) params = {} params["mtry"] = max_features params["ntrees"] = self.n_estimators params["nodesize"] = self.min_samples_leaf # Convert data self.classes_ = np.unique(y) y = np.searchsorted(self.classes_, y) + 1 X = numpy2ri(X) y = ro.FactorVector(numpy2ri(y)) # Run self.model_ = rf.randomForest(X, y, **params) return self
def numpy2ri_avoiding_zerodim(x): if hasattr(x, 'shape') and x.shape == (): # cast into normal python scalar...sigh kinds = { 'b': bool, 'u': int, 'i': int, 'f': float, 'c': complex, } try: x = kinds[x.dtype.kind](x) except KeyError: pass # just pass it along return numpy2ri(x)