def bootdensity(data, min, max, nboot, ci): """ Calculate density and confidence intervals on density for a 1D array of points. Bandwidth is selected automatically. """ r(""" limdensity <- function(data, weights=NULL, bw="nrd0") { density(data, from=%f, to=%f, weights=weights, bw=bw) } """%(min, max)) density = r.limdensity(data) xdens = N.array(density['x']) ydens = N.array(density['y']) bw = density['bw'] #print 'bandwidth:', bw ydensboot = N.zeros((nboot, len(xdens)), N.float) ndata = len(data) ran = N.random.uniform(0, ndata, (nboot,ndata)).astype(N.int) for i in range(nboot): den = r.limdensity(data[ran[i]]) y = N.array(den['y']) ydensboot[i] = y ydensbootsort = N.sort(ydensboot, axis=0) ydensbootsort = interp1d(N.arange(0, 1.000001, 1.0/(nboot-1)), ydensbootsort, axis=0) ilow = (0.5-ci/2.0) ihigh = (0.5+ci/2.0) ydenslow, ydenshigh = ydensbootsort((ilow, ihigh)) ydenslow = gaussian_filter1d(ydenslow, bw*512/10.0) ydenshigh, ydenshigh = ydensbootsort((ihigh, ihigh)) ydenshigh = gaussian_filter1d(ydenshigh, bw*512/10.0) return xdens, ydens, ydenslow, ydenshigh, bw
def testHMM_paral(self): server_url = self.server_url self.get(server_url + "/", description="Get /") self.post(server_url + "/cgi-bin/adacghR.cgi", params=[ ['acghData', Upload("empty.txt")], ['positionInfo', Upload("empty.txt")], ['twofiles', 'One.file'], ['acghAndPosition', Upload("two.sample.shuffled.num.test")], ['centering', 'None'], ['methodaCGH', 'HMM'], ['Wave.minDiff', '0.25'], ['Wave.merge', 'Yes'], ['PSW.nIter', '1000'], ['PSW.p.crit', '0.15'], ['ACE.fdr', '0.15'], ['MCR.gapAllowed', '500'], ['MCR.alteredLow', '0.03'], ['MCR.alteredHigh', '0.97'], ['MCR.recurrence', '75'], ['organism', 'None'], ['idtype', 'None']], description="CBS; numerical of parallel") final_output = 'Segmented data plots' common_part(self, final_output) url_before_get = self.getLastUrl() urlretrieve(server_url + url_before_get.replace('results.html', 'HMM.output.txt'), filename = 'HMM.web.output.txt') import rpy ## to verify numerical output print '########## @@@@@@@@@@@@@@ Testing HMM' tmp = rpy.r('source("test-num.R")') tmp = rpy.r('test.hmm()') print tmp
def plotNumLegend(self, colVecL, breakL, nb_breaks, filename=None, legendDir=None, type='png', int_labels=False): if filename is None: filename = 'legend_%i_%i_%i' % (len(colVecL), min(breakL), max(breakL)) if legendDir is None: legendDir = self.legendDir full_filename = os.path.join(legendDir, filename) max_break = max(breakL) min_break = min(breakL) tickBreakL = [float(x) / (nb_breaks - 1) * (max_break - min_break) - min_break for x in range(nb_breaks)] if int_labels: labels = ['%i' % int(x) for x in tickBreakL] else: labels = ['%.3f' % x for x in tickBreakL] rdev = plot_utilities.RDevice(name = full_filename, title='', plotType=type, width=640, height=120) r("par(mar=c(3,1,0,1), las=2)") #legendA = rpy.reshape(breakL, (len(breakL), 1)) legendA = r.matrix(breakL, byrow=False, ncol=1) r.image(legendA, 1, legendA, col=colVecL, axes=False, ann=False) r.box() r.axis(1, at=tickBreakL, labels=labels, tick=True, line=0, cex=0.8, cex_axis=0.8) rdev.close() return
def Monocle(self): print 'Loading affy package in R' print_out = r('library("monocle")') if "Error" in print_out: print 'Installing the R package "affy" in Config/R' print_out = r('source("http://bioconductor.org/biocLite.R"); biocLite("monocle")') print print_out print_out = r('library("monocle")')
def make_L(data,direction='S',z=None,): """ Define the along track distance from one reference direction define the cardinal direction priority (N,S,W or E). S means that the reference will be the southern most point z define the bathymetry, if defined, the closest point to that bathymetry will be the reference. In case of cross this bathymetry more than once, the direction criteria is used to distinguish. """ from fluid.common.distance import distance all_cycles_data = join_cycles(data) if z==None: import rpy #for t in topex.invert_keys(data): for t in all_cycles_data: rpy.set_default_mode(rpy.NO_CONVERSION) linear_model = rpy.r.lm(rpy.r("y ~ x"), data = rpy.r.data_frame(x=all_cycles_data[t]['Longitude'], y=all_cycles_data[t]['Latitude'])) rpy.set_default_mode(rpy.BASIC_CONVERSION) coef=rpy.r.coef(linear_model) if direction=='S': lat0=all_cycles_data[t]['Latitude'].min()-1 lon0 = (lat0-coef['(Intercept)'])/coef['x'] L_correction = distance(all_cycles_data[t]['Latitude'],all_cycles_data[t]['Longitude'],lat0,lon0).min() for c in invert_keys(data)[t]: data[c][t]['L'] = distance(data[c][t]['Latitude'],data[c][t]['Longitude'],lat0,lon0)- L_correction # This bathymetric method was only copied from an old code. This should be atleast # changed, if not removed. elif method=='bathymetric': import rpy for t in all_cycles_data: # First define the near coast values. idSouth=numpy.argmin(all_cycles_data[t]['Latitude']) L_tmp = distance(all_cycles_data[t]['Latitude'],all_cycles_data[t]['Longitude'],all_cycles_data[t]['Latitude'][idSouth],all_cycles_data[t]['Longitude'][idSouth]) idNearCoast = L_tmp.data<400e3 if min(all_cycles_data[t]['Bathy'][idNearCoast]) > -z: idNearCoast = L_tmp.data<600e3 # Then calculate the distance to a reference rpy.set_default_mode(rpy.NO_CONVERSION) linear_model = rpy.r.lm(rpy.r("y ~ x"), data = rpy.r.data_frame(x=all_cycles_data[t]['Longitude'], y=all_cycles_data[t]['Latitude'])) rpy.set_default_mode(rpy.BASIC_CONVERSION) coef=rpy.r.coef(linear_model) lat0 = all_cycles_data[t]['Latitude'].min()-1 lon0 = (lat0-coef['(Intercept)'])/coef['x'] #L = distance(,lon,lat0,lon0) # #id0 = numpy.argmin(numpy.absolute(all_cycles_data[t]['Bathy'][idNearCoast])) idref=numpy.argmin(numpy.absolute(all_cycles_data[t]['Bathy'][idNearCoast]+z)) #L_correction = distance(all_cycles_data[t]['Latitude'][idNearCoast][idref],all_cycles_data[t]['Longitude'][idNearCoast][idref],all_cycles_data[t]['Latitude'][idNearCoast][idref],all_cycles_data[t]['Longitude'][idNearCoast][idref]) L_correction = distance(all_cycles_data[t]['Latitude'][idNearCoast][idref],all_cycles_data[t]['Longitude'][idNearCoast][idref],lat0,lon0) for c in topex.invert_keys(data)[t]: #data[c][t]['L'] = distance(data[c][t]['Latitude'],data[c][t]['Longitude'],all_cycles_data[t]['Latitude'][idNearCoast][id0],all_cycles_data[t]['Longitude'][idNearCoast][id0]) - L_correction data[c][t]['L'] = distance(data[c][t]['Latitude'],data[c][t]['Longitude'],lat0,lon0) - L_correction # return
def get_by_index(spss_data_file, gene_index_1, gene_index_2, trait_index_1, trait_index_2): r("data = read.spss(file='"+spss_data_file+"',to.data.frame=TRUE)") gene_index = str(gene_index_1)+":"+ str(gene_index_2) genes = r("names(data["+gene_index+"])") trait_index = str(trait_index_1)+":"+ str(trait_index_2) traits = r("names(data["+trait_index+"])") #print genes #print traits return genes, traits
def __init__(self, x, y, **kwargs): self.nvar = 1 if x.ndim==1 else x.shape[1] assert (x.ndim==1 and x.size==y.size) or (x.ndim==2 and x.shape[0]==y.size), "X and Y inputs must have same number of rows" assert (self.nvar < 5), "Maximum number of predictors is 4" df = with_mode(NO_CONVERSION, r.data_frame)(x=x,y=y.flatten()) if x.ndim==1: model = r("y ~ x") else: model = r("y ~ " + ' + '.join('x.%d' % (i+1) for i in range(4))) self.smoother = with_mode(NO_CONVERSION, r.loess)(model, data=df, **kwargs)
def Monocle(self): print 'Loading affy package in R' print_out = r('library("monocle")') if "Error" in print_out: print 'Installing the R package "affy" in Config/R' print_out = r( 'source("http://bioconductor.org/biocLite.R"); biocLite("monocle")' ) print print_out print_out = r('library("monocle")')
def read_directory(sub_dir): dir=os.path.dirname(__file__) #print "Working Directory:", r('getwd()') working_dir = dir+'/'+sub_dir[1:] setwd = 'setwd("%s")' % working_dir r(setwd) #print "Working Directory:", r('getwd()') dir_list = os.listdir(dir +'/'+ sub_dir[1:]); dir_list2 = [] for entry in dir_list: #add in code to prevent folder names from being included if entry[-4:] == ".txt" or entry[-4:] == ".csv": dir_list2.append(entry) return dir_list2
def ALS(s, thresh=.001, nonnegS=True, nonnegC=True): """Alternate least squares Wrapper around the R's ALS package Parameters ---------- s : Spectrum instance threshold : float convergence criteria nonnegS : bool if True, impose non-negativity constraint on the components nonnegC : bool if True, impose non-negativity constraint on the maps Returns ------- Dictionary """ import_rpy() # Format # ic format (channels, components) # W format (experiment, components) # s format (experiment, channels) nonnegS = 'TRUE' if nonnegS is True else 'FALSE' nonnegC = 'TRUE' if nonnegC is True else 'FALSE' print "Non negative constraint in the sources: ", nonnegS print "Non negative constraint in the mixing matrix: ", nonnegC refold = unfold_if_2D(s) W = s._calculate_recmatrix().T ic = np.ones(s.ic.shape) rpy.r.library('ALS') rpy.r('W = NULL') rpy.r('ic = NULL') rpy.r('d1 = NULL') rpy.r['<-']('d1', s.data_cube.squeeze().T) rpy.r['<-']('W', W) rpy.r['<-']('ic', ic) i = 0 # Workaround a bug in python rpy version 1 while hasattr(rpy.r, 'test' + str(i)): rpy.r('test%s = NULL' % i) i += 1 rpy.r('test%s = als(CList = list(W), thresh = %s, S = ic,\ PsiList = list(d1), nonnegS = %s, nonnegC = %s)' % (i, thresh, nonnegS, nonnegC)) if refold: s.fold() exec('als_result = rpy.r.test%s' % i) return als_result
def ALS(s, thresh =.001, nonnegS = True, nonnegC = True): """Alternate least squares Wrapper around the R's ALS package Parameters ---------- s : Spectrum instance threshold : float convergence criteria nonnegS : bool if True, impose non-negativity constraint on the components nonnegC : bool if True, impose non-negativity constraint on the maps Returns ------- Dictionary """ import_rpy() # Format # ic format (channels, components) # W format (experiment, components) # s format (experiment, channels) nonnegS = 'TRUE' if nonnegS is True else 'FALSE' nonnegC = 'TRUE' if nonnegC is True else 'FALSE' print "Non negative constraint in the sources: ", nonnegS print "Non negative constraint in the mixing matrix: ", nonnegC refold = unfold_if_2D(s) W = s._calculate_recmatrix().T ic = np.ones(s.ic.shape) rpy.r.library('ALS') rpy.r('W = NULL') rpy.r('ic = NULL') rpy.r('d1 = NULL') rpy.r['<-']('d1', s.data_cube.squeeze().T) rpy.r['<-']('W', W) rpy.r['<-']('ic', ic) i = 0 # Workaround a bug in python rpy version 1 while hasattr(rpy.r, 'test' + str(i)): rpy.r('test%s = NULL' % i) i+=1 rpy.r('test%s = als(CList = list(W), thresh = %s, S = ic,\ PsiList = list(d1), nonnegS = %s, nonnegC = %s)' % (i, thresh, nonnegS, nonnegC)) if refold: s.fold() exec('als_result = rpy.r.test%s' % i) return als_result
def r_to_str(robj): "Returns an R object in a representation as a list of strings." from rpy import r from tempfile import mktemp tmpfile = mktemp() #logging.info('Tmpfile: %s' % tmpfile) try: r.assign('tmpobj', robj) r('save(tmpobj, file="%s", ascii=TRUE)' % tmpfile) return open(tmpfile).read() finally: if os.access(tmpfile, os.R_OK): os.remove(tmpfile)
def read_directory(sub_dir): dir = os.path.dirname(__file__) #print "Working Directory:", r('getwd()') working_dir = dir + '/' + sub_dir[1:] setwd = 'setwd("%s")' % working_dir r(setwd) #print "Working Directory:", r('getwd()') dir_list = os.listdir(dir + '/' + sub_dir[1:]) dir_list2 = [] for entry in dir_list: #add in code to prevent folder names from being included if entry[-4:] == ".txt" or entry[-4:] == ".csv": dir_list2.append(entry) return dir_list2
def __init__(self, x, y, **kwargs): self.nvar = 1 if x.ndim == 1 else x.shape[1] assert (x.ndim == 1 and x.size == y.size) or ( x.ndim == 2 and x.shape[0] == y.size), "X and Y inputs must have same number of rows" assert (self.nvar < 5), "Maximum number of predictors is 4" df = with_mode(NO_CONVERSION, r.data_frame)(x=x, y=y.flatten()) if x.ndim == 1: model = r("y ~ x") else: model = r("y ~ " + ' + '.join('x.%d' % (i + 1) for i in range(4))) self.smoother = with_mode(NO_CONVERSION, r.loess)(model, data=df, **kwargs)
def plot_entry_list(log_entry_list, binning=None, binning_dict=None, bin_widths=None, bin_widths_dict=None, support_dict=None, xlab=None, ylab="", normalize_log_space=True, xmin=None, xmax=None, main=None, bin_numbers=True): entries = convert_log_entries(log_entry_list) points = [] for entry in entries: number, fullname, tarray = entry this_ylab = ylab # Decide on which binning to use if binning_dict!=None and binning_dict.has_key(number): bins = binning_dict[number][2] elif binning!=None and len(binning)==1 and len(binning[0][2])-1==len(tarray): bins = binning[0][2] else: bins = None # Normalize by bin width, if the binwidh is available if bin_widths_dict!=None and bin_widths_dict.has_key(number): widths = bin_widths_dict[number][2] elif bin_widths!=None and len(bin_widths)==1 and len(bin_widths[0][2])==len(tarray): widths = bin_widths[0][2] else: widths = None if widths!=None: if normalize_log_space: tarray -= log(widths) this_ylab = r("expression(%s - ln(Delta[bin]))" % ylab) else: tarray = tarray/widths # Note, /= worn't work, since tarray might be an int array this_ylab = r("expression(%s / Delta[bin])" % ylab) # Decide on which support to use if support_dict!=None and support_dict.has_key(number): support = support_dict[number][2] else: support = None # Do the plotting p = plot_name_array(fullname, tarray, 1, counts=False, bins=bins, support=support, makenewplot=True, xlab=xlab, ylab=this_ylab, xmin=xmin, xmax=xmax, main=main, bin_numbers=bin_numbers) points.append((number, fullname, p)) return points
def chisq_test(*samples): """do chi-square test on contingency table samples [(n, m),...] """ data = transpose(samples) cmd = "chisq.test(data.frame(t=c(%s),f=c(%s)))" % (cjoin(data[0]), cjoin(data[1])) return rpy.r(cmd)["p.value"]
def chisq(m1, n1, m2, n2, **kargs): """do chi-square test on contingency table samples [(n, m),...] """ cmd = "chisq.test(matrix(c(%d, %d, %d, %d), nc=2))" \ % (m1, n1-m1, m2, n2-m2) return rpy.r(cmd)['p.value']
def read_data_old(spss_data_file): #file=fileimp, to.data.frame=TRUE) r("library('foreign')") #r("library('stats')") data_sheet = r("read.spss(file='"+spss_data_file+"',to.data.frame=TRUE)") #print data_sheet #exit(0) #column_labels = data_sheet.keys() #exit(0) #for label, values in data_sheet.items(): # new_vals = map(lambda x:if not is.str(x) : str(x), values) # data_sheet[label] = new_vals return data_sheet #column_labels
def _generate_simplex_trajectories(r, p, mins, maxs, h): x = np.zeros((r * (p + 1), p)) simpf = rpy.r("sensitivity:::random.simplexes") minsr = rpy.r.FloatVector(mins) maxsr = rpy.r.FloatVector(maxs) x[:,:] = simpf(p, r, minsr, maxsr, h) return x.reshape((r, p+1, p))
def verify_FishersExact(data_name): """ Verify output from Fishers' Exact test for IxJ contingency tables.""" print "\n\n\n******* Verifying Fisher's test results \n" rpy.r('load("' + FILES_DIR + 'fisher.verified.RData")') fisherConnect = flstandalone.NumTesting() fisherConnect.setUp('http://pomelo2.bioinfo.cnio.es') fisherConnect.send_get_pomelo(FILES_DIR + data_name + '.data.txt', FILES_DIR + data_name + '.labels.txt', FILES_DIR + 'empty.txt', 'FisherIxJ', '2') ## time.sleep(50) r_read = 'fisherPomelo <- readPomeloOutput()' r_compare = 'comparePomelo(fisherPomelo, fisher.pv, fisher = TRUE)' rpy.r(r_read) out_comparison = rpy.r(r_compare) if (out_comparison == 'ERROR: test failed'): raise AsteriasAssertionError
def fisher(m1, n1, m2, n2, **kargs): """do chi-square test on contingency table samples [(n, m),...] """ alternative = kargs.get('alternative', 'greater') cmd = 'fisher.test(matrix(c(%d, %d, %d, %d), nc=2), alternative="%s")' \ % (m1, n1-m1, m2, n2-m2, alternative) return rpy.r(cmd)['p.value']
def pure_linear_model_via_R(cls, non_NA_genotype_ls, non_NA_phenotype_ls, non_NA_phenotype2count=None): """ 2010-2-25 use createDesignMatrix() to generate a design matrix 2009-8-28 split out of pure_linear_model(). same functionality as pure_linear_model(), but invoke R to run regression. """ genotype_matrix = cls.createDesignMatrix(non_NA_genotype_ls) # 2008-11-10 do linear regression by R genotype_var = numpy.var(genotype_matrix[:, 0]) # 2008-11-10 var=\sum(x_i-\bar{x})^2/(n-1) rpy.set_default_mode(rpy.NO_CONVERSION) # 04-07-05 # data_frame = rpy.r.as_data_frame({"phenotype":non_NA_phenotype_ls, "genotype":rpy.r.as_factor(genotype_matrix[:,1])}) formula_list = [] data_frame_dict = {"phenotype": non_NA_phenotype_ls} for i in range(genotype_matrix.shape[1]): var_name = "genotype%s" % i formula_list.append(var_name) data_frame_dict.update({var_name: genotype_matrix[:, i]}) data_frame = rpy.r.as_data_frame(data_frame_dict) formula = "phenotype~%s" % "+".join(formula_list) if non_NA_phenotype2count and len(non_NA_phenotype2count) == 2: # binary phenotype, use logistic regression lm_result = rpy.r.glm(rpy.r(formula), data=data_frame, family=rpy.r("binomial")) else: lm_result = rpy.r.glm(rpy.r(formula), data=data_frame) rpy.set_default_mode(rpy.BASIC_CONVERSION) # 04-07-05 r.summary() requires lm_result in NO_CONVERSION state summary_stat = rpy.r.summary(lm_result) # 06-30-05 index 0 in summary_stat['coefficients'] is intercept coeff_list = [] coeff_p_value_list = [] for i in range(len(summary_stat["coefficients"])): coeff_list.append(summary_stat["coefficients"][i][0]) # 0 is the coefficient coeff_p_value_list.append(summary_stat["coefficients"][i][-1]) # -1 is the corresponding p-value # 06-30-05 fill in other efficients based on bit_string, NOTE i+1 pvalue = coeff_p_value_list[1] residuals = summary_stat["deviance"] geno_effect_var = genotype_var * coeff_list[1] * coeff_list[1] * (no_of_rows - 1) var_perc = geno_effect_var / (residuals + geno_effect_var) pdata = PassingData( pvalue=pvalue, var_perc=var_perc, coeff_list=coeff_list, coeff_p_value_list=coeff_p_value_list ) return pdata
def lm(self, l, h): for i in range(l, h + 1): data_frame, data_model = self.mount_reg_params(i) print data_model rpy.set_default_mode(rpy.NO_CONVERSION) linear_model = r.lm(r(data_model), data=data_frame) rpy.set_default_mode(rpy.BASIC_CONVERSION) print r.summary(linear_model)['r.squared']
def chisq_test(*samples): """do chi-square test on contingency table samples [(n, m),...] """ data = transpose(samples) cmd = "chisq.test(data.frame(t=c(%s),f=c(%s)))" \ % (cjoin(data[0]), cjoin(data[1])) return rpy.r(cmd)['p.value']
def platt_opts(light, params): """ Adjust `opt` values of PAR levels following the Platt model. Parameters ---------- light : arr Generally PAR values. Where Photosynthetic Active Radiance interfer on Primary Production. 'light' is this parameter. params: arr Containing values of (alpha, Beta, etrmax). Returns ------- opts : arr Values optimized according to `params`and list of PAR levels. """ opts = [] r.assign("light", light[~np.isnan(light)]) r.assign("params", params) # if opt == None: # r.assign("opt", light[~np.isnan(light)]) # else: # r.assign("opt", opt[~np.isnan(opt)]) # if ini == None: # r.assign('ini', [0.4,1.5,1500]) # else: # r.assign('ini', np.array(ini)) # op, platt_param = platt(light,etr, ini=ini) # r.assign('platt_param', platt_param) min_opt = r(""" min_opt<-function(light,params){ alpha<-params[1] Beta<-params[2] Ps<-params[3] return( ( (Ps*(1-exp(-alpha*light/Ps)) *exp(-Beta*light/Ps)) ) ) }""") opts = np.append(opts, r('min_opt(light, params)')) return opts
def regression(data): """Calls R's lm to make a linear regression on each of its inputs.""" reg = r.lm(r('x ~ y'), data = r.data_frame(x=data[:,0], y=data[:,1]) )['coefficients'] return reg
def plot_bin_widths(entries, log_space=False, main=None): for entry in entries: number, fullname, tarray = entry if log_space: tarray = numpy.log(tarray) ylab = r("expression(ln(Delta[bin]))") else: ylab = r("expression(Delta[bin])") if main==None: this_main = fullname else: this_main = main if len(tarray)>0: r.plot(range(len(tarray)), tarray, xlab="bin number", ylab=ylab, main=this_main)
def anova2(values, factor1, factor2, factor1name="factor1", factor2name="factor2", interaction=True): """ python wrapper for a two-way anova in R with optional interaction term ( default=True ) """ # build a dataframe for R dataframe = {} dataframe["feature"] = values dataframe["factor1"] = factor1 dataframe["factor2"] = factor2 r.assign("df", dataframe) r("df$factor1 <- factor( df$factor1 )") r("df$factor2 <- factor( df$factor2 )") # run the model results = r("anova( lm( df$feature ~ df$factor1 %s df$factor2 ) )" % ("*" if interaction else "+")) r("rm( list=ls() )") # convert R results to table colheads = ["Df", "Sum Sq", "Mean Sq", "F value", "Pr( >F )"] rowheads = [factor1name, factor2name] rowheads += ["int term", "error"] if interaction else ["error"] ndictData = {} for rowhead in results.keys(): for index, name in zip(range(len(rowheads)), rowheads): dictName = ndictData.setdefault(name, {}) dictName[rowhead] = results[rowhead][index] # return as zopy table return nesteddict2table(ndictData, rowheads, colheads)
def binom(m1, n1, m2, n2, **kargs): """do chi-square test on contingency table samples [(n, m),...] """ p = float(m2) / n2 alternative = kargs.get('alternative', 'greater') cmd = 'binom.test(c(%d, %d), p=%f, alternative="%s")' \ % (m1, n1-m1, p, alternative) return rpy.r(cmd)['p.value']
def model_mean_and_variance(meanvar_ests): """Regression models of mean and var as functions of major allele len. NOTE: since alleles are already normalized to the major allele (e.g., allele len=0 is the major allele), we're modeling error of the means and variances. LATER: Use several regression formulae to see how things look and choose the best fit? For now, the log regression seems (simply by eye) to be the better fit.""" # Weights are just the number of observed sites for each majro allele weights = meanvar_ests['count'] adjmean = [ na(mean) for mean in meanvar_ests['mean'] ] adjvar = [ na(var) for var in meanvar_ests['var'] ] lmdata = rpy.r.data_frame(major=meanvar_ests['major'], mean=adjmean, var=adjvar) meanmodel = rpy.r.lm(rpy.r("mean ~ log(major)"), data=lmdata, weights=weights) varmodel = rpy.r.lm(rpy.r("var ~ log(major)"), data=lmdata, weights=weights) return meanmodel, varmodel
def binom(m1, n1, m2, n2, **kargs): """do chi-square test on contingency table samples [(n, m),...] """ p = float(m2)/n2 alternative = kargs.get('alternative', 'greater') cmd = 'binom.test(c(%d, %d), p=%f, alternative="%s")' \ % (m1, n1-m1, p, alternative) return rpy.r(cmd)['p.value']
def rpart_fit_and_predict(self, all_data, known_data, rpart_cp, loss_matrix, prior_prob, bit_string='11111'): """ 11-09-05 1st use known_data to get the fit model 2nd use the fit model to do prediction on all_data, result is prob for each class 11-09-05 add rpart_cp 11-17-05 add loss_matrix, prior_prob return two pred """ sys.stderr.write("rpart fitting and predicting...\n") r.library("rpart") coeff_name_list = ['p_value', 'recurrence', 'connectivity', 'cluster_size', 'gradient'] formula_list = [] for i in range(len(bit_string)): if bit_string[i] == '1': formula_list.append(coeff_name_list[i]) #11-17-05 transform into array all_data = array(all_data) known_data = array(known_data) set_default_mode(NO_CONVERSION) data_frame = r.as_data_frame({"p_value":known_data[:,0], "recurrence":known_data[:,1], "connectivity":known_data[:,2], \ "cluster_size":known_data[:,3], "gradient":known_data[:,4], "is_correct":known_data[:,-1]}) if prior_prob: prior_prob = [prior_prob, 1-prior_prob] #get the full list fit = r.rpart(r("is_correct~%s"%'+'.join(formula_list)), data=data_frame, method="class", control=r.rpart_control(cp=rpart_cp),\ parms=r.list(prior=prior_prob, loss=r.matrix(loss_matrix) ) ) else: fit = r.rpart(r("is_correct~%s"%'+'.join(formula_list)), data=data_frame, method="class", control=r.rpart_control(cp=rpart_cp),\ parms=r.list(loss=r.matrix(loss_matrix) ) ) set_default_mode(BASIC_CONVERSION) pred_training = r.predict(fit, data_frame, type=["class"]) del data_frame set_default_mode(NO_CONVERSION) all_data_frame = r.as_data_frame({"p_value":all_data[:,0], "recurrence":all_data[:,1], "connectivity":all_data[:,2], \ "cluster_size":all_data[:,3], "gradient":all_data[:,4], "is_correct":all_data[:,-1]}) set_default_mode(BASIC_CONVERSION) pred = r.predict(fit, all_data_frame, type=["class"]) #11-17-05 type=c("class") del all_data_frame sys.stderr.write("Done rpart fitting and predicting.\n") return pred, pred_training
def plotLabelLegend(self, colVecL, labelPhenoD, filename=None, legendDir=None, type='png'): if filename is None: filename = 'legend_label_%i' % len(labelPhenoD.keys()) if legendDir is None: legendDir = self.legendDir full_filename = os.path.join(legendDir, filename) labelL = labelPhenoD.keys() labelL.sort() phenoL = [labelPhenoD[x] for x in labelL] rdev = plot_utilities.RDevice(name = full_filename, title='', plotType=type, width=300, height=200) r("par(mar=c(0,0,0,0))") r.plot([1, 2, 3], type="n", xlab="", ylab="", main="", ann=False, axes=False) r.legend(x="center", legend = phenoL, fill=colVecL, bg = "white", bty="n", cex=0.7) rdev.close()
def Multtest(self,test_type): r('library("multtest")') filename = self.File() try: output_file = string.replace(filename,'input','output') except ValueError: output_file = filename[0:-4]+'-output.txt' print "Begining to process",filename parse_line = 'job<-read.table(%s,sep="\t", row.names=1, as.is=T)' % filename print_out = r(parse_line) print_out = r('matrix_size<-dim(job)') print_out = r('label<-job[1,2:matrix_size[2]]') print_out = r('jobdata<-job[2:matrix_size[1],2:matrix_size[2]]') if test_type == "f": print_out = r('ttest<-mt.maxT(jobdata,label, test="f", B=50000)') if test_type == "t": print_out = r('ttest<-mt.maxT(jobdata,label)') print_out = r('ttest2<-ttest[order(ttest[,1]),]') write_file = 'write.table(ttest2,%s,sep="\t")' % output_file print_out = r(write_file) print "Results written to:",output_file
def randomForest_fit(self, known_data, parameter_list, bit_string="1111111"): """ 03-17-06 2006-10-302006-10-30, add avg_degree(vertex_gradient) and unknown_cut_off """ if self.debug: sys.stderr.write("Fitting randomForest...\n") mty = parameter_list[0] from rpy import r r._libPaths( os.path.join(lib_path, "R") ) # better than r.library("randomForest", lib_loc=os.path.join(lib_path, "R")) (see plone doc) r.library("randomForest") coeff_name_list = [ "p_value", "recurrence", "connectivity", "cluster_size", "gradient", "avg_degree", "unknown_ratio", ] # 2006-10-30 formula_list = [] for i in range(len(bit_string)): if bit_string[i] == "1": formula_list.append(coeff_name_list[i]) formula = r("is_correct~%s" % "+".join(formula_list)) known_data = array(known_data) set_default_mode(NO_CONVERSION) data_frame = r.as_data_frame( { "p_value": known_data[:, 0], "recurrence": known_data[:, 1], "connectivity": known_data[:, 2], "cluster_size": known_data[:, 3], "gradient": known_data[:, 4], "avg_degree": known_data[:, 5], "unknown_ratio": known_data[:, 6], "is_correct": r.factor(known_data[:, -1]), } ) # 03-17-06, watch r.factor #2006-10-30 if mty > 0: fit = r.randomForest(formula, data=data_frame, mty=mty) else: fit = r.randomForest(formula, data=data_frame) del data_frame if self.debug: sys.stderr.write("Done fitting randomForest.\n") return fit
def verify_Cox(data_name): ''' Launch Cox in PomeloII, get results, and verify against R.''' print '\n\n\n******* Verifying Cox results for data set ' + data_name + '\n' coxConnect = flstandalone.NumTesting() coxConnect.setUp('http://pomelo2.bioinfo.cnio.es') coxConnect.send_get_pomelo(FILES_DIR + data_name + '.covar.txt', FILES_DIR + data_name + '.surv.txt', FILES_DIR + data_name + '.event.txt', 'Cox', '2') r_read = data_name + 'Pomelo <- readPomeloOutput()' r_compare = 'comparePomelo(' + data_name + 'Pomelo, ' + \ data_name + '.results)' ## r_read and r_compare are so that we can ## send to R the following two types of instructions ## rpy.r('breastPomelo <- readPomeloOutput()') ## rpy.r('comparePomelo(breastPomelo, breast.results)') rpy.r(r_read) out_comparison = rpy.r(r_compare) if (out_comparison == 'ERROR: test failed'): raise AsteriasAssertionError
def interpolazionelineare(x, y): rpy.set_default_mode(rpy.NO_CONVERSION) #serve per l'ultima parte in R linear_model = rpy.r.lm(rpy.r("y ~ x"), data=rpy.r.data_frame(x=x, y=y)) rpy.set_default_mode(rpy.BASIC_CONVERSION) summary = rpy.r.summary(linear_model) #pendenza,errpendenza,intercetta,errintercetta risultati = (summary['coefficients'][0][0], \ summary['coefficients'][0][1], \ summary['coefficients'][1][0], \ summary['coefficients'][1][1]) return risultati
def wavelet_gaussian_denoising(spectrum): """Denoise data with pure Gaussian noise using wavelets Wrapper around the R packages EbayesThresh and wavethresh Parameters ---------- spectrum : spectrum instance Returns ------- Spectrum instance. """ import_rpy() rpy.r.library('EbayesThresh') rpy.r.library('wavethresh') rpy.r['<-']('X',spectrum) rpy.r('Xwd <- wd(X, bc="symmetric")') rpy.r('XwdT <- ebayesthresh.wavelet(Xwd)') Xdn = rpy.r('Xdn <- wr(XwdT)') return Xdn
def wavelet_gaussian_denoising(spectrum): """Denoise data with pure Gaussian noise using wavelets Wrapper around the R packages EbayesThresh and wavethresh Parameters ---------- spectrum : spectrum instance Returns ------- Spectrum instance. """ import_rpy() rpy.r.library('EbayesThresh') rpy.r.library('wavethresh') rpy.r['<-']('X', spectrum) rpy.r('Xwd <- wd(X, bc="symmetric")') rpy.r('XwdT <- ebayesthresh.wavelet(Xwd)') Xdn = rpy.r('Xdn <- wr(XwdT)') return Xdn
def anova(values, factor1, factor1name="factor1"): """ python wrapper for a one-way ANOVA in R """ # build a dataframe for R dataframe = {} dataframe["feature"] = values dataframe["factor1"] = factor1 r.assign("df", dataframe) r("df$factor1 <- factor( df$factor1 )") # run the model results = r("anova( lm( df$feature ~ df$factor1 ) )") r("rm( list=ls() )") # convert R results to table colheads = ["Df", "Sum Sq", "Mean Sq", "F value", "Pr( >F )"] rowheads = [factor1name, "error"] ndictData = {} for rowhead in results.keys(): for index, name in zip(range(len(rowheads)), rowheads): dictName = ndictData.setdefault(name, {}) dictName[rowhead] = results[rowhead][index] # return as zopy table return nesteddict2table(ndictData, rowheads, colheads)
def Multtest(self, test_type): r('library("multtest")') filename = self.File() try: output_file = string.replace(filename, 'input', 'output') except ValueError: output_file = filename[0:-4] + '-output.txt' print "Begining to process", filename parse_line = 'job<-read.table(%s,sep="\t", row.names=1, as.is=T)' % filename print_out = r(parse_line) print_out = r('matrix_size<-dim(job)') print_out = r('label<-job[1,2:matrix_size[2]]') print_out = r('jobdata<-job[2:matrix_size[1],2:matrix_size[2]]') if test_type == "f": print_out = r('ttest<-mt.maxT(jobdata,label, test="f", B=50000)') if test_type == "t": print_out = r('ttest<-mt.maxT(jobdata,label)') print_out = r('ttest2<-ttest[order(ttest[,1]),]') write_file = 'write.table(ttest2,%s,sep="\t")' % output_file print_out = r(write_file) print "Results written to:", output_file
def calibrate(self): """ Performs a calibration based on the available datapoints. """ from rpy import r if len(self.pts) < 2: return False in_x = [] in_y = [] in_z = [] out_x = [] out_y = [] out_z = [] # Index all points so they can be fed into the R multiple linear regression for in_pt, out_pt in self.pts: in_x.append(in_pt[0]) in_y.append(in_pt[1]) in_z.append(in_pt[2]) out_x.append(out_pt[0]) out_y.append(out_pt[1]) out_z.append(out_pt[2]) # Perform the regression analysis fx = r.lm(r("x ~ a + b + c"), data = r.data_frame(a=in_x, b=in_y, c=in_z, x=out_x))["coefficients"] fy = r.lm(r("y ~ a + b + c"), data = r.data_frame(a=in_x, b=in_y, c=in_z, y=out_y))["coefficients"] fz = r.lm(r("z ~ a + b + c"), data = r.data_frame(a=in_x, b=in_y, c=in_z, z=out_z))["coefficients"] self.fx = fx["(Intercept)"], fx["a"], fx["b"], fx["c"] self.fy = fy["(Intercept)"], fy["a"], fy["b"], fy["c"] self.fz = fz["(Intercept)"], fz["a"], fz["b"], fz["c"] self.calibrated = True return True
def spss_to_csv(spss_name, csv_name=None): """Convert spss file to csv file""" if csv_name is None: basename = os.path.splitext(spss_name)[0] csv_name = basename + ".csv" rpy.r("library(foreign)") data = rpy.r("read.spss('{0}')".format(spss_name)) keys = data.keys() for key, values in data.items(): types = set([type(val) for val in values]) if len(types) != 1: sys.exit("ERROR: multiple types for {0}: {1}".format(key, types)) first = values[0] if isinstance(first, float): values = [None if math.isnan(val) else val for val in values] if isinstance(first, float): values = to_dates(values, key) elif isinstance(first, float): values = [int(val) if val is not None and val.is_integer() else val for val in values] data[key] = values row = 0 with open(csv_name, "w") as fobj: writer = csv.writer(fobj) writer.writerow(keys) while True: try: line = [data[key][row] for key in keys] writer.writerow(line) except IndexError: break row += 1
def interpolazionelineare(self, other): """x.interpolazionelineare(y) esegue l'i.l. con x in ascissa e y in ordinata. x e y devono essere due oggetti della classe DatiSperimentali.""" rpy.set_default_mode(rpy.NO_CONVERSION) linear_model = rpy.r.lm(rpy.r("y ~ x"), data = rpy.r.data_frame(x=self.valori, y=other.valori)) rpy.set_default_mode(rpy.BASIC_CONVERSION) summary = rpy.r.summary(linear_model) #pendenza,errpendenza,intercetta,errintercetta risultati = (summary['coefficients'][0][0], \ summary['coefficients'][0][1], \ summary['coefficients'][1][0], \ summary['coefficients'][1][1]) return risultati
def interpolazionelineare(self, other): rpy.set_default_mode(rpy.NO_CONVERSION) linear_model = rpy.r.lm(rpy.r("y ~ x"), data=rpy.r.data_frame(x=self.valori, y=other.valori)) rpy.set_default_mode(rpy.BASIC_CONVERSION) summary = rpy.r.summary(linear_model) #pendenza,errpendenza,intercetta,errintercetta risultati = (summary['coefficients'][0][0], \ summary['coefficients'][0][1], \ summary['coefficients'][1][0], \ summary['coefficients'][1][1]) return risultati
def LinearRegression_lm(ls1,ls2,return_rsqrd): intercept = 0 ### when forced through the origin from rpy import r d = r.data_frame(x=ls1, y=ls2) model = r("y ~ x - 1") ### when not forced through the origin it is r("y ~ x") fitted_model = r.lm(model, data = d) slope = fitted_model['coefficients']['x'] #intercept = fitted_model['coefficients']['(Intercept)'] if return_rsqrd == 'yes': from scipy import stats rsqrd = math.pow(stats.linregress(ls1,ls2)[2],2) return slope,rsqrd else: return slope
def loess(y, x=None, span=0.2): """locally weighted scatterplot smoothing Wrapper around the R funcion loess Parameters ---------- spectrum : spectrum instance span : float parameter to control the smoothing Returns ------- Spectrum instance. """ import_rpy() if x is None: x = np.arange(0, len(y)) rpy.r['<-']('x', x) rpy.r['<-']('y', y) rpy.r('y.loess <- loess(y ~ x, span = %s, data.frame(x=x, y=y))' % span) loess = rpy.r('y.predict <- predict(y.loess, data.frame(x=x))') return loess
def LinearRegression(ls1,ls2,return_rsqrd): intercept = 0 ### when forced through the origin from rpy import r r.library('MASS') k = r.options(warn=-1) ### suppress all warning messages from R #print ls1; print ls2 d = r.data_frame(x=ls1, y=ls2) model = r("y ~ x - 1") ### when not forced through the origin it is r("y ~ x") fitted_model = r.rlm(model, data = d) ###errors: rlm failed to converge in 20 steps - maxit=21 slope = fitted_model['coefficients']['x'] #intercept = fitted_model['coefficients']['(Intercept)'] if return_rsqrd == 'yes': from scipy import stats rsqrd = math.pow(stats.linregress(ls1,ls2)[2],2) return slope,rsqrd else: return slope
def wavelet_poissonian_denoising(spectrum): """Denoise data with pure Poissonian noise using wavelets Wrapper around the R packages EbayesThresh and wavethresh Parameters ---------- spectrum : spectrum instance Returns ------- Spectrum instance. """ import_rpy() rpy.r.library('EbayesThresh') rpy.r.library('wavethresh') rpy.r['<-']('X', spectrum) rpy.r('XHF <- hft(X)') rpy.r('XHFwd <- wd(XHF, bc="symmetric")') rpy.r('XHFwdT <- ebayesthresh.wavelet(XHFwd)') rpy.r('XHFdn <- wr(XHFwdT)') XHFest = rpy.r('XHFest <- hft.inv(XHFdn)') return XHFest
def _generate_oat_trajectories(r, p, levels, jumps, mins, maxs): #Set up the R function and vectors oatf = rpy.r("sensitivity:::random.oat") levelsr = rpy.r.FloatVector([levels] * p) jumpsr = rpy.r.FloatVector([jumps] * p) #Create a list to hold the trajectories ts = [] #Generate r trajectories consisting of p + 1 points for i in range(r): ts.append(io.Trajectory(points=[])) tsr = oatf(p , levelsr, jumpsr) for point in np.array(tsr): #Scale the point and add it to the trajectory scaled = [v * (mx - mn) + mn for v, mn, mx in zip(point, mins, maxs)] ts[-1].add_point(io.Point(scaled)) return ts
def fitPoly(xarray, yarray, order): r.lm.local_mode(rpy.NO_CONVERSION) xl=list(xarray) yl=list(yarray) modelDef = "y ~ poly(x,%d)" % order model=r.lm(r(modelDef), data=r.data_frame(x=xl,y=yl)) pred=r.predict(model) # pred is now a dict with keys from '1' to 'N', where N is the size of xl predvals = [] for i in range(len(xl)): predvals.append(pred[str(i+1)]) return(xl, predvals)
def smooth_data(data): sample_data=data[0] window_size=data[1] for rep_num in range(sample_data.get_number_of_replicates()): for chrom in sample_data.get_chromosome_list(): met_manager = sample_data.get_manager_of_chrom(chrom) pos=[] m=[] cov=[] for methyl_c in met_manager: pos.append(methyl_c.position) m.append(methyl_c.get_methylrate(rep_num)) cov.append(methyl_c.get_coverage(rep_num)) r.warnings() r.library("locfit") r.assign("pos",pos) r.assign("m",m) r.assign("cov",cov) r.assign("h",window_size) r("posm=data.frame(pos,m)") r("fit=locfit(m~lp(pos,h=h),data=posm,maxk=1000000,weights=cov)") r("pp=preplot(fit,where='data',band='local',newdata=data.frame(pos=pos))") fit=r("pp")["fit"] list=r("unlist(pp$xev$xev)") for i, each in enumerate(list): position=int(each[0]) methyl_c=met_manager.get_methyl_c(position) if methyl_c: smoothedrate=None if 1 <= fit[i]: smoothedrate=1 elif fit[i] <= 0: smoothedrate=0 else: smoothedrate=fit[i] methyl_c.update_methylrate(rep_num,smoothedrate) else: sys.stderr.write("methyl_c doesn't exist at %d",position) sys.exit(1)
def read_data(spss_data_file): r("library('foreign')") r("data = read.spss(file='"+spss_data_file+"',to.data.frame=TRUE)") data_sheet = r("data") num_columns = len(data_sheet.keys()) column_labels = [] data_list = [] for i in range(1,num_columns+1): #column_label = column = r("data["+str(i)+"]") label = column.keys()[0] # only has one key column_labels.append(label) data_list.append(column[label]) return data_list, column_labels