def regression(data): """Calls R's lm to make a linear regression on each of its inputs.""" reg = r.lm(r('x ~ y'), data = r.data_frame(x=data[:,0], y=data[:,1]) )['coefficients'] return reg
def lm(self, l, h): for i in range(l, h + 1): data_frame, data_model = self.mount_reg_params(i) print data_model rpy.set_default_mode(rpy.NO_CONVERSION) linear_model = r.lm(r(data_model), data=data_frame) rpy.set_default_mode(rpy.BASIC_CONVERSION) print r.summary(linear_model)['r.squared']
def calibrate(self): """ Performs a calibration based on the available datapoints. """ from rpy import r if len(self.pts) < 2: return False in_x = [] in_y = [] in_z = [] out_x = [] out_y = [] out_z = [] # Index all points so they can be fed into the R multiple linear regression for in_pt, out_pt in self.pts: in_x.append(in_pt[0]) in_y.append(in_pt[1]) in_z.append(in_pt[2]) out_x.append(out_pt[0]) out_y.append(out_pt[1]) out_z.append(out_pt[2]) # Perform the regression analysis fx = r.lm(r("x ~ a + b + c"), data = r.data_frame(a=in_x, b=in_y, c=in_z, x=out_x))["coefficients"] fy = r.lm(r("y ~ a + b + c"), data = r.data_frame(a=in_x, b=in_y, c=in_z, y=out_y))["coefficients"] fz = r.lm(r("z ~ a + b + c"), data = r.data_frame(a=in_x, b=in_y, c=in_z, z=out_z))["coefficients"] self.fx = fx["(Intercept)"], fx["a"], fx["b"], fx["c"] self.fy = fy["(Intercept)"], fy["a"], fy["b"], fy["c"] self.fz = fz["(Intercept)"], fz["a"], fz["b"], fz["c"] self.calibrated = True return True
def funcion(dato,variable,caso,opciones): # Cambiar cosa por caso from rpy import r #pylint: disable=import-error variable1 = variable[0] variable2 = variable[1] lista1=dato.query(variable1,caso) lista2=dato.query(variable2,caso) #lista2=[float(x) for x in dato.getCol(variable2,caso=caso)] resultadoprueba=r.lm(r("y ~ x"),data=r.data_frame(x=lista1, y=lista2)) sumario=r.summary_lm(resultadoprueba,True) anova=r.anova_lm(resultadoprueba) #resultadoprueba=r.lsfit(lista1,lista2) midiccionario={"resultado":resultadoprueba,"sumario":sumario,"anova":anova} return midiccionario
def LinearRegression_lm(ls1,ls2,return_rsqrd): intercept = 0 ### when forced through the origin from rpy import r d = r.data_frame(x=ls1, y=ls2) model = r("y ~ x - 1") ### when not forced through the origin it is r("y ~ x") fitted_model = r.lm(model, data = d) slope = fitted_model['coefficients']['x'] #intercept = fitted_model['coefficients']['(Intercept)'] if return_rsqrd == 'yes': from scipy import stats rsqrd = math.pow(stats.linregress(ls1,ls2)[2],2) return slope,rsqrd else: return slope
def lm(self, Y, X): observations = {} # here's where we would handle factors for predictor in X: observations[predictor.name] = predictor.data # get Wilkinson-Rogers notation for model wr_model = "%s ~ %s" % (Y.name, " + ".join(observations.keys())) # add y to the model observations[Y.name] = Y.data # fit the model model = r.lm(r(wr_model), data = observations) return model['coefficients']
def fitPoly(xarray, yarray, order): r.lm.local_mode(rpy.NO_CONVERSION) xl=list(xarray) yl=list(yarray) modelDef = "y ~ poly(x,%d)" % order model=r.lm(r(modelDef), data=r.data_frame(x=xl,y=yl)) pred=r.predict(model) # pred is now a dict with keys from '1' to 'N', where N is the size of xl predvals = [] for i in range(len(xl)): predvals.append(pred[str(i+1)]) return(xl, predvals)
def _independent_betas_same_sources(self, tag_list, remove_tags_when_bad_regression, n_times_show_summary=3): times_showed_summary = 0 # This allows us to print out some summary statistics without producing an overwhelming amount of output. SUMMARY_STATS = ["beta", "stderr", "tstat", "pval"] for tag in tag_list: self._progress("Computing betas for tag %s." % tag, newline=True) # rmme: newline make false rpy.set_default_mode(rpy.NO_CONVERSION) # Turn off conversion so that lm returns Robj. data = rc.list(y=self.y[tag],X=self.X[tag]) model = "y~X-1" # Use -1 because X has an intercept already if self.regtype=="Independent Linear": try: result = rc.lm(model,data=data) except: pdb.set_trace() elif self.regtype=="Independent Logistic": result = rc.glm(model,family=rc.binomial("logit"),data=data) rpy.set_default_mode(rpy.BASIC_CONVERSION) # Return to normal conversion mode. summary = rc.summary(result,correlation=rc.TRUE) self._record_regression_stats(tag, summary) beta_dict = dict() sorted_sources = self.sorted_sources[tag] coeff_matrix = summary["coefficients"] for i in range(len(sorted_sources)): try: cur_source_dict = dict(zip(SUMMARY_STATS,coeff_matrix[i,:])) except IndexError: util.info("\tWARNING: Regression for %s didn't end up using all variables." % tag) if remove_tags_when_bad_regression: self._remove_tag(tag) break # break from for-loop over sorted_sources; we don't continue out of the per-tag for loop until later when we check if tag is in self.features.... continue try: cur_source_dict["-log10(pval)"] = -log(cur_source_dict["pval"], 10) except OverflowError: pass beta_dict[sorted_sources[i]] = cur_source_dict if tag not in self.features: # We've removed this tag a few lines above, so skip it. continue self.beta[tag] = beta_dict if times_showed_summary < n_times_show_summary: self._print_regression_summary(tag, summary) times_showed_summary += 1
from __future__ import print_function from statsmodels.compat.python import iterkeys from rpy import r import numpy as np import statsmodels.api as sm examples = [1, 2] if 1 in examples: data = sm.datasets.longley.load(as_pandas=False) y, x = data.endog, sm.add_constant(data.exog, prepend=False) des_cols = ['x.%d' % (i + 1) for i in range(x.shape[1])] formula = r('y~%s-1' % '+'.join(des_cols)) frame = r.data_frame(y=y, x=x) results = r.lm(formula, data=frame) print(list(iterkeys(results))) print(results['coefficients']) if 2 in examples: data2 = sm.datasets.star98.load(as_pandas=False) y2, x2 = data2.endog, sm.add_constant(data2.exog, prepend=False) import rpy y2 = y2[:, 0] / y2.sum(axis=1) des_cols2 = ['x.%d' % (i + 1) for i in range(x2.shape[1])] formula2 = r('y~%s-1' % '+'.join(des_cols2)) frame2 = r.data_frame(y=y2, x=x2) results2 = r.glm(formula2, data=frame2, family='binomial') params_est = [ results2['coefficients'][k] for k in sorted(results2['coefficients']) ]
''' from rpy import r import numpy as np import scikits.statsmodels.api as sm examples = [1, 2] if 1 in examples: data = sm.datasets.longley.load() y,x = data.endog, sm.add_constant(data.exog) des_cols = ['x.%d' % (i+1) for i in range(x.shape[1])] formula = r('y~%s-1' % '+'.join(des_cols)) frame = r.data_frame(y=y, x=x) results = r.lm(formula, data=frame) print results.keys() print results['coefficients'] if 2 in examples: data2 = sm.datasets.star98.load() y2,x2 = data2.endog, sm.add_constant(data2.exog) import rpy y2 = y2[:,0]/y2.sum(axis=1) des_cols2 = ['x.%d' % (i+1) for i in range(x2.shape[1])] formula2 = r('y~%s-1' % '+'.join(des_cols2)) frame2 = r.data_frame(y=y2, x=x2) results2 = r.glm(formula2, data=frame2, family='binomial') params_est = [results2['coefficients'][k] for k in sorted(results2['coefficients'])] print params_est
for id in py_id: x1 = poly_x_vals[i,0] x2 = poly_x_vals[i,1] y1 = poly_y_vals[i,0] y2 = poly_y_vals[i,1] xy = poly_xy_vals[i] if poly_values: poly_values = poly_values + "," poly_values += "(%s, %f, %f, %f, %f, %f)" % (id, x1, x2, y1, y2, xy) i = i+1 query = query + poly_values # print query c.execute(query) model = r.lm(r("delta ~ poly(x, 2) + poly(y, 2) + poly(x*y, 1)"), data=r.data_frame(x=py_x, y=py_y, delta=py_delta), weights=py_wt) model_summary = r.summary(model) model_coeff = array(model_summary['coefficients']) if not model_coeff.shape == (6,4): print "Bad model for %s" % exp continue c0 = model_coeff[0][0] c0_sigma = model_coeff[0][1] cx1 = model_coeff[1][0] cx1_sigma = model_coeff[1][1] cx2 = model_coeff[2][0] cx2_sigma = model_coeff[2][1] cy1 = model_coeff[3][0] cy1_sigma = model_coeff[3][1] cy2 = model_coeff[4][0]
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-c", "--columns", dest="columns", type="string", help="columns to take for calculating histograms.") parser.add_option("-t", "--tree-nh-file", dest="filename_tree", type="string", help="filename with tree(s).") parser.add_option("--skip-header", dest="add_header", action="store_false", help="do not add header to flat format.") parser.add_option("--output-with-header", dest="write_header", action="store_true", help="write header and exit.") parser.add_option("--debug", dest="debug", action="store_true", help="debug mode") parser.add_option("--display-tree", dest="display_tree", action="store_true", help="display the tree") parser.add_option("-m", "--method", dest="methods", type="choice", action="append", choices=("contrasts", "spearman", "pearson", "compute"), help="methods to perform on contrasts.") parser.set_defaults( columns="all", filename_tree=None, add_header=True, write_header=False, debug=False, methods=[], value_format="%6.4f", pvalue_format="%e", display_tree=False, ) (options, args) = E.Start(parser, quiet=True) if options.columns not in ("all", "all-but-first"): options.columns = map(lambda x: int(x) - 1, options.columns.split(",")) phylip = WrapperPhylip.Phylip() if options.debug: phylip.setLogLevel(options.loglevel) phylip.setProgram("contrast") ########################################################## ########################################################## ########################################################## # retrieve data and give to phylip data = [] headers = [] first = True for line in sys.stdin: if line[0] == "#": continue d = line[:-1].strip().split("\t") if first: first = False headers = d[1:] continue data.append(d) phylip.setData(data) ncolumns = len(headers) nrows = len(data) ########################################################## ########################################################## ########################################################## # read trees nexus = None if options.filename_tree: nexus = TreeTools.Newick2Nexus(open(options.filename_tree, "r")) if not nexus: raise ValueError("please provide trees with branchlenghts") ########################################################## ########################################################## ########################################################## # set up phylip phylip_options = [] # print out contrasts phylip_options.append("C") phylip_options.append("Y") phylip.setOptions(phylip_options) ########################################################## ########################################################## ########################################################## # main loop ########################################################## for tree in nexus.trees: if options.display_tree: tree.display() # compute this before giving the tree to the phylip module, # as it remaps taxon names. map_node2data = {} for x in range(nrows): taxon = data[x][0] map_node2data[tree.search_taxon(taxon)] = x phylip.setTree(tree) result = phylip.run() for method in options.methods: if method in ("pearson", "spearman"): options.stdout.write("header1\theader2\tr\tp\tcode\n") # n = len(result.mContrasts) columns = [] for c in range(ncolumns): columns.append(map(lambda x: x[c], result.mContrasts)) for x in range(0, ncolumns - 1): for y in range(x + 1, ncolumns): # phylip value phy_r = result.mCorrelations[x][y] import rpy from rpy import r as R # Various ways to calculate r. It is not # possible to use cor.test or lsfit directly, # as you have to perform a regression through # the origin. # uncomment to check pearson r against # phylip's value r = # calculateCorrelationCoefficient(columns[x], # columns[y]) # for significance, use linear regression models in R rpy.set_default_mode(rpy.NO_CONVERSION) linear_model = R.lm( R("y ~ x - 1"), data=R.data_frame(x=columns[x], y=columns[y])) rpy.set_default_mode(rpy.BASIC_CONVERSION) ss = R.summary(linear_model) # extract the p-value p = ss['coefficients'][-1][-1] if p < 0.001: code = "***" elif p < 0.01: code = "**" elif p < 0.05: code = "*" else: code = "" options.stdout.write("\t".join( (headers[x], headers[y], options.value_format % phy_r, options.pvalue_format % p, code)) + "\n") elif method == "contrasts": options.stdout.write("\t".join(headers) + "\n") for d in result.mContrasts: options.stdout.write( "\t".join( map(lambda x: options.value_format % x, d)) + "\n") elif method == "compute": # make room for all internal nodes and one dummy node # for unrooted trees. max_index = TreeTools.GetMaxIndex(tree) + 2 variances = [None] * max_index values = [[None] * nrows for x in range(max_index)] contrasts = [] for x in range(max_index): contrasts.append([None] * ncolumns) branchlengths = [None] * max_index def update_data(node_id, bl, c1, c2, ): b1, b2 = branchlengths[c1], branchlengths[c2] rb1 = 1.0 / b1 rb2 = 1.0 / b2 # compute variance variance = math.sqrt(b1 + b2) # extend branch length of this node to create correct # variance for parent branchlengths[node_id] = bl + (b1 * b2) / (b1 + b2) variances[node_id] = variance for c in range(ncolumns): v1, v2 = values[c1][c], values[c2][c] # save ancestral value as weighted mean values[node_id][c] = ( (rb1 * v1 + rb2 * v2)) / (rb1 + rb2) # compute normalized contrast contrasts[node_id][c] = (v1 - v2) / variance def update_contrasts(node_id): """update contrasts for a node.""" node = tree.node(node_id) if node.succ: if len(node.succ) == 2: c1, c2 = node.succ update_data( node_id, node.data.branchlength, c1, c2) else: assert(node_id == tree.root) assert(len(node.succ) == 3) update_data( node_id, node.data.branchlength, node.succ[0], node.succ[1]) update_data( max_index - 1, node.data.branchlength, node_id, node.succ[2]) else: for c in range(ncolumns): values[node_id][c] = float( data[map_node2data[node_id]][c + 1]) branchlengths[node_id] = node.data.branchlength tree.dfs(tree.root, post_function=update_contrasts) options.stdout.write( "node_id\tvariance\t%s\n" % "\t".join(headers)) for node_id in range(max_index): if variances[node_id] is None: continue options.stdout.write("%s\t%s\t%s\n" % ( node_id, options.value_format % variances[ node_id], "\t".join( map(lambda x: options.value_format % x, contrasts[node_id])), )) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: data2phylocontrasts.py 2782 2009-09-10 11:40:29Z andreas $", usage=globals()["__doc__"]) parser.add_option("-c", "--columns", dest="columns", type="string", help="columns to take for calculating histograms.") parser.add_option("-t", "--filename-tree", dest="filename_tree", type="string", help="filename with tree(s).") parser.add_option("--skip-header", dest="add_header", action="store_false", help="do not add header to flat format.") parser.add_option("--write-header", dest="write_header", action="store_true", help="write header and exit.") parser.add_option("--debug", dest="debug", action="store_true", help="debug mode") parser.add_option("--display-tree", dest="display_tree", action="store_true", help="display the tree") parser.add_option("-m", "--method", dest="methods", type="choice", action="append", choices=("contrasts", "spearman", "pearson", "compute"), help="methods to perform on contrasts.") parser.set_defaults( columns="all", filename_tree=None, add_header=True, write_header=False, debug=False, methods=[], value_format="%6.4f", pvalue_format="%e", display_tree=False, ) (options, args) = E.Start(parser, quiet=True) if options.columns not in ("all", "all-but-first"): options.columns = map(lambda x: int(x) - 1, options.columns.split(",")) phylip = WrapperPhylip.Phylip() if options.debug: phylip.setLogLevel(options.loglevel) phylip.setProgram("contrast") ########################################################## ########################################################## ########################################################## # retrieve data and give to phylip data = [] headers = [] first = True for line in sys.stdin: if line[0] == "#": continue d = line[:-1].strip().split("\t") if first: first = False headers = d[1:] continue data.append(d) phylip.setData(data) ncolumns = len(headers) nrows = len(data) ########################################################## ########################################################## ########################################################## # read trees nexus = None if options.filename_tree: nexus = TreeTools.Newick2Nexus(open(options.filename_tree, "r")) if not nexus: raise ValueError("please provide trees with branchlenghts") ########################################################## ########################################################## ########################################################## # set up phylip phylip_options = [] # print out contrasts phylip_options.append("C") phylip_options.append("Y") phylip.setOptions(phylip_options) ########################################################## ########################################################## ########################################################## # main loop ########################################################## for tree in nexus.trees: if options.display_tree: tree.display() # compute this before giving the tree to the phylip module, # as it remaps taxon names. map_node2data = {} for x in range(nrows): taxon = data[x][0] map_node2data[tree.search_taxon(taxon)] = x phylip.setTree(tree) result = phylip.run() for method in options.methods: if method in ("pearson", "spearman"): options.stdout.write("header1\theader2\tr\tp\tcode\n") n = len(result.mContrasts) columns = [] for c in range(ncolumns): columns.append(map(lambda x: x[c], result.mContrasts)) for x in range(0, ncolumns - 1): for y in range(x + 1, ncolumns): # phylip value phy_r = result.mCorrelations[x][y] import rpy from rpy import r as R # Various ways to calculate r. It is not possible to use # cor.test or lsfit directly, as you have to perform a # regression through the origin. # uncomment to check pearson r against phylip's value ## r = calculateCorrelationCoefficient( columns[x], columns[y] ) # for significance, use linear regression models in R rpy.set_default_mode(rpy.NO_CONVERSION) linear_model = R.lm(R("y ~ x - 1"), data=R.data_frame(x=columns[x], y=columns[y])) rpy.set_default_mode(rpy.BASIC_CONVERSION) ss = R.summary(linear_model) # extract the p-value p = ss['coefficients'][-1][-1] if p < 0.001: code = "***" elif p < 0.01: code = "**" elif p < 0.05: code = "*" else: code = "" options.stdout.write("\t".join( (headers[x], headers[y], options.value_format % phy_r, options.pvalue_format % p, code)) + "\n") elif method == "contrasts": options.stdout.write("\t".join(headers) + "\n") for d in result.mContrasts: options.stdout.write( "\t".join(map(lambda x: options.value_format % x, d)) + "\n ") elif method == "compute": # make room for all internal nodes and one dummy node # for unrooted trees. max_index = TreeTools.GetMaxIndex(tree) + 2 variances = [None] * max_index values = [[None] * nrows for x in range(max_index)] contrasts = [] for x in range(max_index): contrasts.append([None] * ncolumns) branchlengths = [None] * max_index def update_data( node_id, bl, c1, c2, ): b1, b2 = branchlengths[c1], branchlengths[c2] rb1 = 1.0 / b1 rb2 = 1.0 / b2 # compute variance variance = math.sqrt(b1 + b2) # extend branch length of this node to create correct # variance for parent branchlengths[node_id] = bl + (b1 * b2) / (b1 + b2) variances[node_id] = variance for c in range(ncolumns): v1, v2 = values[c1][c], values[c2][c] # save ancestral value as weighted mean values[node_id][c] = ( (rb1 * v1 + rb2 * v2)) / (rb1 + rb2) # compute normalized contrast contrasts[node_id][c] = (v1 - v2) / variance def update_contrasts(node_id): """update contrasts for a node.""" node = tree.node(node_id) if node.succ: if len(node.succ) == 2: c1, c2 = node.succ update_data(node_id, node.data.branchlength, c1, c2) else: assert (node_id == tree.root) assert (len(node.succ) == 3) update_data(node_id, node.data.branchlength, node.succ[0], node.succ[1]) update_data(max_index - 1, node.data.branchlength, node_id, node.succ[2]) else: for c in range(ncolumns): values[node_id][c] = float( data[map_node2data[node_id]][c + 1]) branchlengths[node_id] = node.data.branchlength tree.dfs(tree.root, post_function=update_contrasts) options.stdout.write("node_id\tvariance\t%s\n" % "\t".join(headers)) for node_id in range(max_index): if variances[node_id] is None: continue options.stdout.write("%s\t%s\t%s\n" % ( node_id, options.value_format % variances[node_id], "\t".join( map(lambda x: options.value_format % x, contrasts[node_id])), )) E.Stop()
# phylip value phy_r = result.mCorrelations[x][y] import rpy from rpy import r as R ## Various ways to calculate r. It is not possible to use ## cor.test or lsfit directly, as you have to perform a ## regression through the origin. ## uncomment to check pearson r against phylip's value ## r = calculateCorrelationCoefficient( columns[x], columns[y] ) ## for significance, use linear regression models in R rpy.set_default_mode(rpy.NO_CONVERSION) linear_model = R.lm(R("y ~ x - 1"), data = R.data_frame(x=columns[x], y=columns[y])) rpy.set_default_mode(rpy.BASIC_CONVERSION) ss = R.summary(linear_model) ## extract the p-value p = ss['coefficients'][-1][-1] if p < 0.001: code = "***" elif p < 0.01: code = "**" elif p < 0.05: code = "*" else: code = ""