def lm(self, l, h): for i in range(l, h + 1): data_frame, data_model = self.mount_reg_params(i) print data_model rpy.set_default_mode(rpy.NO_CONVERSION) linear_model = r.lm(r(data_model), data=data_frame) rpy.set_default_mode(rpy.BASIC_CONVERSION) print r.summary(linear_model)['r.squared']
def _independent_betas_same_sources(self, tag_list, remove_tags_when_bad_regression, n_times_show_summary=3): times_showed_summary = 0 # This allows us to print out some summary statistics without producing an overwhelming amount of output. SUMMARY_STATS = ["beta", "stderr", "tstat", "pval"] for tag in tag_list: self._progress("Computing betas for tag %s." % tag, newline=True) # rmme: newline make false rpy.set_default_mode(rpy.NO_CONVERSION) # Turn off conversion so that lm returns Robj. data = rc.list(y=self.y[tag],X=self.X[tag]) model = "y~X-1" # Use -1 because X has an intercept already if self.regtype=="Independent Linear": try: result = rc.lm(model,data=data) except: pdb.set_trace() elif self.regtype=="Independent Logistic": result = rc.glm(model,family=rc.binomial("logit"),data=data) rpy.set_default_mode(rpy.BASIC_CONVERSION) # Return to normal conversion mode. summary = rc.summary(result,correlation=rc.TRUE) self._record_regression_stats(tag, summary) beta_dict = dict() sorted_sources = self.sorted_sources[tag] coeff_matrix = summary["coefficients"] for i in range(len(sorted_sources)): try: cur_source_dict = dict(zip(SUMMARY_STATS,coeff_matrix[i,:])) except IndexError: util.info("\tWARNING: Regression for %s didn't end up using all variables." % tag) if remove_tags_when_bad_regression: self._remove_tag(tag) break # break from for-loop over sorted_sources; we don't continue out of the per-tag for loop until later when we check if tag is in self.features.... continue try: cur_source_dict["-log10(pval)"] = -log(cur_source_dict["pval"], 10) except OverflowError: pass beta_dict[sorted_sources[i]] = cur_source_dict if tag not in self.features: # We've removed this tag a few lines above, so skip it. continue self.beta[tag] = beta_dict if times_showed_summary < n_times_show_summary: self._print_regression_summary(tag, summary) times_showed_summary += 1
ggmod.nma = 1 ggmod._start_params = np.array([-0.6, 0.1, 0.2, 0.0]) ggres = ggmod.fit(start_params=np.array([-0.6, 0.1, 0.2, 0.0]), maxiter=1000) print 'ggres.params', ggres.params g11res = optimize.fmin( lambda params: -loglike_GARCH11(params, x - x.mean())[0], [0.6, 0.6, 0.2]) print g11res llf = loglike_GARCH11(g11res, x - x.mean()) print llf[0] garchplot(ggmod.errorsest, ggmod.h, title='Garch estimated') fit = r.garchFit(f, data=x - x.mean(), include_mean=False, trace=False) print r.summary(fit) '''based on R default simulation model = list(omega = 1e-06, alpha = 0.1, beta = 0.8) nobs = 1000 (with nobs=500, gjrgarch doesn't do well >>> ggres = ggmod.fit(start_params=np.array([-0.6, 0.1, 0.2, 0.0]), maxiter=1000) Optimization terminated successfully. Current function value: -448.861335 Iterations: 385 Function evaluations: 690 >>> print 'ggres.params', ggres.params ggres.params [ -7.75090330e-01 1.57714749e-01 -9.60223930e-02 8.76021411e-07] rearranged 8.76021411e-07 1.57714749e-01(-9.60223930e-02) 7.75090330e-01
def __init__(self, y, design, model_type=r.lm, **kwds): """ Set up and estimate R model with data and design """ r.library("MASS") # still needs to be in test, but also here for # logical tests at the end not to show an error self.y = np.array(y) self.design = np.array(design) self.model_type = model_type self._design_cols = ["x.%d" % (i + 1) for i in range(self.design.shape[1])] # Note the '-1' for no intercept - this is included in the design self.formula = r("y ~ %s-1" % "+".join(self._design_cols)) self.frame = r.data_frame(y=y, x=self.design) rpy.set_default_mode(rpy.NO_CONVERSION) results = self.model_type(self.formula, data=self.frame, **kwds) self.robj = results # keep the Robj model so it can be # used in the tests rpy.set_default_mode(rpy.BASIC_CONVERSION) rsum = r.summary(results) self.rsum = rsum # Provide compatible interface with scipy models self.results = results.as_py() # coeffs = self.results['coefficients'] # self.beta0 = np.array([coeffs[c] for c in self._design_cols]) self.nobs = len(self.results["residuals"]) if isinstance(self.results["residuals"], dict): self.resid = np.zeros((len(list(self.results["residuals"].keys())))) for i in list(self.results["residuals"].keys()): self.resid[int(i) - 1] = self.results["residuals"][i] else: self.resid = self.results["residuals"] self.fittedvalues = self.results["fitted.values"] self.df_resid = self.results["df.residual"] self.params = rsum["coefficients"][:, 0] self.bse = rsum["coefficients"][:, 1] self.bt = rsum["coefficients"][:, 2] try: self.pvalues = rsum["coefficients"][:, 3] except: pass self.rsquared = rsum.setdefault("r.squared", None) self.rsquared_adj = rsum.setdefault("adj.r.squared", None) self.aic_R = rsum.setdefault("aic", None) self.fvalue = rsum.setdefault("fstatistic", None) if self.fvalue and isinstance(self.fvalue, dict): self.fvalue = self.fvalue.setdefault("value", None) # for wls df = rsum.setdefault("df", None) if df: # for RLM, works for other models? self.df_model = df[0] - 1 # R counts intercept self.df_resid = df[1] self.bcov_unscaled = rsum.setdefault("cov.unscaled", None) self.bcov = rsum.setdefault("cov.scaled", None) if "sigma" in rsum: self.scale = rsum["sigma"] elif "dispersion" in rsum: self.scale = rsum["dispersion"] else: self.scale = None self.llf = r.logLik(results) if model_type == r.glm: self.getglm() if model_type == r.rlm: self.getrlm()
x1 = poly_x_vals[i,0] x2 = poly_x_vals[i,1] y1 = poly_y_vals[i,0] y2 = poly_y_vals[i,1] xy = poly_xy_vals[i] if poly_values: poly_values = poly_values + "," poly_values += "(%s, %f, %f, %f, %f, %f)" % (id, x1, x2, y1, y2, xy) i = i+1 query = query + poly_values # print query c.execute(query) model = r.lm(r("delta ~ poly(x, 2) + poly(y, 2) + poly(x*y, 1)"), data=r.data_frame(x=py_x, y=py_y, delta=py_delta), weights=py_wt) model_summary = r.summary(model) model_coeff = array(model_summary['coefficients']) if not model_coeff.shape == (6,4): print "Bad model for %s" % exp continue c0 = model_coeff[0][0] c0_sigma = model_coeff[0][1] cx1 = model_coeff[1][0] cx1_sigma = model_coeff[1][1] cx2 = model_coeff[2][0] cx2_sigma = model_coeff[2][1] cy1 = model_coeff[3][0] cy1_sigma = model_coeff[3][1] cy2 = model_coeff[4][0] cy2_sigma = model_coeff[4][1]
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-c", "--columns", dest="columns", type="string", help="columns to take for calculating histograms.") parser.add_option("-t", "--tree-nh-file", dest="filename_tree", type="string", help="filename with tree(s).") parser.add_option("--skip-header", dest="add_header", action="store_false", help="do not add header to flat format.") parser.add_option("--output-with-header", dest="write_header", action="store_true", help="write header and exit.") parser.add_option("--debug", dest="debug", action="store_true", help="debug mode") parser.add_option("--display-tree", dest="display_tree", action="store_true", help="display the tree") parser.add_option("-m", "--method", dest="methods", type="choice", action="append", choices=("contrasts", "spearman", "pearson", "compute"), help="methods to perform on contrasts.") parser.set_defaults( columns="all", filename_tree=None, add_header=True, write_header=False, debug=False, methods=[], value_format="%6.4f", pvalue_format="%e", display_tree=False, ) (options, args) = E.Start(parser, quiet=True) if options.columns not in ("all", "all-but-first"): options.columns = map(lambda x: int(x) - 1, options.columns.split(",")) phylip = WrapperPhylip.Phylip() if options.debug: phylip.setLogLevel(options.loglevel) phylip.setProgram("contrast") ########################################################## ########################################################## ########################################################## # retrieve data and give to phylip data = [] headers = [] first = True for line in sys.stdin: if line[0] == "#": continue d = line[:-1].strip().split("\t") if first: first = False headers = d[1:] continue data.append(d) phylip.setData(data) ncolumns = len(headers) nrows = len(data) ########################################################## ########################################################## ########################################################## # read trees nexus = None if options.filename_tree: nexus = TreeTools.Newick2Nexus(open(options.filename_tree, "r")) if not nexus: raise ValueError("please provide trees with branchlenghts") ########################################################## ########################################################## ########################################################## # set up phylip phylip_options = [] # print out contrasts phylip_options.append("C") phylip_options.append("Y") phylip.setOptions(phylip_options) ########################################################## ########################################################## ########################################################## # main loop ########################################################## for tree in nexus.trees: if options.display_tree: tree.display() # compute this before giving the tree to the phylip module, # as it remaps taxon names. map_node2data = {} for x in range(nrows): taxon = data[x][0] map_node2data[tree.search_taxon(taxon)] = x phylip.setTree(tree) result = phylip.run() for method in options.methods: if method in ("pearson", "spearman"): options.stdout.write("header1\theader2\tr\tp\tcode\n") # n = len(result.mContrasts) columns = [] for c in range(ncolumns): columns.append(map(lambda x: x[c], result.mContrasts)) for x in range(0, ncolumns - 1): for y in range(x + 1, ncolumns): # phylip value phy_r = result.mCorrelations[x][y] import rpy from rpy import r as R # Various ways to calculate r. It is not # possible to use cor.test or lsfit directly, # as you have to perform a regression through # the origin. # uncomment to check pearson r against # phylip's value r = # calculateCorrelationCoefficient(columns[x], # columns[y]) # for significance, use linear regression models in R rpy.set_default_mode(rpy.NO_CONVERSION) linear_model = R.lm( R("y ~ x - 1"), data=R.data_frame(x=columns[x], y=columns[y])) rpy.set_default_mode(rpy.BASIC_CONVERSION) ss = R.summary(linear_model) # extract the p-value p = ss['coefficients'][-1][-1] if p < 0.001: code = "***" elif p < 0.01: code = "**" elif p < 0.05: code = "*" else: code = "" options.stdout.write("\t".join( (headers[x], headers[y], options.value_format % phy_r, options.pvalue_format % p, code)) + "\n") elif method == "contrasts": options.stdout.write("\t".join(headers) + "\n") for d in result.mContrasts: options.stdout.write( "\t".join( map(lambda x: options.value_format % x, d)) + "\n") elif method == "compute": # make room for all internal nodes and one dummy node # for unrooted trees. max_index = TreeTools.GetMaxIndex(tree) + 2 variances = [None] * max_index values = [[None] * nrows for x in range(max_index)] contrasts = [] for x in range(max_index): contrasts.append([None] * ncolumns) branchlengths = [None] * max_index def update_data(node_id, bl, c1, c2, ): b1, b2 = branchlengths[c1], branchlengths[c2] rb1 = 1.0 / b1 rb2 = 1.0 / b2 # compute variance variance = math.sqrt(b1 + b2) # extend branch length of this node to create correct # variance for parent branchlengths[node_id] = bl + (b1 * b2) / (b1 + b2) variances[node_id] = variance for c in range(ncolumns): v1, v2 = values[c1][c], values[c2][c] # save ancestral value as weighted mean values[node_id][c] = ( (rb1 * v1 + rb2 * v2)) / (rb1 + rb2) # compute normalized contrast contrasts[node_id][c] = (v1 - v2) / variance def update_contrasts(node_id): """update contrasts for a node.""" node = tree.node(node_id) if node.succ: if len(node.succ) == 2: c1, c2 = node.succ update_data( node_id, node.data.branchlength, c1, c2) else: assert(node_id == tree.root) assert(len(node.succ) == 3) update_data( node_id, node.data.branchlength, node.succ[0], node.succ[1]) update_data( max_index - 1, node.data.branchlength, node_id, node.succ[2]) else: for c in range(ncolumns): values[node_id][c] = float( data[map_node2data[node_id]][c + 1]) branchlengths[node_id] = node.data.branchlength tree.dfs(tree.root, post_function=update_contrasts) options.stdout.write( "node_id\tvariance\t%s\n" % "\t".join(headers)) for node_id in range(max_index): if variances[node_id] is None: continue options.stdout.write("%s\t%s\t%s\n" % ( node_id, options.value_format % variances[ node_id], "\t".join( map(lambda x: options.value_format % x, contrasts[node_id])), )) E.Stop()
x = np.asarray(xr) ggmod = Garch(x-x.mean()) ggmod.nar = 1 ggmod.nma = 1 ggmod._start_params = np.array([-0.6, 0.1, 0.2, 0.0]) ggres = ggmod.fit(start_params=np.array([-0.6, 0.1, 0.2, 0.0]), maxiter=1000) print 'ggres.params', ggres.params g11res = optimize.fmin(lambda params: -loglike_GARCH11(params, x-x.mean())[0], [0.6, 0.6, 0.2]) print g11res llf = loglike_GARCH11(g11res, x-x.mean()) print llf[0] garchplot(ggmod.errorsest, ggmod.h, title='Garch estimated') fit = r.garchFit(f, data = x-x.mean(), include_mean=False, trace=False) print r.summary(fit) '''based on R default simulation model = list(omega = 1e-06, alpha = 0.1, beta = 0.8) nobs = 1000 (with nobs=500, gjrgarch doesn't do well >>> ggres = ggmod.fit(start_params=np.array([-0.6, 0.1, 0.2, 0.0]), maxiter=1000) Optimization terminated successfully. Current function value: -448.861335 Iterations: 385 Function evaluations: 690 >>> print 'ggres.params', ggres.params ggres.params [ -7.75090330e-01 1.57714749e-01 -9.60223930e-02 8.76021411e-07] rearranged 8.76021411e-07 1.57714749e-01(-9.60223930e-02) 7.75090330e-01
ggmod.nma = 1 ggmod._start_params = np.array([-0.6, 0.1, 0.2, 0.0]) ggres = ggmod.fit(start_params=np.array([-0.6, 0.1, 0.2, 0.0]), maxiter=1000) print('ggres.params', ggres.params) g11res = optimize.fmin( lambda params: -loglike_GARCH11(params, x - x.mean())[0], [0.6, 0.6, 0.2]) print(g11res) llf = loglike_GARCH11(g11res, x - x.mean()) print(llf[0]) garchplot(ggmod.errorsest, ggmod.h, title='Garch estimated') fit = r.garchFit(f, data=x - x.mean(), include_mean=False, trace=False) print(r.summary(fit)) '''based on R default simulation model = list(omega = 1e-06, alpha = 0.1, beta = 0.8) nobs = 1000 (with nobs=500, gjrgarch doesn't do well >>> ggres = ggmod.fit(start_params=np.array([-0.6, 0.1, 0.2, 0.0]), maxiter=1000) Optimization terminated successfully. Current function value: -448.861335 Iterations: 385 Function evaluations: 690 >>> print('ggres.params', ggres.params ggres.params [ -7.75090330e-01 1.57714749e-01 -9.60223930e-02 8.76021411e-07] rearranged 8.76021411e-07 1.57714749e-01(-9.60223930e-02) 7.75090330e-01
def __init__(self, y, design, model_type=r.lm, **kwds): ''' Set up and estimate R model with data and design ''' r.library('MASS') # still needs to be in test, but also here for # logical tests at the end not to show an error self.y = np.array(y) self.design = np.array(design) self.model_type = model_type self._design_cols = [ 'x.%d' % (i + 1) for i in range(self.design.shape[1]) ] # Note the '-1' for no intercept - this is included in the design self.formula = r('y ~ %s-1' % '+'.join(self._design_cols)) self.frame = r.data_frame(y=y, x=self.design) rpy.set_default_mode(rpy.NO_CONVERSION) results = self.model_type(self.formula, data=self.frame, **kwds) self.robj = results # keep the Robj model so it can be # used in the tests rpy.set_default_mode(rpy.BASIC_CONVERSION) rsum = r.summary(results) self.rsum = rsum # Provide compatible interface with scipy models self.results = results.as_py() # coeffs = self.results['coefficients'] # self.beta0 = np.array([coeffs[c] for c in self._design_cols]) self.nobs = len(self.results['residuals']) if isinstance(self.results['residuals'], dict): self.resid = np.zeros((len(self.results['residuals'].keys()))) for i in self.results['residuals'].keys(): self.resid[int(i) - 1] = self.results['residuals'][i] else: self.resid = self.results['residuals'] self.fittedvalues = self.results['fitted.values'] self.df_resid = self.results['df.residual'] self.params = rsum['coefficients'][:, 0] self.bse = rsum['coefficients'][:, 1] self.bt = rsum['coefficients'][:, 2] try: self.pvalues = rsum['coefficients'][:, 3] except: pass self.rsquared = rsum.setdefault('r.squared', None) self.rsquared_adj = rsum.setdefault('adj.r.squared', None) self.aic_R = rsum.setdefault('aic', None) self.fvalue = rsum.setdefault('fstatistic', None) if self.fvalue and isinstance(self.fvalue, dict): self.fvalue = self.fvalue.setdefault('value', None) # for wls df = rsum.setdefault('df', None) if df: # for RLM, works for other models? self.df_model = df[0] - 1 # R counts intercept self.df_resid = df[1] self.bcov_unscaled = rsum.setdefault('cov.unscaled', None) self.bcov = rsum.setdefault('cov.scaled', None) if 'sigma' in rsum: self.scale = rsum['sigma'] elif 'dispersion' in rsum: self.scale = rsum['dispersion'] else: self.scale = None self.llf = r.logLik(results) if model_type == r.glm: self.getglm() if model_type == r.rlm: self.getrlm()
x = np.asarray(xr) ggmod = Garch(x-x.mean()) ggmod.nar = 1 ggmod.nma = 1 ggmod._start_params = np.array([-0.6, 0.1, 0.2, 0.0]) ggres = ggmod.fit(start_params=np.array([-0.6, 0.1, 0.2, 0.0]), maxiter=1000) print('ggres.params', ggres.params) g11res = optimize.fmin(lambda params: -loglike_GARCH11(params, x-x.mean())[0], [0.6, 0.6, 0.2]) print(g11res) llf = loglike_GARCH11(g11res, x-x.mean()) print(llf[0]) garchplot(ggmod.errorsest, ggmod.h, title='Garch estimated') fit = r.garchFit(f, data = x-x.mean(), include_mean=False, trace=False) print(r.summary(fit)) '''based on R default simulation model = list(omega = 1e-06, alpha = 0.1, beta = 0.8) nobs = 1000 (with nobs=500, gjrgarch doesn't do well >>> ggres = ggmod.fit(start_params=np.array([-0.6, 0.1, 0.2, 0.0]), maxiter=1000) Optimization terminated successfully. Current function value: -448.861335 Iterations: 385 Function evaluations: 690 >>> print('ggres.params', ggres.params ggres.params [ -7.75090330e-01 1.57714749e-01 -9.60223930e-02 8.76021411e-07] rearranged 8.76021411e-07 1.57714749e-01(-9.60223930e-02) 7.75090330e-01
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: data2phylocontrasts.py 2782 2009-09-10 11:40:29Z andreas $", usage=globals()["__doc__"]) parser.add_option("-c", "--columns", dest="columns", type="string", help="columns to take for calculating histograms.") parser.add_option("-t", "--filename-tree", dest="filename_tree", type="string", help="filename with tree(s).") parser.add_option("--skip-header", dest="add_header", action="store_false", help="do not add header to flat format.") parser.add_option("--write-header", dest="write_header", action="store_true", help="write header and exit.") parser.add_option("--debug", dest="debug", action="store_true", help="debug mode") parser.add_option("--display-tree", dest="display_tree", action="store_true", help="display the tree") parser.add_option("-m", "--method", dest="methods", type="choice", action="append", choices=("contrasts", "spearman", "pearson", "compute"), help="methods to perform on contrasts.") parser.set_defaults( columns="all", filename_tree=None, add_header=True, write_header=False, debug=False, methods=[], value_format="%6.4f", pvalue_format="%e", display_tree=False, ) (options, args) = E.Start(parser, quiet=True) if options.columns not in ("all", "all-but-first"): options.columns = map(lambda x: int(x) - 1, options.columns.split(",")) phylip = WrapperPhylip.Phylip() if options.debug: phylip.setLogLevel(options.loglevel) phylip.setProgram("contrast") ########################################################## ########################################################## ########################################################## # retrieve data and give to phylip data = [] headers = [] first = True for line in sys.stdin: if line[0] == "#": continue d = line[:-1].strip().split("\t") if first: first = False headers = d[1:] continue data.append(d) phylip.setData(data) ncolumns = len(headers) nrows = len(data) ########################################################## ########################################################## ########################################################## # read trees nexus = None if options.filename_tree: nexus = TreeTools.Newick2Nexus(open(options.filename_tree, "r")) if not nexus: raise ValueError("please provide trees with branchlenghts") ########################################################## ########################################################## ########################################################## # set up phylip phylip_options = [] # print out contrasts phylip_options.append("C") phylip_options.append("Y") phylip.setOptions(phylip_options) ########################################################## ########################################################## ########################################################## # main loop ########################################################## for tree in nexus.trees: if options.display_tree: tree.display() # compute this before giving the tree to the phylip module, # as it remaps taxon names. map_node2data = {} for x in range(nrows): taxon = data[x][0] map_node2data[tree.search_taxon(taxon)] = x phylip.setTree(tree) result = phylip.run() for method in options.methods: if method in ("pearson", "spearman"): options.stdout.write("header1\theader2\tr\tp\tcode\n") n = len(result.mContrasts) columns = [] for c in range(ncolumns): columns.append(map(lambda x: x[c], result.mContrasts)) for x in range(0, ncolumns - 1): for y in range(x + 1, ncolumns): # phylip value phy_r = result.mCorrelations[x][y] import rpy from rpy import r as R # Various ways to calculate r. It is not possible to use # cor.test or lsfit directly, as you have to perform a # regression through the origin. # uncomment to check pearson r against phylip's value ## r = calculateCorrelationCoefficient( columns[x], columns[y] ) # for significance, use linear regression models in R rpy.set_default_mode(rpy.NO_CONVERSION) linear_model = R.lm(R("y ~ x - 1"), data=R.data_frame(x=columns[x], y=columns[y])) rpy.set_default_mode(rpy.BASIC_CONVERSION) ss = R.summary(linear_model) # extract the p-value p = ss['coefficients'][-1][-1] if p < 0.001: code = "***" elif p < 0.01: code = "**" elif p < 0.05: code = "*" else: code = "" options.stdout.write("\t".join( (headers[x], headers[y], options.value_format % phy_r, options.pvalue_format % p, code)) + "\n") elif method == "contrasts": options.stdout.write("\t".join(headers) + "\n") for d in result.mContrasts: options.stdout.write( "\t".join(map(lambda x: options.value_format % x, d)) + "\n ") elif method == "compute": # make room for all internal nodes and one dummy node # for unrooted trees. max_index = TreeTools.GetMaxIndex(tree) + 2 variances = [None] * max_index values = [[None] * nrows for x in range(max_index)] contrasts = [] for x in range(max_index): contrasts.append([None] * ncolumns) branchlengths = [None] * max_index def update_data( node_id, bl, c1, c2, ): b1, b2 = branchlengths[c1], branchlengths[c2] rb1 = 1.0 / b1 rb2 = 1.0 / b2 # compute variance variance = math.sqrt(b1 + b2) # extend branch length of this node to create correct # variance for parent branchlengths[node_id] = bl + (b1 * b2) / (b1 + b2) variances[node_id] = variance for c in range(ncolumns): v1, v2 = values[c1][c], values[c2][c] # save ancestral value as weighted mean values[node_id][c] = ( (rb1 * v1 + rb2 * v2)) / (rb1 + rb2) # compute normalized contrast contrasts[node_id][c] = (v1 - v2) / variance def update_contrasts(node_id): """update contrasts for a node.""" node = tree.node(node_id) if node.succ: if len(node.succ) == 2: c1, c2 = node.succ update_data(node_id, node.data.branchlength, c1, c2) else: assert (node_id == tree.root) assert (len(node.succ) == 3) update_data(node_id, node.data.branchlength, node.succ[0], node.succ[1]) update_data(max_index - 1, node.data.branchlength, node_id, node.succ[2]) else: for c in range(ncolumns): values[node_id][c] = float( data[map_node2data[node_id]][c + 1]) branchlengths[node_id] = node.data.branchlength tree.dfs(tree.root, post_function=update_contrasts) options.stdout.write("node_id\tvariance\t%s\n" % "\t".join(headers)) for node_id in range(max_index): if variances[node_id] is None: continue options.stdout.write("%s\t%s\t%s\n" % ( node_id, options.value_format % variances[node_id], "\t".join( map(lambda x: options.value_format % x, contrasts[node_id])), )) E.Stop()
def lm_fit(self, lm_instance, go_no2prediction_space, bit_string, curs=None, lm_table=None): """ 02-28-05 linear model fitting here 03-08-05 grouping and accumulating before do linear model fitting, see log of 2005, section 'linear model overfitting' for detail. 03-27-05 Use glm of R to do logistic regression 06-30-05 add cluster_size add bit_string to control which parameter should be enabled. 07-04-05 add connectivity_2nd 07-06-05 add logistic 11-09-05 extend coeff_list and coeff_p_value_list restructure the list, go_no2lm_results[go_no] --data_prepare --submit """ sys.stderr.write("Linear Model Fitting...\n") go_no2lm_results = {} #06-30-05 setup the formula_list based on bit_string coeff_name_list = ['p_value', 'recurrence', 'connectivity', 'cluster_size', 'connectivity_2nd'] formula_list = [] for i in range(len(bit_string)): if bit_string[i] == '1': formula_list.append(coeff_name_list[i]) for (go_no,data) in go_no2prediction_space.iteritems(): sys.stderr.write("%s prediction entries from %s.\n"%(len(data), go_no)) #11-09-05 extend coeff_list and coeff_p_value_list coeff_list = [0]*7 #intercept, p_value, recurrence, connectivity, cluster_size coeff_p_value_list = [1]*7 index = 0 #06-30-05 the pointer for summary_stat if len(data)<=50: #two few data continue #convert it to a 2d array data = array(data) """ data_frame = r("d=data.frame(p_value=c(%s),recurrence=c(%s),connectivity=c(%s), is_correct=c(%s))"%(repr(list(data[:,0]))[1:-1], \ repr(list(data[:,1]))[1:-1], repr(list(data[:,2]))[1:-1], repr(list(data[:,3]))[1:-1])) lm_result = r("lm_result=glm(is_correct~p_value+recurrence+connectivity, data=d,family=binomial)") significance_dict = r("summary(lm_result)") print significance_dict['coefficients'] """ set_default_mode(NO_CONVERSION) #04-07-05 data_frame = r.as_data_frame({"p_value":data[:,0], "recurrence":data[:,1], "connectivity":data[:,2], \ "cluster_size":data[:,3], "connectivity_2nd":data[:,4], "is_correct":data[:,-1]}) #06-30-05 -1 denotes is_correct if self.logistic: lm_result = r.glm(r("is_correct~%s"%'+'.join(formula_list)), data=data_frame, family=r("binomial")) else: lm_result = r.glm(r("is_correct~%s"%'+'.join(formula_list)), data=data_frame) #06-30-05 use formula_list set_default_mode(BASIC_CONVERSION) #04-07-05 #04-07-05 r.summary() requires lm_result in NO_CONVERSION state summary_stat = r.summary(lm_result) if self.debug: print "everything about coefficients from function", go_no, "is" print summary_stat['coefficients'] #p-values of coefficients """ #04-07-05 convert to python dictionary form lm_result = lm_result.as_py() coeff_list = [lm_result["coefficients"]["(Intercept)"], lm_result["coefficients"]["p_value"], \ lm_result["coefficients"]["recurrence"], lm_result["coefficients"]["connectivity"], \ lm_result["coefficients"]["cluster_size"], \ summary_stat['coefficients'][0][-1], summary_stat['coefficients'][1][-1],\ summary_stat['coefficients'][2][-1], summary_stat['coefficients'][3][-1],\ summary_stat['coefficients'][4][-1], 1] #the last entry is score_cut_off, replaced later in get_score_cut_off() #06-30-05 add corresponding p-values """ #06-30-05 0 in summary_stat['coefficients'] is intercept coeff_list[0] = summary_stat['coefficients'][0][0] #0 is the coefficient coeff_p_value_list[0] = summary_stat['coefficients'][0][-1] #-1 is the corresponding p-value #06-30-05 fill in other efficients based on bit_string, NOTE i+1 for i in range(len(bit_string)): if bit_string[i] == '1': index+=1 coeff_list[i+1] = summary_stat['coefficients'][index][0] #0 is the coefficient coeff_p_value_list[i+1] = summary_stat['coefficients'][index][-1] #-1 is the corresponding p-value #11-09-05 restructure the following list go_no2lm_results[go_no] = [coeff_list, coeff_p_value_list, 1] #the last entry is score_cut_off, replaced later in get_score_cut_off() sys.stderr.write("done.\n") return go_no2lm_results
import rpy from rpy import r as R ## Various ways to calculate r. It is not possible to use ## cor.test or lsfit directly, as you have to perform a ## regression through the origin. ## uncomment to check pearson r against phylip's value ## r = calculateCorrelationCoefficient( columns[x], columns[y] ) ## for significance, use linear regression models in R rpy.set_default_mode(rpy.NO_CONVERSION) linear_model = R.lm(R("y ~ x - 1"), data = R.data_frame(x=columns[x], y=columns[y])) rpy.set_default_mode(rpy.BASIC_CONVERSION) ss = R.summary(linear_model) ## extract the p-value p = ss['coefficients'][-1][-1] if p < 0.001: code = "***" elif p < 0.01: code = "**" elif p < 0.05: code = "*" else: code = "" options.stdout.write( "\t".join( (headers[x], headers[y], options.value_format % phy_r,