def check_R(model,g): import rpy from rpy import r from numpy import array,allclose vars = [ v.replace(':','.').replace('+','p').replace('-','m').replace('_','.') for v in model.vars[1:] ] frame = dict( (v,model.X[:,i+1].reshape(-1)) for i,v in enumerate(vars) ) frame['y'] = model.y.reshape(-1) formula = 'y ~ ' + ' + '.join(v.replace(':','.') for v in vars) rpy.set_default_mode(rpy.NO_CONVERSION) mod = r.glm(r(formula),data=r.data_frame(**frame),family=r.binomial('logit')) rpy.set_default_mode(rpy.BASIC_CONVERSION) pmod = mod.as_py() coef = r.coefficients(mod) coef = array([coef['(Intercept)']] + [ coef[v] for v in vars ],dtype=float) coef2 = g.beta.reshape(-1)
def _independent_betas_same_sources(self, tag_list, remove_tags_when_bad_regression, n_times_show_summary=3): times_showed_summary = 0 # This allows us to print out some summary statistics without producing an overwhelming amount of output. SUMMARY_STATS = ["beta", "stderr", "tstat", "pval"] for tag in tag_list: self._progress("Computing betas for tag %s." % tag, newline=True) # rmme: newline make false rpy.set_default_mode(rpy.NO_CONVERSION) # Turn off conversion so that lm returns Robj. data = rc.list(y=self.y[tag],X=self.X[tag]) model = "y~X-1" # Use -1 because X has an intercept already if self.regtype=="Independent Linear": try: result = rc.lm(model,data=data) except: pdb.set_trace() elif self.regtype=="Independent Logistic": result = rc.glm(model,family=rc.binomial("logit"),data=data) rpy.set_default_mode(rpy.BASIC_CONVERSION) # Return to normal conversion mode. summary = rc.summary(result,correlation=rc.TRUE) self._record_regression_stats(tag, summary) beta_dict = dict() sorted_sources = self.sorted_sources[tag] coeff_matrix = summary["coefficients"] for i in range(len(sorted_sources)): try: cur_source_dict = dict(zip(SUMMARY_STATS,coeff_matrix[i,:])) except IndexError: util.info("\tWARNING: Regression for %s didn't end up using all variables." % tag) if remove_tags_when_bad_regression: self._remove_tag(tag) break # break from for-loop over sorted_sources; we don't continue out of the per-tag for loop until later when we check if tag is in self.features.... continue try: cur_source_dict["-log10(pval)"] = -log(cur_source_dict["pval"], 10) except OverflowError: pass beta_dict[sorted_sources[i]] = cur_source_dict if tag not in self.features: # We've removed this tag a few lines above, so skip it. continue self.beta[tag] = beta_dict if times_showed_summary < n_times_show_summary: self._print_regression_summary(tag, summary) times_showed_summary += 1
from rpy import r import numpy as np import statsmodels.api as sm examples = [1, 2] if 1 in examples: data = sm.datasets.longley.load(as_pandas=False) y, x = data.endog, sm.add_constant(data.exog, prepend=False) des_cols = ['x.%d' % (i + 1) for i in range(x.shape[1])] formula = r('y~%s-1' % '+'.join(des_cols)) frame = r.data_frame(y=y, x=x) results = r.lm(formula, data=frame) print(list(iterkeys(results))) print(results['coefficients']) if 2 in examples: data2 = sm.datasets.star98.load(as_pandas=False) y2, x2 = data2.endog, sm.add_constant(data2.exog, prepend=False) import rpy y2 = y2[:, 0] / y2.sum(axis=1) des_cols2 = ['x.%d' % (i + 1) for i in range(x2.shape[1])] formula2 = r('y~%s-1' % '+'.join(des_cols2)) frame2 = r.data_frame(y=y2, x=x2) results2 = r.glm(formula2, data=frame2, family='binomial') params_est = [ results2['coefficients'][k] for k in sorted(results2['coefficients']) ] print(params_est) print(', '.join(['%13.10f'] * 21) % tuple(params_est))
import numpy as np import scikits.statsmodels.api as sm examples = [1, 2] if 1 in examples: data = sm.datasets.longley.load() y,x = data.endog, sm.add_constant(data.exog) des_cols = ['x.%d' % (i+1) for i in range(x.shape[1])] formula = r('y~%s-1' % '+'.join(des_cols)) frame = r.data_frame(y=y, x=x) results = r.lm(formula, data=frame) print results.keys() print results['coefficients'] if 2 in examples: data2 = sm.datasets.star98.load() y2,x2 = data2.endog, sm.add_constant(data2.exog) import rpy y2 = y2[:,0]/y2.sum(axis=1) des_cols2 = ['x.%d' % (i+1) for i in range(x2.shape[1])] formula2 = r('y~%s-1' % '+'.join(des_cols2)) frame2 = r.data_frame(y=y2, x=x2) results2 = r.glm(formula2, data=frame2, family='binomial') params_est = [results2['coefficients'][k] for k in sorted(results2['coefficients'])] print params_est print ', '.join(['%13.10f']*21) % tuple(params_est)
def lm_fit(self, lm_instance, go_no2prediction_space, bit_string, curs=None, lm_table=None): """ 02-28-05 linear model fitting here 03-08-05 grouping and accumulating before do linear model fitting, see log of 2005, section 'linear model overfitting' for detail. 03-27-05 Use glm of R to do logistic regression 06-30-05 add cluster_size add bit_string to control which parameter should be enabled. 07-04-05 add connectivity_2nd 07-06-05 add logistic 11-09-05 extend coeff_list and coeff_p_value_list restructure the list, go_no2lm_results[go_no] --data_prepare --submit """ sys.stderr.write("Linear Model Fitting...\n") go_no2lm_results = {} #06-30-05 setup the formula_list based on bit_string coeff_name_list = ['p_value', 'recurrence', 'connectivity', 'cluster_size', 'connectivity_2nd'] formula_list = [] for i in range(len(bit_string)): if bit_string[i] == '1': formula_list.append(coeff_name_list[i]) for (go_no,data) in go_no2prediction_space.iteritems(): sys.stderr.write("%s prediction entries from %s.\n"%(len(data), go_no)) #11-09-05 extend coeff_list and coeff_p_value_list coeff_list = [0]*7 #intercept, p_value, recurrence, connectivity, cluster_size coeff_p_value_list = [1]*7 index = 0 #06-30-05 the pointer for summary_stat if len(data)<=50: #two few data continue #convert it to a 2d array data = array(data) """ data_frame = r("d=data.frame(p_value=c(%s),recurrence=c(%s),connectivity=c(%s), is_correct=c(%s))"%(repr(list(data[:,0]))[1:-1], \ repr(list(data[:,1]))[1:-1], repr(list(data[:,2]))[1:-1], repr(list(data[:,3]))[1:-1])) lm_result = r("lm_result=glm(is_correct~p_value+recurrence+connectivity, data=d,family=binomial)") significance_dict = r("summary(lm_result)") print significance_dict['coefficients'] """ set_default_mode(NO_CONVERSION) #04-07-05 data_frame = r.as_data_frame({"p_value":data[:,0], "recurrence":data[:,1], "connectivity":data[:,2], \ "cluster_size":data[:,3], "connectivity_2nd":data[:,4], "is_correct":data[:,-1]}) #06-30-05 -1 denotes is_correct if self.logistic: lm_result = r.glm(r("is_correct~%s"%'+'.join(formula_list)), data=data_frame, family=r("binomial")) else: lm_result = r.glm(r("is_correct~%s"%'+'.join(formula_list)), data=data_frame) #06-30-05 use formula_list set_default_mode(BASIC_CONVERSION) #04-07-05 #04-07-05 r.summary() requires lm_result in NO_CONVERSION state summary_stat = r.summary(lm_result) if self.debug: print "everything about coefficients from function", go_no, "is" print summary_stat['coefficients'] #p-values of coefficients """ #04-07-05 convert to python dictionary form lm_result = lm_result.as_py() coeff_list = [lm_result["coefficients"]["(Intercept)"], lm_result["coefficients"]["p_value"], \ lm_result["coefficients"]["recurrence"], lm_result["coefficients"]["connectivity"], \ lm_result["coefficients"]["cluster_size"], \ summary_stat['coefficients'][0][-1], summary_stat['coefficients'][1][-1],\ summary_stat['coefficients'][2][-1], summary_stat['coefficients'][3][-1],\ summary_stat['coefficients'][4][-1], 1] #the last entry is score_cut_off, replaced later in get_score_cut_off() #06-30-05 add corresponding p-values """ #06-30-05 0 in summary_stat['coefficients'] is intercept coeff_list[0] = summary_stat['coefficients'][0][0] #0 is the coefficient coeff_p_value_list[0] = summary_stat['coefficients'][0][-1] #-1 is the corresponding p-value #06-30-05 fill in other efficients based on bit_string, NOTE i+1 for i in range(len(bit_string)): if bit_string[i] == '1': index+=1 coeff_list[i+1] = summary_stat['coefficients'][index][0] #0 is the coefficient coeff_p_value_list[i+1] = summary_stat['coefficients'][index][-1] #-1 is the corresponding p-value #11-09-05 restructure the following list go_no2lm_results[go_no] = [coeff_list, coeff_p_value_list, 1] #the last entry is score_cut_off, replaced later in get_score_cut_off() sys.stderr.write("done.\n") return go_no2lm_results