コード例 #1
0
ファイル: linear1.py プロジェクト: gzhang-hli/glu-genetics
def check_R(model,g):
  import rpy
  from   rpy   import r
  from   numpy import array,allclose

  vars = [ v.replace(':','.').replace('+','p').replace('-','m').replace('_','.') for v in model.vars[1:] ]
  frame = dict( (v,model.X[:,i+1].reshape(-1)) for i,v in enumerate(vars) )
  frame['y'] = model.y.reshape(-1)
  formula = 'y ~ ' + ' + '.join(v.replace(':','.') for v in vars)

  rpy.set_default_mode(rpy.NO_CONVERSION)
  mod = r.glm(r(formula),data=r.data_frame(**frame),family=r.binomial('logit'))
  rpy.set_default_mode(rpy.BASIC_CONVERSION)
  pmod = mod.as_py()

  coef  = r.coefficients(mod)
  coef  = array([coef['(Intercept)']] + [ coef[v] for v in vars ],dtype=float)
  coef2 = g.beta.reshape(-1)
 def _independent_betas_same_sources(self, tag_list, remove_tags_when_bad_regression, n_times_show_summary=3):
     times_showed_summary = 0 # This allows us to print out some summary statistics without producing an overwhelming amount of output.
     SUMMARY_STATS = ["beta", "stderr", "tstat", "pval"]
     for tag in tag_list:
         self._progress("Computing betas for tag %s." % tag, newline=True) # rmme: newline make false
         rpy.set_default_mode(rpy.NO_CONVERSION) # Turn off conversion so that lm returns Robj.
         data = rc.list(y=self.y[tag],X=self.X[tag])
         model = "y~X-1" # Use -1 because X has an intercept already
         if self.regtype=="Independent Linear":
             try:
                 result = rc.lm(model,data=data)
             except:
                 pdb.set_trace()
         elif self.regtype=="Independent Logistic":
             result = rc.glm(model,family=rc.binomial("logit"),data=data)
         rpy.set_default_mode(rpy.BASIC_CONVERSION) # Return to normal conversion mode.
         summary = rc.summary(result,correlation=rc.TRUE)
         self._record_regression_stats(tag, summary)
         beta_dict = dict()
         sorted_sources = self.sorted_sources[tag]
         coeff_matrix = summary["coefficients"]
         for i in range(len(sorted_sources)):
             try:
                 cur_source_dict = dict(zip(SUMMARY_STATS,coeff_matrix[i,:]))
             except IndexError:
                 util.info("\tWARNING: Regression for %s didn't end up using all variables." % tag)
                 if remove_tags_when_bad_regression:
                     self._remove_tag(tag)
                     break # break from for-loop over sorted_sources; we don't continue out of the per-tag for loop until later when we check if tag is in self.features....
                 continue
             try:
                 cur_source_dict["-log10(pval)"] = -log(cur_source_dict["pval"], 10)
             except OverflowError:
                 pass
             beta_dict[sorted_sources[i]] = cur_source_dict
         if tag not in self.features: # We've removed this tag a few lines above, so skip it.
             continue
         self.beta[tag] = beta_dict
         if times_showed_summary < n_times_show_summary:
             self._print_regression_summary(tag, summary)
             times_showed_summary += 1
コード例 #3
0
from rpy import r
import numpy as np
import statsmodels.api as sm

examples = [1, 2]

if 1 in examples:
    data = sm.datasets.longley.load(as_pandas=False)
    y, x = data.endog, sm.add_constant(data.exog, prepend=False)
    des_cols = ['x.%d' % (i + 1) for i in range(x.shape[1])]
    formula = r('y~%s-1' % '+'.join(des_cols))
    frame = r.data_frame(y=y, x=x)
    results = r.lm(formula, data=frame)
    print(list(iterkeys(results)))
    print(results['coefficients'])

if 2 in examples:
    data2 = sm.datasets.star98.load(as_pandas=False)
    y2, x2 = data2.endog, sm.add_constant(data2.exog, prepend=False)
    import rpy
    y2 = y2[:, 0] / y2.sum(axis=1)
    des_cols2 = ['x.%d' % (i + 1) for i in range(x2.shape[1])]
    formula2 = r('y~%s-1' % '+'.join(des_cols2))
    frame2 = r.data_frame(y=y2, x=x2)
    results2 = r.glm(formula2, data=frame2, family='binomial')
    params_est = [
        results2['coefficients'][k] for k in sorted(results2['coefficients'])
    ]
    print(params_est)
    print(', '.join(['%13.10f'] * 21) % tuple(params_est))
コード例 #4
0
import numpy as np
import scikits.statsmodels.api as sm


examples = [1, 2]

if 1 in examples:
    data = sm.datasets.longley.load()
    y,x = data.endog, sm.add_constant(data.exog)
    des_cols = ['x.%d' % (i+1) for i in range(x.shape[1])]
    formula = r('y~%s-1' % '+'.join(des_cols))
    frame = r.data_frame(y=y, x=x)
    results = r.lm(formula, data=frame)
    print results.keys()
    print results['coefficients']

if 2 in examples:
    data2 = sm.datasets.star98.load()
    y2,x2 = data2.endog, sm.add_constant(data2.exog)
    import rpy
    y2 = y2[:,0]/y2.sum(axis=1)
    des_cols2 = ['x.%d' % (i+1) for i in range(x2.shape[1])]
    formula2 = r('y~%s-1' % '+'.join(des_cols2))
    frame2 = r.data_frame(y=y2, x=x2)
    results2 = r.glm(formula2, data=frame2, family='binomial')
    params_est = [results2['coefficients'][k] for k
                    in sorted(results2['coefficients'])]
    print params_est
    print ', '.join(['%13.10f']*21) % tuple(params_est)

コード例 #5
0
ファイル: p_gene_lm.py プロジェクト: polyactis/annot
	def lm_fit(self, lm_instance, go_no2prediction_space, bit_string, curs=None, lm_table=None):
		"""
		02-28-05
			linear model fitting here
		
		03-08-05
			grouping and accumulating before do linear model fitting, see log of 2005, 
			section 'linear model overfitting' for detail.
		03-27-05
			Use glm of R to do logistic regression
		06-30-05
			add cluster_size
			add bit_string to control which parameter should be enabled.
		07-04-05
			add connectivity_2nd
		07-06-05
			add logistic
		11-09-05 extend coeff_list and coeff_p_value_list
			restructure the list, go_no2lm_results[go_no]
			
			--data_prepare
			--submit
		"""
		sys.stderr.write("Linear Model Fitting...\n")
		go_no2lm_results = {}
		
		#06-30-05	setup the formula_list based on bit_string
		coeff_name_list = ['p_value', 'recurrence', 'connectivity', 'cluster_size', 'connectivity_2nd']
		formula_list = []
		for i in range(len(bit_string)):
			if bit_string[i] == '1':
				formula_list.append(coeff_name_list[i])
		
		for (go_no,data) in go_no2prediction_space.iteritems():
			sys.stderr.write("%s prediction entries from %s.\n"%(len(data), go_no))
			#11-09-05 extend coeff_list and coeff_p_value_list
			coeff_list = [0]*7	#intercept, p_value, recurrence, connectivity, cluster_size
			coeff_p_value_list = [1]*7
			index = 0	#06-30-05	the pointer for summary_stat
			
			if len(data)<=50:
				#two few data
				continue
			#convert it to a 2d array
			data = array(data)
			"""
			data_frame = r("d=data.frame(p_value=c(%s),recurrence=c(%s),connectivity=c(%s), is_correct=c(%s))"%(repr(list(data[:,0]))[1:-1], \
				repr(list(data[:,1]))[1:-1], repr(list(data[:,2]))[1:-1], repr(list(data[:,3]))[1:-1]))
			lm_result = r("lm_result=glm(is_correct~p_value+recurrence+connectivity, data=d,family=binomial)")
			significance_dict = r("summary(lm_result)")
			print significance_dict['coefficients']
			"""
			set_default_mode(NO_CONVERSION) #04-07-05
			data_frame = r.as_data_frame({"p_value":data[:,0], "recurrence":data[:,1], "connectivity":data[:,2], \
				"cluster_size":data[:,3], "connectivity_2nd":data[:,4], "is_correct":data[:,-1]})	#06-30-05	-1 denotes is_correct
			if self.logistic:
				lm_result = r.glm(r("is_correct~%s"%'+'.join(formula_list)), data=data_frame, family=r("binomial"))
			else:
				lm_result = r.glm(r("is_correct~%s"%'+'.join(formula_list)), data=data_frame)	#06-30-05 use formula_list
			set_default_mode(BASIC_CONVERSION) #04-07-05
			#04-07-05 r.summary() requires lm_result in NO_CONVERSION state
			summary_stat = r.summary(lm_result)
			if self.debug:
				print "everything about coefficients from function", go_no, "is"
				print summary_stat['coefficients']	#p-values of coefficients
			"""
			#04-07-05 convert to python dictionary form
			lm_result = lm_result.as_py()
			coeff_list = [lm_result["coefficients"]["(Intercept)"], lm_result["coefficients"]["p_value"], \
				lm_result["coefficients"]["recurrence"], lm_result["coefficients"]["connectivity"], \
				lm_result["coefficients"]["cluster_size"], \
				summary_stat['coefficients'][0][-1], summary_stat['coefficients'][1][-1],\
				summary_stat['coefficients'][2][-1], summary_stat['coefficients'][3][-1],\
				summary_stat['coefficients'][4][-1], 1]
				#the last entry is score_cut_off, replaced later in get_score_cut_off()
				#06-30-05	add corresponding p-values
			"""
			#06-30-05	0 in summary_stat['coefficients'] is intercept
			coeff_list[0] = summary_stat['coefficients'][0][0]	#0 is the coefficient
			coeff_p_value_list[0] = summary_stat['coefficients'][0][-1]	#-1 is the corresponding p-value
			#06-30-05	fill in other efficients based on bit_string, NOTE i+1
			for i in range(len(bit_string)):
				if bit_string[i] == '1':
					index+=1
					coeff_list[i+1] = summary_stat['coefficients'][index][0]	#0 is the coefficient
					coeff_p_value_list[i+1] = summary_stat['coefficients'][index][-1]	#-1 is the corresponding p-value
			#11-09-05 restructure the following list
			go_no2lm_results[go_no] = [coeff_list, coeff_p_value_list, 1]	#the last entry is score_cut_off, replaced later in get_score_cut_off()
		sys.stderr.write("done.\n")
		return go_no2lm_results