def errors_sampling(self): beta_boot=[] res_boot=[] for b in range(self.B): ind= npr.randint(0,len(self.X),len(self.X)) sample_X=self.X[ind,:] sample_resid=self.resid[ind] sample_Y=np.zeros(len(self.X)) if self.method=="linear": for i in range(len(self.X)): sample_Y[i] = np.dot(sample_X[i,:],self.beta) + np.sqrt(self.var)*sample_resid[i] model_sample =st.GLM(sample_Y,sample_X,st.families.Gaussian()) if self.method=="logistic": for i in range(len(self.X)): pi_logreg = np.exp( np.dot(sample_X[i,:],self.beta) ) cutoff = pi_logreg / ( 1 + pi_logreg ) + np.sqrt(pi_logreg*(1-pi_logreg))*sample_resid[i] sample_Y[i] = 1 if cutoff > 0.5 else 0 model_sample =st.GLM(sample_Y,sample_X,st.families.Binomial()) res_sample = model_sample.fit() beta_sample = res_sample.params if self.method == "linear": resid_sample = res_sample.resid_deviance if self.method == "logistic": resid_sample = res_sample.resid_pearson sd_sample=np.std(resid_sample) beta_boot.append(beta_sample) res_boot.append(sd_sample) return(beta_boot)
def bootstrap_H0_CS(self,k,hyp): S_b = np.zeros(self.B) #ETAPE 1: estime les paramètres et erreurs sous H0 gamma,Residus,X_H0,vrais,y_prime = self.Estim_H0(k,hyp) #ETAPE 2: échantillonne aléatoirement les résidus test_ES=[] for b in range(self.B): ind=npr.randint(0,len(Residus),len(Residus)) #ETAPE 3: calcul des Beta et variances avec une regression linéaire sur l'échantillon bootstrapé if self.method == "linear": model_sample =st.GLM(y_prime[ind],X_H0[ind,:],st.families.Gaussian()) if self.method == "logistic": model_sample =st.GLM(y_prime[ind],X_H0[ind,:],st.families.Binomial()) res_sample = model_sample.fit() beta_sample = res_sample.params #ETAPE 4: sauvegarde dans une liste les beta's et erreurs estimés X_sub=X_H0.copy()#[ ind,: ] X = self.X.copy()#[ind,:] Y_sub=self.y.copy()#[ind] if self.method == "linear": test_ES.append( self.Fisher(X,X_sub,Y_sub,y_prime,beta_sample,self.beta) ) if self.method == "logistic": test_ES.append( -2*( res_sample.llf - vrais ) ) return test_ES
def bootstrap_H0_ES(self,k,hyp): #ETAPE 1: estime les paramètres et erreurs sous H0 gamma,Residus,X_H0,vrais,y_prime = self.Estim_H0(k,hyp) var_H0=(1/(len(X_H0)-X_H0.shape[1]-1))*np.sum(Residus**2) #ETAPE 2: échantillonne aléatoirement les résidus test_ES=[] y_hat_list=[] ind_list=[] for b in range(self.B): ind=npr.randint(0,len(Residus),len(Residus)) #ETAPE 3: calcul des Beta et variances avec une regression linéaire sur l'échantillon bootstrapé y_hat=np.zeros(len(X_H0)) m=0 for i in ind: if self.method == "linear": y_hat[m] = np.dot(X_H0[i,:],gamma) + np.sqrt(var_H0)*Residus[i] if self.method == "logistic": pi_logreg = np.exp( np.dot(X_H0[i,:],gamma) ) cutoff = pi_logreg / ( 1 + pi_logreg ) + np.sqrt(pi_logreg*(1-pi_logreg))*Residus[i] y_hat[m] = 1 if cutoff > 0.5 else 0 m=m+1 y_hat_list.append(y_hat) #ind_list.append(ind) #if self.method == "linear": # model_sample =st.GLM(y_hat,X_H0[ind,:],st.families.Gaussian()) if self.method == "logistic": model_sample =st.GLM(y_hat,X_H0[ind,:],st.families.Binomial()) res_sample = model_sample.fit() beta_sample = res_sample.params #ETAPE 4: sauvegarde dans une liste les beta's et erreurs estimés if self.method == "linear": test_ES.append( self.Fisher(self.X,X_H0,y_hat,gamma,self.beta) ) if self.method == "logistic": test_ES.append( -2*( res_sample.llf - vrais ) ) return test_ES
def case_sampling(self): beta_boot=[] sd_hat=np.zeros(self.B) for b in range(self.B): ind= npr.randint(0,len(self.X),len(self.X)) sample_X=self.X[ind,:] sample_Y=self.y[ind] if self.method=="linear": model_sample = st.GLM(sample_Y,sample_X,st.families.Gaussian()) if self.method=="logistic": model_sample = st.GLM(sample_Y,sample_X,st.families.Binomial()) res_sample = model_sample.fit() beta_sample = res_sample.params beta_boot.append(beta_sample) sd_hat[b]=np.std(self.X[ind,:]/np.sqrt(np.var(self.X[ind,:])/np.mean(self.y[ind])**2+ self.X[ind,:]*np.var(self.y[ind])/np.mean(self.y[ind])**4) ) return(beta_boot,sd_hat)
def __init__(self,X,y,method,alpha=0.05,B=1000): self.X=X # !!! enlever y !!! self.y=y self.alpha=alpha self.B=B self.method=method if method=="linear": Reg = st.GLM(y,X) if method == "logistic": Reg = st.GLM(y,X,st.families.Binomial()) results = Reg.fit() self.results=results #paramètres de sortie de la régression self.beta = results.params if method == "linear": self.resid = results.resid_deviance if method == "logistic": self.resid = results.resid_pearson self.var=(1/(len(X)-X.shape[1]-1))*sum(self.resid**2) self.y_pred = results.fittedvalues self.std_beta = results.bse
def Estim_H0(self,k,hyp): y= self.y.copy() if hyp != 0: for i in range(len(y)): y[i]=y[i]-hyp*self.X[i,k] if self.method=="linear": X=np.delete(self.X,k,axis=1).copy() model=st.families.Gaussian() Reg = st.GLM(y,X,model) results = Reg.fit() Beta=results.params resid=results.resid_deviance vraise=0 if self.method=="logistic": X=np.delete(self.X,k,axis=1).copy() model=st.families.Binomial() Reg = st.GLM(y,X,model) results = Reg.fit() Beta=results.params resid=results.resid_pearson vraise=results.llf return(Beta,resid,X,vraise,y)
def logistic_regression(df, model, groupby=None, compute_cpd=True, standardize=False): #should we use group by? usegroupby = groupby != None #If we're using group by, find unique values to group by. if (usegroupby): gb_u = df[groupby].drop_duplicates( ) #unique values of groupby variable ncond = gb_u.shape[0] else: #otherwise, we're not using group by. ncond = 1 mout = [] for i in range(0, ncond): #gets the subset of data given by groupby variable. #Otherwise, use entire dataframe if (usegroupby): thisdf = df.loc[np.sum(df[groupby] == gb_u.iloc[i, :], axis=1) == len(groupby)] else: thisdf = df #converts data into regressand (y) and regression matrix (X) based on model. y, X = patsy.dmatrices(model, thisdf, return_type='dataframe') if (standardize): for c in X.columns: if (c != 'Intercept'): X[c] = scipy.stats.mstats.zscore(X[c]) #create and fit model object mdl = sreg.GLM(endog=y, exog=X, family=sm.genmod.families.family.Binomial()) thismout = mdl.fit() thismout.bic = thismout.deviance + np.log(X.shape[0]) * len( thismout.params) thismout.rank = np.linalg.matrix_rank(X) thismout.npar = X.shape[1] thismout.fullrank = thismout.rank == thismout.npar #placeholder for computing coefficient of partial determination if (compute_cpd): pass else: pass #store results mout.append(thismout) #convert output from GLMresults object into dictionary, which is later converted #into a pandas table. mout_dict = { 'bic': [m.bic for m in mout], 'deviance': [m.deviance for m in mout], 'df_model': [m.df_model for m in mout], 'df_resid': [m.df_resid for m in mout], 'fittedvalues': [m.fittedvalues for m in mout], 'llf': [m.llf for m in mout], 'mu': [m.mu for m in mout], 'npar': [m.npar for m in mout], 'null_deviance': [m.null_deviance for m in mout], 'rank': [m.rank for m in mout], 'resid_deviance': [m.resid_deviance for m in mout], 'scale': [m.scale for m in mout] } #flatten parameter/pvalue output into 1 parameter per column. for i in range(0, X.shape[1]): mout_dict['b_' + X.columns[i]] = [m.params[i] for m in mout] mout_dict['p_' + X.columns[i]] = [m.pvalues[i] for m in mout] #add groupby information to output data structure if (usegroupby): for i in range(0, ncond): for gbcond in groupby: if (i == 0): mout_dict[gbcond] = [] mout_dict[gbcond].append(gb_u[gbcond].iloc[i]) #convert dictionary into dataframe return pd.DataFrame(mout_dict)