def get_confidence_interval_for_mean(self,X=[]): """ Calculates the confidence interval for each datapoint, given a model fit This is the confidence interval of the model, not the prediction interval """ if isinstance(X,pd.core.frame.DataFrame): X=X[self.independent_] df_results=pd.DataFrame({'y_hat':numpy.zeros(X.shape[0])}) y_hat=self.predict(X) w=numpy.matrix(X) # XT_X=numpy.matrix(X).T*\ # numpy.matrix(X) #print "X_XT" #print X_XT # print "w" # print numpy.shape(w) # print "XT_T" # print numpy.shape(XT_X) #logging.debug(numpy.shape(s_2*inv(XT_X))) s_c_2=numpy.array(w*numpy.power(self.s_y,2)*inv(self.X_dash_X)*w.T) #logging.debug("s_c_2: {}".format(s_c_2)) #we only want the diagonal s_c_2=numpy.diagonal(s_c_2) #logging.debug("s_c_2 diag: {}".format(s_c_2)) #tau=df_new.apply(lambda x:numpy.matrix(x[est.params.index.values].values),axis=1) # X_XT*numpy.matrix(x[est.params.index.values].values).T) # tau=numpy.matrix(df_new[est.params.index.values].values[])*X_XT*\ # numpy.matrix(df_new[est.params.index.values].values).T #print "tau" #print numpy.shape(numpy.squeeze(tau)) #95% confidence interval so alpha =0.95 alpha=0.05 t_val=stats.t.ppf(1-alpha/2,self.df_resid+1) upper=y_hat+t_val*numpy.sqrt(s_c_2) lower=y_hat-t_val*numpy.sqrt(s_c_2) # df_orig['s_c_2']=s_c_2 # #df_orig['sigma_tilde']=sigma_tilde # df_orig['t']=t_val # df_orig['upper_y_hat']=upper # df_orig['lower_y_hat']=lower df=pd.DataFrame({'y_hat':y_hat,'upper_mean':upper,'lower_mean':lower}) return (df)
def get_prediction_interval(self,X=[]): """ Chuck out the 95% prediction interval for the data passed Note that if X is a dataframe it may contain more columns than there are in the original data, therefore just pull out what we're after """ #need to get the idempotent matrix i_n=numpy.matrix(numpy.ones(X.shape[0])) n_obs=X.shape[0] # M_0=numpy.matrix(numpy.eye(n_obs))-numpy.power(n_obs,-1)*i_n*i_n.T #Z is the X's without the offset # logging.debug(X.head()) if isinstance(X,pd.core.frame.DataFrame): #assume its' called alpha X=X[self.independent_] df_pred=pd.DataFrame({'upper_pred':numpy.zeros(X.shape[0]),'lower_pred':numpy.zeros(X.shape[0])}) df_pred['y_hat']=self.predict(X) df_pred['percent_ci']=0.0 alpha=0.05 t_val=stats.t.ppf(1-alpha/2,self.df_resid+1) for indx in df_pred.index: # print(df_pred.ix[indx].values[1:]) # logging.debug(self.X_bar) # logging.debug(X.head()) if "alpha" in self.independent_: x_0_x_bar=numpy.matrix(X.ix[indx].values[1:]-self.X_bar) else: x_0_x_bar=numpy.matrix(X.ix[indx].values-self.X_bar) # print(numpy.shape(x_0_x_bar)) # print("************") # logging.debug(self.Z_M_Z) # logging.debug(x_0_x_bar) se_e = self.s_y*numpy.sqrt(1 + (1/self.nobs) + x_0_x_bar*inv(self.Z_M_Z)*x_0_x_bar.T) df_pred.loc[indx,'upper_pred']=df_pred.loc[indx,'y_hat']+t_val*se_e df_pred.loc[indx,'lower_pred']=df_pred.loc[indx,'y_hat']-t_val*se_e df_pred.loc[indx,'percent_ci']=100*2*t_val*se_e/numpy.abs(df_pred.loc[indx,'y_hat']) return df_pred