def get_prediction_interval(self,X=[]): """ Chuck out the 95% prediction interval for the data passed Note that if X is a dataframe it may contain more columns than there are in the original data, therefore just pull out what we're after """ #need to get the idempotent matrix i_n=numpy.matrix(numpy.ones(X.shape[0])) n_obs=X.shape[0] # M_0=numpy.matrix(numpy.eye(n_obs))-numpy.power(n_obs,-1)*i_n*i_n.T #Z is the X's without the offset # logging.debug(X.head()) if isinstance(X,pd.core.frame.DataFrame): #assume its' called alpha X=X[self.independent_] df_pred=pd.DataFrame({'upper_pred':numpy.zeros(X.shape[0]),'lower_pred':numpy.zeros(X.shape[0])}) df_pred['y_hat']=self.predict(X) df_pred['percent_ci']=0.0 alpha=0.05 t_val=stats.t.ppf(1-alpha/2,self.df_resid+1) for indx in df_pred.index: # print(df_pred.ix[indx].values[1:]) # logging.debug(self.X_bar) # logging.debug(X.head()) if "alpha" in self.independent_: x_0_x_bar=numpy.matrix(X.ix[indx].values[1:]-self.X_bar) else: x_0_x_bar=numpy.matrix(X.ix[indx].values-self.X_bar) # print(numpy.shape(x_0_x_bar)) # print("************") # logging.debug(self.Z_M_Z) # logging.debug(x_0_x_bar) se_e = self.s_y*numpy.sqrt(1 + (1/self.nobs) + x_0_x_bar*inv(self.Z_M_Z)*x_0_x_bar.T) df_pred.loc[indx,'upper_pred']=df_pred.loc[indx,'y_hat']+t_val*se_e df_pred.loc[indx,'lower_pred']=df_pred.loc[indx,'y_hat']-t_val*se_e df_pred.loc[indx,'percent_ci']=100*2*t_val*se_e/numpy.abs(df_pred.loc[indx,'y_hat']) return df_pred
def gen_simplemodel_data(n=1000,k=1): numpy.random.seed(10) df_x=pd.DataFrame({'alpha':numpy.ones(n)}) coefs=numpy.random.rand(k+1) for ii in range(k): #draw from normal distribution and scale it randomly df_x['X{}'.format(ii)]=numpy.random.normal(0,1,n) # logging.debug(df_x.head()) # logging.debug("coefs: {}. df_x: {}. disturb: {}".format(\ # numpy.shape(numpy.matrix(coefs)),\ # numpy.shape(numpy.matrix(df_x)),\ # numpy.shape(numpy.matrix(numpy.random.normal(0,1,n)*numpy.random.rand(1)).T))) data=numpy.array(numpy.matrix(df_x)*numpy.matrix(coefs).T+numpy.matrix(numpy.random.normal(0,1,n)*numpy.random.rand(1)).T) df_x['y']=data return (coefs,df_x)
def fit(self, X, y, n_jobs=1): """ y can be series or array X can be dataframe or ndarry (N datapoints x M features) """ self = super(LinModel, self).fit(X, y, n_jobs) self.nobs=X.shape[0] self.nparams=X.shape[1] #remove an extra 1 for the alpha (k-1) self.df_model=X.shape[1]-1 #(n-k-1) - we always assume an alpha is present self.df_resid=self.nobs-X.shape[1]-1 #standard error of the regression y_bar=y.mean() y_hat=self.predict(X) self.raw_data=X self.training=y # logging.debug(X) self.fittedvalues=y_hat #explained sum of squares SSE=numpy.sum([numpy.power(val-y_bar,2) for val in y_hat]) e=numpy.matrix(y-y_hat).T self.resid=numpy.ravel(e) # logging.debug(y_bar) # logging.debug(y) SST=numpy.sum([numpy.power(val-y_bar,2) for val in y]) SSR=numpy.sum([numpy.power(x,2) for x in e]) self.ssr=SSR #print(SSR) #mean squared error of the residuals (unbiased) #square root of this is the standard error of the regression s_2 = SSR / (self.df_resid+1) self.s_y=numpy.sqrt(s_2) self.RMSE_pc=metrics.get_RMSE_pc(y,y_hat) # logging.debug("s_y = {}".format(self.s_y)) #Also get the means of the independent variables if isinstance(X,pd.core.frame.DataFrame): #assume its' called alpha self.X_bar=X[X.columns[X.columns!='alpha']].mean() Z=numpy.matrix(X[X.columns[X.columns!='alpha']]) else: #assume its the first column self.X_bar=numpy.mean(X.values,axis=0)[1:] Z=numpy.matrix(X[:,1:]) i_n=numpy.matrix(numpy.ones(self.nobs)) M_0=numpy.matrix(numpy.eye(self.nobs))-numpy.power(self.nobs,-1)*i_n*i_n.T self.Z_M_Z=Z.T*M_0*Z # #print(numpy.sqrt(numpy.diagonal(sse * numpy.linalg.inv(numpy.dot(X.T, X))))) # #standard error of estimator bk X_mat=numpy.matrix(X.values) #print(X_mat) self.X_dash_X=X_mat.T*X_mat # we get nans using this approach so calculate each one separately # se=numpy.zeros(self.nparams) # for ii in range(self.nparams): # se[ii]=numpy.sqrt(X_dash_X[ii,ii]*s_2) # logging.debug(s_2) # logging.debug(numpy.linalg.inv(X_dash_X)) # #se = numpy.sqrt(numpy.diagonal(s_2 * numpy.linalg.inv(numpy.matrix(X.T, X)))) se=numpy.sqrt(numpy.diagonal(s_2 * numpy.linalg.inv(self.X_dash_X))) self.se= se self.t = self.coef_ / se self.p = 2 * (1 - stats.t.cdf(numpy.abs(self.t), y.shape[0] - X.shape[1])) self.independent_ = [] if isinstance(X,pd.DataFrame): self.independent_=X.columns.values #t_val=stats.t.ppf(1-0.05/2,y.shape[0] - X.shape[1]) #R2 - 1-SSR/SST self.rsquared=1-SSR/SST #adjusted r2 #1-[(1-R2)(n-1)/(n-k-1)] self.rsquared_adj=1-(((1-self.rsquared)*(self.nobs-1))/self.df_resid) #f-value f_value=(self.rsquared/(self.df_model))/\ ((1-self.rsquared)/(self.df_resid+1)) self.f_stat=f_value self.f_pvalue=stats.f.pdf(f_value,self.df_model,self.df_resid+1)