def run_simple_regression(n=1000,k=1,feature='X0'): (coefs,df)=gen_simplemodel_data(n=n,k=k) # logging.debug(df.head()) df.sort(feature,inplace=True) lr=lin_model.LinModel() X=df[df.columns[df.columns!='y']] y=df.y lr.fit(X=X,y=y) lr.summary() df_ci=lr.get_confidence_interval_for_mean(X) df_pi=lr.get_prediction_interval(X) (fig,ax)=plt.subplots(nrows=2,ncols=1,figsize=[14,12]) cols=sns.color_palette('husl',n_colors=4) ax[0].scatter(X[feature],y,label='y',color=cols[3],alpha=0.4) ax[0].plot(X[feature],df_pi['upper_pred'],label='pred',color=cols[1],alpha=0.5) ax[0].plot(X[feature],df_pi['lower_pred'],color=cols[1],alpha=0.5) ax[0].plot(X[feature],df_ci['upper_mean'],color=cols[2],alpha=0.5) ax[0].plot(X[feature],df_ci['lower_mean'],label='mean_ci',color=cols[2],alpha=0.5) ax[0].scatter(X[feature],df_pi['y_hat'],label='y_hat',color=cols[0],alpha=0.5) ax[0].legend(loc='best') ax[1].scatter(X[feature],y,label='y',color=cols[3],alpha=0.4) ax[1].scatter(X[feature],df_ci['y_hat'],label='y_hat',color=cols[0],alpha=0.5) ax[1].legend(loc='best')
def test_pred_interval(show_plot=False): from ml_ext import examples (coefs,df)=examples.gen_simplemodel_data(n=50,k=3) df.sort('X1',inplace=True) lr=LinModel() X=df[df.columns[df.columns!='y']] y=df.y lr.fit(X=X,y=y) lr.summary() df_ci=lr.get_confidence_interval_for_mean(X) df_pi=lr.get_prediction_interval(X) #Now use statsmodels to compare from statsmodels.sandbox.regression.predstd import wls_prediction_std import statsmodels.api as sm re = sm.OLS(y, X).fit() prstd, iv_l, iv_u = wls_prediction_std(re) if show_plot: (fig,ax)=plt.subplots(nrows=2,ncols=1,figsize=[14,12]) cols=sns.color_palette('husl',n_colors=4) ax[0].scatter(X.X1,y,label='y',color=cols[3],alpha=0.4) ax[0].plot(X.X1,df_pi['upper_pred'],label='pred',color=cols[1],alpha=0.5) ax[0].plot(X.X1,df_pi['lower_pred'],color=cols[1],alpha=0.5) ax[0].plot(X.X1,df_ci['upper_mean'],color=cols[2],alpha=0.5) ax[0].plot(X.X1,df_ci['lower_mean'],label='mean_ci',color=cols[2],alpha=0.5) ax[0].scatter(X.X1,df_pi['y_hat'],label='y_hat',color=cols[0],alpha=0.5) ax[0].legend(loc='best') ax[1].scatter(X.X1,y,label='y',color=cols[3],alpha=0.4) ax[1].scatter(X.X1,df_ci['y_hat'],label='y_hat',color=cols[0],alpha=0.5) ax[1].plot(X.X1,iv_u,label='wls',color=cols[1],alpha=0.5) ax[1].plot(X.X1,iv_l,color=cols[1],alpha=0.5) ax[1].legend(loc='best') #get difference between uppers from each and check they are within 1% overall_diff=100*numpy.sum(iv_u-df_pi['upper_pred'])/numpy.sum(iv_u) logging.debug("Overall % difference in prediction ranges for upper bound: {}".format(overall_diff)) assert overall_diff<0.1