Esempio n. 1
0
def sample_properties():
	"""

	Pop out some sample properties for explanation

	"""

	#read in some wooldridge data
	df=pd.read_csv('http://samba.fsv.cuni.cz/~cahlik/Backup/Ekonometrie/Data%20Wooldridge%20Stata/401k.csv')

	logging.debug(df.describe())
Esempio n. 2
0
	def confidence_interval(self):
		#get sample std
		N=len(self.X)
		S=statistics.stdev(self.X)
		logging.debug(S)
		se=S/numpy.sqrt(self.X)

		#get t statistic
		pctile=stats.t.ppf(0.975,df=N-1)

		x_bar=numpy.mean(self.X)

		return (x_bar-pctile*se,x_bar+pctile*se)
Esempio n. 3
0
def test_pred_interval(show_plot=False):
    from ml_ext import examples
    (coefs,df)=examples.gen_simplemodel_data(n=50,k=3)
    df.sort('X1',inplace=True)
    lr=LinModel()
    X=df[df.columns[df.columns!='y']]
    y=df.y


    lr.fit(X=X,y=y)
    lr.summary()
    df_ci=lr.get_confidence_interval_for_mean(X)
    df_pi=lr.get_prediction_interval(X)

    #Now use statsmodels to compare
    from statsmodels.sandbox.regression.predstd import wls_prediction_std
    import statsmodels.api as sm
    re = sm.OLS(y, X).fit()
    prstd, iv_l, iv_u = wls_prediction_std(re)

    if show_plot:
        (fig,ax)=plt.subplots(nrows=2,ncols=1,figsize=[14,12])

        cols=sns.color_palette('husl',n_colors=4)
        ax[0].scatter(X.X1,y,label='y',color=cols[3],alpha=0.4)
        
        ax[0].plot(X.X1,df_pi['upper_pred'],label='pred',color=cols[1],alpha=0.5)
        ax[0].plot(X.X1,df_pi['lower_pred'],color=cols[1],alpha=0.5)
        ax[0].plot(X.X1,df_ci['upper_mean'],color=cols[2],alpha=0.5)
        ax[0].plot(X.X1,df_ci['lower_mean'],label='mean_ci',color=cols[2],alpha=0.5)
        ax[0].scatter(X.X1,df_pi['y_hat'],label='y_hat',color=cols[0],alpha=0.5)
        ax[0].legend(loc='best')

        ax[1].scatter(X.X1,y,label='y',color=cols[3],alpha=0.4)
        ax[1].scatter(X.X1,df_ci['y_hat'],label='y_hat',color=cols[0],alpha=0.5)
        ax[1].plot(X.X1,iv_u,label='wls',color=cols[1],alpha=0.5)
        ax[1].plot(X.X1,iv_l,color=cols[1],alpha=0.5)
        ax[1].legend(loc='best')

    #get difference between uppers from each and check they are within 1%
    overall_diff=100*numpy.sum(iv_u-df_pi['upper_pred'])/numpy.sum(iv_u)
    logging.debug("Overall % difference in prediction ranges for upper bound: {}".format(overall_diff))
    assert overall_diff<0.1