def test_het_breusch_pagan(self): res = self.res bptest = dict(statistic=0.709924388395087, pvalue=0.701199952134347, parameters=(2,), distr="f") bp = smsdia.het_breuschpagan(res.resid, res.model.exog) compare_t_est(bp, bptest, decimal=(12, 12))
resid = fsm_results.resid qq1 = sm.qqplot(resid, line ='45', fit=True, dist=stats.t) y = fsm_df["price"] y_hat = fsm_results.predict() fig2, ax2 = plt.subplots() ax2.set(xlabel="Price", ylabel="Residuals") ax2.scatter(x=y_hat, y=y_hat-y, color="blue", alpha=0.2); lm, lm_p_value, fvalue, f_p_value = het_breuschpagan(y-y_hat, fsm_df[["price"]]) print("Lagrange Multiplier p-value:", lm_p_value) print("F-statistic p-value:", f_p_value) #necessary?? preds = fsm_results.predict() def plot_predictions(y_true, y_hat): fig, axis = plt.subplots() axis.scatter(y_true,y_hat,label='Model Output', alpha=.5, edgecolor='black') y_equalsx = np.linspace(0,y_true.max())
def quiz4(): df = pd.read_csv(utils.PATH.COURSE_FILE(4, 'botswana.tsv', 'week3'), sep='\t') print(df.head()) print(df.info()) # Q1 religion = df['religion'].value_counts() print('---- Q1:', religion) # Q2 df_nonna = df.dropna() print('---- Q2:', df_nonna.shape) # Q3 df['nevermarr'] = df['agefm'].apply(lambda x: 0 if x >= 0 else 1) df.drop(['evermarr'], axis=1, inplace=True) df['agefm'].fillna(0, inplace=True) df['heduc'] = df.apply(lambda row: row['heduc'] if row['nevermarr'] == 0 else -1, axis=1) heduc_na = df['heduc'].isnull().value_counts() print('---- Q3:', heduc_na) # Q4 df['idlnchld_noans'] = df['idlnchld'].apply(lambda x: 0 if x >= 0 else -1) df['idlnchld'].fillna(-1, inplace=True) df['heduc_noans'] = df['heduc'].apply(lambda x: 0 if x >= 0 else -1) df['heduc'].fillna(-1, inplace=True) df['usemeth_noans'] = df['usemeth'].apply(lambda x: 0 if x >= 0 else -1) df['usemeth'].fillna(-1, inplace=True) df.dropna(inplace=True) print(df.shape) print('---- Q4:', df.shape[0] * df.shape[1]) # Q5 model = smf.ols('ceb ~ age + educ + religion + idlnchld + knowmeth + usemeth + agefm + heduc + urban + electric + radio + tv + bicycle +'\ 'nevermarr + idlnchld_noans + heduc_noans + usemeth_noans', data=df) fitted = model.fit() sum = fitted.summary() print('---- Q5"\n', sum) # Q6 print('---- Q6: p=%f' % het_breuschpagan(fitted.resid, fitted.model.exog)[1]) # Q7 model1 = smf.ols('ceb ~ age + educ + religion + idlnchld + knowmeth + usemeth + agefm + heduc + urban + electric + radio + tv + bicycle +'\ 'nevermarr + idlnchld_noans + heduc_noans + usemeth_noans', data=df) fitted = model1.fit(cov_type='HC1') sum = fitted.summary() print('---- Q7\n:', sum) # Q8 model2 = smf.ols('ceb ~ age + educ + idlnchld + knowmeth + usemeth + agefm + heduc + urban + electric + bicycle +'\ 'nevermarr + idlnchld_noans + heduc_noans + usemeth_noans', data=df) fitted = model2.fit() print('p=%f' % het_breuschpagan(fitted.resid, fitted.model.exog)[1]) model2 = smf.ols('ceb ~ age + educ + idlnchld + knowmeth + usemeth + agefm + heduc + urban + electric + bicycle +'\ 'nevermarr + idlnchld_noans + heduc_noans + usemeth_noans', data=df) fitted = model2.fit(cov_type='HC1') print("---- Q8: F=%f, p=%f, k1=%f" % model1.fit().compare_f_test(model2.fit())) # Q9 model3 = smf.ols('ceb ~ age + educ + idlnchld + knowmeth + agefm + heduc + urban + electric + bicycle +'\ 'nevermarr + idlnchld_noans + heduc_noans', data=df) fitted = model3.fit() print("---- Q9: F=%f, p=%f, k1=%f" % model2.fit().compare_f_test(model3.fit())) # Q10 model = smf.ols('ceb ~ age + educ + idlnchld + knowmeth + agefm + heduc + urban + electric + bicycle +'\ 'nevermarr + idlnchld_noans + heduc_noans', data=df) fitted = model.fit(cov_type='HC1') sum = fitted.summary() print('---- Q10\n:', sum) return
def breusch_pagan(): names = [ "Lagrange multiplier statistic", "p-value", "f-value", "f p-value" ] test = het_breuschpagan(res.resid, res.model.exog) print(dict(zip(names, test)))
import pandas as pd import os import statsmodels.api as sm from statsmodels.stats.diagnostic import het_breuschpagan base_path = os.path.dirname(__file__) base_path = os.path.join(base_path, 'Aula 9') df = pd.read_stata(os.path.join(base_path, 'HPRICE1-Kayo.dta')) print(df.head()) # df.corr() X = df[['lotsize', 'sqrft', 'bdrms']] y = df['price'] X2 = sm.add_constant(X) # model = sm.RLM(y, X2, M=sm.robust.norms.HuberT()) model = sm.OLS(y, X2) results = model.fit(cov_type='HC0') # Statsmodels gives R-like statistical output print(results.summary()) name = ['Lagrange multiplier statistic', 'p-value', 'f-value', 'f p-value'] bp = het_breuschpagan(results.resid, results.model.exog) bp df = pd.DataFrame(name, bp) print(df)
#define input X2 = sm.add_constant(X) # create OLS model model = sm.OLS(Y, X2) # fit data est = model.fit() # test for heteroscedasticity (want p values over 0.05) _, pval, _, f_pval = diag.het_white(est.resid, est.model.exog, retres=False) print(pval, f_pval) print('_' * 100) _, pval, _, f_pval = diag.het_breuschpagan(est.resid, est.model.exog) print(pval, f_pval) print('_' * 100) # values greater than 0.05 there is no heterodasticity # test for autocorrelation want p values over 0.05) lag = min(10, (len(X) // 5)) print("number of lags is {}".format(lag)) ibvalue, pval = diag.acorr_ljungbox(est.resid, lags=lag) print(min(pval)) print('_' * 100) # check residuals are normal (visual, close to line) sm.qqplot(est.resid, line='s') pylab.show()
def test_all(self): d = macrodata.load_pandas().data #import datasetswsm.greene as g #d = g.load('5-1') #growth rates gs_l_realinv = 400 * np.diff(np.log(d['realinv'].values)) gs_l_realgdp = 400 * np.diff(np.log(d['realgdp'].values)) #simple diff, not growthrate, I want heteroscedasticity later for testing endogd = np.diff(d['realinv']) exogd = add_constant(np.c_[np.diff(d['realgdp'].values), d['realint'][:-1].values]) endogg = gs_l_realinv exogg = add_constant(np.c_[gs_l_realgdp, d['realint'][:-1].values]) res_ols = OLS(endogg, exogg).fit() #print res_ols.params mod_g1 = GLSAR(endogg, exogg, rho=-0.108136) res_g1 = mod_g1.fit() #print res_g1.params mod_g2 = GLSAR(endogg, exogg, rho=-0.108136) #-0.1335859) from R res_g2 = mod_g2.iterative_fit(maxiter=5) #print res_g2.params rho = -0.108136 # coefficient std. error t-ratio p-value 95% CONFIDENCE INTERVAL partable = np.array([ [-9.50990, 0.990456, -9.602, 3.65e-018, -11.4631, -7.55670], # *** [4.37040, 0.208146, 21.00, 2.93e-052, 3.95993, 4.78086], # *** [-0.579253, 0.268009, -2.161, 0.0319, -1.10777, -0.0507346] ]) # ** #Statistics based on the rho-differenced data: result_gretl_g1 = dict(endog_mean=("Mean dependent var", 3.113973), endog_std=("S.D. dependent var", 18.67447), ssr=("Sum squared resid", 22530.90), mse_resid_sqrt=("S.E. of regression", 10.66735), rsquared=("R-squared", 0.676973), rsquared_adj=("Adjusted R-squared", 0.673710), fvalue=("F(2, 198)", 221.0475), f_pvalue=("P-value(F)", 3.56e-51), resid_acf1=("rho", -0.003481), dw=("Durbin-Watson", 1.993858)) #fstatistic, p-value, df1, df2 reset_2_3 = [5.219019, 0.00619, 2, 197, "f"] reset_2 = [7.268492, 0.00762, 1, 198, "f"] reset_3 = [5.248951, 0.023, 1, 198, "f"] #LM-statistic, p-value, df arch_4 = [7.30776, 0.120491, 4, "chi2"] #multicollinearity vif = [1.002, 1.002] cond_1norm = 6862.0664 determinant = 1.0296049e+009 reciprocal_condition_number = 0.013819244 #Chi-square(2): test-statistic, pvalue, df normality = [20.2792, 3.94837e-005, 2] #tests res = res_g1 #with rho from Gretl #basic assert_almost_equal(res.params, partable[:, 0], 4) assert_almost_equal(res.bse, partable[:, 1], 6) assert_almost_equal(res.tvalues, partable[:, 2], 2) assert_almost_equal(res.ssr, result_gretl_g1['ssr'][1], decimal=2) #assert_almost_equal(res.llf, result_gretl_g1['llf'][1], decimal=7) #not in gretl #assert_almost_equal(res.rsquared, result_gretl_g1['rsquared'][1], decimal=7) #FAIL #assert_almost_equal(res.rsquared_adj, result_gretl_g1['rsquared_adj'][1], decimal=7) #FAIL assert_almost_equal(np.sqrt(res.mse_resid), result_gretl_g1['mse_resid_sqrt'][1], decimal=5) assert_almost_equal(res.fvalue, result_gretl_g1['fvalue'][1], decimal=4) assert_allclose(res.f_pvalue, result_gretl_g1['f_pvalue'][1], rtol=1e-2) #assert_almost_equal(res.durbin_watson, result_gretl_g1['dw'][1], decimal=7) #TODO #arch #sm_arch = smsdia.acorr_lm(res.wresid**2, maxlag=4, autolag=None) sm_arch = smsdia.het_arch(res.wresid, nlags=4) assert_almost_equal(sm_arch[0], arch_4[0], decimal=4) assert_almost_equal(sm_arch[1], arch_4[1], decimal=6) #tests res = res_g2 #with estimated rho #estimated lag coefficient assert_almost_equal(res.model.rho, rho, decimal=3) #basic assert_almost_equal(res.params, partable[:, 0], 4) assert_almost_equal(res.bse, partable[:, 1], 3) assert_almost_equal(res.tvalues, partable[:, 2], 2) assert_almost_equal(res.ssr, result_gretl_g1['ssr'][1], decimal=2) #assert_almost_equal(res.llf, result_gretl_g1['llf'][1], decimal=7) #not in gretl #assert_almost_equal(res.rsquared, result_gretl_g1['rsquared'][1], decimal=7) #FAIL #assert_almost_equal(res.rsquared_adj, result_gretl_g1['rsquared_adj'][1], decimal=7) #FAIL assert_almost_equal(np.sqrt(res.mse_resid), result_gretl_g1['mse_resid_sqrt'][1], decimal=5) assert_almost_equal(res.fvalue, result_gretl_g1['fvalue'][1], decimal=0) assert_almost_equal(res.f_pvalue, result_gretl_g1['f_pvalue'][1], decimal=6) #assert_almost_equal(res.durbin_watson, result_gretl_g1['dw'][1], decimal=7) #TODO c = oi.reset_ramsey(res, degree=2) compare_ftest(c, reset_2, decimal=(2, 4)) c = oi.reset_ramsey(res, degree=3) compare_ftest(c, reset_2_3, decimal=(2, 4)) #arch #sm_arch = smsdia.acorr_lm(res.wresid**2, maxlag=4, autolag=None) sm_arch = smsdia.het_arch(res.wresid, nlags=4) assert_almost_equal(sm_arch[0], arch_4[0], decimal=1) assert_almost_equal(sm_arch[1], arch_4[1], decimal=2) ''' Performing iterative calculation of rho... ITER RHO ESS 1 -0.10734 22530.9 2 -0.10814 22530.9 Model 4: Cochrane-Orcutt, using observations 1959:3-2009:3 (T = 201) Dependent variable: ds_l_realinv rho = -0.108136 coefficient std. error t-ratio p-value ------------------------------------------------------------- const -9.50990 0.990456 -9.602 3.65e-018 *** ds_l_realgdp 4.37040 0.208146 21.00 2.93e-052 *** realint_1 -0.579253 0.268009 -2.161 0.0319 ** Statistics based on the rho-differenced data: Mean dependent var 3.113973 S.D. dependent var 18.67447 Sum squared resid 22530.90 S.E. of regression 10.66735 R-squared 0.676973 Adjusted R-squared 0.673710 F(2, 198) 221.0475 P-value(F) 3.56e-51 rho -0.003481 Durbin-Watson 1.993858 ''' ''' RESET test for specification (squares and cubes) Test statistic: F = 5.219019, with p-value = P(F(2,197) > 5.21902) = 0.00619 RESET test for specification (squares only) Test statistic: F = 7.268492, with p-value = P(F(1,198) > 7.26849) = 0.00762 RESET test for specification (cubes only) Test statistic: F = 5.248951, with p-value = P(F(1,198) > 5.24895) = 0.023: ''' ''' Test for ARCH of order 4 coefficient std. error t-ratio p-value -------------------------------------------------------- alpha(0) 97.0386 20.3234 4.775 3.56e-06 *** alpha(1) 0.176114 0.0714698 2.464 0.0146 ** alpha(2) -0.0488339 0.0724981 -0.6736 0.5014 alpha(3) -0.0705413 0.0737058 -0.9571 0.3397 alpha(4) 0.0384531 0.0725763 0.5298 0.5968 Null hypothesis: no ARCH effect is present Test statistic: LM = 7.30776 with p-value = P(Chi-square(4) > 7.30776) = 0.120491: ''' ''' Variance Inflation Factors Minimum possible value = 1.0 Values > 10.0 may indicate a collinearity problem ds_l_realgdp 1.002 realint_1 1.002 VIF(j) = 1/(1 - R(j)^2), where R(j) is the multiple correlation coefficient between variable j and the other independent variables Properties of matrix X'X: 1-norm = 6862.0664 Determinant = 1.0296049e+009 Reciprocal condition number = 0.013819244 ''' ''' Test for ARCH of order 4 - Null hypothesis: no ARCH effect is present Test statistic: LM = 7.30776 with p-value = P(Chi-square(4) > 7.30776) = 0.120491 Test of common factor restriction - Null hypothesis: restriction is acceptable Test statistic: F(2, 195) = 0.426391 with p-value = P(F(2, 195) > 0.426391) = 0.653468 Test for normality of residual - Null hypothesis: error is normally distributed Test statistic: Chi-square(2) = 20.2792 with p-value = 3.94837e-005: ''' #no idea what this is ''' Augmented regression for common factor test OLS, using observations 1959:3-2009:3 (T = 201) Dependent variable: ds_l_realinv coefficient std. error t-ratio p-value --------------------------------------------------------------- const -10.9481 1.35807 -8.062 7.44e-014 *** ds_l_realgdp 4.28893 0.229459 18.69 2.40e-045 *** realint_1 -0.662644 0.334872 -1.979 0.0492 ** ds_l_realinv_1 -0.108892 0.0715042 -1.523 0.1294 ds_l_realgdp_1 0.660443 0.390372 1.692 0.0923 * realint_2 0.0769695 0.341527 0.2254 0.8219 Sum of squared residuals = 22432.8 Test of common factor restriction Test statistic: F(2, 195) = 0.426391, with p-value = 0.653468 ''' ################ with OLS, HAC errors #Model 5: OLS, using observations 1959:2-2009:3 (T = 202) #Dependent variable: ds_l_realinv #HAC standard errors, bandwidth 4 (Bartlett kernel) #coefficient std. error t-ratio p-value 95% CONFIDENCE INTERVAL #for confidence interval t(199, 0.025) = 1.972 partable = np.array([ [-9.48167, 1.17709, -8.055, 7.17e-014, -11.8029, -7.16049], # *** [4.37422, 0.328787, 13.30, 2.62e-029, 3.72587, 5.02258], #*** [-0.613997, 0.293619, -2.091, 0.0378, -1.19300, -0.0349939] ]) # ** result_gretl_g1 = dict(endog_mean=("Mean dependent var", 3.257395), endog_std=("S.D. dependent var", 18.73915), ssr=("Sum squared resid", 22799.68), mse_resid_sqrt=("S.E. of regression", 10.70380), rsquared=("R-squared", 0.676978), rsquared_adj=("Adjusted R-squared", 0.673731), fvalue=("F(2, 199)", 90.79971), f_pvalue=("P-value(F)", 9.53e-29), llf=("Log-likelihood", -763.9752), aic=("Akaike criterion", 1533.950), bic=("Schwarz criterion", 1543.875), hqic=("Hannan-Quinn", 1537.966), resid_acf1=("rho", -0.107341), dw=("Durbin-Watson", 2.213805)) linear_logs = [1.68351, 0.430953, 2, "chi2"] #for logs: dropping 70 nan or incomplete observations, T=133 #(res_ols.model.exog <=0).any(1).sum() = 69 ?not 70 linear_squares = [7.52477, 0.0232283, 2, "chi2"] #Autocorrelation, Breusch-Godfrey test for autocorrelation up to order 4 lm_acorr4 = [1.17928, 0.321197, 4, 195, "F"] lm2_acorr4 = [4.771043, 0.312, 4, "chi2"] acorr_ljungbox4 = [5.23587, 0.264, 4, "chi2"] #break cusum_Harvey_Collier = [0.494432, 0.621549, 198, "t"] #stats.t.sf(0.494432, 198)*2 #see cusum results in files break_qlr = [3.01985, 0.1, 3, 196, "maxF"] #TODO check this, max at 2001:4 break_chow = [13.1897, 0.00424384, 3, "chi2"] # break at 1984:1 arch_4 = [3.43473, 0.487871, 4, "chi2"] normality = [23.962, 0.00001, 2, "chi2"] het_white = [33.503723, 0.000003, 5, "chi2"] het_breusch_pagan = [1.302014, 0.521520, 2, "chi2"] #TODO: not available het_breusch_pagan_konker = [0.709924, 0.701200, 2, "chi2"] reset_2_3 = [5.219019, 0.00619, 2, 197, "f"] reset_2 = [7.268492, 0.00762, 1, 198, "f"] reset_3 = [5.248951, 0.023, 1, 198, "f"] #not available cond_1norm = 5984.0525 determinant = 7.1087467e+008 reciprocal_condition_number = 0.013826504 vif = [1.001, 1.001] names = 'date residual leverage influence DFFITS'.split( ) cur_dir = os.path.abspath(os.path.dirname(__file__)) fpath = os.path.join(cur_dir, 'results/leverage_influence_ols_nostars.txt') lev = np.genfromtxt(fpath, skip_header=3, skip_footer=1, converters={0: lambda s: s}) #either numpy 1.6 or python 3.2 changed behavior if np.isnan(lev[-1]['f1']): lev = np.genfromtxt(fpath, skip_header=3, skip_footer=2, converters={0: lambda s: s}) lev.dtype.names = names res = res_ols #for easier copying cov_hac = sw.cov_hac_simple(res, nlags=4, use_correction=False) bse_hac = sw.se_cov(cov_hac) assert_almost_equal(res.params, partable[:, 0], 5) assert_almost_equal(bse_hac, partable[:, 1], 5) #TODO assert_almost_equal(res.ssr, result_gretl_g1['ssr'][1], decimal=2) assert_almost_equal(res.llf, result_gretl_g1['llf'][1], decimal=4) #not in gretl assert_almost_equal(res.rsquared, result_gretl_g1['rsquared'][1], decimal=6) #FAIL assert_almost_equal(res.rsquared_adj, result_gretl_g1['rsquared_adj'][1], decimal=6) #FAIL assert_almost_equal(np.sqrt(res.mse_resid), result_gretl_g1['mse_resid_sqrt'][1], decimal=5) #f-value is based on cov_hac I guess #res2 = res.get_robustcov_results(cov_type='HC1') # TODO: fvalue differs from Gretl, trying any of the HCx #assert_almost_equal(res2.fvalue, result_gretl_g1['fvalue'][1], decimal=0) #FAIL #assert_approx_equal(res.f_pvalue, result_gretl_g1['f_pvalue'][1], significant=1) #FAIL #assert_almost_equal(res.durbin_watson, result_gretl_g1['dw'][1], decimal=7) #TODO c = oi.reset_ramsey(res, degree=2) compare_ftest(c, reset_2, decimal=(6, 5)) c = oi.reset_ramsey(res, degree=3) compare_ftest(c, reset_2_3, decimal=(6, 5)) linear_sq = smsdia.linear_lm(res.resid, res.model.exog) assert_almost_equal(linear_sq[0], linear_squares[0], decimal=6) assert_almost_equal(linear_sq[1], linear_squares[1], decimal=7) hbpk = smsdia.het_breuschpagan(res.resid, res.model.exog) assert_almost_equal(hbpk[0], het_breusch_pagan_konker[0], decimal=6) assert_almost_equal(hbpk[1], het_breusch_pagan_konker[1], decimal=6) hw = smsdia.het_white(res.resid, res.model.exog) assert_almost_equal(hw[:2], het_white[:2], 6) #arch #sm_arch = smsdia.acorr_lm(res.resid**2, maxlag=4, autolag=None) sm_arch = smsdia.het_arch(res.resid, nlags=4) assert_almost_equal(sm_arch[0], arch_4[0], decimal=5) assert_almost_equal(sm_arch[1], arch_4[1], decimal=6) vif2 = [ oi.variance_inflation_factor(res.model.exog, k) for k in [1, 2] ] infl = oi.OLSInfluence(res_ols) #print np.max(np.abs(lev['DFFITS'] - infl.dffits[0])) #print np.max(np.abs(lev['leverage'] - infl.hat_matrix_diag)) #print np.max(np.abs(lev['influence'] - infl.influence)) #just added this based on Gretl #just rough test, low decimal in Gretl output, assert_almost_equal(lev['residual'], res.resid, decimal=3) assert_almost_equal(lev['DFFITS'], infl.dffits[0], decimal=3) assert_almost_equal(lev['leverage'], infl.hat_matrix_diag, decimal=3) assert_almost_equal(lev['influence'], infl.influence, decimal=4)
diag.het_goldfeldquandt(modelMR2.resid, modelMR2.model.exog) diag.het_breuschpagan(modelMR4.resid, modelMR4.model.exog) diag.het_white(modelMR2.resid, modelMR2.model.exog, retres = False) diag.(modelMR2.resid, modelMR2.model.exog) diag.acorr_ljungbox(modelMR2.resid)
def breusch_pagan_test(dataset, regression_dict, results_dict): for key in regression_dict.keys(): BP_statistic = smd.het_breuschpagan(results_dict[key].resid, dataset[regression_dict[key][1:]]) print("{}\nBP:\t{:.2f}\nP-Val:\t{:.4f}".format(key, BP_statistic[0], BP_statistic[1]))
# Fit Random Forest Regressor according to best hyperparameters regressor = RandomForestRegressor(random_state=0, n_estimators=500, max_depth=3, max_features=4, min_samples_leaf=1, bootstrap=True) regressor.fit(X, y) # Obtain R2 squared and adjusted R-squared r2 = regressor.score(X,y) RAdjusted = 1-(((1-r2)*len(dataset['demand_Maa'])/(len(dataset['demand_Maa'])-5-1))) # Apply 10-fold cross validation maescores = cross_val_score(regressor, X, y, scoring='neg_mean_absolute_error', cv=10, n_jobs=-1) # Obtain AIC and BIC value y_hat = regressor.predict(X) residuals = y - y_hat res = het_breuschpagan(residuals, X) sse = sum(residuals**2) AIC = 2*4 - 2*np.log(sse) BIC = 54*np.log(sse/54) + 4*np.log(54) # Print scores #print("The Mean Absolute Error of the Random Forest Regressor is " + str(abs(np.mean(maescores)))) print("The R2 squared of the Random Forest Regressor is " + str(r2)) print("The adjusted R2 squared of the Random Forest Regressor is " + str(RAdjusted)) print("The AIC score of the Random Forest Regressor is " + str(AIC)) print("The BIC score of the Random Forest Regressor " + str(BIC)) print("The p-value from the Breuschpagan test is " + str(res)) # Plot feature importance feature_import = pd.DataFrame(data=regressor.feature_importances_, index=['own_price','compe_price','inc_per_capita','pro_exp','Annual_Prod_MSeeds'], columns=['values']) feature_import.sort_values(['values'], ascending=False, inplace=True)
variance_inflation_factor(X.values, i) for i in range(X.shape[1]) ] vif["features"] = X.columns vif # <h3>3. Homoscedasticity</h3> # # When residuals do not have constant variance (they exhibit heteroscedasticity), it is difficult to determine the true standard deviation of the forecast errors, usually resulting in confidence intervals that are too wide/narrow. For example, if the variance of the residuals is increasing over time, confidence intervals for out-of-sample predictions will be unrealistically narrow. # In[49]: from statsmodels.stats.diagnostic import het_breuschpagan from statsmodels.stats.diagnostic import het_white # breuschpagan test bp_test = het_breuschpagan(res.resids, df[independent_vars].dropna()) labels = ['BP Statistic', 'BP-Test p-value', 'F-Statistic', 'F-Test p-value'] print(pd.Series(zip(labels, bp_test))) # In[50]: # visualization of heteroscedasticity fitted = res.fitted_values residuals = res.resids plt.figure(figsize=(12, 8)) sns.regplot(x=fitted, y=residuals) plt.xlabel('Fitted Values') plt.xlim([4, 6]) plt.show() # <center><h2>WEIGHTED REGRESSION</h2></center>
y_pred1 = result1.predict(X1) err1 = y1 - y_pred1 plt.scatter(y1, y_pred1) plt.plot(xx, xx, color='k') plt.ylabel('Predicted') plt.xlabel('Real') ##Need to find another model, when price is higher, it seems under-estimate plt.hist(err1, bins=50) probplot(err1, plot=plt) ## diagnostic.het_breuschpagan(err1, X1) ##result ftest, chi-square diagnostic.het_breuschpagan(err1, X1[['bedrooms', 'bathrooms']]) ##to delete outliers cond = (house['price'] < 1000000) & (house['price'] >= 20000) X2 = house[cond][varlist] y2 = house[cond]['price'] X2 = sm.add_constant(X2) y2.plot.kde() model2 = sm.OLS(y2, X2) result2 = model2.fit() result2.summary() y_pred2 = result2.predict(X2)
#plt.title('Stadium Percentage - Reg(p-value) - Normal Q-Q') plt.savefig(savepath+'QQPlot.jpeg') plt.show() _,p=shapiro(y_train_res2) print(p) if p>0.05: x="The residuals seem to come from Gaussian process" else: x="The normality assumption may not hold" pvalue=pd.DataFrame(['Stadium Percentage Regression (p-value)',p,x]).T pvalue=pvalue.rename(columns={0:'Model',1:'p-value',2:'Interpretation'}) savedfile=pvalue.to_csv('C:/Users/bcheasty/OneDrive - Athlone Institute Of Technology/Research Project/Data Set Creation/Data/Model/pvalue7.csv',index=False) #Breuschpagan test bp_test = het_breuschpagan(y_train_res, X_train2) labels = ['LM Statistic', 'LM-Test p-value', 'F-Statistic', 'F-Test p-value'] bp=(dict(zip(labels, bp_test))) bp=pd.DataFrame(bp,index=[0]) bp['Model']='Stadium Percentage - Reg(p-value)' bp.loc[(bp['F-Test p-value']<0.05),'Interpretation']='The Residuals seem to be Heteroskedastic' bp.loc[(bp['F-Test p-value']>0.05),'Interpretation']='The Residuals seem to be Homoskedastic' bp=bp[['Model','F-Test p-value','Interpretation']] savedfile=bp.to_csv('C:/Users/bcheasty/OneDrive - Athlone Institute Of Technology/Research Project/Data Set Creation/Data/Model/bp7.csv',index=False) #Scale Loction plot_lm_3 = plt.figure() plt.scatter(y_train_pred, y_train_res, alpha=0.5, c='steelblue', marker='o', edgecolor='white', label='Training data'); sns.regplot(y_train_pred, y_train_res,
# Feel free to adjust the chart size plt.figure(figsize = (15,7.5)) plt.plot(result['2016-7':'2016-9'].index,result.loc['2016-7':'2016-9','simple regression']) plt.plot(result['2016-7':'2016-9'].index,result.loc['2016-7':'2016-9','fama_french']) plt.plot(result['2016-7':'2016-9'].index,result.loc['2016-7':'2016-9','sample']) plt.legend() plt.show() # In[22]: plt.figure() fama_model,resid.plot.density() plt.show() # In[23]: print ('Residual mean:', np.mean(fama_model.resid)) # In[ ]: from statsmodels.stats import diagnostic as dia het = dia.het_breuschpagan(fama_model.resid,fama_df[['MKT','SMB','HML','RMW','CMA']][1:]) print ('p-value: ', het[-1])
def test_all(self): d = macrodata.load_pandas().data #import datasetswsm.greene as g #d = g.load('5-1') #growth rates gs_l_realinv = 400 * np.diff(np.log(d['realinv'].values)) gs_l_realgdp = 400 * np.diff(np.log(d['realgdp'].values)) #simple diff, not growthrate, I want heteroscedasticity later for testing endogd = np.diff(d['realinv']) exogd = add_constant(np.c_[np.diff(d['realgdp'].values), d['realint'][:-1].values]) endogg = gs_l_realinv exogg = add_constant(np.c_[gs_l_realgdp, d['realint'][:-1].values]) res_ols = OLS(endogg, exogg).fit() #print res_ols.params mod_g1 = GLSAR(endogg, exogg, rho=-0.108136) res_g1 = mod_g1.fit() #print res_g1.params mod_g2 = GLSAR(endogg, exogg, rho=-0.108136) #-0.1335859) from R res_g2 = mod_g2.iterative_fit(maxiter=5) #print res_g2.params rho = -0.108136 # coefficient std. error t-ratio p-value 95% CONFIDENCE INTERVAL partable = np.array([ [-9.50990, 0.990456, -9.602, 3.65e-018, -11.4631, -7.55670], # *** [ 4.37040, 0.208146, 21.00, 2.93e-052, 3.95993, 4.78086], # *** [-0.579253, 0.268009, -2.161, 0.0319, -1.10777, -0.0507346]]) # ** #Statistics based on the rho-differenced data: result_gretl_g1 = dict( endog_mean = ("Mean dependent var", 3.113973), endog_std = ("S.D. dependent var", 18.67447), ssr = ("Sum squared resid", 22530.90), mse_resid_sqrt = ("S.E. of regression", 10.66735), rsquared = ("R-squared", 0.676973), rsquared_adj = ("Adjusted R-squared", 0.673710), fvalue = ("F(2, 198)", 221.0475), f_pvalue = ("P-value(F)", 3.56e-51), resid_acf1 = ("rho", -0.003481), dw = ("Durbin-Watson", 1.993858)) #fstatistic, p-value, df1, df2 reset_2_3 = [5.219019, 0.00619, 2, 197, "f"] reset_2 = [7.268492, 0.00762, 1, 198, "f"] reset_3 = [5.248951, 0.023, 1, 198, "f"] #LM-statistic, p-value, df arch_4 = [7.30776, 0.120491, 4, "chi2"] #multicollinearity vif = [1.002, 1.002] cond_1norm = 6862.0664 determinant = 1.0296049e+009 reciprocal_condition_number = 0.013819244 #Chi-square(2): test-statistic, pvalue, df normality = [20.2792, 3.94837e-005, 2] #tests res = res_g1 #with rho from Gretl #basic assert_almost_equal(res.params, partable[:,0], 4) assert_almost_equal(res.bse, partable[:,1], 6) assert_almost_equal(res.tvalues, partable[:,2], 2) assert_almost_equal(res.ssr, result_gretl_g1['ssr'][1], decimal=2) #assert_almost_equal(res.llf, result_gretl_g1['llf'][1], decimal=7) #not in gretl #assert_almost_equal(res.rsquared, result_gretl_g1['rsquared'][1], decimal=7) #FAIL #assert_almost_equal(res.rsquared_adj, result_gretl_g1['rsquared_adj'][1], decimal=7) #FAIL assert_almost_equal(np.sqrt(res.mse_resid), result_gretl_g1['mse_resid_sqrt'][1], decimal=5) assert_almost_equal(res.fvalue, result_gretl_g1['fvalue'][1], decimal=4) assert_allclose(res.f_pvalue, result_gretl_g1['f_pvalue'][1], rtol=1e-2) #assert_almost_equal(res.durbin_watson, result_gretl_g1['dw'][1], decimal=7) #TODO #arch #sm_arch = smsdia.acorr_lm(res.wresid**2, maxlag=4, autolag=None) sm_arch = smsdia.het_arch(res.wresid, maxlag=4) assert_almost_equal(sm_arch[0], arch_4[0], decimal=4) assert_almost_equal(sm_arch[1], arch_4[1], decimal=6) #tests res = res_g2 #with estimated rho #estimated lag coefficient assert_almost_equal(res.model.rho, rho, decimal=3) #basic assert_almost_equal(res.params, partable[:,0], 4) assert_almost_equal(res.bse, partable[:,1], 3) assert_almost_equal(res.tvalues, partable[:,2], 2) assert_almost_equal(res.ssr, result_gretl_g1['ssr'][1], decimal=2) #assert_almost_equal(res.llf, result_gretl_g1['llf'][1], decimal=7) #not in gretl #assert_almost_equal(res.rsquared, result_gretl_g1['rsquared'][1], decimal=7) #FAIL #assert_almost_equal(res.rsquared_adj, result_gretl_g1['rsquared_adj'][1], decimal=7) #FAIL assert_almost_equal(np.sqrt(res.mse_resid), result_gretl_g1['mse_resid_sqrt'][1], decimal=5) assert_almost_equal(res.fvalue, result_gretl_g1['fvalue'][1], decimal=0) assert_almost_equal(res.f_pvalue, result_gretl_g1['f_pvalue'][1], decimal=6) #assert_almost_equal(res.durbin_watson, result_gretl_g1['dw'][1], decimal=7) #TODO c = oi.reset_ramsey(res, degree=2) compare_ftest(c, reset_2, decimal=(2,4)) c = oi.reset_ramsey(res, degree=3) compare_ftest(c, reset_2_3, decimal=(2,4)) #arch #sm_arch = smsdia.acorr_lm(res.wresid**2, maxlag=4, autolag=None) sm_arch = smsdia.het_arch(res.wresid, maxlag=4) assert_almost_equal(sm_arch[0], arch_4[0], decimal=1) assert_almost_equal(sm_arch[1], arch_4[1], decimal=2) ''' Performing iterative calculation of rho... ITER RHO ESS 1 -0.10734 22530.9 2 -0.10814 22530.9 Model 4: Cochrane-Orcutt, using observations 1959:3-2009:3 (T = 201) Dependent variable: ds_l_realinv rho = -0.108136 coefficient std. error t-ratio p-value ------------------------------------------------------------- const -9.50990 0.990456 -9.602 3.65e-018 *** ds_l_realgdp 4.37040 0.208146 21.00 2.93e-052 *** realint_1 -0.579253 0.268009 -2.161 0.0319 ** Statistics based on the rho-differenced data: Mean dependent var 3.113973 S.D. dependent var 18.67447 Sum squared resid 22530.90 S.E. of regression 10.66735 R-squared 0.676973 Adjusted R-squared 0.673710 F(2, 198) 221.0475 P-value(F) 3.56e-51 rho -0.003481 Durbin-Watson 1.993858 ''' ''' RESET test for specification (squares and cubes) Test statistic: F = 5.219019, with p-value = P(F(2,197) > 5.21902) = 0.00619 RESET test for specification (squares only) Test statistic: F = 7.268492, with p-value = P(F(1,198) > 7.26849) = 0.00762 RESET test for specification (cubes only) Test statistic: F = 5.248951, with p-value = P(F(1,198) > 5.24895) = 0.023: ''' ''' Test for ARCH of order 4 coefficient std. error t-ratio p-value -------------------------------------------------------- alpha(0) 97.0386 20.3234 4.775 3.56e-06 *** alpha(1) 0.176114 0.0714698 2.464 0.0146 ** alpha(2) -0.0488339 0.0724981 -0.6736 0.5014 alpha(3) -0.0705413 0.0737058 -0.9571 0.3397 alpha(4) 0.0384531 0.0725763 0.5298 0.5968 Null hypothesis: no ARCH effect is present Test statistic: LM = 7.30776 with p-value = P(Chi-square(4) > 7.30776) = 0.120491: ''' ''' Variance Inflation Factors Minimum possible value = 1.0 Values > 10.0 may indicate a collinearity problem ds_l_realgdp 1.002 realint_1 1.002 VIF(j) = 1/(1 - R(j)^2), where R(j) is the multiple correlation coefficient between variable j and the other independent variables Properties of matrix X'X: 1-norm = 6862.0664 Determinant = 1.0296049e+009 Reciprocal condition number = 0.013819244 ''' ''' Test for ARCH of order 4 - Null hypothesis: no ARCH effect is present Test statistic: LM = 7.30776 with p-value = P(Chi-square(4) > 7.30776) = 0.120491 Test of common factor restriction - Null hypothesis: restriction is acceptable Test statistic: F(2, 195) = 0.426391 with p-value = P(F(2, 195) > 0.426391) = 0.653468 Test for normality of residual - Null hypothesis: error is normally distributed Test statistic: Chi-square(2) = 20.2792 with p-value = 3.94837e-005: ''' #no idea what this is ''' Augmented regression for common factor test OLS, using observations 1959:3-2009:3 (T = 201) Dependent variable: ds_l_realinv coefficient std. error t-ratio p-value --------------------------------------------------------------- const -10.9481 1.35807 -8.062 7.44e-014 *** ds_l_realgdp 4.28893 0.229459 18.69 2.40e-045 *** realint_1 -0.662644 0.334872 -1.979 0.0492 ** ds_l_realinv_1 -0.108892 0.0715042 -1.523 0.1294 ds_l_realgdp_1 0.660443 0.390372 1.692 0.0923 * realint_2 0.0769695 0.341527 0.2254 0.8219 Sum of squared residuals = 22432.8 Test of common factor restriction Test statistic: F(2, 195) = 0.426391, with p-value = 0.653468 ''' ################ with OLS, HAC errors #Model 5: OLS, using observations 1959:2-2009:3 (T = 202) #Dependent variable: ds_l_realinv #HAC standard errors, bandwidth 4 (Bartlett kernel) #coefficient std. error t-ratio p-value 95% CONFIDENCE INTERVAL #for confidence interval t(199, 0.025) = 1.972 partable = np.array([ [-9.48167, 1.17709, -8.055, 7.17e-014, -11.8029, -7.16049], # *** [4.37422, 0.328787, 13.30, 2.62e-029, 3.72587, 5.02258], #*** [-0.613997, 0.293619, -2.091, 0.0378, -1.19300, -0.0349939]]) # ** result_gretl_g1 = dict( endog_mean = ("Mean dependent var", 3.257395), endog_std = ("S.D. dependent var", 18.73915), ssr = ("Sum squared resid", 22799.68), mse_resid_sqrt = ("S.E. of regression", 10.70380), rsquared = ("R-squared", 0.676978), rsquared_adj = ("Adjusted R-squared", 0.673731), fvalue = ("F(2, 199)", 90.79971), f_pvalue = ("P-value(F)", 9.53e-29), llf = ("Log-likelihood", -763.9752), aic = ("Akaike criterion", 1533.950), bic = ("Schwarz criterion", 1543.875), hqic = ("Hannan-Quinn", 1537.966), resid_acf1 = ("rho", -0.107341), dw = ("Durbin-Watson", 2.213805)) linear_logs = [1.68351, 0.430953, 2, "chi2"] #for logs: dropping 70 nan or incomplete observations, T=133 #(res_ols.model.exog <=0).any(1).sum() = 69 ?not 70 linear_squares = [7.52477, 0.0232283, 2, "chi2"] #Autocorrelation, Breusch-Godfrey test for autocorrelation up to order 4 lm_acorr4 = [1.17928, 0.321197, 4, 195, "F"] lm2_acorr4 = [4.771043, 0.312, 4, "chi2"] acorr_ljungbox4 = [5.23587, 0.264, 4, "chi2"] #break cusum_Harvey_Collier = [0.494432, 0.621549, 198, "t"] #stats.t.sf(0.494432, 198)*2 #see cusum results in files break_qlr = [3.01985, 0.1, 3, 196, "maxF"] #TODO check this, max at 2001:4 break_chow = [13.1897, 0.00424384, 3, "chi2"] # break at 1984:1 arch_4 = [3.43473, 0.487871, 4, "chi2"] normality = [23.962, 0.00001, 2, "chi2"] het_white = [33.503723, 0.000003, 5, "chi2"] het_breusch_pagan = [1.302014, 0.521520, 2, "chi2"] #TODO: not available het_breusch_pagan_konker = [0.709924, 0.701200, 2, "chi2"] reset_2_3 = [5.219019, 0.00619, 2, 197, "f"] reset_2 = [7.268492, 0.00762, 1, 198, "f"] reset_3 = [5.248951, 0.023, 1, 198, "f"] #not available cond_1norm = 5984.0525 determinant = 7.1087467e+008 reciprocal_condition_number = 0.013826504 vif = [1.001, 1.001] names = 'date residual leverage influence DFFITS'.split() cur_dir = os.path.abspath(os.path.dirname(__file__)) fpath = os.path.join(cur_dir, 'results/leverage_influence_ols_nostars.txt') lev = np.genfromtxt(fpath, skip_header=3, skip_footer=1, converters={0:lambda s: s}) #either numpy 1.6 or python 3.2 changed behavior if np.isnan(lev[-1]['f1']): lev = np.genfromtxt(fpath, skip_header=3, skip_footer=2, converters={0:lambda s: s}) lev.dtype.names = names res = res_ols #for easier copying cov_hac = sw.cov_hac_simple(res, nlags=4, use_correction=False) bse_hac = sw.se_cov(cov_hac) assert_almost_equal(res.params, partable[:,0], 5) assert_almost_equal(bse_hac, partable[:,1], 5) #TODO assert_almost_equal(res.ssr, result_gretl_g1['ssr'][1], decimal=2) assert_almost_equal(res.llf, result_gretl_g1['llf'][1], decimal=4) #not in gretl assert_almost_equal(res.rsquared, result_gretl_g1['rsquared'][1], decimal=6) #FAIL assert_almost_equal(res.rsquared_adj, result_gretl_g1['rsquared_adj'][1], decimal=6) #FAIL assert_almost_equal(np.sqrt(res.mse_resid), result_gretl_g1['mse_resid_sqrt'][1], decimal=5) #f-value is based on cov_hac I guess #res2 = res.get_robustcov_results(cov_type='HC1') # TODO: fvalue differs from Gretl, trying any of the HCx #assert_almost_equal(res2.fvalue, result_gretl_g1['fvalue'][1], decimal=0) #FAIL #assert_approx_equal(res.f_pvalue, result_gretl_g1['f_pvalue'][1], significant=1) #FAIL #assert_almost_equal(res.durbin_watson, result_gretl_g1['dw'][1], decimal=7) #TODO c = oi.reset_ramsey(res, degree=2) compare_ftest(c, reset_2, decimal=(6,5)) c = oi.reset_ramsey(res, degree=3) compare_ftest(c, reset_2_3, decimal=(6,5)) linear_sq = smsdia.linear_lm(res.resid, res.model.exog) assert_almost_equal(linear_sq[0], linear_squares[0], decimal=6) assert_almost_equal(linear_sq[1], linear_squares[1], decimal=7) hbpk = smsdia.het_breuschpagan(res.resid, res.model.exog) assert_almost_equal(hbpk[0], het_breusch_pagan_konker[0], decimal=6) assert_almost_equal(hbpk[1], het_breusch_pagan_konker[1], decimal=6) hw = smsdia.het_white(res.resid, res.model.exog) assert_almost_equal(hw[:2], het_white[:2], 6) #arch #sm_arch = smsdia.acorr_lm(res.resid**2, maxlag=4, autolag=None) sm_arch = smsdia.het_arch(res.resid, maxlag=4) assert_almost_equal(sm_arch[0], arch_4[0], decimal=5) assert_almost_equal(sm_arch[1], arch_4[1], decimal=6) vif2 = [oi.variance_inflation_factor(res.model.exog, k) for k in [1,2]] infl = oi.OLSInfluence(res_ols) #print np.max(np.abs(lev['DFFITS'] - infl.dffits[0])) #print np.max(np.abs(lev['leverage'] - infl.hat_matrix_diag)) #print np.max(np.abs(lev['influence'] - infl.influence)) #just added this based on Gretl #just rough test, low decimal in Gretl output, assert_almost_equal(lev['residual'], res.resid, decimal=3) assert_almost_equal(lev['DFFITS'], infl.dffits[0], decimal=3) assert_almost_equal(lev['leverage'], infl.hat_matrix_diag, decimal=3) assert_almost_equal(lev['influence'], infl.influence, decimal=4)
plt.plot(RESID[k]) plt.ylabel('Residual amplitude',fontsize = 13) plt.xlabel('Week',fontsize = 13) import statsmodels.api as sm sm.graphics.tsa.plot_acf(RESID[k],lags=25) np.array(abs(RESID[k])).sum()/(len(RESID[k])) from statsmodels.stats.diagnostic import het_breuschpagan from statsmodels.stats.diagnostic import het_white df_resid = pd.DataFrame(RESID[k],columns = ['resid']) X= np.concatenate((np.ones(30).reshape(-1,1),x[-30:,:]),axis = 1) white = het_white(list(df_resid.resid), X ) breuschpagan = het_breuschpagan(list(df_resid.resid), X ) from statsmodels.stats.diagnostic import het_breuschpagan from statsmodels.stats.diagnostic import het_white import pandas as pd import statsmodels.api as sm from statsmodels.formula.api import ols statecrime_df = sm.datasets.statecrime.load_pandas().data f ='violent~hs_grad+poverty+single+urban' statecrime_model = ols(formula=f, data=statecrime_df).fit() white_test = het_white(np.array(statecrime_model.resid)[-30:], x[-30:,:] ) np.array(statecrime_model.model.exog)[-30:,:]
IndVariable, IndVariable2, IndVariable3, IndVariable4, IndVariable5 ]], DataExcel['demand_Maa']))) print("Average demands:" + str(np.mean(DataExcel['demand_Maa']))) print("Percentage MSE: " + str(np.mean(Scores) / np.mean(DataExcel['demand_Maa']))) print("-----------------------------------------------") print(SecondRegressor.feature_importances_ * 100) print("-----------------------------------------------") r2 = r2_score(y_true=y_actual, y_pred=y_pred) print("The R Squared is: " + str(r2)) AIC, BIC = AICCalc(y_actual, y_pred, 5) print("The AIC is: " + str(AIC)) print("The BIC is: " + str(BIC)) print("-----------------------------------------------") Res, lm_pvalue, fvalue, pvalue = het_breuschpagan( Residuals, DataExcel[[ IndVariable, IndVariable2, IndVariable3, IndVariable4, IndVariable5 ]]) print("P Value of the Test: " + str(pvalue)) print("-----------------------------------------------") # Computing the adjusted R squared RAdjusted = 1 - (((1 - r2) * len(DataExcel['demand_Maa']) / (len(DataExcel['demand_Maa']) - 5 - 1))) print("The adjusted R squared is: " + str(RAdjusted)) def AICCalc(Y_Actual, Y_Predicted, N_Variables): res = Y_Actual - Y_Predicted SSE = sum(res**2) AIC = 2 * N_Variables - 2 * math.log(SSE) BIC = len(Y_Actual) * math.log( SSE / len(Y_Actual)) + N_Variables * math.log(len(Y_Actual))