def test_all(self): d = macrodata.load().data #import datasetswsm.greene as g #d = g.load('5-1') #growth rates gs_l_realinv = 400 * np.diff(np.log(d['realinv'])) gs_l_realgdp = 400 * np.diff(np.log(d['realgdp'])) #simple diff, not growthrate, I want heteroscedasticity later for testing endogd = np.diff(d['realinv']) exogd = add_constant(np.c_[np.diff(d['realgdp']), d['realint'][:-1]]) endogg = gs_l_realinv exogg = add_constant(np.c_[gs_l_realgdp, d['realint'][:-1]]) res_ols = OLS(endogg, exogg).fit() #print res_ols.params mod_g1 = GLSAR(endogg, exogg, rho=-0.108136) res_g1 = mod_g1.fit() #print res_g1.params mod_g2 = GLSAR(endogg, exogg, rho=-0.108136) #-0.1335859) from R res_g2 = mod_g2.iterative_fit(maxiter=5) #print res_g2.params rho = -0.108136 # coefficient std. error t-ratio p-value 95% CONFIDENCE INTERVAL partable = np.array([ [-9.50990, 0.990456, -9.602, 3.65e-018, -11.4631, -7.55670], # *** [ 4.37040, 0.208146, 21.00, 2.93e-052, 3.95993, 4.78086], # *** [-0.579253, 0.268009, -2.161, 0.0319, -1.10777, -0.0507346]]) # ** #Statistics based on the rho-differenced data: result_gretl_g1 = dict( endog_mean = ("Mean dependent var", 3.113973), endog_std = ("S.D. dependent var", 18.67447), ssr = ("Sum squared resid", 22530.90), mse_resid_sqrt = ("S.E. of regression", 10.66735), rsquared = ("R-squared", 0.676973), rsquared_adj = ("Adjusted R-squared", 0.673710), fvalue = ("F(2, 198)", 221.0475), f_pvalue = ("P-value(F)", 3.56e-51), resid_acf1 = ("rho", -0.003481), dw = ("Durbin-Watson", 1.993858)) #fstatistic, p-value, df1, df2 reset_2_3 = [5.219019, 0.00619, 2, 197, "f"] reset_2 = [7.268492, 0.00762, 1, 198, "f"] reset_3 = [5.248951, 0.023, 1, 198, "f"] #LM-statistic, p-value, df arch_4 = [7.30776, 0.120491, 4, "chi2"] #multicollinearity vif = [1.002, 1.002] cond_1norm = 6862.0664 determinant = 1.0296049e+009 reciprocal_condition_number = 0.013819244 #Chi-square(2): test-statistic, pvalue, df normality = [20.2792, 3.94837e-005, 2] #tests res = res_g1 #with rho from Gretl #basic assert_almost_equal(res.params, partable[:,0], 4) assert_almost_equal(res.bse, partable[:,1], 6) assert_almost_equal(res.tvalues, partable[:,2], 2) assert_almost_equal(res.ssr, result_gretl_g1['ssr'][1], decimal=2) #assert_almost_equal(res.llf, result_gretl_g1['llf'][1], decimal=7) #not in gretl #assert_almost_equal(res.rsquared, result_gretl_g1['rsquared'][1], decimal=7) #FAIL #assert_almost_equal(res.rsquared_adj, result_gretl_g1['rsquared_adj'][1], decimal=7) #FAIL assert_almost_equal(np.sqrt(res.mse_resid), result_gretl_g1['mse_resid_sqrt'][1], decimal=5) assert_almost_equal(res.fvalue, result_gretl_g1['fvalue'][1], decimal=4) assert_approx_equal(res.f_pvalue, result_gretl_g1['f_pvalue'][1], significant=2) #assert_almost_equal(res.durbin_watson, result_gretl_g1['dw'][1], decimal=7) #TODO #arch #sm_arch = smsdia.acorr_lm(res.wresid**2, maxlag=4, autolag=None) sm_arch = smsdia.het_arch(res.wresid, maxlag=4) assert_almost_equal(sm_arch[0], arch_4[0], decimal=4) assert_almost_equal(sm_arch[1], arch_4[1], decimal=6) #tests res = res_g2 #with estimated rho #estimated lag coefficient assert_almost_equal(res.model.rho, rho, decimal=3) #basic assert_almost_equal(res.params, partable[:,0], 4) assert_almost_equal(res.bse, partable[:,1], 3) assert_almost_equal(res.tvalues, partable[:,2], 2) assert_almost_equal(res.ssr, result_gretl_g1['ssr'][1], decimal=2) #assert_almost_equal(res.llf, result_gretl_g1['llf'][1], decimal=7) #not in gretl #assert_almost_equal(res.rsquared, result_gretl_g1['rsquared'][1], decimal=7) #FAIL #assert_almost_equal(res.rsquared_adj, result_gretl_g1['rsquared_adj'][1], decimal=7) #FAIL assert_almost_equal(np.sqrt(res.mse_resid), result_gretl_g1['mse_resid_sqrt'][1], decimal=5) assert_almost_equal(res.fvalue, result_gretl_g1['fvalue'][1], decimal=0) assert_almost_equal(res.f_pvalue, result_gretl_g1['f_pvalue'][1], decimal=6) #assert_almost_equal(res.durbin_watson, result_gretl_g1['dw'][1], decimal=7) #TODO c = oi.reset_ramsey(res, degree=2) compare_ftest(c, reset_2, decimal=(2,4)) c = oi.reset_ramsey(res, degree=3) compare_ftest(c, reset_2_3, decimal=(2,4)) #arch #sm_arch = smsdia.acorr_lm(res.wresid**2, maxlag=4, autolag=None) sm_arch = smsdia.het_arch(res.wresid, maxlag=4) assert_almost_equal(sm_arch[0], arch_4[0], decimal=1) assert_almost_equal(sm_arch[1], arch_4[1], decimal=2) ''' Performing iterative calculation of rho... ITER RHO ESS 1 -0.10734 22530.9 2 -0.10814 22530.9 Model 4: Cochrane-Orcutt, using observations 1959:3-2009:3 (T = 201) Dependent variable: ds_l_realinv rho = -0.108136 coefficient std. error t-ratio p-value ------------------------------------------------------------- const -9.50990 0.990456 -9.602 3.65e-018 *** ds_l_realgdp 4.37040 0.208146 21.00 2.93e-052 *** realint_1 -0.579253 0.268009 -2.161 0.0319 ** Statistics based on the rho-differenced data: Mean dependent var 3.113973 S.D. dependent var 18.67447 Sum squared resid 22530.90 S.E. of regression 10.66735 R-squared 0.676973 Adjusted R-squared 0.673710 F(2, 198) 221.0475 P-value(F) 3.56e-51 rho -0.003481 Durbin-Watson 1.993858 ''' ''' RESET test for specification (squares and cubes) Test statistic: F = 5.219019, with p-value = P(F(2,197) > 5.21902) = 0.00619 RESET test for specification (squares only) Test statistic: F = 7.268492, with p-value = P(F(1,198) > 7.26849) = 0.00762 RESET test for specification (cubes only) Test statistic: F = 5.248951, with p-value = P(F(1,198) > 5.24895) = 0.023: ''' ''' Test for ARCH of order 4 coefficient std. error t-ratio p-value -------------------------------------------------------- alpha(0) 97.0386 20.3234 4.775 3.56e-06 *** alpha(1) 0.176114 0.0714698 2.464 0.0146 ** alpha(2) -0.0488339 0.0724981 -0.6736 0.5014 alpha(3) -0.0705413 0.0737058 -0.9571 0.3397 alpha(4) 0.0384531 0.0725763 0.5298 0.5968 Null hypothesis: no ARCH effect is present Test statistic: LM = 7.30776 with p-value = P(Chi-square(4) > 7.30776) = 0.120491: ''' ''' Variance Inflation Factors Minimum possible value = 1.0 Values > 10.0 may indicate a collinearity problem ds_l_realgdp 1.002 realint_1 1.002 VIF(j) = 1/(1 - R(j)^2), where R(j) is the multiple correlation coefficient between variable j and the other independent variables Properties of matrix X'X: 1-norm = 6862.0664 Determinant = 1.0296049e+009 Reciprocal condition number = 0.013819244 ''' ''' Test for ARCH of order 4 - Null hypothesis: no ARCH effect is present Test statistic: LM = 7.30776 with p-value = P(Chi-square(4) > 7.30776) = 0.120491 Test of common factor restriction - Null hypothesis: restriction is acceptable Test statistic: F(2, 195) = 0.426391 with p-value = P(F(2, 195) > 0.426391) = 0.653468 Test for normality of residual - Null hypothesis: error is normally distributed Test statistic: Chi-square(2) = 20.2792 with p-value = 3.94837e-005: ''' #no idea what this is ''' Augmented regression for common factor test OLS, using observations 1959:3-2009:3 (T = 201) Dependent variable: ds_l_realinv coefficient std. error t-ratio p-value --------------------------------------------------------------- const -10.9481 1.35807 -8.062 7.44e-014 *** ds_l_realgdp 4.28893 0.229459 18.69 2.40e-045 *** realint_1 -0.662644 0.334872 -1.979 0.0492 ** ds_l_realinv_1 -0.108892 0.0715042 -1.523 0.1294 ds_l_realgdp_1 0.660443 0.390372 1.692 0.0923 * realint_2 0.0769695 0.341527 0.2254 0.8219 Sum of squared residuals = 22432.8 Test of common factor restriction Test statistic: F(2, 195) = 0.426391, with p-value = 0.653468 ''' ################ with OLS, HAC errors #Model 5: OLS, using observations 1959:2-2009:3 (T = 202) #Dependent variable: ds_l_realinv #HAC standard errors, bandwidth 4 (Bartlett kernel) #coefficient std. error t-ratio p-value 95% CONFIDENCE INTERVAL #for confidence interval t(199, 0.025) = 1.972 partable = np.array([ [-9.48167, 1.17709, -8.055, 7.17e-014, -11.8029, -7.16049], # *** [4.37422, 0.328787, 13.30, 2.62e-029, 3.72587, 5.02258], #*** [-0.613997, 0.293619, -2.091, 0.0378, -1.19300, -0.0349939]]) # ** result_gretl_g1 = dict( endog_mean = ("Mean dependent var", 3.257395), endog_std = ("S.D. dependent var", 18.73915), ssr = ("Sum squared resid", 22799.68), mse_resid_sqrt = ("S.E. of regression", 10.70380), rsquared = ("R-squared", 0.676978), rsquared_adj = ("Adjusted R-squared", 0.673731), fvalue = ("F(2, 199)", 90.79971), f_pvalue = ("P-value(F)", 9.53e-29), llf = ("Log-likelihood", -763.9752), aic = ("Akaike criterion", 1533.950), bic = ("Schwarz criterion", 1543.875), hqic = ("Hannan-Quinn", 1537.966), resid_acf1 = ("rho", -0.107341), dw = ("Durbin-Watson", 2.213805)) linear_logs = [1.68351, 0.430953, 2, "chi2"] #for logs: dropping 70 nan or incomplete observations, T=133 #(res_ols.model.exog <=0).any(1).sum() = 69 ?not 70 linear_squares = [7.52477, 0.0232283, 2, "chi2"] #Autocorrelation, Breusch-Godfrey test for autocorrelation up to order 4 lm_acorr4 = [1.17928, 0.321197, 4, 195, "F"] lm2_acorr4 = [4.771043, 0.312, 4, "chi2"] acorr_ljungbox4 = [5.23587, 0.264, 4, "chi2"] #break cusum_Harvey_Collier = [0.494432, 0.621549, 198, "t"] #stats.t.sf(0.494432, 198)*2 #see cusum results in files break_qlr = [3.01985, 0.1, 3, 196, "maxF"] #TODO check this, max at 2001:4 break_chow = [13.1897, 0.00424384, 3, "chi2"] # break at 1984:1 arch_4 = [3.43473, 0.487871, 4, "chi2"] normality = [23.962, 0.00001, 2, "chi2"] het_white = [33.503723, 0.000003, 5, "chi2"] het_breusch_pagan = [1.302014, 0.521520, 2, "chi2"] #TODO: not available het_breusch_pagan_konker = [0.709924, 0.701200, 2, "chi2"] reset_2_3 = [5.219019, 0.00619, 2, 197, "f"] reset_2 = [7.268492, 0.00762, 1, 198, "f"] reset_3 = [5.248951, 0.023, 1, 198, "f"] #not available cond_1norm = 5984.0525 determinant = 7.1087467e+008 reciprocal_condition_number = 0.013826504 vif = [1.001, 1.001] names = 'date residual leverage influence DFFITS'.split() cur_dir = os.path.abspath(os.path.dirname(__file__)) fpath = os.path.join(cur_dir, 'results/leverage_influence_ols_nostars.txt') lev = np.genfromtxt(fpath, skip_header=3, skip_footer=1, converters={0:lambda s: s}) #either numpy 1.6 or python 3.2 changed behavior if np.isnan(lev[-1]['f1']): lev = np.genfromtxt(fpath, skip_header=3, skip_footer=2, converters={0:lambda s: s}) lev.dtype.names = names res = res_ols #for easier copying cov_hac = sw.cov_hac_simple(res, nlags=4, use_correction=False) bse_hac = sw.se_cov(cov_hac) assert_almost_equal(res.params, partable[:,0], 5) assert_almost_equal(bse_hac, partable[:,1], 5) #TODO assert_almost_equal(res.ssr, result_gretl_g1['ssr'][1], decimal=2) assert_almost_equal(res.llf, result_gretl_g1['llf'][1], decimal=4) #not in gretl assert_almost_equal(res.rsquared, result_gretl_g1['rsquared'][1], decimal=6) #FAIL assert_almost_equal(res.rsquared_adj, result_gretl_g1['rsquared_adj'][1], decimal=6) #FAIL assert_almost_equal(np.sqrt(res.mse_resid), result_gretl_g1['mse_resid_sqrt'][1], decimal=5) #f-value is based on cov_hac I guess #res2 = res.get_robustcov_results(cov_type='HC1') # TODO: fvalue differs from Gretl, trying any of the HCx #assert_almost_equal(res2.fvalue, result_gretl_g1['fvalue'][1], decimal=0) #FAIL #assert_approx_equal(res.f_pvalue, result_gretl_g1['f_pvalue'][1], significant=1) #FAIL #assert_almost_equal(res.durbin_watson, result_gretl_g1['dw'][1], decimal=7) #TODO c = oi.reset_ramsey(res, degree=2) compare_ftest(c, reset_2, decimal=(6,5)) c = oi.reset_ramsey(res, degree=3) compare_ftest(c, reset_2_3, decimal=(6,5)) linear_sq = smsdia.linear_lm(res.resid, res.model.exog) assert_almost_equal(linear_sq[0], linear_squares[0], decimal=6) assert_almost_equal(linear_sq[1], linear_squares[1], decimal=7) hbpk = smsdia.het_breuschpagan(res.resid, res.model.exog) assert_almost_equal(hbpk[0], het_breusch_pagan_konker[0], decimal=6) assert_almost_equal(hbpk[1], het_breusch_pagan_konker[1], decimal=6) hw = smsdia.het_white(res.resid, res.model.exog) assert_almost_equal(hw[:2], het_white[:2], 6) #arch #sm_arch = smsdia.acorr_lm(res.resid**2, maxlag=4, autolag=None) sm_arch = smsdia.het_arch(res.resid, maxlag=4) assert_almost_equal(sm_arch[0], arch_4[0], decimal=5) assert_almost_equal(sm_arch[1], arch_4[1], decimal=6) vif2 = [oi.variance_inflation_factor(res.model.exog, k) for k in [1,2]] infl = oi.OLSInfluence(res_ols) #print np.max(np.abs(lev['DFFITS'] - infl.dffits[0])) #print np.max(np.abs(lev['leverage'] - infl.hat_matrix_diag)) #print np.max(np.abs(lev['influence'] - infl.influence)) #just added this based on Gretl #just rough test, low decimal in Gretl output, assert_almost_equal(lev['residual'], res.resid, decimal=3) assert_almost_equal(lev['DFFITS'], infl.dffits[0], decimal=3) assert_almost_equal(lev['leverage'], infl.hat_matrix_diag, decimal=3) assert_almost_equal(lev['influence'], infl.influence, decimal=4)
def Fig_OLS_Checks(): #fs = 10 # font size used across figures #color = str() #OrC = 'open' SampSizes = [5, 6, 7, 8, 9, 10, 13, 16, 20, 30, 40, 50, 60, 70, 80, 90, 100] Iterations = 100 fig = plt.figure(figsize=(12, 8)) # MODEL PARAMETERS Rare_MacIntercept_pVals = [] # List to hold coefficient p-values Rare_MacIntercept_Coeffs = [] # List to hold coefficients Rich_MacIntercept_pVals = [] Rich_MacIntercept_Coeffs = [] Dom_MacIntercept_pVals = [] Dom_MacIntercept_Coeffs = [] Even_MacIntercept_pVals = [] Even_MacIntercept_Coeffs = [] Rare_MicIntercept_pVals = [] Rare_MicIntercept_Coeffs = [] Rich_MicIntercept_pVals = [] Rich_MicIntercept_Coeffs = [] Dom_MicIntercept_pVals = [] Dom_MicIntercept_Coeffs = [] Even_MicIntercept_pVals = [] Even_MicIntercept_Coeffs = [] Rare_MacSlope_pVals = [] Rare_MacSlope_Coeffs = [] Rich_MacSlope_pVals = [] Rich_MacSlope_Coeffs = [] Dom_MacSlope_pVals = [] Dom_MacSlope_Coeffs = [] Even_MacSlope_pVals = [] Even_MacSlope_Coeffs = [] Rare_MicSlope_pVals = [] Rare_MicSlope_Coeffs = [] Rich_MicSlope_pVals = [] Rich_MicSlope_Coeffs = [] Dom_MicSlope_pVals = [] Dom_MicSlope_Coeffs = [] Even_MicSlope_pVals = [] Even_MicSlope_Coeffs = [] RareR2List = [] # List to hold model R2 RarepFList = [] # List to hold significance of model R2 RichR2List = [] # List to hold model R2 RichpFList = [] # List to hold significance of model R2 DomR2List = [] # List to hold model R2 DompFList = [] # List to hold significance of model R2 EvenR2List = [] # List to hold model R2 EvenpFList = [] # List to hold significance of model R2 # ASSUMPTIONS OF LINEAR REGRESSION # 1. Error in predictor variables is negligible...presumably yes # 2. Variables are measured at the continuous level...yes # 3. The relationship is linear #RarepLinListHC = [] RarepLinListRainB = [] RarepLinListLM = [] #RichpLinListHC = [] RichpLinListRainB = [] RichpLinListLM = [] #DompLinListHC = [] DompLinListRainB = [] DompLinListLM = [] #EvenpLinListHC = [] EvenpLinListRainB = [] EvenpLinListLM = [] # 4. There are no significant outliers...need to find tests or measures # 5. Independence of observations (no serial correlation in residuals) RarepCorrListBG = [] RarepCorrListF = [] RichpCorrListBG = [] RichpCorrListF = [] DompCorrListBG = [] DompCorrListF = [] EvenpCorrListBG = [] EvenpCorrListF = [] # 6. Homoscedacticity RarepHomoHW = [] RarepHomoHB = [] RichpHomoHW = [] RichpHomoHB = [] DompHomoHW = [] DompHomoHB = [] EvenpHomoHW = [] EvenpHomoHB = [] # 7. Normally distributed residuals (errors) RarepNormListOmni = [] # Omnibus test for normality RarepNormListJB = [] # Calculate residual skewness, kurtosis, and do the JB test for normality RarepNormListKS = [] # Lillifors test for normality, Kolmogorov Smirnov test with estimated mean and variance RarepNormListAD = [] # Anderson-Darling test for normal distribution unknown mean and variance RichpNormListOmni = [] # Omnibus test for normality RichpNormListJB = [] # Calculate residual skewness, kurtosis, and do the JB test for normality RichpNormListKS = [] # Lillifors test for normality, Kolmogorov Smirnov test with estimated mean and variance RichpNormListAD = [] # Anderson-Darling test for normal distribution unknown mean and variance DompNormListOmni = [] # Omnibus test for normality DompNormListJB = [] # Calculate residual skewness, kurtosis, and do the JB test for normality DompNormListKS = [] # Lillifors test for normality, Kolmogorov Smirnov test with estimated mean and variance DompNormListAD = [] # Anderson-Darling test for normal distribution unknown mean and variance EvenpNormListOmni = [] # Omnibus test for normality EvenpNormListJB = [] # Calculate residual skewness, kurtosis, and do the JB test for normality EvenpNormListKS = [] # Lillifors test for normality, Kolmogorov Smirnov test with estimated mean and variance EvenpNormListAD = [] # Anderson-Darling test for normal distribution unknown mean and variance NLIST = [] for SampSize in SampSizes: sRare_MacIntercept_pVals = [] # List to hold coefficient p-values sRare_MacIntercept_Coeffs = [] # List to hold coefficients sRich_MacIntercept_pVals = [] # List to hold coefficient p-values sRich_MacIntercept_Coeffs = [] # List to hold coefficients sDom_MacIntercept_pVals = [] sDom_MacIntercept_Coeffs = [] sEven_MacIntercept_pVals = [] sEven_MacIntercept_Coeffs = [] sRare_MicIntercept_pVals = [] sRare_MicIntercept_Coeffs = [] sRich_MicIntercept_pVals = [] sRich_MicIntercept_Coeffs = [] sDom_MicIntercept_pVals = [] sDom_MicIntercept_Coeffs = [] sEven_MicIntercept_pVals = [] sEven_MicIntercept_Coeffs = [] sRare_MacSlope_pVals = [] sRare_MacSlope_Coeffs = [] sRich_MacSlope_pVals = [] sRich_MacSlope_Coeffs = [] sDom_MacSlope_pVals = [] sDom_MacSlope_Coeffs = [] sEven_MacSlope_pVals = [] sEven_MacSlope_Coeffs = [] sRare_MicSlope_pVals = [] sRare_MicSlope_Coeffs = [] sRich_MicSlope_pVals = [] sRich_MicSlope_Coeffs = [] sDom_MicSlope_pVals = [] sDom_MicSlope_Coeffs = [] sEven_MicSlope_pVals = [] sEven_MicSlope_Coeffs = [] sRareR2List = [] # List to hold model R2 sRarepFList = [] # List to hold significance of model R2 sRichR2List = [] # List to hold model R2 sRichpFList = [] # List to hold significance of model R2 sDomR2List = [] # List to hold model R2 sDompFList = [] # List to hold significance of model R2 sEvenR2List = [] # List to hold model R2 sEvenpFList = [] # List to hold significance of model R2 # ASSUMPTIONS OF LINEAR REGRESSION # 1. Error in predictor variables is negligible...presumably yes # 2. Variables are measured at the continuous level...yes # 3. The relationship is linear #sRarepLinListHC = [] sRarepLinListRainB = [] sRarepLinListLM = [] #sRichpLinListHC = [] sRichpLinListRainB = [] sRichpLinListLM = [] #sDompLinListHC = [] sDompLinListRainB = [] sDompLinListLM = [] #sEvenpLinListHC = [] sEvenpLinListRainB = [] sEvenpLinListLM = [] # 4. There are no significant outliers...need to find tests or measures # 5. Independence of observations (no serial correlation in residuals) sRarepCorrListBG = [] sRarepCorrListF = [] sRichpCorrListBG = [] sRichpCorrListF = [] sDompCorrListBG = [] sDompCorrListF = [] sEvenpCorrListBG = [] sEvenpCorrListF = [] # 6. Homoscedacticity sRarepHomoHW = [] sRarepHomoHB = [] sRichpHomoHW = [] sRichpHomoHB = [] sDompHomoHW = [] sDompHomoHB = [] sEvenpHomoHW = [] sEvenpHomoHB = [] # 7. Normally distributed residuals (errors) sRarepNormListOmni = [] # Omnibus test for normality sRarepNormListJB = [] # Calculate residual skewness, kurtosis, and do the JB test for normality sRarepNormListKS = [] # Lillifors test for normality, Kolmogorov Smirnov test with estimated mean and variance sRarepNormListAD = [] # Anderson-Darling test for normal distribution unknown mean and variance sRichpNormListOmni = [] # Omnibus test for normality sRichpNormListJB = [] # Calculate residual skewness, kurtosis, and do the JB test for normality sRichpNormListKS = [] # Lillifors test for normality, Kolmogorov Smirnov test with estimated mean and variance sRichpNormListAD = [] # Anderson-Darling test for normal distribution unknown mean and variance sDompNormListOmni = [] # Omnibus test for normality sDompNormListJB = [] # Calculate residual skewness, kurtosis, and do the JB test for normality sDompNormListKS = [] # Lillifors test for normality, Kolmogorov Smirnov test with estimated mean and variance sDompNormListAD = [] # Anderson-Darling test for normal distribution unknown mean and variance sEvenpNormListOmni = [] # Omnibus test for normality sEvenpNormListJB = [] # Calculate residual skewness, kurtosis, and do the JB test for normality sEvenpNormListKS = [] # Lillifors test for normality, Kolmogorov Smirnov test with estimated mean and variance sEvenpNormListAD = [] # Anderson-Darling test for normal distribution unknown mean and variance for iteration in range(Iterations): Nlist, Slist, Evarlist, ESimplist, ENeelist, EHeiplist, EQlist = [[], [], [], [], [], [], []] klist, Shanlist, BPlist, SimpDomlist, SinglesList, tenlist, onelist = [[], [], [], [], [], [], []] NmaxList, rareSkews, KindList = [[], [], []] NSlist = [] ct = 0 radDATA = [] datasets = [] GoodNames = ['EMPclosed', 'HMP', 'BIGN', 'TARA', 'BOVINE', 'HUMAN', 'LAUB', 'SED', 'CHU', 'CHINA', 'CATLIN', 'FUNGI', 'HYDRO', 'BBS', 'CBC', 'MCDB', 'GENTRY', 'FIA'] # all microbe data is MGRAST mlist = ['micro', 'macro'] for m in mlist: for name in os.listdir(mydir +'data/'+m): if name in GoodNames: pass else: continue path = mydir+'data/'+m+'/'+name+'/'+name+'-SADMetricData.txt' num_lines = sum(1 for line in open(path)) datasets.append([name, m, num_lines]) numMac = 0 numMic = 0 radDATA = [] for d in datasets: name, kind, numlines = d lines = [] lines = np.random.choice(range(1, numlines+1), SampSize, replace=True) path = mydir+'data/'+kind+'/'+name+'/'+name+'-SADMetricData.txt' for line in lines: data = linecache.getline(path, line) radDATA.append(data) #print name, kind, numlines, len(radDATA) for data in radDATA: data = data.split() if len(data) == 0: print 'no data' continue name, kind, N, S, Var, Evar, ESimp, EQ, O, ENee, EPielou, EHeip, BP, SimpDom, Nmax, McN, skew, logskew, chao1, ace, jknife1, jknife2, margalef, menhinick, preston_a, preston_S = data N = float(N) S = float(S) Nlist.append(float(np.log(N))) Slist.append(float(np.log(S))) NSlist.append(float(np.log(N/S))) Evarlist.append(float(np.log(float(Evar)))) ESimplist.append(float(np.log(float(ESimp)))) KindList.append(kind) BPlist.append(float(BP)) NmaxList.append(float(np.log(float(BP)*float(N)))) EHeiplist.append(float(EHeip)) # lines for the log-modulo transformation of skewnness skew = float(skew) sign = 1 if skew < 0: sign = -1 lms = np.log(np.abs(skew) + 1) lms = lms * sign #if lms > 3: print name, N, S rareSkews.append(float(lms)) if kind == 'macro': numMac += 1 elif kind == 'micro': numMic += 1 ct+=1 #print 'Sample Size:',SampSize, ' Mic:', numMic,'Mac:', numMac # Multiple regression for Rarity d = pd.DataFrame({'N': list(Nlist)}) d['Rarity'] = list(rareSkews) d['Kind'] = list(KindList) RarityResults = smf.ols('Rarity ~ N * Kind', d).fit() # Fit the dummy variable regression model #print RarityResults.summary(), '\n' # Multiple regression for Rarity d = pd.DataFrame({'N': list(Nlist)}) d['Richness'] = list(Slist) d['Kind'] = list(KindList) RichnessResults = smf.ols('Richness ~ N * Kind', d).fit() # Fit the dummy variable regression model #print RichnessResults.summary(), '\n' # Multiple regression for Dominance d = pd.DataFrame({'N': list(Nlist)}) d['Dominance'] = list(NmaxList) d['Kind'] = list(KindList) DomResults = smf.ols('Dominance ~ N * Kind', d).fit() # Fit the dummy variable regression model #print DomResults.summary(), '\n' # Multiple regression for Evenness d = pd.DataFrame({'N': list(Nlist)}) d['Evenness'] = list(ESimplist) d['Kind'] = list(KindList) EvenResults = smf.ols('Evenness ~ N * Kind', d).fit() # Fit the dummy variable regression model #print RarityResults.summary(), '\n' RareResids = RarityResults.resid # residuals of the model RichResids = RichnessResults.resid # residuals of the model DomResids = DomResults.resid # residuals of the model EvenResids = EvenResults.resid # residuals of the model # MODEL RESULTS/FIT RareFpval = RarityResults.f_pvalue Rarer2 = RarityResults.rsquared # coefficient of determination #Adj_r2 = RareResults.rsquared_adj # adjusted RichFpval = RichnessResults.f_pvalue Richr2 = RichnessResults.rsquared # coefficient of determination #Adj_r2 = RichnessResults.rsquared_adj # adjusted DomFpval = DomResults.f_pvalue Domr2 = DomResults.rsquared # coefficient of determination #Adj_r2 = DomResults.rsquared_adj # adjusted EvenFpval = EvenResults.f_pvalue Evenr2 = EvenResults.rsquared # coefficient of determination #Adj_r2 = EvenResuls.rsquared_adj # adjusted # MODEL PARAMETERS and p-values Rareparams = RarityResults.params Rareparams = Rareparams.tolist() Rarepvals = RarityResults.pvalues Rarepvals = Rarepvals.tolist() Richparams = RichnessResults.params Richparams = Richparams.tolist() Richpvals = RichnessResults.pvalues Richpvals = Richpvals.tolist() Domparams = DomResults.params Domparams = Domparams.tolist() Dompvals = DomResults.pvalues Dompvals = Dompvals.tolist() Evenparams = EvenResults.params Evenparams = Evenparams.tolist() Evenpvals = EvenResults.pvalues Evenpvals = Evenpvals.tolist() sRare_MacIntercept_pVals.append(Rarepvals[0]) sRare_MacIntercept_Coeffs.append(Rareparams[0]) sRich_MacIntercept_pVals.append(Rarepvals[0]) sRich_MacIntercept_Coeffs.append(Rareparams[0]) sDom_MacIntercept_pVals.append(Dompvals[0]) sDom_MacIntercept_Coeffs.append(Domparams[0]) sEven_MacIntercept_pVals.append(Evenpvals[0]) sEven_MacIntercept_Coeffs.append(Evenparams[0]) sRare_MicIntercept_pVals.append(Rarepvals[1]) if Rarepvals[1] > 0.05: sRare_MicIntercept_Coeffs.append(Rareparams[1]) else: sRare_MicIntercept_Coeffs.append(Rareparams[1]) sRich_MicIntercept_pVals.append(Richpvals[1]) if Richpvals[1] > 0.05: sRich_MicIntercept_Coeffs.append(Richparams[1]) else: sRich_MicIntercept_Coeffs.append(Richparams[1]) sDom_MicIntercept_pVals.append(Dompvals[1]) if Dompvals[1] > 0.05: sDom_MicIntercept_Coeffs.append(Domparams[1]) else: sDom_MicIntercept_Coeffs.append(Domparams[1]) sEven_MicIntercept_pVals.append(Evenpvals[1]) if Evenpvals[1] > 0.05: sEven_MicIntercept_Coeffs.append(Evenparams[1]) else: sEven_MicIntercept_Coeffs.append(Evenparams[1]) sRare_MacSlope_pVals.append(Rarepvals[2]) sRare_MacSlope_Coeffs.append(Rareparams[2]) sRich_MacSlope_pVals.append(Richpvals[2]) sRich_MacSlope_Coeffs.append(Richparams[2]) sDom_MacSlope_pVals.append(Dompvals[2]) sDom_MacSlope_Coeffs.append(Domparams[2]) sEven_MacSlope_pVals.append(Evenpvals[2]) sEven_MacSlope_Coeffs.append(Evenparams[2]) sRare_MicSlope_pVals.append(Rarepvals[3]) if Rarepvals[3] > 0.05: sRare_MicSlope_Coeffs.append(Rareparams[3]) else: sRare_MicSlope_Coeffs.append(Rareparams[3]) sRich_MicSlope_pVals.append(Richpvals[3]) if Richpvals[3] > 0.05: sRich_MicSlope_Coeffs.append(Richparams[3]) else: sRich_MicSlope_Coeffs.append(Richparams[3]) sDom_MicSlope_pVals.append(Dompvals[3]) if Dompvals[3] > 0.05: sDom_MicSlope_Coeffs.append(Domparams[3]) else: sDom_MicSlope_Coeffs.append(Domparams[3]) sEven_MicSlope_pVals.append(Evenpvals[3]) if Evenpvals[3] > 0.05: sEven_MicSlope_Coeffs.append(Evenparams[3]) else: sEven_MicSlope_Coeffs.append(Evenparams[3]) sRareR2List.append(Rarer2) sRarepFList.append(RareFpval) sRichR2List.append(Richr2) sRichpFList.append(RichFpval) sDomR2List.append(Domr2) sDompFList.append(DomFpval) sEvenR2List.append(Evenr2) sEvenpFList.append(EvenFpval) # TESTS OF LINEAR REGRESSION ASSUMPTIONS # Error in predictor variables is negligible...Presumably Yes # Variables are measured at the continuous level...Definitely Yes # TESTS FOR LINEARITY, i.e., WHETHER THE DATA ARE CORRECTLY MODELED AS LINEAR #HC = smd.linear_harvey_collier(RarityResults) # Harvey Collier test for linearity. The Null hypothesis is that the regression is correctly modeled as linear. #sRarepLinListHC.append(HC) #HC = smd.linear_harvey_collier(DomResults) # Harvey Collier test for linearity. The Null hypothesis is that the regression is correctly modeled as linear. #sDompLinListHC.append(HC) #HC = smd.linear_harvey_collier(EvenResults) # Harvey Collier test for linearity. The Null hypothesis is that the regression is correctly modeled as linear. #sEvenpLinListHC.append(HC) RB = smd.linear_rainbow(RarityResults) # Rainbow test for linearity. The Null hypothesis is that the regression is correctly modeled as linear. sRarepLinListRainB.append(RB[1]) RB = smd.linear_rainbow(RichnessResults) # Rainbow test for linearity. The Null hypothesis is that the regression is correctly modeled as linear. sRichpLinListRainB.append(RB[1]) RB = smd.linear_rainbow(DomResults) # Rainbow test for linearity. The Null hypothesis is that the regression is correctly modeled as linear. sDompLinListRainB.append(RB[1]) RB = smd.linear_rainbow(EvenResults) # Rainbow test for linearity. The Null hypothesis is that the regression is correctly modeled as linear. sEvenpLinListRainB.append(RB[1]) LM = smd.linear_lm(RarityResults.resid, RarityResults.model.exog) # Lagrangian multiplier test for linearity sRarepLinListLM.append(LM[1]) LM = smd.linear_lm(RichnessResults.resid, RichnessResults.model.exog) # Lagrangian multiplier test for linearity sRichpLinListLM.append(LM[1]) LM = smd.linear_lm(DomResults.resid, DomResults.model.exog) # Lagrangian multiplier test for linearity sDompLinListLM.append(LM[1]) LM = smd.linear_lm(EvenResults.resid, EvenResults.model.exog) # Lagrangian multiplier test for linearity sEvenpLinListLM.append(LM[1]) # INDEPENDENCE OF OBSERVATIONS (no serial correlation in residuals) BGtest = smd.acorr_breush_godfrey(RarityResults, nlags=None, store=False) # Breusch Godfrey Lagrange Multiplier tests for residual autocorrelation # Lagrange multiplier test statistic, p-value for Lagrange multiplier test, fstatistic for F test, pvalue for F test #BGtest = smd.acorr_ljungbox(RareResids, lags=None, boxpierce=True) sRarepCorrListBG.append(BGtest[1]) sRarepCorrListF.append(BGtest[3]) BGtest = smd.acorr_breush_godfrey(RichnessResults, nlags=None, store=False) # Breusch Godfrey Lagrange Multiplier tests for residual autocorrelation # Lagrange multiplier test statistic, p-value for Lagrange multiplier test, fstatistic for F test, pvalue for F test #BGtest = smd.acorr_ljungbox(RichResids, lags=None, boxpierce=True) sRichpCorrListBG.append(BGtest[1]) sRichpCorrListF.append(BGtest[3]) BGtest = smd.acorr_breush_godfrey(DomResults, nlags=None, store=False) # Breusch Godfrey Lagrange Multiplier tests for residual autocorrelation # Lagrange multiplier test statistic, p-value for Lagrange multiplier test, fstatistic for F test, pvalue for F test #BGtest = smd.acorr_ljungbox(DomResids, lags=None, boxpierce=True) sDompCorrListBG.append(BGtest[1]) sDompCorrListF.append(BGtest[3]) BGtest = smd.acorr_breush_godfrey(EvenResults, nlags=None, store=False) # Breusch Godfrey Lagrange Multiplier tests for residual autocorrelation # Lagrange multiplier test statistic, p-value for Lagrange multiplier test, fstatistic for F test, pvalue for F test #BGtest = smd.acorr_ljungbox(EvenResids, lags=None, boxpierce=True) sEvenpCorrListBG.append(BGtest[1]) sEvenpCorrListF.append(BGtest[3]) # There are no significant outliers...Need tests or measures/metrics # HOMOSCEDASTICITY # These tests return: # 1. lagrange multiplier statistic, # 2. p-value of lagrange multiplier test, # 3. f-statistic of the hypothesis that the error variance does not depend on x, # 4. p-value for the f-statistic HW = sms.het_white(RareResids, RarityResults.model.exog) sRarepHomoHW.append(HW[3]) HW = sms.het_white(RichResids, RichnessResults.model.exog) sRichpHomoHW.append(HW[3]) HW = sms.het_white(DomResids, DomResults.model.exog) sDompHomoHW.append(HW[3]) HW = sms.het_white(EvenResids, EvenResults.model.exog) sEvenpHomoHW.append(HW[3]) HB = sms.het_breushpagan(RareResids, RarityResults.model.exog) sRarepHomoHB.append(HB[3]) HB = sms.het_breushpagan(RichResids, RichnessResults.model.exog) sRichpHomoHB.append(HB[3]) HB = sms.het_breushpagan(DomResids, DomResults.model.exog) sDompHomoHB.append(HB[3]) HB = sms.het_breushpagan(EvenResids, EvenResults.model.exog) sEvenpHomoHB.append(HB[3]) # 7. NORMALITY OF ERROR TERMS O = sms.omni_normtest(RareResids) sRarepNormListOmni.append(O[1]) O = sms.omni_normtest(RichResids) sRichpNormListOmni.append(O[1]) O = sms.omni_normtest(DomResids) sDompNormListOmni.append(O[1]) O = sms.omni_normtest(EvenResids) sEvenpNormListOmni.append(O[1]) JB = sms.jarque_bera(RareResids) sRarepNormListJB.append(JB[1]) # Calculate residual skewness, kurtosis, and do the JB test for normality JB = sms.jarque_bera(RichResids) sRichpNormListJB.append(JB[1]) # Calculate residual skewness, kurtosis, and do the JB test for normality JB = sms.jarque_bera(DomResids) sDompNormListJB.append(JB[1]) # Calculate residual skewness, kurtosis, and do the JB test for normality JB = sms.jarque_bera(EvenResids) sEvenpNormListJB.append(JB[1]) # Calculate residual skewness, kurtosis, and do the JB test for normality KS = smd.kstest_normal(RareResids) sRarepNormListKS.append(KS[1]) # Lillifors test for normality, Kolmogorov Smirnov test with estimated mean and variance KS = smd.kstest_normal(RichResids) sRichpNormListKS.append(KS[1]) # Lillifors test for normality, Kolmogorov Smirnov test with estimated mean and variance KS = smd.kstest_normal(DomResids) sDompNormListKS.append(KS[1]) # Lillifors test for normality, Kolmogorov Smirnov test with estimated mean and variance KS = smd.kstest_normal(EvenResids) sEvenpNormListKS.append(KS[1]) # Lillifors test for normality, Kolmogorov Smirnov test with estimated mean and variance AD = smd.normal_ad(RareResids) sRarepNormListAD.append(AD[1]) # Anderson-Darling test for normal distribution unknown mean and variance AD = smd.normal_ad(RichResids) sRichpNormListAD.append(AD[1]) # Anderson-Darling test for normal distribution unknown mean and variance AD = smd.normal_ad(DomResids) sDompNormListAD.append(AD[1]) # Anderson-Darling test for normal distribution unknown mean and variance AD = smd.normal_ad(EvenResids) sEvenpNormListAD.append(AD[1]) # Anderson-Darling test for normal distribution unknown mean and variance print 'Sample size:',SampSize, 'iteration:',iteration NLIST.append(SampSize) Rare_MacIntercept_pVals.append(np.mean(sRare_MacIntercept_pVals)) # List to hold coefficient p-values Rare_MacIntercept_Coeffs.append(np.mean(sRare_MacIntercept_Coeffs)) # List to hold coefficients Rich_MacIntercept_pVals.append(np.mean(sRich_MacIntercept_pVals)) # List to hold coefficient p-values Rich_MacIntercept_Coeffs.append(np.mean(sRich_MacIntercept_Coeffs)) # List to hold coefficients Dom_MacIntercept_pVals.append(np.mean(sDom_MacIntercept_pVals)) Dom_MacIntercept_Coeffs.append(np.mean(sDom_MacIntercept_Coeffs)) Even_MacIntercept_pVals.append(np.mean(sEven_MacIntercept_pVals)) Even_MacIntercept_Coeffs.append(np.mean(sEven_MacIntercept_Coeffs)) Rare_MicIntercept_pVals.append(np.mean(sRare_MicIntercept_pVals)) Rare_MicIntercept_Coeffs.append(np.mean(sRare_MicIntercept_Coeffs)) Rich_MicIntercept_pVals.append(np.mean(sRich_MicIntercept_pVals)) Rich_MicIntercept_Coeffs.append(np.mean(sRich_MicIntercept_Coeffs)) Dom_MicIntercept_pVals.append(np.mean(sDom_MicIntercept_pVals)) Dom_MicIntercept_Coeffs.append(np.mean(sDom_MicIntercept_Coeffs)) Even_MicIntercept_pVals.append(np.mean(sEven_MicIntercept_pVals)) Even_MicIntercept_Coeffs.append(np.mean(sEven_MicIntercept_Coeffs)) Rare_MacSlope_pVals.append(np.mean(sRare_MacSlope_pVals)) # List to hold coefficient p-values Rare_MacSlope_Coeffs.append(np.mean(sRare_MacSlope_Coeffs)) # List to hold coefficients Rich_MacSlope_pVals.append(np.mean(sRich_MacSlope_pVals)) # List to hold coefficient p-values Rich_MacSlope_Coeffs.append(np.mean(sRich_MacSlope_Coeffs)) # List to hold coefficients Dom_MacSlope_pVals.append(np.mean(sDom_MacSlope_pVals)) Dom_MacSlope_Coeffs.append(np.mean(sDom_MacSlope_Coeffs)) Even_MacSlope_pVals.append(np.mean(sEven_MacSlope_pVals)) Even_MacSlope_Coeffs.append(np.mean(sEven_MacSlope_Coeffs)) Rare_MicSlope_pVals.append(np.mean(sRare_MicSlope_pVals)) Rare_MicSlope_Coeffs.append(np.mean(sRare_MicSlope_Coeffs)) Rich_MicSlope_pVals.append(np.mean(sRich_MicSlope_pVals)) Rich_MicSlope_Coeffs.append(np.mean(sRich_MicSlope_Coeffs)) Dom_MicSlope_pVals.append(np.mean(sDom_MicSlope_pVals)) Dom_MicSlope_Coeffs.append(np.mean(sDom_MicSlope_Coeffs)) Even_MicSlope_pVals.append(np.mean(sEven_MicSlope_pVals)) Even_MicSlope_Coeffs.append(np.mean(sEven_MicSlope_Coeffs)) RareR2List.append(np.mean(sRareR2List)) RarepFList.append(np.mean(sRarepFList)) RichR2List.append(np.mean(sRichR2List)) RichpFList.append(np.mean(sRichpFList)) DomR2List.append(np.mean(sDomR2List)) DompFList.append(np.mean(sDompFList)) EvenR2List.append(np.mean(sEvenR2List)) EvenpFList.append(np.mean(sEvenpFList)) # ASSUMPTIONS OF LINEAR REGRESSION # 1. Error in predictor variables is negligible...presumably yes # 2. Variables are measured at the continuous level...yes # 3. The relationship is linear #RarepLinListHC.append(np.mean(sRarepLinListHC)) RarepLinListRainB.append(np.mean(sRarepLinListRainB)) RarepLinListLM.append(np.mean(sRarepLinListLM)) #RichpLinListHC.append(np.mean(sRichpLinListHC)) RichpLinListRainB.append(np.mean(sRichpLinListRainB)) RichpLinListLM.append(np.mean(sRichpLinListLM)) #DompLinListHC.append(np.mean(sDompLinListHC)) DompLinListRainB.append(np.mean(sDompLinListRainB)) DompLinListLM.append(np.mean(sDompLinListLM)) #EvenpLinListHC.append(np.mean(sEvenpLinListHC)) EvenpLinListRainB.append(np.mean(sEvenpLinListRainB)) EvenpLinListLM.append(np.mean(sEvenpLinListLM)) # 4. There are no significant outliers...need to find tests or measures # 5. Independence of observations (no serial correlation in residuals) RarepCorrListBG.append(np.mean(sRarepCorrListBG)) RarepCorrListF.append(np.mean(sRarepCorrListF)) RichpCorrListBG.append(np.mean(sRichpCorrListBG)) RichpCorrListF.append(np.mean(sRichpCorrListF)) DompCorrListBG.append(np.mean(sDompCorrListBG)) DompCorrListF.append(np.mean(sDompCorrListF)) EvenpCorrListBG.append(np.mean(sEvenpCorrListBG)) EvenpCorrListF.append(np.mean(sEvenpCorrListF)) # 6. Homoscedacticity RarepHomoHW.append(np.mean(sRarepHomoHW)) RarepHomoHB.append(np.mean(sRarepHomoHB)) RichpHomoHB.append(np.mean(sRichpHomoHB)) RichpHomoHW.append(np.mean(sRichpHomoHW)) DompHomoHW.append(np.mean(sDompHomoHW)) DompHomoHB.append(np.mean(sDompHomoHB)) EvenpHomoHW.append(np.mean(sEvenpHomoHW)) EvenpHomoHB.append(np.mean(sEvenpHomoHB)) # 7. Normally distributed residuals (errors) RarepNormListOmni.append(np.mean(sRarepNormListOmni)) RarepNormListJB.append(np.mean(sRarepNormListJB)) RarepNormListKS.append(np.mean(sRarepNormListKS)) RarepNormListAD.append(np.mean(sRarepNormListAD)) RichpNormListOmni.append(np.mean(sRichpNormListOmni)) RichpNormListJB.append(np.mean(sRichpNormListJB)) RichpNormListKS.append(np.mean(sRichpNormListKS)) RichpNormListAD.append(np.mean(sRichpNormListAD)) DompNormListOmni.append(np.mean(sDompNormListOmni)) DompNormListJB.append(np.mean(sDompNormListJB)) DompNormListKS.append(np.mean(sDompNormListKS)) DompNormListAD.append(np.mean(sDompNormListAD)) EvenpNormListOmni.append(np.mean(sEvenpNormListOmni)) EvenpNormListJB.append(np.mean(sEvenpNormListJB)) EvenpNormListKS.append(np.mean(sEvenpNormListKS)) EvenpNormListAD.append(np.mean(sEvenpNormListAD)) fig.add_subplot(4, 3, 1) plt.xlim(min(SampSizes)-1,max(SampSizes)+10) plt.ylim(0,1) plt.xscale('log') # Rarity R2 vs. Sample Size plt.plot(NLIST,RareR2List, c='0.2', ls='--', lw=2, label=r'$R^2$') plt.ylabel(r'$R^2$', fontsize=14) plt.text(1.01, 0.6, 'Rarity', rotation='vertical', fontsize=16) leg = plt.legend(loc=4,prop={'size':14}) leg.draw_frame(False) fig.add_subplot(4, 3, 2) plt.xlim(min(SampSizes)-1, max(SampSizes)+10) plt.xscale('log') plt.ylim(0.0, 0.16) # Rarity Coeffs vs. Sample Size plt.plot(NLIST, Rare_MicSlope_Coeffs, c='r', lw=2, label='Microbe') plt.plot(NLIST, Rare_MacSlope_Coeffs, c='b', lw=2, label='Macrobe') #plt.plot(NLIST, RareIntCoeffList, c='g', label='Interaction') plt.ylabel('Coefficient') leg = plt.legend(loc=10,prop={'size':8}) leg.draw_frame(False) fig.add_subplot(4, 3, 3) plt.xlim(min(SampSizes)-1, max(SampSizes)+10) plt.ylim(0.0, 0.6) plt.xscale('log') # Rarity p-vals vs. Sample Size # 3. The relationship is linear #plt.plot(RarepLinListHC, NLIST, c='m', alpha=0.8) #plt.plot(NLIST,RarepLinListRainB, c='m') plt.plot(NLIST,RarepLinListLM, c='m', ls='-', label='linearity') # 5. Independence of observations (no serial correlation in residuals) #plt.plot(NLIST,RarepCorrListBG, c='c') plt.plot(NLIST,RarepCorrListF, c='c', ls='-', label='autocorrelation') # 6. Homoscedacticity plt.plot(NLIST,RarepHomoHW, c='orange', ls='-', label='homoscedasticity') #plt.plot(NLIST,RarepHomoHB, c='r', ls='-') # 7. Normally distributed residuals (errors) plt.plot(NLIST,RarepNormListOmni, c='Lime', ls='-', label='normality') #plt.plot(NLIST,RarepNormListJB, c='Lime', ls='-') #plt.plot(NLIST,RarepNormListKS, c='Lime', ls='--', lw=3) #plt.plot(NLIST,RarepNormListAD, c='Lime', ls='--') plt.plot([1, 100], [0.05, 0.05], c='0.2', ls='--') plt.ylabel('p-value') leg = plt.legend(loc=1,prop={'size':8}) leg.draw_frame(False) fig.add_subplot(4, 3, 4) plt.xscale('log') plt.ylim(0,1) plt.xlim(min(SampSizes)-1, max(SampSizes)+10) # Dominance R2 vs. Sample Size plt.plot(NLIST, DomR2List, c='0.2', ls='--', lw=2, label=r'$R^2$') plt.ylabel(r'$R^2$', fontsize=14) plt.text(1.01, 0.82, 'Dominance', rotation='vertical', fontsize=16) leg = plt.legend(loc=4,prop={'size':14}) leg.draw_frame(False) fig.add_subplot(4, 3, 5) plt.ylim(-0.2, 1.2) plt.xscale('log') plt.xlim(min(SampSizes)-1, max(SampSizes)+10) # Dominance Coeffs vs. Sample Size plt.plot(NLIST, Dom_MicSlope_Coeffs, c='r', lw=2, label='Microbe') plt.plot(NLIST, Dom_MacSlope_Coeffs, c='b', lw=2, label='Macrobe') #plt.plot(NLIST, DomIntCoeffList, c='g', label='Interaction') plt.ylabel('Coefficient') leg = plt.legend(loc=10,prop={'size':8}) leg.draw_frame(False) fig.add_subplot(4, 3, 6) plt.xlim(min(SampSizes)-1, max(SampSizes)+10) plt.xscale('log') #plt.yscale('log') plt.ylim(0, 0.6) # Dominance p-vals vs. Sample Size # 3. The relationship is linear #plt.plot(DompLinListHC, NLIST, c='m', alpha=0.8) #plt.plot(NLIST, DompLinListRainB, c='m') plt.plot(NLIST, DompLinListLM, c='m', ls='-', label='linearity') # 5. Independence of observations (no serial correlation in residuals) #plt.plot(NLIST, DompCorrListBG, c='c') plt.plot(NLIST, DompCorrListF, c='c', ls='-', label='autocorrelation') # 6. Homoscedacticity plt.plot(NLIST, DompHomoHW, c='orange', ls='-', label='homoscedasticity') #plt.plot(NLIST, DompHomoHB, c='r',ls='-') # 7. Normally distributed residuals (errors) plt.plot(NLIST, DompNormListOmni, c='Lime', ls='-', label='normality') #plt.plot(NLIST, DompNormListJB, c='Lime', ls='-') #plt.plot(NLIST, DompNormListKS, c='Lime', ls='--', lw=3) #plt.plot(NLIST, DompNormListAD, c='Lime', ls='--') plt.plot([1, 100], [0.05, 0.05], c='0.2', ls='--') plt.ylabel('p-value') leg = plt.legend(loc=1,prop={'size':8}) leg.draw_frame(False) fig.add_subplot(4, 3, 7) plt.text(1.01, 0.7, 'Evenness', rotation='vertical', fontsize=16) plt.xscale('log') plt.ylim(0,1) plt.xlim(min(SampSizes)-1, max(SampSizes)+10) # Evenness R2 vs. Sample Size plt.plot(NLIST, EvenR2List, c='0.2', ls='--', lw=2, label=r'$R^2$') plt.ylabel(r'$R^2$', fontsize=14) leg = plt.legend(loc=4,prop={'size':14}) leg.draw_frame(False) fig.add_subplot(4, 3, 8) plt.ylim(-0.25, 0.0) plt.xscale('log') plt.xlim(min(SampSizes)-1, max(SampSizes)+10) # Evenness Coeffs vs. Sample Size plt.plot(NLIST, Even_MicSlope_Coeffs, c='r', lw=2, label='Microbe') plt.plot(NLIST, Even_MacSlope_Coeffs, c='b', lw=2, label='Macrobe') #plt.plot(NLIST, EvenIntCoeffList, c='g', label='Interaction') plt.ylabel('Coefficient') leg = plt.legend(loc=10,prop={'size':8}) leg.draw_frame(False) fig.add_subplot(4, 3, 9) plt.xlim(min(SampSizes)-1, max(SampSizes)+10) plt.xscale('log') plt.ylim(0.0, 0.3) # Evenness p-vals vs. Sample Size # 3. The relationship is linear #plt.plot(EvenpLinListHC, NLIST, c='m', alpha=0.8) #plt.plot(NLIST, EvenpLinListRainB, c='m') plt.plot(NLIST, EvenpLinListLM, c='m', ls='-', label='linearity') # 5. Independence of observations (no serial correlation in residuals) #plt.plot(NLIST, EvenpCorrListBG, c='c') plt.plot(NLIST, EvenpCorrListF, c='c', ls='-', label='autocorrelation') # 6. Homoscedacticity plt.plot(NLIST, EvenpHomoHW, c='orange', ls='-', label='homoscedasticity') #plt.plot(NLIST, EvenpHomoHB, c='r', ls='-') # 7. Normally distributed residuals (errors) plt.plot(NLIST, EvenpNormListOmni, c='Lime', ls='-', label='normality') #plt.plot(NLIST, EvenpNormListJB, c='Lime', alpha=0.9, ls='-') #plt.plot(NLIST, EvenpNormListKS, c='Lime', alpha=0.9, ls='--', lw=3) #plt.plot(NLIST, EvenpNormListAD, c='Lime', alpha=0.9, ls='--') plt.plot([1, 100], [0.05, 0.05], c='0.2', ls='--') plt.ylabel('p-value') leg = plt.legend(loc=1,prop={'size':8}) leg.draw_frame(False) fig.add_subplot(4, 3, 10) plt.xscale('log') plt.ylim(0,1) plt.xlim(min(SampSizes)-1, max(SampSizes)+10) # Dominance R2 vs. Sample Size plt.plot(NLIST, RichR2List, c='0.2', ls='--', lw=2, label=r'$R^2$') plt.ylabel(r'$R^2$', fontsize=14) plt.xlabel('Sample size', fontsize=14) plt.text(1.01, 0.82, 'Richness', rotation='vertical', fontsize=16) leg = plt.legend(loc=4,prop={'size':14}) leg.draw_frame(False) fig.add_subplot(4, 3, 11) plt.ylim(-0.2, 1.2) plt.xscale('log') plt.xlim(min(SampSizes)-1, max(SampSizes)+10) # Richness Coeffs vs. Sample Size plt.plot(NLIST, Rich_MicSlope_Coeffs, c='r', lw=2, label='Microbe') plt.plot(NLIST, Rich_MacSlope_Coeffs, c='b', lw=2, label='Macrobe') #plt.plot(NLIST, RichIntCoeffList, c='g', label='Interaction') plt.ylabel('Coefficient') plt.xlabel('Sample size', fontsize=14) leg = plt.legend(loc=10,prop={'size':8}) leg.draw_frame(False) fig.add_subplot(4, 3, 12) plt.xlim(min(SampSizes)-1, max(SampSizes)+10) plt.xscale('log') # Richness p-vals vs. Sample Size # 3. The relationship is linear #plt.plot(RichpLinListHC, NLIST, c='m', alpha=0.8) #plt.plot(NLIST,RichpLinListRainB, c='m') plt.plot(NLIST,RichpLinListLM, c='m', ls='-', label='linearity') # 5. Independence of observations (no serial correlation in residuals) #plt.plot(NLIST,RichpCorrListBG, c='c') plt.plot(NLIST, EvenpCorrListF, c='c', ls='-', label='autocorrelation') # 6. Homoscedacticity plt.plot(NLIST,RichpHomoHW, c='orange', ls='-', label='homoscedasticity') #plt.plot(NLIST,RichpHomoHB, c='r', ls='-') # 7. Normally distributed residuals (errors) plt.plot(NLIST,RichpNormListOmni, c='Lime', ls='-', label='normality') #plt.plot(NLIST,RichpNormListJB, c='Lime', ls='-') #plt.plot(NLIST,RichpNormListKS, c='Lime', ls='--', lw=3) #plt.plot(NLIST,RichpNormListAD, c='Lime', ls='--') plt.plot([1, 100], [0.05, 0.05], c='0.2', ls='--') plt.ylabel('p-value') plt.xlabel('Sample size', fontsize=14) leg = plt.legend(loc=1,prop={'size':8}) leg.draw_frame(False) #plt.tick_params(axis='both', which='major', labelsize=fs-3) plt.subplots_adjust(wspace=0.4, hspace=0.4) plt.savefig(mydir+'figs/appendix/SampleSize/SampleSizeEffects.png', dpi=600, bbox_inches = "tight") #plt.close() #plt.show() return
def test_all(self): d = macrodata.load().data #import datasetswsm.greene as g #d = g.load('5-1') #growth rates gs_l_realinv = 400 * np.diff(np.log(d['realinv'])) gs_l_realgdp = 400 * np.diff(np.log(d['realgdp'])) #simple diff, not growthrate, I want heteroscedasticity later for testing endogd = np.diff(d['realinv']) exogd = add_constant(np.c_[np.diff(d['realgdp']), d['realint'][:-1]], prepend=True) endogg = gs_l_realinv exogg = add_constant(np.c_[gs_l_realgdp, d['realint'][:-1]],prepend=True) res_ols = OLS(endogg, exogg).fit() #print res_ols.params mod_g1 = GLSAR(endogg, exogg, rho=-0.108136) res_g1 = mod_g1.fit() #print res_g1.params mod_g2 = GLSAR(endogg, exogg, rho=-0.108136) #-0.1335859) from R res_g2 = mod_g2.iterative_fit(maxiter=5) #print res_g2.params rho = -0.108136 # coefficient std. error t-ratio p-value 95% CONFIDENCE INTERVAL partable = np.array([ [-9.50990, 0.990456, -9.602, 3.65e-018, -11.4631, -7.55670], # *** [ 4.37040, 0.208146, 21.00, 2.93e-052, 3.95993, 4.78086], # *** [-0.579253, 0.268009, -2.161, 0.0319, -1.10777, -0.0507346]]) # ** #Statistics based on the rho-differenced data: result_gretl_g1 = dict( endog_mean = ("Mean dependent var", 3.113973), endog_std = ("S.D. dependent var", 18.67447), ssr = ("Sum squared resid", 22530.90), mse_resid_sqrt = ("S.E. of regression", 10.66735), rsquared = ("R-squared", 0.676973), rsquared_adj = ("Adjusted R-squared", 0.673710), fvalue = ("F(2, 198)", 221.0475), f_pvalue = ("P-value(F)", 3.56e-51), resid_acf1 = ("rho", -0.003481), dw = ("Durbin-Watson", 1.993858)) #fstatistic, p-value, df1, df2 reset_2_3 = [5.219019, 0.00619, 2, 197, "f"] reset_2 = [7.268492, 0.00762, 1, 198, "f"] reset_3 = [5.248951, 0.023, 1, 198, "f"] #LM-statistic, p-value, df arch_4 = [7.30776, 0.120491, 4, "chi2"] #multicollinearity vif = [1.002, 1.002] cond_1norm = 6862.0664 determinant = 1.0296049e+009 reciprocal_condition_number = 0.013819244 #Chi-square(2): test-statistic, pvalue, df normality = [20.2792, 3.94837e-005, 2] #tests res = res_g1 #with rho from Gretl #basic assert_almost_equal(res.params, partable[:,0], 4) assert_almost_equal(res.bse, partable[:,1], 6) assert_almost_equal(res.tvalues, partable[:,2], 2) assert_almost_equal(res.ssr, result_gretl_g1['ssr'][1], decimal=2) #assert_almost_equal(res.llf, result_gretl_g1['llf'][1], decimal=7) #not in gretl #assert_almost_equal(res.rsquared, result_gretl_g1['rsquared'][1], decimal=7) #FAIL #assert_almost_equal(res.rsquared_adj, result_gretl_g1['rsquared_adj'][1], decimal=7) #FAIL assert_almost_equal(np.sqrt(res.mse_resid), result_gretl_g1['mse_resid_sqrt'][1], decimal=5) assert_almost_equal(res.fvalue, result_gretl_g1['fvalue'][1], decimal=4) assert_approx_equal(res.f_pvalue, result_gretl_g1['f_pvalue'][1], significant=2) #assert_almost_equal(res.durbin_watson, result_gretl_g1['dw'][1], decimal=7) #TODO #arch #sm_arch = smsdia.acorr_lm(res.wresid**2, maxlag=4, autolag=None) sm_arch = smsdia.het_arch(res.wresid, maxlag=4) assert_almost_equal(sm_arch[0], arch_4[0], decimal=4) assert_almost_equal(sm_arch[1], arch_4[1], decimal=6) #tests res = res_g2 #with estimated rho #estimated lag coefficient assert_almost_equal(res.model.rho, rho, decimal=3) #basic assert_almost_equal(res.params, partable[:,0], 4) assert_almost_equal(res.bse, partable[:,1], 3) assert_almost_equal(res.tvalues, partable[:,2], 2) assert_almost_equal(res.ssr, result_gretl_g1['ssr'][1], decimal=2) #assert_almost_equal(res.llf, result_gretl_g1['llf'][1], decimal=7) #not in gretl #assert_almost_equal(res.rsquared, result_gretl_g1['rsquared'][1], decimal=7) #FAIL #assert_almost_equal(res.rsquared_adj, result_gretl_g1['rsquared_adj'][1], decimal=7) #FAIL assert_almost_equal(np.sqrt(res.mse_resid), result_gretl_g1['mse_resid_sqrt'][1], decimal=5) assert_almost_equal(res.fvalue, result_gretl_g1['fvalue'][1], decimal=0) assert_almost_equal(res.f_pvalue, result_gretl_g1['f_pvalue'][1], decimal=6) #assert_almost_equal(res.durbin_watson, result_gretl_g1['dw'][1], decimal=7) #TODO c = oi.reset_ramsey(res, degree=2) compare_ftest(c, reset_2, decimal=(2,4)) c = oi.reset_ramsey(res, degree=3) compare_ftest(c, reset_2_3, decimal=(2,4)) #arch #sm_arch = smsdia.acorr_lm(res.wresid**2, maxlag=4, autolag=None) sm_arch = smsdia.het_arch(res.wresid, maxlag=4) assert_almost_equal(sm_arch[0], arch_4[0], decimal=1) assert_almost_equal(sm_arch[1], arch_4[1], decimal=2) ''' Performing iterative calculation of rho... ITER RHO ESS 1 -0.10734 22530.9 2 -0.10814 22530.9 Model 4: Cochrane-Orcutt, using observations 1959:3-2009:3 (T = 201) Dependent variable: ds_l_realinv rho = -0.108136 coefficient std. error t-ratio p-value ------------------------------------------------------------- const -9.50990 0.990456 -9.602 3.65e-018 *** ds_l_realgdp 4.37040 0.208146 21.00 2.93e-052 *** realint_1 -0.579253 0.268009 -2.161 0.0319 ** Statistics based on the rho-differenced data: Mean dependent var 3.113973 S.D. dependent var 18.67447 Sum squared resid 22530.90 S.E. of regression 10.66735 R-squared 0.676973 Adjusted R-squared 0.673710 F(2, 198) 221.0475 P-value(F) 3.56e-51 rho -0.003481 Durbin-Watson 1.993858 ''' ''' RESET test for specification (squares and cubes) Test statistic: F = 5.219019, with p-value = P(F(2,197) > 5.21902) = 0.00619 RESET test for specification (squares only) Test statistic: F = 7.268492, with p-value = P(F(1,198) > 7.26849) = 0.00762 RESET test for specification (cubes only) Test statistic: F = 5.248951, with p-value = P(F(1,198) > 5.24895) = 0.023: ''' ''' Test for ARCH of order 4 coefficient std. error t-ratio p-value -------------------------------------------------------- alpha(0) 97.0386 20.3234 4.775 3.56e-06 *** alpha(1) 0.176114 0.0714698 2.464 0.0146 ** alpha(2) -0.0488339 0.0724981 -0.6736 0.5014 alpha(3) -0.0705413 0.0737058 -0.9571 0.3397 alpha(4) 0.0384531 0.0725763 0.5298 0.5968 Null hypothesis: no ARCH effect is present Test statistic: LM = 7.30776 with p-value = P(Chi-square(4) > 7.30776) = 0.120491: ''' ''' Variance Inflation Factors Minimum possible value = 1.0 Values > 10.0 may indicate a collinearity problem ds_l_realgdp 1.002 realint_1 1.002 VIF(j) = 1/(1 - R(j)^2), where R(j) is the multiple correlation coefficient between variable j and the other independent variables Properties of matrix X'X: 1-norm = 6862.0664 Determinant = 1.0296049e+009 Reciprocal condition number = 0.013819244 ''' ''' Test for ARCH of order 4 - Null hypothesis: no ARCH effect is present Test statistic: LM = 7.30776 with p-value = P(Chi-square(4) > 7.30776) = 0.120491 Test of common factor restriction - Null hypothesis: restriction is acceptable Test statistic: F(2, 195) = 0.426391 with p-value = P(F(2, 195) > 0.426391) = 0.653468 Test for normality of residual - Null hypothesis: error is normally distributed Test statistic: Chi-square(2) = 20.2792 with p-value = 3.94837e-005: ''' #no idea what this is ''' Augmented regression for common factor test OLS, using observations 1959:3-2009:3 (T = 201) Dependent variable: ds_l_realinv coefficient std. error t-ratio p-value --------------------------------------------------------------- const -10.9481 1.35807 -8.062 7.44e-014 *** ds_l_realgdp 4.28893 0.229459 18.69 2.40e-045 *** realint_1 -0.662644 0.334872 -1.979 0.0492 ** ds_l_realinv_1 -0.108892 0.0715042 -1.523 0.1294 ds_l_realgdp_1 0.660443 0.390372 1.692 0.0923 * realint_2 0.0769695 0.341527 0.2254 0.8219 Sum of squared residuals = 22432.8 Test of common factor restriction Test statistic: F(2, 195) = 0.426391, with p-value = 0.653468 ''' ################ with OLS, HAC errors #Model 5: OLS, using observations 1959:2-2009:3 (T = 202) #Dependent variable: ds_l_realinv #HAC standard errors, bandwidth 4 (Bartlett kernel) #coefficient std. error t-ratio p-value 95% CONFIDENCE INTERVAL #for confidence interval t(199, 0.025) = 1.972 partable = np.array([ [-9.48167, 1.17709, -8.055, 7.17e-014, -11.8029, -7.16049], # *** [4.37422, 0.328787, 13.30, 2.62e-029, 3.72587, 5.02258], #*** [-0.613997, 0.293619, -2.091, 0.0378, -1.19300, -0.0349939]]) # ** result_gretl_g1 = dict( endog_mean = ("Mean dependent var", 3.257395), endog_std = ("S.D. dependent var", 18.73915), ssr = ("Sum squared resid", 22799.68), mse_resid_sqrt = ("S.E. of regression", 10.70380), rsquared = ("R-squared", 0.676978), rsquared_adj = ("Adjusted R-squared", 0.673731), fvalue = ("F(2, 199)", 90.79971), f_pvalue = ("P-value(F)", 9.53e-29), llf = ("Log-likelihood", -763.9752), aic = ("Akaike criterion", 1533.950), bic = ("Schwarz criterion", 1543.875), hqic = ("Hannan-Quinn", 1537.966), resid_acf1 = ("rho", -0.107341), dw = ("Durbin-Watson", 2.213805)) linear_logs = [1.68351, 0.430953, 2, "chi2"] #for logs: dropping 70 nan or incomplete observations, T=133 #(res_ols.model.exog <=0).any(1).sum() = 69 ?not 70 linear_squares = [7.52477, 0.0232283, 2, "chi2"] #Autocorrelation, Breusch-Godfrey test for autocorrelation up to order 4 lm_acorr4 = [1.17928, 0.321197, 4, 195, "F"] lm2_acorr4 = [4.771043, 0.312, 4, "chi2"] acorr_ljungbox4 = [5.23587, 0.264, 4, "chi2"] #break cusum_Harvey_Collier = [0.494432, 0.621549, 198, "t"] #stats.t.sf(0.494432, 198)*2 #see cusum results in files break_qlr = [3.01985, 0.1, 3, 196, "maxF"] #TODO check this, max at 2001:4 break_chow = [13.1897, 0.00424384, 3, "chi2"] # break at 1984:1 arch_4 = [3.43473, 0.487871, 4, "chi2"] normality = [23.962, 0.00001, 2, "chi2"] het_white = [33.503723, 0.000003, 5, "chi2"] het_breush_pagan = [1.302014, 0.521520, 2, "chi2"] #TODO: not available het_breush_pagan_konker = [0.709924, 0.701200, 2, "chi2"] reset_2_3 = [5.219019, 0.00619, 2, 197, "f"] reset_2 = [7.268492, 0.00762, 1, 198, "f"] reset_3 = [5.248951, 0.023, 1, 198, "f"] #not available cond_1norm = 5984.0525 determinant = 7.1087467e+008 reciprocal_condition_number = 0.013826504 vif = [1.001, 1.001] names = 'date residual leverage influence DFFITS'.split() cur_dir = os.path.abspath(os.path.dirname(__file__)) fpath = os.path.join(cur_dir, 'results/leverage_influence_ols_nostars.txt') lev = np.genfromtxt(fpath, skip_header=3, skip_footer=1, converters={0:lambda s: s}) #either numpy 1.6 or python 3.2 changed behavior if np.isnan(lev[-1]['f1']): lev = np.genfromtxt(fpath, skip_header=3, skip_footer=2, converters={0:lambda s: s}) lev.dtype.names = names res = res_ols #for easier copying cov_hac = sw.cov_hac_simple(res, nlags=4, use_correction=False) bse_hac = sw.se_cov(cov_hac) assert_almost_equal(res.params, partable[:,0], 5) assert_almost_equal(bse_hac, partable[:,1], 5) #TODO assert_almost_equal(res.ssr, result_gretl_g1['ssr'][1], decimal=2) #assert_almost_equal(res.llf, result_gretl_g1['llf'][1], decimal=7) #not in gretl assert_almost_equal(res.rsquared, result_gretl_g1['rsquared'][1], decimal=6) #FAIL assert_almost_equal(res.rsquared_adj, result_gretl_g1['rsquared_adj'][1], decimal=6) #FAIL assert_almost_equal(np.sqrt(res.mse_resid), result_gretl_g1['mse_resid_sqrt'][1], decimal=5) #f-value is based on cov_hac I guess #assert_almost_equal(res.fvalue, result_gretl_g1['fvalue'][1], decimal=0) #FAIL #assert_approx_equal(res.f_pvalue, result_gretl_g1['f_pvalue'][1], significant=1) #FAIL #assert_almost_equal(res.durbin_watson, result_gretl_g1['dw'][1], decimal=7) #TODO c = oi.reset_ramsey(res, degree=2) compare_ftest(c, reset_2, decimal=(6,5)) c = oi.reset_ramsey(res, degree=3) compare_ftest(c, reset_2_3, decimal=(6,5)) linear_sq = smsdia.linear_lm(res.resid, res.model.exog) assert_almost_equal(linear_sq[0], linear_squares[0], decimal=6) assert_almost_equal(linear_sq[1], linear_squares[1], decimal=7) hbpk = smsdia.het_breushpagan(res.resid, res.model.exog) assert_almost_equal(hbpk[0], het_breush_pagan_konker[0], decimal=6) assert_almost_equal(hbpk[1], het_breush_pagan_konker[1], decimal=6) hw = smsdia.het_white(res.resid, res.model.exog) assert_almost_equal(hw[:2], het_white[:2], 6) #arch #sm_arch = smsdia.acorr_lm(res.resid**2, maxlag=4, autolag=None) sm_arch = smsdia.het_arch(res.resid, maxlag=4) assert_almost_equal(sm_arch[0], arch_4[0], decimal=5) assert_almost_equal(sm_arch[1], arch_4[1], decimal=6) vif2 = [oi.variance_inflation_factor(res.model.exog, k) for k in [1,2]] infl = oi.OLSInfluence(res_ols) #print np.max(np.abs(lev['DFFITS'] - infl.dffits[0])) #print np.max(np.abs(lev['leverage'] - infl.hat_matrix_diag)) #print np.max(np.abs(lev['influence'] - infl.influence)) #just added this based on Gretl #just rough test, low decimal in Gretl output, assert_almost_equal(lev['residual'], res.resid, decimal=3) assert_almost_equal(lev['DFFITS'], infl.dffits[0], decimal=3) assert_almost_equal(lev['leverage'], infl.hat_matrix_diag, decimal=3) assert_almost_equal(lev['influence'], infl.influence, decimal=4)
def Fig_OLS_Checks(): #fs = 10 # font size used across figures #color = str() #OrC = 'open' SampSizes = [ 5, 6, 7, 8, 9, 10, 13, 16, 20, 30, 40, 50, 60, 70, 80, 90, 100 ] Iterations = 100 fig = plt.figure(figsize=(12, 8)) # MODEL PARAMETERS Rare_MacIntercept_pVals = [] # List to hold coefficient p-values Rare_MacIntercept_Coeffs = [] # List to hold coefficients Rich_MacIntercept_pVals = [] Rich_MacIntercept_Coeffs = [] Dom_MacIntercept_pVals = [] Dom_MacIntercept_Coeffs = [] Even_MacIntercept_pVals = [] Even_MacIntercept_Coeffs = [] Rare_MicIntercept_pVals = [] Rare_MicIntercept_Coeffs = [] Rich_MicIntercept_pVals = [] Rich_MicIntercept_Coeffs = [] Dom_MicIntercept_pVals = [] Dom_MicIntercept_Coeffs = [] Even_MicIntercept_pVals = [] Even_MicIntercept_Coeffs = [] Rare_MacSlope_pVals = [] Rare_MacSlope_Coeffs = [] Rich_MacSlope_pVals = [] Rich_MacSlope_Coeffs = [] Dom_MacSlope_pVals = [] Dom_MacSlope_Coeffs = [] Even_MacSlope_pVals = [] Even_MacSlope_Coeffs = [] Rare_MicSlope_pVals = [] Rare_MicSlope_Coeffs = [] Rich_MicSlope_pVals = [] Rich_MicSlope_Coeffs = [] Dom_MicSlope_pVals = [] Dom_MicSlope_Coeffs = [] Even_MicSlope_pVals = [] Even_MicSlope_Coeffs = [] RareR2List = [] # List to hold model R2 RarepFList = [] # List to hold significance of model R2 RichR2List = [] # List to hold model R2 RichpFList = [] # List to hold significance of model R2 DomR2List = [] # List to hold model R2 DompFList = [] # List to hold significance of model R2 EvenR2List = [] # List to hold model R2 EvenpFList = [] # List to hold significance of model R2 # ASSUMPTIONS OF LINEAR REGRESSION # 1. Error in predictor variables is negligible...presumably yes # 2. Variables are measured at the continuous level...yes # 3. The relationship is linear #RarepLinListHC = [] RarepLinListRainB = [] RarepLinListLM = [] #RichpLinListHC = [] RichpLinListRainB = [] RichpLinListLM = [] #DompLinListHC = [] DompLinListRainB = [] DompLinListLM = [] #EvenpLinListHC = [] EvenpLinListRainB = [] EvenpLinListLM = [] # 4. There are no significant outliers...need to find tests or measures # 5. Independence of observations (no serial correlation in residuals) RarepCorrListBG = [] RarepCorrListF = [] RichpCorrListBG = [] RichpCorrListF = [] DompCorrListBG = [] DompCorrListF = [] EvenpCorrListBG = [] EvenpCorrListF = [] # 6. Homoscedacticity RarepHomoHW = [] RarepHomoHB = [] RichpHomoHW = [] RichpHomoHB = [] DompHomoHW = [] DompHomoHB = [] EvenpHomoHW = [] EvenpHomoHB = [] # 7. Normally distributed residuals (errors) RarepNormListOmni = [] # Omnibus test for normality RarepNormListJB = [ ] # Calculate residual skewness, kurtosis, and do the JB test for normality RarepNormListKS = [ ] # Lillifors test for normality, Kolmogorov Smirnov test with estimated mean and variance RarepNormListAD = [ ] # Anderson-Darling test for normal distribution unknown mean and variance RichpNormListOmni = [] # Omnibus test for normality RichpNormListJB = [ ] # Calculate residual skewness, kurtosis, and do the JB test for normality RichpNormListKS = [ ] # Lillifors test for normality, Kolmogorov Smirnov test with estimated mean and variance RichpNormListAD = [ ] # Anderson-Darling test for normal distribution unknown mean and variance DompNormListOmni = [] # Omnibus test for normality DompNormListJB = [ ] # Calculate residual skewness, kurtosis, and do the JB test for normality DompNormListKS = [ ] # Lillifors test for normality, Kolmogorov Smirnov test with estimated mean and variance DompNormListAD = [ ] # Anderson-Darling test for normal distribution unknown mean and variance EvenpNormListOmni = [] # Omnibus test for normality EvenpNormListJB = [ ] # Calculate residual skewness, kurtosis, and do the JB test for normality EvenpNormListKS = [ ] # Lillifors test for normality, Kolmogorov Smirnov test with estimated mean and variance EvenpNormListAD = [ ] # Anderson-Darling test for normal distribution unknown mean and variance NLIST = [] for SampSize in SampSizes: sRare_MacIntercept_pVals = [] # List to hold coefficient p-values sRare_MacIntercept_Coeffs = [] # List to hold coefficients sRich_MacIntercept_pVals = [] # List to hold coefficient p-values sRich_MacIntercept_Coeffs = [] # List to hold coefficients sDom_MacIntercept_pVals = [] sDom_MacIntercept_Coeffs = [] sEven_MacIntercept_pVals = [] sEven_MacIntercept_Coeffs = [] sRare_MicIntercept_pVals = [] sRare_MicIntercept_Coeffs = [] sRich_MicIntercept_pVals = [] sRich_MicIntercept_Coeffs = [] sDom_MicIntercept_pVals = [] sDom_MicIntercept_Coeffs = [] sEven_MicIntercept_pVals = [] sEven_MicIntercept_Coeffs = [] sRare_MacSlope_pVals = [] sRare_MacSlope_Coeffs = [] sRich_MacSlope_pVals = [] sRich_MacSlope_Coeffs = [] sDom_MacSlope_pVals = [] sDom_MacSlope_Coeffs = [] sEven_MacSlope_pVals = [] sEven_MacSlope_Coeffs = [] sRare_MicSlope_pVals = [] sRare_MicSlope_Coeffs = [] sRich_MicSlope_pVals = [] sRich_MicSlope_Coeffs = [] sDom_MicSlope_pVals = [] sDom_MicSlope_Coeffs = [] sEven_MicSlope_pVals = [] sEven_MicSlope_Coeffs = [] sRareR2List = [] # List to hold model R2 sRarepFList = [] # List to hold significance of model R2 sRichR2List = [] # List to hold model R2 sRichpFList = [] # List to hold significance of model R2 sDomR2List = [] # List to hold model R2 sDompFList = [] # List to hold significance of model R2 sEvenR2List = [] # List to hold model R2 sEvenpFList = [] # List to hold significance of model R2 # ASSUMPTIONS OF LINEAR REGRESSION # 1. Error in predictor variables is negligible...presumably yes # 2. Variables are measured at the continuous level...yes # 3. The relationship is linear #sRarepLinListHC = [] sRarepLinListRainB = [] sRarepLinListLM = [] #sRichpLinListHC = [] sRichpLinListRainB = [] sRichpLinListLM = [] #sDompLinListHC = [] sDompLinListRainB = [] sDompLinListLM = [] #sEvenpLinListHC = [] sEvenpLinListRainB = [] sEvenpLinListLM = [] # 4. There are no significant outliers...need to find tests or measures # 5. Independence of observations (no serial correlation in residuals) sRarepCorrListBG = [] sRarepCorrListF = [] sRichpCorrListBG = [] sRichpCorrListF = [] sDompCorrListBG = [] sDompCorrListF = [] sEvenpCorrListBG = [] sEvenpCorrListF = [] # 6. Homoscedacticity sRarepHomoHW = [] sRarepHomoHB = [] sRichpHomoHW = [] sRichpHomoHB = [] sDompHomoHW = [] sDompHomoHB = [] sEvenpHomoHW = [] sEvenpHomoHB = [] # 7. Normally distributed residuals (errors) sRarepNormListOmni = [] # Omnibus test for normality sRarepNormListJB = [ ] # Calculate residual skewness, kurtosis, and do the JB test for normality sRarepNormListKS = [ ] # Lillifors test for normality, Kolmogorov Smirnov test with estimated mean and variance sRarepNormListAD = [ ] # Anderson-Darling test for normal distribution unknown mean and variance sRichpNormListOmni = [] # Omnibus test for normality sRichpNormListJB = [ ] # Calculate residual skewness, kurtosis, and do the JB test for normality sRichpNormListKS = [ ] # Lillifors test for normality, Kolmogorov Smirnov test with estimated mean and variance sRichpNormListAD = [ ] # Anderson-Darling test for normal distribution unknown mean and variance sDompNormListOmni = [] # Omnibus test for normality sDompNormListJB = [ ] # Calculate residual skewness, kurtosis, and do the JB test for normality sDompNormListKS = [ ] # Lillifors test for normality, Kolmogorov Smirnov test with estimated mean and variance sDompNormListAD = [ ] # Anderson-Darling test for normal distribution unknown mean and variance sEvenpNormListOmni = [] # Omnibus test for normality sEvenpNormListJB = [ ] # Calculate residual skewness, kurtosis, and do the JB test for normality sEvenpNormListKS = [ ] # Lillifors test for normality, Kolmogorov Smirnov test with estimated mean and variance sEvenpNormListAD = [ ] # Anderson-Darling test for normal distribution unknown mean and variance for iteration in range(Iterations): Nlist, Slist, Evarlist, ESimplist, ENeelist, EHeiplist, EQlist = [ [], [], [], [], [], [], [] ] klist, Shanlist, BPlist, SimpDomlist, SinglesList, tenlist, onelist = [ [], [], [], [], [], [], [] ] NmaxList, rareSkews, KindList = [[], [], []] NSlist = [] ct = 0 radDATA = [] datasets = [] GoodNames = [ 'EMPclosed', 'HMP', 'BIGN', 'TARA', 'BOVINE', 'HUMAN', 'LAUB', 'SED', 'CHU', 'CHINA', 'CATLIN', 'FUNGI', 'HYDRO', 'BBS', 'CBC', 'MCDB', 'GENTRY', 'FIA' ] # all microbe data is MGRAST mlist = ['micro', 'macro'] for m in mlist: for name in os.listdir(mydir + 'data/' + m): if name in GoodNames: pass else: continue path = mydir + 'data/' + m + '/' + name + '/' + name + '-SADMetricData.txt' num_lines = sum(1 for line in open(path)) datasets.append([name, m, num_lines]) numMac = 0 numMic = 0 radDATA = [] for d in datasets: name, kind, numlines = d lines = [] lines = np.random.choice(range(1, numlines + 1), SampSize, replace=True) path = mydir + 'data/' + kind + '/' + name + '/' + name + '-SADMetricData.txt' for line in lines: data = linecache.getline(path, line) radDATA.append(data) #print name, kind, numlines, len(radDATA) for data in radDATA: data = data.split() if len(data) == 0: print 'no data' continue name, kind, N, S, Var, Evar, ESimp, EQ, O, ENee, EPielou, EHeip, BP, SimpDom, Nmax, McN, skew, logskew, chao1, ace, jknife1, jknife2, margalef, menhinick, preston_a, preston_S = data N = float(N) S = float(S) Nlist.append(float(np.log(N))) Slist.append(float(np.log(S))) NSlist.append(float(np.log(N / S))) Evarlist.append(float(np.log(float(Evar)))) ESimplist.append(float(np.log(float(ESimp)))) KindList.append(kind) BPlist.append(float(BP)) NmaxList.append(float(np.log(float(BP) * float(N)))) EHeiplist.append(float(EHeip)) # lines for the log-modulo transformation of skewnness skew = float(skew) sign = 1 if skew < 0: sign = -1 lms = np.log(np.abs(skew) + 1) lms = lms * sign #if lms > 3: print name, N, S rareSkews.append(float(lms)) if kind == 'macro': numMac += 1 elif kind == 'micro': numMic += 1 ct += 1 #print 'Sample Size:',SampSize, ' Mic:', numMic,'Mac:', numMac # Multiple regression for Rarity d = pd.DataFrame({'N': list(Nlist)}) d['Rarity'] = list(rareSkews) d['Kind'] = list(KindList) RarityResults = smf.ols( 'Rarity ~ N * Kind', d).fit() # Fit the dummy variable regression model #print RarityResults.summary(), '\n' # Multiple regression for Rarity d = pd.DataFrame({'N': list(Nlist)}) d['Richness'] = list(Slist) d['Kind'] = list(KindList) RichnessResults = smf.ols( 'Richness ~ N * Kind', d).fit() # Fit the dummy variable regression model #print RichnessResults.summary(), '\n' # Multiple regression for Dominance d = pd.DataFrame({'N': list(Nlist)}) d['Dominance'] = list(NmaxList) d['Kind'] = list(KindList) DomResults = smf.ols( 'Dominance ~ N * Kind', d).fit() # Fit the dummy variable regression model #print DomResults.summary(), '\n' # Multiple regression for Evenness d = pd.DataFrame({'N': list(Nlist)}) d['Evenness'] = list(ESimplist) d['Kind'] = list(KindList) EvenResults = smf.ols( 'Evenness ~ N * Kind', d).fit() # Fit the dummy variable regression model #print RarityResults.summary(), '\n' RareResids = RarityResults.resid # residuals of the model RichResids = RichnessResults.resid # residuals of the model DomResids = DomResults.resid # residuals of the model EvenResids = EvenResults.resid # residuals of the model # MODEL RESULTS/FIT RareFpval = RarityResults.f_pvalue Rarer2 = RarityResults.rsquared # coefficient of determination #Adj_r2 = RareResults.rsquared_adj # adjusted RichFpval = RichnessResults.f_pvalue Richr2 = RichnessResults.rsquared # coefficient of determination #Adj_r2 = RichnessResults.rsquared_adj # adjusted DomFpval = DomResults.f_pvalue Domr2 = DomResults.rsquared # coefficient of determination #Adj_r2 = DomResults.rsquared_adj # adjusted EvenFpval = EvenResults.f_pvalue Evenr2 = EvenResults.rsquared # coefficient of determination #Adj_r2 = EvenResuls.rsquared_adj # adjusted # MODEL PARAMETERS and p-values Rareparams = RarityResults.params Rareparams = Rareparams.tolist() Rarepvals = RarityResults.pvalues Rarepvals = Rarepvals.tolist() Richparams = RichnessResults.params Richparams = Richparams.tolist() Richpvals = RichnessResults.pvalues Richpvals = Richpvals.tolist() Domparams = DomResults.params Domparams = Domparams.tolist() Dompvals = DomResults.pvalues Dompvals = Dompvals.tolist() Evenparams = EvenResults.params Evenparams = Evenparams.tolist() Evenpvals = EvenResults.pvalues Evenpvals = Evenpvals.tolist() sRare_MacIntercept_pVals.append(Rarepvals[0]) sRare_MacIntercept_Coeffs.append(Rareparams[0]) sRich_MacIntercept_pVals.append(Rarepvals[0]) sRich_MacIntercept_Coeffs.append(Rareparams[0]) sDom_MacIntercept_pVals.append(Dompvals[0]) sDom_MacIntercept_Coeffs.append(Domparams[0]) sEven_MacIntercept_pVals.append(Evenpvals[0]) sEven_MacIntercept_Coeffs.append(Evenparams[0]) sRare_MicIntercept_pVals.append(Rarepvals[1]) if Rarepvals[1] > 0.05: sRare_MicIntercept_Coeffs.append(Rareparams[1]) else: sRare_MicIntercept_Coeffs.append(Rareparams[1]) sRich_MicIntercept_pVals.append(Richpvals[1]) if Richpvals[1] > 0.05: sRich_MicIntercept_Coeffs.append(Richparams[1]) else: sRich_MicIntercept_Coeffs.append(Richparams[1]) sDom_MicIntercept_pVals.append(Dompvals[1]) if Dompvals[1] > 0.05: sDom_MicIntercept_Coeffs.append(Domparams[1]) else: sDom_MicIntercept_Coeffs.append(Domparams[1]) sEven_MicIntercept_pVals.append(Evenpvals[1]) if Evenpvals[1] > 0.05: sEven_MicIntercept_Coeffs.append(Evenparams[1]) else: sEven_MicIntercept_Coeffs.append(Evenparams[1]) sRare_MacSlope_pVals.append(Rarepvals[2]) sRare_MacSlope_Coeffs.append(Rareparams[2]) sRich_MacSlope_pVals.append(Richpvals[2]) sRich_MacSlope_Coeffs.append(Richparams[2]) sDom_MacSlope_pVals.append(Dompvals[2]) sDom_MacSlope_Coeffs.append(Domparams[2]) sEven_MacSlope_pVals.append(Evenpvals[2]) sEven_MacSlope_Coeffs.append(Evenparams[2]) sRare_MicSlope_pVals.append(Rarepvals[3]) if Rarepvals[3] > 0.05: sRare_MicSlope_Coeffs.append(Rareparams[3]) else: sRare_MicSlope_Coeffs.append(Rareparams[3]) sRich_MicSlope_pVals.append(Richpvals[3]) if Richpvals[3] > 0.05: sRich_MicSlope_Coeffs.append(Richparams[3]) else: sRich_MicSlope_Coeffs.append(Richparams[3]) sDom_MicSlope_pVals.append(Dompvals[3]) if Dompvals[3] > 0.05: sDom_MicSlope_Coeffs.append(Domparams[3]) else: sDom_MicSlope_Coeffs.append(Domparams[3]) sEven_MicSlope_pVals.append(Evenpvals[3]) if Evenpvals[3] > 0.05: sEven_MicSlope_Coeffs.append(Evenparams[3]) else: sEven_MicSlope_Coeffs.append(Evenparams[3]) sRareR2List.append(Rarer2) sRarepFList.append(RareFpval) sRichR2List.append(Richr2) sRichpFList.append(RichFpval) sDomR2List.append(Domr2) sDompFList.append(DomFpval) sEvenR2List.append(Evenr2) sEvenpFList.append(EvenFpval) # TESTS OF LINEAR REGRESSION ASSUMPTIONS # Error in predictor variables is negligible...Presumably Yes # Variables are measured at the continuous level...Definitely Yes # TESTS FOR LINEARITY, i.e., WHETHER THE DATA ARE CORRECTLY MODELED AS LINEAR #HC = smd.linear_harvey_collier(RarityResults) # Harvey Collier test for linearity. The Null hypothesis is that the regression is correctly modeled as linear. #sRarepLinListHC.append(HC) #HC = smd.linear_harvey_collier(DomResults) # Harvey Collier test for linearity. The Null hypothesis is that the regression is correctly modeled as linear. #sDompLinListHC.append(HC) #HC = smd.linear_harvey_collier(EvenResults) # Harvey Collier test for linearity. The Null hypothesis is that the regression is correctly modeled as linear. #sEvenpLinListHC.append(HC) RB = smd.linear_rainbow( RarityResults ) # Rainbow test for linearity. The Null hypothesis is that the regression is correctly modeled as linear. sRarepLinListRainB.append(RB[1]) RB = smd.linear_rainbow( RichnessResults ) # Rainbow test for linearity. The Null hypothesis is that the regression is correctly modeled as linear. sRichpLinListRainB.append(RB[1]) RB = smd.linear_rainbow( DomResults ) # Rainbow test for linearity. The Null hypothesis is that the regression is correctly modeled as linear. sDompLinListRainB.append(RB[1]) RB = smd.linear_rainbow( EvenResults ) # Rainbow test for linearity. The Null hypothesis is that the regression is correctly modeled as linear. sEvenpLinListRainB.append(RB[1]) LM = smd.linear_lm(RarityResults.resid, RarityResults.model.exog ) # Lagrangian multiplier test for linearity sRarepLinListLM.append(LM[1]) LM = smd.linear_lm(RichnessResults.resid, RichnessResults.model.exog ) # Lagrangian multiplier test for linearity sRichpLinListLM.append(LM[1]) LM = smd.linear_lm(DomResults.resid, DomResults.model.exog ) # Lagrangian multiplier test for linearity sDompLinListLM.append(LM[1]) LM = smd.linear_lm(EvenResults.resid, EvenResults.model.exog ) # Lagrangian multiplier test for linearity sEvenpLinListLM.append(LM[1]) # INDEPENDENCE OF OBSERVATIONS (no serial correlation in residuals) BGtest = smd.acorr_breush_godfrey( RarityResults, nlags=None, store=False ) # Breusch Godfrey Lagrange Multiplier tests for residual autocorrelation # Lagrange multiplier test statistic, p-value for Lagrange multiplier test, fstatistic for F test, pvalue for F test #BGtest = smd.acorr_ljungbox(RareResids, lags=None, boxpierce=True) sRarepCorrListBG.append(BGtest[1]) sRarepCorrListF.append(BGtest[3]) BGtest = smd.acorr_breush_godfrey( RichnessResults, nlags=None, store=False ) # Breusch Godfrey Lagrange Multiplier tests for residual autocorrelation # Lagrange multiplier test statistic, p-value for Lagrange multiplier test, fstatistic for F test, pvalue for F test #BGtest = smd.acorr_ljungbox(RichResids, lags=None, boxpierce=True) sRichpCorrListBG.append(BGtest[1]) sRichpCorrListF.append(BGtest[3]) BGtest = smd.acorr_breush_godfrey( DomResults, nlags=None, store=False ) # Breusch Godfrey Lagrange Multiplier tests for residual autocorrelation # Lagrange multiplier test statistic, p-value for Lagrange multiplier test, fstatistic for F test, pvalue for F test #BGtest = smd.acorr_ljungbox(DomResids, lags=None, boxpierce=True) sDompCorrListBG.append(BGtest[1]) sDompCorrListF.append(BGtest[3]) BGtest = smd.acorr_breush_godfrey( EvenResults, nlags=None, store=False ) # Breusch Godfrey Lagrange Multiplier tests for residual autocorrelation # Lagrange multiplier test statistic, p-value for Lagrange multiplier test, fstatistic for F test, pvalue for F test #BGtest = smd.acorr_ljungbox(EvenResids, lags=None, boxpierce=True) sEvenpCorrListBG.append(BGtest[1]) sEvenpCorrListF.append(BGtest[3]) # There are no significant outliers...Need tests or measures/metrics # HOMOSCEDASTICITY # These tests return: # 1. lagrange multiplier statistic, # 2. p-value of lagrange multiplier test, # 3. f-statistic of the hypothesis that the error variance does not depend on x, # 4. p-value for the f-statistic HW = sms.het_white(RareResids, RarityResults.model.exog) sRarepHomoHW.append(HW[3]) HW = sms.het_white(RichResids, RichnessResults.model.exog) sRichpHomoHW.append(HW[3]) HW = sms.het_white(DomResids, DomResults.model.exog) sDompHomoHW.append(HW[3]) HW = sms.het_white(EvenResids, EvenResults.model.exog) sEvenpHomoHW.append(HW[3]) HB = sms.het_breushpagan(RareResids, RarityResults.model.exog) sRarepHomoHB.append(HB[3]) HB = sms.het_breushpagan(RichResids, RichnessResults.model.exog) sRichpHomoHB.append(HB[3]) HB = sms.het_breushpagan(DomResids, DomResults.model.exog) sDompHomoHB.append(HB[3]) HB = sms.het_breushpagan(EvenResids, EvenResults.model.exog) sEvenpHomoHB.append(HB[3]) # 7. NORMALITY OF ERROR TERMS O = sms.omni_normtest(RareResids) sRarepNormListOmni.append(O[1]) O = sms.omni_normtest(RichResids) sRichpNormListOmni.append(O[1]) O = sms.omni_normtest(DomResids) sDompNormListOmni.append(O[1]) O = sms.omni_normtest(EvenResids) sEvenpNormListOmni.append(O[1]) JB = sms.jarque_bera(RareResids) sRarepNormListJB.append( JB[1] ) # Calculate residual skewness, kurtosis, and do the JB test for normality JB = sms.jarque_bera(RichResids) sRichpNormListJB.append( JB[1] ) # Calculate residual skewness, kurtosis, and do the JB test for normality JB = sms.jarque_bera(DomResids) sDompNormListJB.append( JB[1] ) # Calculate residual skewness, kurtosis, and do the JB test for normality JB = sms.jarque_bera(EvenResids) sEvenpNormListJB.append( JB[1] ) # Calculate residual skewness, kurtosis, and do the JB test for normality KS = smd.kstest_normal(RareResids) sRarepNormListKS.append( KS[1] ) # Lillifors test for normality, Kolmogorov Smirnov test with estimated mean and variance KS = smd.kstest_normal(RichResids) sRichpNormListKS.append( KS[1] ) # Lillifors test for normality, Kolmogorov Smirnov test with estimated mean and variance KS = smd.kstest_normal(DomResids) sDompNormListKS.append( KS[1] ) # Lillifors test for normality, Kolmogorov Smirnov test with estimated mean and variance KS = smd.kstest_normal(EvenResids) sEvenpNormListKS.append( KS[1] ) # Lillifors test for normality, Kolmogorov Smirnov test with estimated mean and variance AD = smd.normal_ad(RareResids) sRarepNormListAD.append( AD[1] ) # Anderson-Darling test for normal distribution unknown mean and variance AD = smd.normal_ad(RichResids) sRichpNormListAD.append( AD[1] ) # Anderson-Darling test for normal distribution unknown mean and variance AD = smd.normal_ad(DomResids) sDompNormListAD.append( AD[1] ) # Anderson-Darling test for normal distribution unknown mean and variance AD = smd.normal_ad(EvenResids) sEvenpNormListAD.append( AD[1] ) # Anderson-Darling test for normal distribution unknown mean and variance print 'Sample size:', SampSize, 'iteration:', iteration NLIST.append(SampSize) Rare_MacIntercept_pVals.append(np.mean( sRare_MacIntercept_pVals)) # List to hold coefficient p-values Rare_MacIntercept_Coeffs.append( np.mean(sRare_MacIntercept_Coeffs)) # List to hold coefficients Rich_MacIntercept_pVals.append(np.mean( sRich_MacIntercept_pVals)) # List to hold coefficient p-values Rich_MacIntercept_Coeffs.append( np.mean(sRich_MacIntercept_Coeffs)) # List to hold coefficients Dom_MacIntercept_pVals.append(np.mean(sDom_MacIntercept_pVals)) Dom_MacIntercept_Coeffs.append(np.mean(sDom_MacIntercept_Coeffs)) Even_MacIntercept_pVals.append(np.mean(sEven_MacIntercept_pVals)) Even_MacIntercept_Coeffs.append(np.mean(sEven_MacIntercept_Coeffs)) Rare_MicIntercept_pVals.append(np.mean(sRare_MicIntercept_pVals)) Rare_MicIntercept_Coeffs.append(np.mean(sRare_MicIntercept_Coeffs)) Rich_MicIntercept_pVals.append(np.mean(sRich_MicIntercept_pVals)) Rich_MicIntercept_Coeffs.append(np.mean(sRich_MicIntercept_Coeffs)) Dom_MicIntercept_pVals.append(np.mean(sDom_MicIntercept_pVals)) Dom_MicIntercept_Coeffs.append(np.mean(sDom_MicIntercept_Coeffs)) Even_MicIntercept_pVals.append(np.mean(sEven_MicIntercept_pVals)) Even_MicIntercept_Coeffs.append(np.mean(sEven_MicIntercept_Coeffs)) Rare_MacSlope_pVals.append( np.mean(sRare_MacSlope_pVals)) # List to hold coefficient p-values Rare_MacSlope_Coeffs.append( np.mean(sRare_MacSlope_Coeffs)) # List to hold coefficients Rich_MacSlope_pVals.append( np.mean(sRich_MacSlope_pVals)) # List to hold coefficient p-values Rich_MacSlope_Coeffs.append( np.mean(sRich_MacSlope_Coeffs)) # List to hold coefficients Dom_MacSlope_pVals.append(np.mean(sDom_MacSlope_pVals)) Dom_MacSlope_Coeffs.append(np.mean(sDom_MacSlope_Coeffs)) Even_MacSlope_pVals.append(np.mean(sEven_MacSlope_pVals)) Even_MacSlope_Coeffs.append(np.mean(sEven_MacSlope_Coeffs)) Rare_MicSlope_pVals.append(np.mean(sRare_MicSlope_pVals)) Rare_MicSlope_Coeffs.append(np.mean(sRare_MicSlope_Coeffs)) Rich_MicSlope_pVals.append(np.mean(sRich_MicSlope_pVals)) Rich_MicSlope_Coeffs.append(np.mean(sRich_MicSlope_Coeffs)) Dom_MicSlope_pVals.append(np.mean(sDom_MicSlope_pVals)) Dom_MicSlope_Coeffs.append(np.mean(sDom_MicSlope_Coeffs)) Even_MicSlope_pVals.append(np.mean(sEven_MicSlope_pVals)) Even_MicSlope_Coeffs.append(np.mean(sEven_MicSlope_Coeffs)) RareR2List.append(np.mean(sRareR2List)) RarepFList.append(np.mean(sRarepFList)) RichR2List.append(np.mean(sRichR2List)) RichpFList.append(np.mean(sRichpFList)) DomR2List.append(np.mean(sDomR2List)) DompFList.append(np.mean(sDompFList)) EvenR2List.append(np.mean(sEvenR2List)) EvenpFList.append(np.mean(sEvenpFList)) # ASSUMPTIONS OF LINEAR REGRESSION # 1. Error in predictor variables is negligible...presumably yes # 2. Variables are measured at the continuous level...yes # 3. The relationship is linear #RarepLinListHC.append(np.mean(sRarepLinListHC)) RarepLinListRainB.append(np.mean(sRarepLinListRainB)) RarepLinListLM.append(np.mean(sRarepLinListLM)) #RichpLinListHC.append(np.mean(sRichpLinListHC)) RichpLinListRainB.append(np.mean(sRichpLinListRainB)) RichpLinListLM.append(np.mean(sRichpLinListLM)) #DompLinListHC.append(np.mean(sDompLinListHC)) DompLinListRainB.append(np.mean(sDompLinListRainB)) DompLinListLM.append(np.mean(sDompLinListLM)) #EvenpLinListHC.append(np.mean(sEvenpLinListHC)) EvenpLinListRainB.append(np.mean(sEvenpLinListRainB)) EvenpLinListLM.append(np.mean(sEvenpLinListLM)) # 4. There are no significant outliers...need to find tests or measures # 5. Independence of observations (no serial correlation in residuals) RarepCorrListBG.append(np.mean(sRarepCorrListBG)) RarepCorrListF.append(np.mean(sRarepCorrListF)) RichpCorrListBG.append(np.mean(sRichpCorrListBG)) RichpCorrListF.append(np.mean(sRichpCorrListF)) DompCorrListBG.append(np.mean(sDompCorrListBG)) DompCorrListF.append(np.mean(sDompCorrListF)) EvenpCorrListBG.append(np.mean(sEvenpCorrListBG)) EvenpCorrListF.append(np.mean(sEvenpCorrListF)) # 6. Homoscedacticity RarepHomoHW.append(np.mean(sRarepHomoHW)) RarepHomoHB.append(np.mean(sRarepHomoHB)) RichpHomoHB.append(np.mean(sRichpHomoHB)) RichpHomoHW.append(np.mean(sRichpHomoHW)) DompHomoHW.append(np.mean(sDompHomoHW)) DompHomoHB.append(np.mean(sDompHomoHB)) EvenpHomoHW.append(np.mean(sEvenpHomoHW)) EvenpHomoHB.append(np.mean(sEvenpHomoHB)) # 7. Normally distributed residuals (errors) RarepNormListOmni.append(np.mean(sRarepNormListOmni)) RarepNormListJB.append(np.mean(sRarepNormListJB)) RarepNormListKS.append(np.mean(sRarepNormListKS)) RarepNormListAD.append(np.mean(sRarepNormListAD)) RichpNormListOmni.append(np.mean(sRichpNormListOmni)) RichpNormListJB.append(np.mean(sRichpNormListJB)) RichpNormListKS.append(np.mean(sRichpNormListKS)) RichpNormListAD.append(np.mean(sRichpNormListAD)) DompNormListOmni.append(np.mean(sDompNormListOmni)) DompNormListJB.append(np.mean(sDompNormListJB)) DompNormListKS.append(np.mean(sDompNormListKS)) DompNormListAD.append(np.mean(sDompNormListAD)) EvenpNormListOmni.append(np.mean(sEvenpNormListOmni)) EvenpNormListJB.append(np.mean(sEvenpNormListJB)) EvenpNormListKS.append(np.mean(sEvenpNormListKS)) EvenpNormListAD.append(np.mean(sEvenpNormListAD)) fig.add_subplot(4, 3, 1) plt.xlim(min(SampSizes) - 1, max(SampSizes) + 10) plt.ylim(0, 1) plt.xscale('log') # Rarity R2 vs. Sample Size plt.plot(NLIST, RareR2List, c='0.2', ls='--', lw=2, label=r'$R^2$') plt.ylabel(r'$R^2$', fontsize=14) plt.text(1.01, 0.6, 'Rarity', rotation='vertical', fontsize=16) leg = plt.legend(loc=4, prop={'size': 14}) leg.draw_frame(False) fig.add_subplot(4, 3, 2) plt.xlim(min(SampSizes) - 1, max(SampSizes) + 10) plt.xscale('log') plt.ylim(0.0, 0.16) # Rarity Coeffs vs. Sample Size plt.plot(NLIST, Rare_MicSlope_Coeffs, c='r', lw=2, label='Microbe') plt.plot(NLIST, Rare_MacSlope_Coeffs, c='b', lw=2, label='Macrobe') #plt.plot(NLIST, RareIntCoeffList, c='g', label='Interaction') plt.ylabel('Coefficient') leg = plt.legend(loc=10, prop={'size': 8}) leg.draw_frame(False) fig.add_subplot(4, 3, 3) plt.xlim(min(SampSizes) - 1, max(SampSizes) + 10) plt.ylim(0.0, 0.6) plt.xscale('log') # Rarity p-vals vs. Sample Size # 3. The relationship is linear #plt.plot(RarepLinListHC, NLIST, c='m', alpha=0.8) #plt.plot(NLIST,RarepLinListRainB, c='m') plt.plot(NLIST, RarepLinListLM, c='m', ls='-', label='linearity') # 5. Independence of observations (no serial correlation in residuals) #plt.plot(NLIST,RarepCorrListBG, c='c') plt.plot(NLIST, RarepCorrListF, c='c', ls='-', label='autocorrelation') # 6. Homoscedacticity plt.plot(NLIST, RarepHomoHW, c='orange', ls='-', label='homoscedasticity') #plt.plot(NLIST,RarepHomoHB, c='r', ls='-') # 7. Normally distributed residuals (errors) plt.plot(NLIST, RarepNormListOmni, c='Lime', ls='-', label='normality') #plt.plot(NLIST,RarepNormListJB, c='Lime', ls='-') #plt.plot(NLIST,RarepNormListKS, c='Lime', ls='--', lw=3) #plt.plot(NLIST,RarepNormListAD, c='Lime', ls='--') plt.plot([1, 100], [0.05, 0.05], c='0.2', ls='--') plt.ylabel('p-value') leg = plt.legend(loc=1, prop={'size': 8}) leg.draw_frame(False) fig.add_subplot(4, 3, 4) plt.xscale('log') plt.ylim(0, 1) plt.xlim(min(SampSizes) - 1, max(SampSizes) + 10) # Dominance R2 vs. Sample Size plt.plot(NLIST, DomR2List, c='0.2', ls='--', lw=2, label=r'$R^2$') plt.ylabel(r'$R^2$', fontsize=14) plt.text(1.01, 0.82, 'Dominance', rotation='vertical', fontsize=16) leg = plt.legend(loc=4, prop={'size': 14}) leg.draw_frame(False) fig.add_subplot(4, 3, 5) plt.ylim(-0.2, 1.2) plt.xscale('log') plt.xlim(min(SampSizes) - 1, max(SampSizes) + 10) # Dominance Coeffs vs. Sample Size plt.plot(NLIST, Dom_MicSlope_Coeffs, c='r', lw=2, label='Microbe') plt.plot(NLIST, Dom_MacSlope_Coeffs, c='b', lw=2, label='Macrobe') #plt.plot(NLIST, DomIntCoeffList, c='g', label='Interaction') plt.ylabel('Coefficient') leg = plt.legend(loc=10, prop={'size': 8}) leg.draw_frame(False) fig.add_subplot(4, 3, 6) plt.xlim(min(SampSizes) - 1, max(SampSizes) + 10) plt.xscale('log') #plt.yscale('log') plt.ylim(0, 0.6) # Dominance p-vals vs. Sample Size # 3. The relationship is linear #plt.plot(DompLinListHC, NLIST, c='m', alpha=0.8) #plt.plot(NLIST, DompLinListRainB, c='m') plt.plot(NLIST, DompLinListLM, c='m', ls='-', label='linearity') # 5. Independence of observations (no serial correlation in residuals) #plt.plot(NLIST, DompCorrListBG, c='c') plt.plot(NLIST, DompCorrListF, c='c', ls='-', label='autocorrelation') # 6. Homoscedacticity plt.plot(NLIST, DompHomoHW, c='orange', ls='-', label='homoscedasticity') #plt.plot(NLIST, DompHomoHB, c='r',ls='-') # 7. Normally distributed residuals (errors) plt.plot(NLIST, DompNormListOmni, c='Lime', ls='-', label='normality') #plt.plot(NLIST, DompNormListJB, c='Lime', ls='-') #plt.plot(NLIST, DompNormListKS, c='Lime', ls='--', lw=3) #plt.plot(NLIST, DompNormListAD, c='Lime', ls='--') plt.plot([1, 100], [0.05, 0.05], c='0.2', ls='--') plt.ylabel('p-value') leg = plt.legend(loc=1, prop={'size': 8}) leg.draw_frame(False) fig.add_subplot(4, 3, 7) plt.text(1.01, 0.7, 'Evenness', rotation='vertical', fontsize=16) plt.xscale('log') plt.ylim(0, 1) plt.xlim(min(SampSizes) - 1, max(SampSizes) + 10) # Evenness R2 vs. Sample Size plt.plot(NLIST, EvenR2List, c='0.2', ls='--', lw=2, label=r'$R^2$') plt.ylabel(r'$R^2$', fontsize=14) leg = plt.legend(loc=4, prop={'size': 14}) leg.draw_frame(False) fig.add_subplot(4, 3, 8) plt.ylim(-0.25, 0.0) plt.xscale('log') plt.xlim(min(SampSizes) - 1, max(SampSizes) + 10) # Evenness Coeffs vs. Sample Size plt.plot(NLIST, Even_MicSlope_Coeffs, c='r', lw=2, label='Microbe') plt.plot(NLIST, Even_MacSlope_Coeffs, c='b', lw=2, label='Macrobe') #plt.plot(NLIST, EvenIntCoeffList, c='g', label='Interaction') plt.ylabel('Coefficient') leg = plt.legend(loc=10, prop={'size': 8}) leg.draw_frame(False) fig.add_subplot(4, 3, 9) plt.xlim(min(SampSizes) - 1, max(SampSizes) + 10) plt.xscale('log') plt.ylim(0.0, 0.3) # Evenness p-vals vs. Sample Size # 3. The relationship is linear #plt.plot(EvenpLinListHC, NLIST, c='m', alpha=0.8) #plt.plot(NLIST, EvenpLinListRainB, c='m') plt.plot(NLIST, EvenpLinListLM, c='m', ls='-', label='linearity') # 5. Independence of observations (no serial correlation in residuals) #plt.plot(NLIST, EvenpCorrListBG, c='c') plt.plot(NLIST, EvenpCorrListF, c='c', ls='-', label='autocorrelation') # 6. Homoscedacticity plt.plot(NLIST, EvenpHomoHW, c='orange', ls='-', label='homoscedasticity') #plt.plot(NLIST, EvenpHomoHB, c='r', ls='-') # 7. Normally distributed residuals (errors) plt.plot(NLIST, EvenpNormListOmni, c='Lime', ls='-', label='normality') #plt.plot(NLIST, EvenpNormListJB, c='Lime', alpha=0.9, ls='-') #plt.plot(NLIST, EvenpNormListKS, c='Lime', alpha=0.9, ls='--', lw=3) #plt.plot(NLIST, EvenpNormListAD, c='Lime', alpha=0.9, ls='--') plt.plot([1, 100], [0.05, 0.05], c='0.2', ls='--') plt.ylabel('p-value') leg = plt.legend(loc=1, prop={'size': 8}) leg.draw_frame(False) fig.add_subplot(4, 3, 10) plt.xscale('log') plt.ylim(0, 1) plt.xlim(min(SampSizes) - 1, max(SampSizes) + 10) # Dominance R2 vs. Sample Size plt.plot(NLIST, RichR2List, c='0.2', ls='--', lw=2, label=r'$R^2$') plt.ylabel(r'$R^2$', fontsize=14) plt.xlabel('Sample size', fontsize=14) plt.text(1.01, 0.82, 'Richness', rotation='vertical', fontsize=16) leg = plt.legend(loc=4, prop={'size': 14}) leg.draw_frame(False) fig.add_subplot(4, 3, 11) plt.ylim(-0.2, 1.2) plt.xscale('log') plt.xlim(min(SampSizes) - 1, max(SampSizes) + 10) # Richness Coeffs vs. Sample Size plt.plot(NLIST, Rich_MicSlope_Coeffs, c='r', lw=2, label='Microbe') plt.plot(NLIST, Rich_MacSlope_Coeffs, c='b', lw=2, label='Macrobe') #plt.plot(NLIST, RichIntCoeffList, c='g', label='Interaction') plt.ylabel('Coefficient') plt.xlabel('Sample size', fontsize=14) leg = plt.legend(loc=10, prop={'size': 8}) leg.draw_frame(False) fig.add_subplot(4, 3, 12) plt.xlim(min(SampSizes) - 1, max(SampSizes) + 10) plt.xscale('log') # Richness p-vals vs. Sample Size # 3. The relationship is linear #plt.plot(RichpLinListHC, NLIST, c='m', alpha=0.8) #plt.plot(NLIST,RichpLinListRainB, c='m') plt.plot(NLIST, RichpLinListLM, c='m', ls='-', label='linearity') # 5. Independence of observations (no serial correlation in residuals) #plt.plot(NLIST,RichpCorrListBG, c='c') plt.plot(NLIST, EvenpCorrListF, c='c', ls='-', label='autocorrelation') # 6. Homoscedacticity plt.plot(NLIST, RichpHomoHW, c='orange', ls='-', label='homoscedasticity') #plt.plot(NLIST,RichpHomoHB, c='r', ls='-') # 7. Normally distributed residuals (errors) plt.plot(NLIST, RichpNormListOmni, c='Lime', ls='-', label='normality') #plt.plot(NLIST,RichpNormListJB, c='Lime', ls='-') #plt.plot(NLIST,RichpNormListKS, c='Lime', ls='--', lw=3) #plt.plot(NLIST,RichpNormListAD, c='Lime', ls='--') plt.plot([1, 100], [0.05, 0.05], c='0.2', ls='--') plt.ylabel('p-value') plt.xlabel('Sample size', fontsize=14) leg = plt.legend(loc=1, prop={'size': 8}) leg.draw_frame(False) #plt.tick_params(axis='both', which='major', labelsize=fs-3) plt.subplots_adjust(wspace=0.4, hspace=0.4) plt.savefig(mydir + 'figs/appendix/SampleSize/SampleSizeEffects.png', dpi=600, bbox_inches="tight") #plt.close() #plt.show() return