def evaluate(self, use="rmse", ad=True, check_VIF=False, exclude=True):
     use = use.lower()
     self.eval = use
     if use == "r2":
         metric = abs(self.r2) - 1.0
     elif use == "r2a":
         metric = abs(self.results.rsquared_adj) - 1.0
     elif use == "rmse":
         metric = self.rmse
     elif use == "press":
         r = smo.OLSInfluence(self.results)
         metric = r.ess_press
     elif use == "aic":
         metric = self.aic
     elif use == "caic":
         k = self.data.shape[1] - 1
         n = self.results.nobs
         metric = self.aic + ((2*(k*k) + 2*k)/(n - k - 1))
     elif use == "bic":
         metric = self.bic
     else:
         metric = self.mse
     if ad:
         if self.anderson_p < 0.05:
             if exclude:
                 metric = float("inf")
             else:
                 metric = 10000
     if check_VIF:
         if not self.evaluate_VIF():
             if exclude:
                 metric = float("inf")
             else:
                 metric = 10000      # Allows for model to still be on the list but will let better models get added.
     return metric
Beispiel #2
0
    def test_influence(self):
        res = self.res

        #this test is slow
        infl = oi.OLSInfluence(res)

        try:
            import json
        except ImportError:
            raise SkipTest

        fp = open(os.path.join(cur_dir, "results/influence_lsdiag_R.json"))
        lsdiag = json.load(fp)

        #basic
        assert_almost_equal(lsdiag['cov.scaled'],
                            res.cov_params().ravel(),
                            decimal=14)
        assert_almost_equal(lsdiag['cov.unscaled'],
                            res.normalized_cov_params.ravel(),
                            decimal=14)

        c0, c1 = infl.cooks_distance  #TODO: what's c1

        assert_almost_equal(c0, lsdiag['cooks'], decimal=14)
        assert_almost_equal(infl.hat_matrix_diag, lsdiag['hat'], decimal=14)
        assert_almost_equal(infl.resid_studentized_internal,
                            lsdiag['std.res'],
                            decimal=14)

        #slow:
        #infl._get_all_obs()  #slow, nobs estimation loop, called implicitly
        dffits, dffth = infl.dffits
        assert_almost_equal(dffits, lsdiag['dfits'], decimal=14)
        assert_almost_equal(infl.resid_studentized_external,
                            lsdiag['stud.res'],
                            decimal=14)

        import pandas
        fn = os.path.join(cur_dir, "results/influence_measures_R.csv")
        infl_r = pandas.read_csv(fn, index_col=0)
        conv = lambda s: 1 if s == 'TRUE' else 0
        fn = os.path.join(cur_dir, "results/influence_measures_bool_R.csv")
        #not used yet:
        #infl_bool_r  = pandas.read_csv(fn, index_col=0,
        #                                converters=dict(zip(range(7),[conv]*7)))
        infl_r2 = np.asarray(infl_r)
        assert_almost_equal(infl.dfbetas, infl_r2[:, :3], decimal=13)
        assert_almost_equal(infl.cov_ratio, infl_r2[:, 4], decimal=14)
        #duplicates
        assert_almost_equal(dffits, infl_r2[:, 3], decimal=14)
        assert_almost_equal(c0, infl_r2[:, 5], decimal=14)
        assert_almost_equal(infl.hat_matrix_diag, infl_r2[:, 6], decimal=14)

        #Note: for dffits, R uses a threshold around 0.36, mine: dffits[1]=0.24373
        #TODO: finish and check thresholds and pvalues
        '''
Beispiel #3
0
def armonic(t, m, f, merr):
    ws = pd.DataFrame({
        'x': m,
        'y1': np.sin(2 * np.pi * t * f),
        'y2': np.cos(2 * np.pi * t * f),
        'y3': np.sin(4 * np.pi * t * f),
        'y4': np.cos(4 * np.pi * t * f),
        'y5': np.sin(6 * np.pi * t * f),
        'y6': np.cos(6 * np.pi * t * f),
        'y7': np.sin(8 * np.pi * t * f),
        'y8': np.cos(8 * np.pi * t * f)
    })
    weights = pd.Series(merr)
    wls_fit = sm.wls('x ~ y1+y2+y3+y4+y5+y6+y7+y8-1',
                     data=ws,
                     weights=1 / weights).fit()
    pred = wls_fit.predict()
    r = m - pred
    A = np.zeros(4)
    PH = np.zeros(4)
    A[0] = np.sqrt(wls_fit.params[0]**2 + wls_fit.params[1]**2)
    A[1] = np.sqrt(wls_fit.params[2]**2 + wls_fit.params[3]**2)
    A[2] = np.sqrt(wls_fit.params[4]**2 + wls_fit.params[5]**2)
    A[3] = np.sqrt(wls_fit.params[6]**2 + wls_fit.params[7]**2)
    PH[0] = np.arctan2(wls_fit.params[1], wls_fit.params[0]) - (
        1 * f / f) * np.arctan2(wls_fit.params[1], wls_fit.params[0])
    PH[1] = np.arctan2(wls_fit.params[3], wls_fit.params[2]) - (
        2 * f / f) * np.arctan2(wls_fit.params[1], wls_fit.params[0])
    PH[2] = np.arctan2(wls_fit.params[5], wls_fit.params[4]) - (
        3 * f / f) * np.arctan2(wls_fit.params[1], wls_fit.params[0])
    PH[3] = np.arctan2(wls_fit.params[7], wls_fit.params[6]) - (
        4 * f / f) * np.arctan2(wls_fit.params[1], wls_fit.params[0])
    influence = inf.OLSInfluence(wls_fit)
    dffits = influence.dffits
    cook = influence.cooks_distance
    leverage = influence.hat_matrix_diag
    inf1 = np.where(dffits[0] > dffits[1])
    inf2 = np.where(cook[1] < 0.05)
    inffin = np.concatenate((inf1, inf2), axis=1)
    return pred, r, A, PH, inffin
    def test_all(self):

        d = macrodata.load().data
        #import datasetswsm.greene as g
        #d = g.load('5-1')

        #growth rates
        gs_l_realinv = 400 * np.diff(np.log(d['realinv']))
        gs_l_realgdp = 400 * np.diff(np.log(d['realgdp']))

        #simple diff, not growthrate, I want heteroscedasticity later for testing
        endogd = np.diff(d['realinv'])
        exogd = add_constant(np.c_[np.diff(d['realgdp']), d['realint'][:-1]])

        endogg = gs_l_realinv
        exogg = add_constant(np.c_[gs_l_realgdp, d['realint'][:-1]])

        res_ols = OLS(endogg, exogg).fit()
        #print res_ols.params

        mod_g1 = GLSAR(endogg, exogg, rho=-0.108136)
        res_g1 = mod_g1.fit()
        #print res_g1.params

        mod_g2 = GLSAR(endogg, exogg, rho=-0.108136)   #-0.1335859) from R
        res_g2 = mod_g2.iterative_fit(maxiter=5)
        #print res_g2.params


        rho = -0.108136

        #                 coefficient   std. error   t-ratio    p-value 95% CONFIDENCE INTERVAL
        partable = np.array([
                        [-9.50990,  0.990456, -9.602, 3.65e-018, -11.4631, -7.55670], # ***
                        [ 4.37040,  0.208146, 21.00,  2.93e-052,  3.95993, 4.78086], # ***
                        [-0.579253, 0.268009, -2.161, 0.0319, -1.10777, -0.0507346]]) #    **

        #Statistics based on the rho-differenced data:

        result_gretl_g1 = dict(
        endog_mean = ("Mean dependent var",   3.113973),
        endog_std = ("S.D. dependent var",   18.67447),
        ssr = ("Sum squared resid",    22530.90),
        mse_resid_sqrt = ("S.E. of regression",   10.66735),
        rsquared = ("R-squared",            0.676973),
        rsquared_adj = ("Adjusted R-squared",   0.673710),
        fvalue = ("F(2, 198)",            221.0475),
        f_pvalue = ("P-value(F)",           3.56e-51),
        resid_acf1 = ("rho",                 -0.003481),
        dw = ("Durbin-Watson",        1.993858))


        #fstatistic, p-value, df1, df2
        reset_2_3 = [5.219019, 0.00619, 2, 197, "f"]
        reset_2 = [7.268492, 0.00762, 1, 198, "f"]
        reset_3 = [5.248951, 0.023, 1, 198, "f"]
        #LM-statistic, p-value, df
        arch_4 = [7.30776, 0.120491, 4, "chi2"]

        #multicollinearity
        vif = [1.002, 1.002]
        cond_1norm = 6862.0664
        determinant = 1.0296049e+009
        reciprocal_condition_number = 0.013819244

        #Chi-square(2): test-statistic, pvalue, df
        normality = [20.2792, 3.94837e-005, 2]

        #tests
        res = res_g1  #with rho from Gretl

        #basic

        assert_almost_equal(res.params, partable[:,0], 4)
        assert_almost_equal(res.bse, partable[:,1], 6)
        assert_almost_equal(res.tvalues, partable[:,2], 2)

        assert_almost_equal(res.ssr, result_gretl_g1['ssr'][1], decimal=2)
        #assert_almost_equal(res.llf, result_gretl_g1['llf'][1], decimal=7) #not in gretl
        #assert_almost_equal(res.rsquared, result_gretl_g1['rsquared'][1], decimal=7) #FAIL
        #assert_almost_equal(res.rsquared_adj, result_gretl_g1['rsquared_adj'][1], decimal=7) #FAIL
        assert_almost_equal(np.sqrt(res.mse_resid), result_gretl_g1['mse_resid_sqrt'][1], decimal=5)
        assert_almost_equal(res.fvalue, result_gretl_g1['fvalue'][1], decimal=4)
        assert_approx_equal(res.f_pvalue, result_gretl_g1['f_pvalue'][1], significant=2)
        #assert_almost_equal(res.durbin_watson, result_gretl_g1['dw'][1], decimal=7) #TODO

        #arch
        #sm_arch = smsdia.acorr_lm(res.wresid**2, maxlag=4, autolag=None)
        sm_arch = smsdia.het_arch(res.wresid, maxlag=4)
        assert_almost_equal(sm_arch[0], arch_4[0], decimal=4)
        assert_almost_equal(sm_arch[1], arch_4[1], decimal=6)

        #tests
        res = res_g2 #with estimated rho

        #estimated lag coefficient
        assert_almost_equal(res.model.rho, rho, decimal=3)

        #basic
        assert_almost_equal(res.params, partable[:,0], 4)
        assert_almost_equal(res.bse, partable[:,1], 3)
        assert_almost_equal(res.tvalues, partable[:,2], 2)

        assert_almost_equal(res.ssr, result_gretl_g1['ssr'][1], decimal=2)
        #assert_almost_equal(res.llf, result_gretl_g1['llf'][1], decimal=7) #not in gretl
        #assert_almost_equal(res.rsquared, result_gretl_g1['rsquared'][1], decimal=7) #FAIL
        #assert_almost_equal(res.rsquared_adj, result_gretl_g1['rsquared_adj'][1], decimal=7) #FAIL
        assert_almost_equal(np.sqrt(res.mse_resid), result_gretl_g1['mse_resid_sqrt'][1], decimal=5)
        assert_almost_equal(res.fvalue, result_gretl_g1['fvalue'][1], decimal=0)
        assert_almost_equal(res.f_pvalue, result_gretl_g1['f_pvalue'][1], decimal=6)
        #assert_almost_equal(res.durbin_watson, result_gretl_g1['dw'][1], decimal=7) #TODO



        c = oi.reset_ramsey(res, degree=2)
        compare_ftest(c, reset_2, decimal=(2,4))
        c = oi.reset_ramsey(res, degree=3)
        compare_ftest(c, reset_2_3, decimal=(2,4))

        #arch
        #sm_arch = smsdia.acorr_lm(res.wresid**2, maxlag=4, autolag=None)
        sm_arch = smsdia.het_arch(res.wresid, maxlag=4)
        assert_almost_equal(sm_arch[0], arch_4[0], decimal=1)
        assert_almost_equal(sm_arch[1], arch_4[1], decimal=2)



        '''
        Performing iterative calculation of rho...

                         ITER       RHO        ESS
                           1     -0.10734   22530.9
                           2     -0.10814   22530.9

        Model 4: Cochrane-Orcutt, using observations 1959:3-2009:3 (T = 201)
        Dependent variable: ds_l_realinv
        rho = -0.108136

                         coefficient   std. error   t-ratio    p-value
          -------------------------------------------------------------
          const           -9.50990      0.990456    -9.602    3.65e-018 ***
          ds_l_realgdp     4.37040      0.208146    21.00     2.93e-052 ***
          realint_1       -0.579253     0.268009    -2.161    0.0319    **

        Statistics based on the rho-differenced data:

        Mean dependent var   3.113973   S.D. dependent var   18.67447
        Sum squared resid    22530.90   S.E. of regression   10.66735
        R-squared            0.676973   Adjusted R-squared   0.673710
        F(2, 198)            221.0475   P-value(F)           3.56e-51
        rho                 -0.003481   Durbin-Watson        1.993858
        '''

        '''
        RESET test for specification (squares and cubes)
        Test statistic: F = 5.219019,
        with p-value = P(F(2,197) > 5.21902) = 0.00619

        RESET test for specification (squares only)
        Test statistic: F = 7.268492,
        with p-value = P(F(1,198) > 7.26849) = 0.00762

        RESET test for specification (cubes only)
        Test statistic: F = 5.248951,
        with p-value = P(F(1,198) > 5.24895) = 0.023:
        '''

        '''
        Test for ARCH of order 4

                     coefficient   std. error   t-ratio   p-value
          --------------------------------------------------------
          alpha(0)   97.0386       20.3234       4.775    3.56e-06 ***
          alpha(1)    0.176114      0.0714698    2.464    0.0146   **
          alpha(2)   -0.0488339     0.0724981   -0.6736   0.5014
          alpha(3)   -0.0705413     0.0737058   -0.9571   0.3397
          alpha(4)    0.0384531     0.0725763    0.5298   0.5968

          Null hypothesis: no ARCH effect is present
          Test statistic: LM = 7.30776
          with p-value = P(Chi-square(4) > 7.30776) = 0.120491:
        '''

        '''
        Variance Inflation Factors

        Minimum possible value = 1.0
        Values > 10.0 may indicate a collinearity problem

           ds_l_realgdp    1.002
              realint_1    1.002

        VIF(j) = 1/(1 - R(j)^2), where R(j) is the multiple correlation coefficient
        between variable j and the other independent variables

        Properties of matrix X'X:

         1-norm = 6862.0664
         Determinant = 1.0296049e+009
         Reciprocal condition number = 0.013819244
        '''
        '''
        Test for ARCH of order 4 -
          Null hypothesis: no ARCH effect is present
          Test statistic: LM = 7.30776
          with p-value = P(Chi-square(4) > 7.30776) = 0.120491

        Test of common factor restriction -
          Null hypothesis: restriction is acceptable
          Test statistic: F(2, 195) = 0.426391
          with p-value = P(F(2, 195) > 0.426391) = 0.653468

        Test for normality of residual -
          Null hypothesis: error is normally distributed
          Test statistic: Chi-square(2) = 20.2792
          with p-value = 3.94837e-005:
        '''

        #no idea what this is
        '''
        Augmented regression for common factor test
        OLS, using observations 1959:3-2009:3 (T = 201)
        Dependent variable: ds_l_realinv

                           coefficient   std. error   t-ratio    p-value
          ---------------------------------------------------------------
          const            -10.9481      1.35807      -8.062    7.44e-014 ***
          ds_l_realgdp       4.28893     0.229459     18.69     2.40e-045 ***
          realint_1         -0.662644    0.334872     -1.979    0.0492    **
          ds_l_realinv_1    -0.108892    0.0715042    -1.523    0.1294
          ds_l_realgdp_1     0.660443    0.390372      1.692    0.0923    *
          realint_2          0.0769695   0.341527      0.2254   0.8219

          Sum of squared residuals = 22432.8

        Test of common factor restriction

          Test statistic: F(2, 195) = 0.426391, with p-value = 0.653468
        '''


        ################ with OLS, HAC errors

        #Model 5: OLS, using observations 1959:2-2009:3 (T = 202)
        #Dependent variable: ds_l_realinv
        #HAC standard errors, bandwidth 4 (Bartlett kernel)

        #coefficient   std. error   t-ratio    p-value 95% CONFIDENCE INTERVAL
        #for confidence interval t(199, 0.025) = 1.972

        partable = np.array([
        [-9.48167,      1.17709,     -8.055,    7.17e-014, -11.8029, -7.16049], # ***
        [4.37422,      0.328787,    13.30,     2.62e-029, 3.72587, 5.02258], #***
        [-0.613997,     0.293619,    -2.091,    0.0378, -1.19300, -0.0349939]]) # **

        result_gretl_g1 = dict(
                    endog_mean = ("Mean dependent var",   3.257395),
                    endog_std = ("S.D. dependent var",   18.73915),
                    ssr = ("Sum squared resid",    22799.68),
                    mse_resid_sqrt = ("S.E. of regression",   10.70380),
                    rsquared = ("R-squared",            0.676978),
                    rsquared_adj = ("Adjusted R-squared",   0.673731),
                    fvalue = ("F(2, 199)",            90.79971),
                    f_pvalue = ("P-value(F)",           9.53e-29),
                    llf = ("Log-likelihood",      -763.9752),
                    aic = ("Akaike criterion",     1533.950),
                    bic = ("Schwarz criterion",    1543.875),
                    hqic = ("Hannan-Quinn",         1537.966),
                    resid_acf1 = ("rho",                 -0.107341),
                    dw = ("Durbin-Watson",        2.213805))

        linear_logs = [1.68351, 0.430953, 2, "chi2"]
        #for logs: dropping 70 nan or incomplete observations, T=133
        #(res_ols.model.exog <=0).any(1).sum() = 69  ?not 70
        linear_squares = [7.52477, 0.0232283, 2, "chi2"]

        #Autocorrelation, Breusch-Godfrey test for autocorrelation up to order 4
        lm_acorr4 = [1.17928, 0.321197, 4, 195, "F"]
        lm2_acorr4 = [4.771043, 0.312, 4, "chi2"]
        acorr_ljungbox4 = [5.23587, 0.264, 4, "chi2"]

        #break
        cusum_Harvey_Collier  = [0.494432, 0.621549, 198, "t"] #stats.t.sf(0.494432, 198)*2
        #see cusum results in files
        break_qlr = [3.01985, 0.1, 3, 196, "maxF"]  #TODO check this, max at 2001:4
        break_chow = [13.1897, 0.00424384, 3, "chi2"] # break at 1984:1

        arch_4 = [3.43473, 0.487871, 4, "chi2"]

        normality = [23.962, 0.00001, 2, "chi2"]

        het_white = [33.503723, 0.000003, 5, "chi2"]
        het_breusch_pagan = [1.302014, 0.521520, 2, "chi2"]  #TODO: not available
        het_breusch_pagan_konker = [0.709924, 0.701200, 2, "chi2"]


        reset_2_3 = [5.219019, 0.00619, 2, 197, "f"]
        reset_2 = [7.268492, 0.00762, 1, 198, "f"]
        reset_3 = [5.248951, 0.023, 1, 198, "f"]  #not available

        cond_1norm = 5984.0525
        determinant = 7.1087467e+008
        reciprocal_condition_number = 0.013826504
        vif = [1.001, 1.001]

        names = 'date   residual        leverage       influence        DFFITS'.split()
        cur_dir = os.path.abspath(os.path.dirname(__file__))
        fpath = os.path.join(cur_dir, 'results/leverage_influence_ols_nostars.txt')
        lev = np.genfromtxt(fpath, skip_header=3, skip_footer=1,
                            converters={0:lambda s: s})
        #either numpy 1.6 or python 3.2 changed behavior
        if np.isnan(lev[-1]['f1']):
            lev = np.genfromtxt(fpath, skip_header=3, skip_footer=2,
                                converters={0:lambda s: s})

        lev.dtype.names = names

        res = res_ols #for easier copying

        cov_hac = sw.cov_hac_simple(res, nlags=4, use_correction=False)
        bse_hac =  sw.se_cov(cov_hac)

        assert_almost_equal(res.params, partable[:,0], 5)
        assert_almost_equal(bse_hac, partable[:,1], 5)
        #TODO

        assert_almost_equal(res.ssr, result_gretl_g1['ssr'][1], decimal=2)
        assert_almost_equal(res.llf, result_gretl_g1['llf'][1], decimal=4) #not in gretl
        assert_almost_equal(res.rsquared, result_gretl_g1['rsquared'][1], decimal=6) #FAIL
        assert_almost_equal(res.rsquared_adj, result_gretl_g1['rsquared_adj'][1], decimal=6) #FAIL
        assert_almost_equal(np.sqrt(res.mse_resid), result_gretl_g1['mse_resid_sqrt'][1], decimal=5)
        #f-value is based on cov_hac I guess
        #res2 = res.get_robustcov_results(cov_type='HC1')
        # TODO: fvalue differs from Gretl, trying any of the HCx
        #assert_almost_equal(res2.fvalue, result_gretl_g1['fvalue'][1], decimal=0) #FAIL
        #assert_approx_equal(res.f_pvalue, result_gretl_g1['f_pvalue'][1], significant=1) #FAIL
        #assert_almost_equal(res.durbin_watson, result_gretl_g1['dw'][1], decimal=7) #TODO


        c = oi.reset_ramsey(res, degree=2)
        compare_ftest(c, reset_2, decimal=(6,5))
        c = oi.reset_ramsey(res, degree=3)
        compare_ftest(c, reset_2_3, decimal=(6,5))

        linear_sq = smsdia.linear_lm(res.resid, res.model.exog)
        assert_almost_equal(linear_sq[0], linear_squares[0], decimal=6)
        assert_almost_equal(linear_sq[1], linear_squares[1], decimal=7)

        hbpk = smsdia.het_breuschpagan(res.resid, res.model.exog)
        assert_almost_equal(hbpk[0], het_breusch_pagan_konker[0], decimal=6)
        assert_almost_equal(hbpk[1], het_breusch_pagan_konker[1], decimal=6)

        hw = smsdia.het_white(res.resid, res.model.exog)
        assert_almost_equal(hw[:2], het_white[:2], 6)

        #arch
        #sm_arch = smsdia.acorr_lm(res.resid**2, maxlag=4, autolag=None)
        sm_arch = smsdia.het_arch(res.resid, maxlag=4)
        assert_almost_equal(sm_arch[0], arch_4[0], decimal=5)
        assert_almost_equal(sm_arch[1], arch_4[1], decimal=6)

        vif2 = [oi.variance_inflation_factor(res.model.exog, k) for k in [1,2]]

        infl = oi.OLSInfluence(res_ols)
        #print np.max(np.abs(lev['DFFITS'] - infl.dffits[0]))
        #print np.max(np.abs(lev['leverage'] - infl.hat_matrix_diag))
        #print np.max(np.abs(lev['influence'] - infl.influence))  #just added this based on Gretl

        #just rough test, low decimal in Gretl output,
        assert_almost_equal(lev['residual'], res.resid, decimal=3)
        assert_almost_equal(lev['DFFITS'], infl.dffits[0], decimal=3)
        assert_almost_equal(lev['leverage'], infl.hat_matrix_diag, decimal=3)
        assert_almost_equal(lev['influence'], infl.influence, decimal=4)
def test_influence_wrapped():
    from pandas import DataFrame

    d = macrodata.load_pandas().data
    #growth rates
    gs_l_realinv = 400 * np.log(d['realinv']).diff().dropna()
    gs_l_realgdp = 400 * np.log(d['realgdp']).diff().dropna()
    lint = d['realint'][:-1]

    # re-index these because they will not conform to lint
    gs_l_realgdp.index = lint.index
    gs_l_realinv.index = lint.index

    data = dict(const=np.ones_like(lint), lint=lint, lrealgdp=gs_l_realgdp)
    #order is important
    exog = DataFrame(data, columns=['const', 'lrealgdp', 'lint'])

    res = OLS(gs_l_realinv, exog).fit()

    #basic
    # already tested
    #assert_almost_equal(lsdiag['cov.scaled'],
    #                    res.cov_params().values.ravel(), decimal=14)
    #assert_almost_equal(lsdiag['cov.unscaled'],
    #                    res.normalized_cov_params.values.ravel(), decimal=14)

    infl = oi.OLSInfluence(res)

    # smoke test just to make sure it works, results separately tested
    df = infl.summary_frame()
    assert_(isinstance(df, DataFrame))

    #this test is slow
    path = os.path.join(cur_dir, "results", "influence_lsdiag_R.json")
    with open(path, "r") as fp:
        lsdiag = json.load(fp)

    c0, c1 = infl.cooks_distance  #TODO: what's c1, it's pvalues? -ss

    #NOTE: we get a hard-cored 5 decimals with pandas testing
    assert_almost_equal(c0, lsdiag['cooks'], 14)
    assert_almost_equal(infl.hat_matrix_diag, (lsdiag['hat']), 14)
    assert_almost_equal(infl.resid_studentized_internal, lsdiag['std.res'], 14)

    #slow:
    dffits, dffth = infl.dffits
    assert_almost_equal(dffits, lsdiag['dfits'], 14)
    assert_almost_equal(infl.resid_studentized_external, lsdiag['stud.res'],
                        14)

    import pandas
    fn = os.path.join(cur_dir, "results/influence_measures_R.csv")
    infl_r = pandas.read_csv(fn, index_col=0)
    conv = lambda s: 1 if s == 'TRUE' else 0
    fn = os.path.join(cur_dir, "results/influence_measures_bool_R.csv")
    #not used yet:
    #infl_bool_r  = pandas.read_csv(fn, index_col=0,
    #                                converters=dict(zip(lrange(7),[conv]*7)))
    infl_r2 = np.asarray(infl_r)
    #TODO: finish wrapping this stuff
    assert_almost_equal(infl.dfbetas, infl_r2[:, :3], decimal=13)
    assert_almost_equal(infl.cov_ratio, infl_r2[:, 4], decimal=14)
# ### Outliers and Influential cases

# #### references
#
# https://www.statsmodels.org/stable/generated/statsmodels.stats.outliers_influence.OLSInfluence.html#statsmodels.stats.outliers_influence.OLSInfluence
#
# https://www.statsmodels.org/dev/generated/statsmodels.regression.linear_model.OLS.html
#
# https://stackoverflow.com/questions/46304514/access-standardized-residuals-cooks-values-hatvalues-leverage-etc-easily-i
#
# https://www.geeksforgeeks.org/reduce-in-python/

# In[71]:

summary_frame = sms.OLSInfluence(m02).summary_frame()
print(summary_frame.head())

# In[72]:

summary_frame = summary_frame[[
    'cooks_d', 'standard_resid', 'student_resid', 'hat_diag'
]]
print(summary_frame.head())

# In[73]:

resid = pd.DataFrame(df['sales'] - m02.fittedvalues)
resid.columns = ['residual']

# In[74]:
Beispiel #7
0
        plt.legend(loc='upper left')
        ax2 = fig.add_subplot(3, 1, 3)
        plt.plot(resid_studentized, 'o', label='studentized_resid')
        plt.plot(dffits, 'o', label='DFFITS')
        leg = plt.legend(loc='lower left', fancybox=True)
        leg.get_frame().set_alpha(0.5)  #, fontsize='small')
        ltext = leg.get_texts()  # all the text.Text instance in the legend
        plt.setp(ltext, fontsize='small')  # the legend text fontsize

    print(oi.reset_ramsey(res, degree=3))

    #note, constant in last column
    for i in range(1):
        print(oi.variance_inflation_factor(res.model.exog, i))

    infl = oi.OLSInfluence(res_ols)
    print(infl.resid_studentized_external)
    print(infl.resid_studentized_internal)
    print(infl.summary_table())
    print(oi.summary_table(res, alpha=0.05)[0])
'''
>>> res.resid
array([  4.28571429,   4.        ,   0.57142857,  -3.64285714,
        -4.71428571,   1.92857143,  10.        ,  -6.35714286,
       -11.        ,  -1.42857143,   1.71428571,   4.64285714])
>>> infl.hat_matrix_diag
array([ 0.10084034,  0.11764706,  0.28571429,  0.20168067,  0.10084034,
        0.16806723,  0.11764706,  0.08403361,  0.11764706,  0.28571429,
        0.33613445,  0.08403361])
>>> infl.resid_press
array([  4.76635514,   4.53333333,   0.8       ,  -4.56315789,
def chapter_3():
    """
    Notes for Linear Regression
    
    - coefficients -> give average change in Y with a one-unit increase in X
    
    - confidence interval -> B1_hat +- 2 * SE(B1_hat)
        - 95% chance the interval contains true value of B
        - SE(B1_hat) -> var(e) / SSE
    
    - t-statistic
        - t = (B1_hat - 0)/ SE(B1_hat)
        
    -  test for synergy (additive assumption)
        - effect of each predictor on response is independent of other predictors
        - include interaction term -> x1 * x2 
            - if interaction term has small p value, then not additive (synergy exists)
            - if results in substantial increase in r2, then not additive (synergy exists)

    - relationship exists
        - p value < 0.0005 or < 0.0001
        - F statistic greater than 1

    - strength of relationship
        - RSE -> estimates standard deviation of response from regression line
        - R squared -> % variability in response explained by predictors
        - percent error -> 100 * residual_standard_error / ys.mean()  
        
    - accuracy of prediction
        - prediction interval (individual response)
        - confidence interval (average response)
        
    - non-linearity
        - residual plots (fitted values vs. studentized/standardized residuals)
        - if residual plots are not random, transform with log(x), sqrt(x), or x2
        
    - correlation of error terms
        - will underestimate p value and narrow confidence/prediction intervals
        
    - heteroscedasticity (funnel shape of residual plot)
        - non-constant variances in the errors
        - if exists, transform the response with log(y) or sqrt(y)
    
    - co-linearity of features
        - (VIF) variance inflation factor -> 1 / (1 - r2)
        - correlation matrix
        - reduces t-statistic and increases standard error
        
    - outliers
        - leverage -> high impact on RSE and/or regression line
        - look at studentized residuals (observations > 3 are outliers)
        - influence (leverage) plots
    """
    #3.8 -> Simple Linear Regression on Auto data set
    dat = pd.read_csv("Auto.csv")
    dat = dat.replace("?", np.nan).dropna()

    # add constant to x values to ensure mean of residuals = 0
    xs = sm.add_constant(dat["horsepower"].astype(float))
    ys = dat["mpg"].astype(float)

    model = sm.OLS(ys, xs).fit()

    intercept, slope = model.params
    r2 = model.rsquared

    # variance inflation factor -> test for co-linearity
    # min(VIF) = 1.0, if VIF > 5 or 10, features are most likely correlated
    vif = 1 / (1 - r2)
    f_stat = model.fvalue
    p_value = model.pvalues[1]

    # create new line with the coefficients
    fit = [slope * x + intercept for x in xs["horsepower"]]
    print("Simple OLS: %s" % model.summary())

    prediction = model.predict()
    residuals = ys.astype(float) - prediction
    standardized_residuals = (residuals - residuals.mean()) / \
                             (residuals.max() - residuals.min())
    #residual_standard_error = results.rmse
    #percent_error = 100 * residual_standard_error / ys.mean()
    """
    Plot
    """
    f = plt.figure()
    ax = f.add_subplot(221)
    ax2 = f.add_subplot(223)
    ax3 = f.add_subplot(222)
    ax4 = f.add_subplot(224)
    ax.scatter(xs["horsepower"],
               ys,
               label="r2=%f; f=%f; p=%f" % (r2, f_stat, p_value))
    ax.plot(xs["horsepower"],
            fit,
            color="r",
            label="f(x) = %f * x + %f" % (slope, intercept))

    # plot fitted values vs residuals to check for non-linearity
    ax2.scatter(model.fittedvalues, residuals, color="r")
    ax2.axhline(0, color="k")
    ax2.set_xlabel("fitted values")
    ax2.set_ylabel("residuals")

    # show leverage to identity observations that may have
    # more effect on the regression than other observations
    sm.graphics.influence_plot(model, ax=ax3)

    # show fitted values vs studentized residuals
    outlier_influence = outliers_influence.OLSInfluence(model).summary_frame()
    ax4.scatter(model.fittedvalues, outlier_influence["student_resid"])
    ax4.axhline(0, color="k")
    ax4.set_xlabel("fitted values")
    ax4.set_ylabel("studentized residuals")

    for _ax in [ax, ax2, ax3, ax4]:
        _ax.legend(loc="best")
    plt.show()

    #3.9 -> Multiple Linear Regression on Auto data set
    xs = dat[[
        "cylinders", "displacement", "horsepower", "weight", "acceleration",
        "year", "origin"
    ]].astype(float)

    # plot correlation matrix to check co-linearity
    # co-linearity reduces the t-statistic (power) of the test
    # and also increases standard error
    print("Correlations: %s" % xs.corr())
    grid = sns.PairGrid(xs)
    grid = grid.map(plt.scatter)
    plt.show()

    results = pd.ols(y=ys, x=xs)
    model = sm.OLS(ys, xs).fit()
    print("Multiple OLS: %s" % results)

    # compute variance inflation factor (VIF) to check for co-linearity
    vif = list(map(lambda x: 1 / (1 - x), model.params))
    print("VIFs: %s" % vif)
    """
    Looking at the p-values associated with each predictor’s t-statistic, 
    we see that displacement, weight, year, and origin 
    have a statistically significant relationship, 
    while cylinders, horsepower, and acceleration do not.
    """
    print("Coefficients: %s" % results.beta)
    """
    The regression coefficient for year, 0.7508, 
    suggests that for every one year, mpg increases by the coefficient. 
    In other words, cars become more fuel efficient every year by almost 1 mpg / year.
    """
    residuals = results.resid
    standardized_residuals = (residuals - residuals.mean()) / \
                             (residuals.max() - residuals.min())
    residual_standard_error = results.rmse
    percent_error = 100 * residual_standard_error / ys.mean()
    """
    Plot
    """
    f = plt.figure()
    ax = f.add_subplot(221)
    ax2 = f.add_subplot(223)
    ax3 = f.add_subplot(222)
    ax4 = f.add_subplot(224)

    ax.scatter(results.y_fitted, residuals)
    ax.axhline(0, color="k")
    ax.set_xlabel("y fitted values")
    ax.set_ylabel("residuals")

    ax2.scatter(results.y_fitted,
                standardized_residuals,
                label='percent error=%f' % percent_error)
    ax2.axhline(0, color="k")
    ax2.set_xlabel("y fitted values")
    ax2.set_ylabel("standardized residuals")

    sm.graphics.influence_plot(model, ax=ax3)

    for _ax in [ax, ax2, ax3, ax4]:
        _ax.legend(loc="best")
    plt.show()
Beispiel #9
0
def nagadan(
        target, npaths, duration,
        base, conductivity, porosity, thickness,
        wells, observations,
        xmin=np.nan, xmax=np.nan, ymin=np.nan, ymax=np.nan,
        buffer=100, spacing=10, umbra=10,
        confined=True, tol=1, maxstep=10):
    """
    The entry-point for the NagadanPy project.

    Arguments
    ---------
    target : int
        The index identifying the target well in the wells.
        That is, the well for which we will compute a stochastic
        capture zone. This uses python's 0-based indexing.

    npaths : int
        The number of paths (starting points for the backtraces)
        to generate uniformly around the target well. 0 < npaths.

    duration : float
        The duration of the capture zone [d]. For example, a 10-year
        capture zone would have a duration = 10*365.25. 0 < duration.

    base : float
        The base elevation of the aquifer [m].

    conductivity : float
        The hydraulic conductivity of the aquifer [m/d]. 0 < conductivity.

    porosity : float
        The porosity of the aquifer []. 0 < porosity < 1.

    thickness : float
        The thickness of the aquifer [m]. 0 < thickness.

    wells : list
        The list of well tuples. Each well tuple has four components.
            xw : float
                The x-coordinate of the well [m].

            yw : float
                The y-coordinate of the well [m].

            rw : float
                The radius of the well [m]. 0 < rw.

            qw : float
                The discharge of the well [m^3/d].

    observations : list of observation tuples.
        An observation tuple contains four values: (x, y, z_ev, z_std), where
            x : float
                The x-coordinate of the observation [m].

            y : float
                The y-coordinate of the observation [m].

            z_ev : float
                The expected value of the observed static water level elevation [m].

            z_std : float
                The standard deviation of the observed static water level elevation [m].

    buffer : float, optional
        The buffer distance [m] around each well. If an obs falls
        within buffer of any well, it is removed. Default is 100 [m].

    spacing : float, optional
        The spacing of the rows and the columns [m] in the square
        ProbabilityField grids. Default is 10 [m].

    umbra : float, optional
        The vector-to-raster range [m] when mapping a particle path
        onto the ProbabilityField grids. If a grid node is within
        umbra of a particle path, it is marked as visited. Default is 10 [m].

    confined : boolean, optional
        True if it is safe to assume that the aquifer is confined
        throughout the domain of interest, False otherwise. This is a
        speed kludge. Default is True.

    tol : float, optional
        The tolerance [m] for the local error when solving the
        backtrace differential equation. This is an inherent
        parameter for an adaptive Runge-Kutta method. Default is 1.

    maxstep : float, optional
        The maximum allowed step in space [m] when solving the
        backtrace differential equation. This is a maximum space
        step and NOT a maximum time step. Default is 10.

    Returns
    -------
    None.

    Notes
    -----
    o   Most of the time-consuming work is orchestrated by the 
        create_capturezone function.
    """

    # Validate the arguments.
    assert(isinstance(target, int) and 0 <= target < len(wells))
    assert(isinstance(npaths, int) and 0 < npaths)
    assert((isinstance(duration, int) or isinstance(duration, float)) and 0 < duration)

    assert(isinstance(base, int) or isinstance(base, float))
    assert((isinstance(conductivity, int) or isinstance(conductivity, float)) and 0 < conductivity)
    assert(isinstance(porosity, float) and 0 < porosity < 1)
    assert((isinstance(thickness, int) or isinstance(thickness, float)) and 0 < thickness)

    assert(isinstance(wells, list) and len(wells) >= 1)
    for we in wells:
        assert(len(we) == 4 and
               (isinstance(we[0], int) or isinstance(we[0], float)) and
               (isinstance(we[1], int) or isinstance(we[1], float)) and
               (isinstance(we[2], int) or isinstance(we[2], float)) and 0 < we[2] and
               (isinstance(we[3], int) or isinstance(we[3], float)))

    assert(isinstance(observations, list) and len(observations) > 6)
    for ob in observations:
        assert(len(ob) == 4 and
               (isinstance(ob[0], int) or isinstance(ob[0], float)) and
               (isinstance(ob[1], int) or isinstance(ob[1], float)) and
               (isinstance(ob[2], int) or isinstance(ob[2], float)) and
               (isinstance(ob[3], int) or isinstance(ob[3], float)) and 0 <= ob[3])

    assert((isinstance(buffer, int) or isinstance(buffer, float)) and 0 < buffer)
    assert((isinstance(spacing, int) or isinstance(spacing, float)) and 0 < spacing)
    assert((isinstance(umbra, int) or isinstance(umbra, float)) and 0 < umbra)

    assert(isinstance(confined, bool))
    assert((isinstance(tol, int) or isinstance(tol, float)) and 0 < tol)
    assert((isinstance(maxstep, int) or isinstance(maxstep, float)) and 0 < maxstep)

    # Initialize the stopwatch.
    start_time = time.time()

    # Log the run information.
    log_the_run(
        target, npaths, duration,
        base, conductivity, porosity, thickness,
        wells, observations,
        buffer, spacing, umbra,
        confined, tol, maxstep)

    # Filter out all of the observations that are too close to any
    # pumping well, and average the duplicate observations.
    obs = filter_obs(observations, wells, buffer)
    nobs = len(obs)
    assert(nobs > 6)

    # Log summary statistics on the wells and the active observations.
    buf = summary_statistics(wells, ['Easting', 'Northing', 'Radius', 'Discharge'], 
        ['12.2f', '12.2f', '12.3f', '12.2f'], 'Wells')
    log.info('\n')
    log.info(buf.getvalue())

    buf = summary_statistics(obs, ['Easting', 'Northing', 'Head', 'Std'], 
        ['12.2f', '12.2f', '10.2f', '10.2f'], 'Active Observations')
    log.info('\n')
    log.info(buf.getvalue())

    # Set the target.
    xtarget, ytarget, rtarget = wells[target][0:3]

    # Create the model
    mo = Model(base, conductivity, porosity, thickness, wells)

    # General influence statistics
    WA, Wb = mo.construct_fit(obs, xtarget, ytarget)

    ols_model = sm.OLS(Wb, WA, hasconst=True)
    ols_results = ols_model.fit()
    ols_influence = smso.OLSInfluence(ols_results)

    log.info('\n')
    log.info(ols_results.summary(
        xname = ['A', 'B', 'C', 'D', 'E', 'F'], yname = 'scaled potential'))
    log.info('\n')
    log.info(ols_influence.summary_frame())
    
    # Compute the exhaustive leave-one-out and leave-two-out boomerang analyses.
    kldiv_one, kldiv_two, kldiv_three = compute_boomerang(WA, Wb)

    kldiv_one.sort(reverse=True)
    kldiv_two.sort(reverse=True)
    kldiv_three.sort(reverse=True)

    most_influential_singleton = kldiv_one[0][1]
    most_influential_pair = [kldiv_two[0][1], kldiv_two[0][2]]
    most_influential_triple = [kldiv_three[0][1], kldiv_three[0][2], kldiv_three[0][3]]

    log.info('\n')
    log.info('Top 5 of the Leave-one-out analysis:')
    for i in range(min(len(kldiv_one), 5)):
        log.info('    {0}'.format(kldiv_one[i]))

    log.info('\n')
    log.info('Top 5 of the Leave-two-out analysis:')
    for i in range(min(len(kldiv_two), 5)):
        log.info('    {0}'.format(kldiv_two[i]))

    log.info('\n')
    log.info('Top 5 of the Leave-three-out analysis:')
    for i in range(min(len(kldiv_three), 5)):
        log.info('    {0}'.format(kldiv_three[i]))

    # Define the local backtracing velocity function.
    if confined:
        def feval(xy):
            Vx, Vy = mo.compute_velocity_confined(xy[0], xy[1])
            return np.array([-Vx, -Vy])
    else:
        def feval(xy):
            Vx, Vy = mo.compute_velocity(xy[0], xy[1])
            return np.array([-Vx, -Vy])

    # Compute the four capture zones around the target well --- 
    # Using all of the obs.
    mo.fit_regional_flow(obs, xtarget, ytarget)
    pf0 = ProbabilityField(spacing, spacing, xtarget, ytarget)
    compute_capturezone(
        xtarget, ytarget, rtarget, npaths, duration,
        pf0, umbra, 1.0, tol, maxstep, feval)

    # Using all of the obs except the most influential singleton.
    obs1 = np.delete(obs, most_influential_singleton, 0)
    mo.fit_regional_flow(obs1, xtarget, ytarget)
    pf1 = ProbabilityField(spacing, spacing, xtarget, ytarget)
    compute_capturezone(
        xtarget, ytarget, rtarget, npaths, duration,
        pf1, umbra, 1.0, tol, maxstep, feval)

    # Using all of the obs except the most influential pair.
    obs2 = np.delete(obs, most_influential_pair, 0)
    mo.fit_regional_flow(obs2, xtarget, ytarget)
    pf2 = ProbabilityField(spacing, spacing, xtarget, ytarget)
    compute_capturezone(
        xtarget, ytarget, rtarget, npaths, duration,
        pf2, umbra, 1.0, tol, maxstep, feval)

    # Using all of the obs except the most influential triple.
    obs3 = np.delete(obs, most_influential_triple, 0)
    mo.fit_regional_flow(obs3, xtarget, ytarget)
    pf3 = ProbabilityField(spacing, spacing, xtarget, ytarget)
    compute_capturezone(
        xtarget, ytarget, rtarget, npaths, duration,
        pf3, umbra, 1.0, tol, maxstep, feval)

    # Compute the capture zone statistics.
    Xmin = min([pf0.xmin, pf1.xmin, pf2.xmin, pf3.xmin])
    Xmax = max([pf0.xmax, pf1.xmax, pf2.xmax, pf3.xmax])
    Ymin = min([pf0.ymin, pf1.ymin, pf2.ymin, pf3.ymin])
    Ymax = max([pf0.ymax, pf1.ymax, pf2.ymax, pf3.ymax])

    pf0.expand(Xmin, Xmax, Ymin, Ymax)
    pf1.expand(Xmin, Xmax, Ymin, Ymax)
    pf2.expand(Xmin, Xmax, Ymin, Ymax)
    pf3.expand(Xmin, Xmax, Ymin, Ymax)

    area0 = sum(sum(pf0.pgrid > 0)) * spacing**2
    area1 = sum(sum(pf1.pgrid > 0)) * spacing**2
    area2 = sum(sum(pf2.pgrid > 0)) * spacing**2
    area3 = sum(sum(pf3.pgrid > 0)) * spacing**2

    area01 = sum(sum((pf0.pgrid > 0) & (pf1.pgrid > 0))) * spacing**2
    area012 = sum(sum((pf0.pgrid > 0) & (pf1.pgrid > 0) & (pf2.pgrid > 0))) * spacing**2
    area0123 = sum(sum((pf0.pgrid > 0) & (pf1.pgrid > 0) & (pf2.pgrid > 0) & (pf3.pgrid > 0))) * spacing**2

    area01 = sum(sum((pf0.pgrid > 0) & (pf1.pgrid > 0))) * spacing**2
    area02 = sum(sum((pf0.pgrid > 0) & (pf2.pgrid > 0))) * spacing**2
    area03 = sum(sum((pf0.pgrid > 0) & (pf3.pgrid > 0))) * spacing**2

    log.info('\n')
    log.info('CAPTURE ZONE STATISTICS:')
    log.info('    0 = capture zone using all observations.')
    log.info('    1 = capture zone without most influenetial singleton.')
    log.info('    2 = capture zone without most influenetial pair.')
    log.info('    3 = capture zone without most influenetial triple.')
    log.info('')
    log.info('    area(0)              = {0:.2f}'.format(area0))
    log.info('    area(1)              = {0:.2f}'.format(area1))
    log.info('    area(2)              = {0:.2f}'.format(area2))
    log.info('    area(3)              = {0:.2f}'.format(area3))    
    log.info('')
    log.info('    area(0 & 1)          = {0:.2f}'.format(area01))
    log.info('    area(0 & 1 & 2)      = {0:.2f}'.format(area012))    
    log.info('    area(0 & 1 & 2 & 3)  = {0:.2f}'.format(area0123))        
    log.info('')
    log.info('    area(0 & !1)         = {0:.2f} ({1:.2f}%)'.format(area0 - area01, (area0-area01)/area0 * 100))
    log.info('    area(1 & !0)         = {0:.2f} ({1:.2f}%)'.format(area1 - area01, (area1-area01)/area1 * 100))    
    log.info('')
    log.info('    area(0 & !2)         = {0:.2f} ({1:.2f}%)'.format(area0 - area02, (area0-area02)/area0 * 100))
    log.info('    area(2 & !0)         = {0:.2f} ({1:.2f}%)'.format(area2 - area02, (area2-area02)/area2 * 100))    
    log.info('')
    log.info('    area(0 & !3)         = {0:.2f} ({1:.2f}%)'.format(area0 - area03, (area0-area03)/area0 * 100))
    log.info('    area(3 & !0)         = {0:.2f} ({1:.2f}%)'.format(area3 - area03, (area3-area03)/area3 * 100))    
    log.info('')

    elapsedtime = time.time() - start_time
    log.info('Computational elapsed time = %.4f seconds' % elapsedtime)
    log.info('')

    # -----------------------------------------------------
    # GRAPHICAL OUTPUT STARTS HERE
    # -----------------------------------------------------

    # ---------------------------------
    # PLOT: studentized residuals at the observation locations. 
    # ---------------------------------
    plt.figure()
    plt.axis('equal')

    plot_locations(target, wells, obs)

    resid = ols_influence.resid_studentized
    max_resid = max(abs(resid))

    xob = np.array([ob[0] for ob in obs])
    yob = np.array([ob[1] for ob in obs])

    a = 40 + (40 * abs(resid)/max_resid)**2
    plt.scatter(xob[resid>0], yob[resid>0], s=a[resid>0], c='b', alpha=0.5)
    plt.scatter(xob[resid<0], yob[resid<0], s=a[resid<0], c='r', alpha=0.5)

    plt.xlabel('UTM Easting [m]')
    plt.ylabel('UTM Northing [m]')
    plt.title('Studentized Residuals', fontsize=14)
    plt.grid(True)

    # ---------------------------------
    # PLOT: studentized residuals
    # ---------------------------------
    plt.figure()
    resid = ols_influence.resid_studentized

    # Bar plot for the studentized residuals.
    plt.subplot(1, 2, 1)
    plt.bar(range(nobs), resid)

    threshold = 2
    left, right = plt.xlim()
    plt.plot([left, right], [threshold, threshold], 'r', linewidth=3)
    plt.plot([left, right], [-threshold, -threshold], 'r', linewidth=3)    

    plt.xlabel('Observation index')
    plt.ylabel('Studentized Residuals')
    plt.title('Studentized Residuals', fontsize=14)
    plt.grid(True)

    # Normal probability plot for the studentized residuals.
    plt.subplot(1, 2, 2)
    scipy.stats.probplot(resid, fit=True, plot=plt)
    plt.ylabel('Studentized Residuals')
    plt.title('Normal Probability Plot for Studentized Residuals', fontsize=14)
    plt.grid(True)

    # ---------------------------------
    # PLOT: locations of observation and wells, overlaying the head contours.
    # ---------------------------------
    plt.figure()
    plt.axis('equal')

    plot_locations(target, wells, obs)

    i = most_influential_singleton
    plt.plot(obs[i][0], obs[i][1], 's', markeredgecolor='k',
             fillstyle='none', markersize=10)

    for i in most_influential_pair:
        plt.plot(obs[i][0], obs[i][1], 'D', markeredgecolor='k',
                 fillstyle='none', markersize=13)

    for i in most_influential_triple:
        plt.plot(obs[i][0], obs[i][1], 'o', markeredgecolor='k',
                 fillstyle='none', markersize=16)

    nrows = 100
    ncols = 100
    xmin, xmax, ymin, ymax = plt.axis()
    contour_head(mo, xmin, xmax, ymin, ymax, nrows, ncols)

    plt.xlabel('UTM Easting [m]')
    plt.ylabel('UTM Northing [m]')
    plt.title('Locations', fontsize=14)
    plt.grid(True)

    # ---------------------------------
    # PLOT: sorted KL divergence
    # ---------------------------------
    plt.figure()

    # leave-one-out analysis.
    plt.subplot(1, 3, 1)
    plt.scatter(range(len(kldiv_one)), [p[0] for p in kldiv_one])

    plt.xlabel('Sort Order')
    plt.ylabel('KL Divergence [bits]')
    plt.title('Leave-One-Out', fontsize=14)
    plt.grid(True)

    # leave-two-out analysis.
    plt.subplot(1, 3, 2)
    plt.scatter(range(len(kldiv_two)), [p[0] for p in kldiv_two])

    plt.xlabel('Sort Order')
    plt.ylabel('KL Divergence [bits]')
    plt.title('Leave-Two-Out', fontsize=14)
    plt.grid(True)

    # leave-two-out analysis.
    plt.subplot(1, 3, 3)
    plt.scatter(range(len(kldiv_three)), [p[0] for p in kldiv_three])

    plt.xlabel('Sort Order')
    plt.ylabel('KL Divergence [bits]')
    plt.title('Leave-Three-Out', fontsize=14)
    plt.grid(True)

    # ---------------------------------
    # PLOT: capture zones 
    # ---------------------------------
    plt.figure()

    # With all data.
    plt.subplot(2, 2, 1)
    plt.axis('equal')

    X = np.linspace(pf0.xmin, pf0.xmax, pf0.ncols)
    Y = np.linspace(pf0.ymin, pf0.ymax, pf0.nrows)
    Z = pf0.pgrid
    plt.contourf(X, Y, Z, [0.0, 0.5, 1.0], cmap='tab10')
    plt.contour(X, Y, Z, [0.0, 0.5, 1.0], colors=['black'])

    plot_locations(target, wells, obs)

    plt.xlabel('UTM Easting [m]')
    plt.ylabel('UTM Northing [m]')
    plt.title('With All Data', fontsize=14)
    plt.grid(True)
    plt.axis([Xmin, Xmax, Ymin, Ymax])

    # Used to plot the shadow on capture zones.
    [XX, YY] = np.meshgrid(X, Y)
    XX = np.reshape(XX[Z > 0.0], -1)
    YY = np.reshape(YY[Z > 0.0], -1)

    # Without the most influential singleton.
    plt.subplot(2, 2, 2)
    plt.axis('equal')

    X = np.linspace(pf1.xmin, pf1.xmax, pf1.ncols)
    Y = np.linspace(pf1.ymin, pf1.ymax, pf1.nrows)
    Z = pf1.pgrid
    plt.contourf(X, Y, Z, [0.0, 0.5, 1.0], cmap='tab10')
    plt.contour(X, Y, Z, [0.0, 0.5, 1.0], colors=['black'])

    plt.scatter(XX, YY, marker='.')
    plot_locations(target, wells, obs)

    plt.xlabel('UTM Easting [m]')
    plt.ylabel('UTM Northing [m]')
    plt.title('Without Most Influential Singleton', fontsize=14)
    plt.grid(True)
    plt.axis([Xmin, Xmax, Ymin, Ymax])

    # Without the most influential pair.
    plt.subplot(2, 2, 3)
    plt.axis('equal')

    X = np.linspace(pf2.xmin, pf2.xmax, pf2.ncols)
    Y = np.linspace(pf2.ymin, pf2.ymax, pf2.nrows)
    Z = pf2.pgrid
    plt.contourf(X, Y, Z, [0.0, 0.5, 1.0], cmap='tab10')
    plt.contour(X, Y, Z, [0.0, 0.5, 1.0], colors=['black'])

    plt.scatter(XX, YY, marker='.')
    plot_locations(target, wells, obs)

    plt.xlabel('UTM Easting [m]')
    plt.ylabel('UTM Northing [m]')
    plt.title('Without Most Influential Pair', fontsize=14)
    plt.grid(True)
    plt.axis([Xmin, Xmax, Ymin, Ymax])

    # Without the most influential triple.
    plt.subplot(2, 2, 4)
    plt.axis('equal')

    X = np.linspace(pf3.xmin, pf3.xmax, pf3.ncols)
    Y = np.linspace(pf3.ymin, pf3.ymax, pf3.nrows)
    Z = pf3.pgrid
    plt.contourf(X, Y, Z, [0.0, 0.5, 1.0], cmap='tab10')
    plt.contour(X, Y, Z, [0.0, 0.5, 1.0], colors=['black'])

    plt.scatter(XX, YY, marker='.')
    plot_locations(target, wells, obs)

    plt.xlabel('UTM Easting [m]')
    plt.ylabel('UTM Northing [m]')
    plt.title('Without Most Influential Triple', fontsize=14)
    plt.grid(True)
    plt.axis([Xmin, Xmax, Ymin, Ymax])

    # ---------------------------------
    # PLOT: DFBETAS
    # ---------------------------------
    plt.figure()
    dfbetas = ols_influence.dfbetas

    for i in range(6):
        plt.subplot(3, 2, i+1)
        plt.bar(range(nobs), dfbetas[:, i])

        threshold = 2/np.sqrt(nobs)
        left, right = plt.xlim()
        plt.plot([left, right], [threshold, threshold], 'r', linewidth=3)
        plt.plot([left, right], [-threshold, -threshold], 'r', linewidth=3)        

        plt.xlabel('Observation index')
        plt.ylabel('DFBETAS')
        plt.title('DFBETAS {0}'.format(chr(65+i)), fontsize=10)
        plt.grid(True)

    plt.tight_layout()

    # ---------------------------------
    # PLOT: the influential data diagnostics 
    # ---------------------------------
    plt.figure()

    # Leverage (diagonal of the Hat matrix) bar plot.
    plt.subplot(1, 2, 1)
    leverage = ols_influence.hat_matrix_diag
    plt.bar(range(nobs), leverage)

    threshold = 2*6/nobs
    left, right = plt.xlim()
    plt.plot([left, right], [threshold, threshold], 'r', linewidth=3)

    plt.xlabel('Observation index')
    plt.ylabel('Leverage')
    plt.title('Leverage', fontsize=14)
    plt.grid(True)

    # DFFITS bar plot.
    plt.subplot(1, 2 , 2)
    dffits, *_ = ols_influence.dffits
    plt.bar(range(nobs), dffits)

    threshold = 2*np.sqrt(6/nobs)
    left, right = plt.xlim()
    plt.plot([left, right], [threshold, threshold], 'r', linewidth=3)
    plt.plot([left, right], [-threshold, -threshold], 'r', linewidth=3)    

    plt.xlabel('Observation index')
    plt.ylabel('DFFITS')
    plt.title('DFFITS', fontsize=14)
    plt.grid(True)
    
    #------------------------
    plt.show()
Beispiel #10
0
def linear_regression_analysis(linear_regression):
    """ Compute and plot a complete analysis of a linear regression computed with Stats Models.
    Args:
         linear_regression (Stats Models Results): the result obtained  with Stats Models.

    """

    # Data
    resid = linear_regression.resid_pearson.copy()
    resid_index = linear_regression.resid.index
    exog = linear_regression.model.exog
    endog = linear_regression.model.endog
    fitted_values = linear_regression.fittedvalues
    influences = outliers_influence.OLSInfluence(linear_regression)

    p = exog.shape[1]  # Number of features
    n = len(resid)  # Number of individuals

    # Paramètres
    color1 = "#3498db"
    color2 = "#e74c3c"

    ##############################################################################
    # Tests statistiques                                                         #
    ##############################################################################

    # Homoscédasticité - Test de Breusch-Pagan
    ##########################################

    names = [
        'Lagrande multiplier statistic', 'p-value', 'f-value', 'f p-value'
    ]
    breusch_pagan = sm.stats.diagnostic.het_breuschpagan(resid, exog)
    print(lzip(names, breusch_pagan))

    # Test de normalité - Shapiro-Wilk
    ###################################

    print(f"Shapiro pvalue : {st.shapiro(resid)[1]}")

    ##############################################################################
    # Analyses de forme                                                          #
    ##############################################################################

    # Histogramme des résidus
    ##########################
    data = resid
    data_filter = data[data < 5]
    data_filter = data[data > -5]
    len_data = len(data)
    len_data_filter = len(data_filter)
    ratio = len_data_filter / len_data

    fig, ax = plt.subplots()
    plt.hist(data_filter, bins=20, color=color1)
    plt.xlabel("Residual values")
    plt.ylabel("Number of residuals")
    plt.title(f"Histogramme des résidus de -5 à 5 ({ratio:.2%})")

    # Normal distribution vs residuals (QQ Plot, droite de Henry)
    #############################################################
    data = pd.Series(resid).sort_values()
    len_data = len(data)

    normal = pd.Series(np.random.normal(size=len_data)).sort_values()
    fig, ax = plt.subplots()
    plt.scatter(data, normal, c=color1)
    plt.plot((-4, 4), (-4, 4), c=color2)
    plt.xlabel("Residuals")
    plt.ylabel("Normal distribution")
    plt.xlim(-4, 4)
    plt.ylim(-4, 4)
    plt.title("Residuals vs Normal (QQ Plot)")

    # Plot
    plt.show()
Beispiel #11
0
def OLSinfluence(X,y):
    ols_retults=ols(X,y)
    test_class = smo.OLSInfluence(ols_results)
    test =test_class.summary_frame()
    return test.head()
# #### # making some plots for assumption checking

# In[11]:

prediction = pd.DataFrame(m01.fittedvalues)
prediction.columns = ['predicted']
prediction['standarized_prediction'] = (
    prediction['predicted'] -
    prediction['predicted'].mean()) / prediction['predicted'].std()
prediction.head()

# In[12]:

import statsmodels.stats.outliers_influence as sms
summary_frame = sms.OLSInfluence(m01).summary_frame()
summary_frame = pd.merge(summary_frame,
                         prediction,
                         how='inner',
                         left_index=True,
                         right_index=True)
summary_frame.head()

# In[13]:

_ = sns.scatterplot(y='standard_resid',
                    x='standarized_prediction',
                    data=summary_frame)
_ = plt.axhline(y=0)

# #### # This graph can be used for testing homogeneity of variance. We encountered this kind of plot previously; essentially, if it has a funnel shape then we’re in trouble. The plot we have shows points that are equally spread for the three groups, which implies that variances are similar across groups (which was also the conclusion reached by Levene’s test).
def linear_regression_analysis(linear_regression):
    """ Compute and plot a complete analysis of a linear regression computed with Stats Models.
    Args:
         linear_regression (Stats Models Results): the result obtained  with Stats Models.

    """

    # Data
    resid = linear_regression.resid_pearson.copy()
    resid_index = linear_regression.resid.index
    exog = linear_regression.model.exog
    endog = linear_regression.model.endog
    fitted_values = linear_regression.fittedvalues
    influences = outliers_influence.OLSInfluence(linear_regression)

    p = exog.shape[1] # Number of features
    n = len(resid) # Number of individuals

    # Paramètres
    color1 = "#3498db"
    color2 = "#e74c3c"

    ##############################################################################
    # Tests statistiques                                                         #
    ##############################################################################

    # Homoscédasticité - Test de Breusch-Pagan
    ##########################################

    names = ['Lagrande multiplier statistic', 'p-value', 'f-value', 'f p-value']
    breusch_pagan = sm.stats.diagnostic.het_breuschpagan(resid, exog)
    print(lzip(names, breusch_pagan))

    # Test de normalité - Shapiro-Wilk
    ###################################

    print(f"Shapiro pvalue : {st.shapiro(resid)[1]}")

    ##############################################################################
    # Analyses de forme                                                          #
    ##############################################################################

    # Histogramme des résidus
    ##########################
    data = resid
    data_filter = data[data < 5]
    data_filter = data[data > -5]
    len_data = len(data)
    len_data_filter = len(data_filter)
    ratio = len_data_filter / len_data

    fig, ax = plt.subplots()
    plt.hist(data_filter, bins=20, color=color1)
    plt.xlabel("Residual values")
    plt.ylabel("Number of residuals")
    plt.title(f"Histogramme des résidus de -5 à 5 ({ratio:.2%})")

    # Normal distribution vs residuals (QQ Plot, droite de Henry)
    #############################################################
    data = pd.Series(resid).sort_values()
    len_data = len(data)

    normal = pd.Series(np.random.normal(size=len_data)).sort_values()
    fig, ax = plt.subplots()
    plt.scatter(data, normal, c=color1)
    plt.plot((-4,4), (-4, 4), c=color2)
    plt.xlabel("Residuals")
    plt.ylabel("Normal distribution")
    plt.xlim(-4, 4)
    plt.ylim(-4, 4)
    plt.title("Residuals vs Normal (QQ Plot)")

    #  Fitted vs Residuals
    ######################
    data = resid
    fig, ax = plt.subplots()
    plt.scatter(fitted_values, data, alpha=0.5, c=color1)
    plt.xlabel("Fitted values")
    plt.ylabel("Residuals")
    plt.title("Fitted vs Residuals")

    # Actual vs Predict plot
    fig, ax = plt.subplots()
    plt.scatter(endog, fitted_values, c=color1, alpha=0.5)
    plt.plot(endog, endog, c=color2)
    plt.xlabel("Actual values")
    plt.ylabel("Fitted values")
    plt.title("Acutal vs Predict")

    ##############################################################################
    # Analyse des outliers                                                       #
    ##############################################################################

    # Leviers (hii, diagonale de la matrice chapeau)
    ################################################

    # Individus atypiques (distance à la moyenne des observations)

    # Calcul de la proportion
    data = influences.hat_matrix_diag
    seuil = 2*p/n
    len_data = len(data)
    data_filter = data[data <= seuil]
    len_data_filter = len(data_filter)
    ratio = len_data_filter / len_data

    # Plot
    fig, ax = plt.subplots()
    plt.plot(data)
    plt.plot((0, len_data), (seuil, seuil), c="#d35400")
    plt.ylabel("Leverage values (hii)")
    plt.title(f"Leviers avec seuil à 2*p/n ({ratio:.2%})")

    # Résidus studentisés
    #####################

    # Individus mal représentés par le modèle

    # Calcul de la proportion
    data = influences.resid_studentized_internal
    len_data = len(data)
    data_filter = data[data <= 2]
    data_filter = data_filter[data_filter >= -2]
    len_data_filter = len(data_filter)
    ratio = len_data_filter / len_data

    # Plot
    fig, ax = plt.subplots()
    plt.plot(data)
    plt.plot((0, len_data), (2, 2), c="#d35400")
    plt.plot((0, len_data), (-2, -2), c="#d35400")
    plt.ylabel("Studentized Residuals")
    plt.title(f"Résidus studentisés avec seuil à 2 et -2 ({ratio:.2%})")

    # Distances de cook
    ###################

    # Outliers dont la supression influencent fortement le modèle

    # Calcul de la proportion
    data = influences.cooks_distance[0]
    seuil = 4/(n-p)
    len_data = len(data)
    data_filter = data[data <= seuil]
    len_data_filter = len(data_filter)
    ratio = len_data_filter / len_data

    # Plot
    fig, ax = plt.subplots()
    plt.plot(data)
    plt.plot((0, len_data), (seuil, seuil))
    plt.ylabel("Cook Distance")
    plt.title(f"Distances de Cook avec seuil à 4/(n-p) ({ratio:.2%})")

    # Plot
    plt.show()