Beispiel #1
0
def _partial_regression(endog, exog_i, exog_others):
    """Partial regression.

    regress endog on exog_i conditional on exog_others

    uses OLS

    Parameters
    ----------
    endog : array_like
    exog : array_like
    exog_others : array_like

    Returns
    -------
    res1c : OLS results instance

    (res1a, res1b) : tuple of OLS results instances
         results from regression of endog on exog_others and of exog_i on
         exog_others

    """
    #FIXME: This function doesn't appear to be used.
    res1a = OLS(endog, exog_others).fit()
    res1b = OLS(exog_i, exog_others).fit()
    res1c = OLS(res1a.resid, res1b.resid).fit()

    return res1c, (res1a, res1b)
Beispiel #2
0
def reset_ramsey(res, degree=5):
    '''Ramsey's RESET specification test for linear models

    This is a general specification test, for additional non-linear effects
    in a model.


    Notes
    -----
    The test fits an auxiliary OLS regression where the design matrix, exog,
    is augmented by powers 2 to degree of the fitted values. Then it performs
    an F-test whether these additional terms are significant.

    If the p-value of the f-test is below a threshold, e.g. 0.1, then this
    indicates that there might be additional non-linear effects in the model
    and that the linear model is mis-specified.


    References
    ----------
    http://en.wikipedia.org/wiki/Ramsey_RESET_test

    '''
    order = degree + 1
    k_vars = res.model.exog.shape[1]
    #vander without constant and x:
    y_fitted_vander = np.vander(res.fittedvalues,
                                order)[:, :-2]  #drop constant
    exog = np.column_stack((res.model.exog, y_fitted_vander))
    res_aux = OLS(res.model.endog, exog).fit()
    #r_matrix = np.eye(degree, exog.shape[1], k_vars)
    r_matrix = np.eye(degree - 1, exog.shape[1], k_vars)
    #df1 = degree - 1
    #df2 = exog.shape[0] - degree - res.df_model  (without constant)
    return res_aux.f_test(r_matrix)  #, r_matrix, res_aux
Beispiel #3
0
def reset_ramsey(res, degree=5):
    '''Ramsey's RESET specification test for linear models

    This is a general specification test, for additional non-linear effects
    in a model.


    Notes
    -----
    The test fits an auxiliary OLS regression where the design matrix, exog,
    is augmented by powers 2 to degree of the fitted values. Then it performs
    an F-test whether these additional terms are significant.

    If the p-value of the f-test is below a threshold, e.g. 0.1, then this
    indicates that there might be additional non-linear effects in the model
    and that the linear model is mis-specified.


    References
    ----------
    http://en.wikipedia.org/wiki/Ramsey_RESET_test

    '''
    order = degree + 1
    k_vars = res.model.exog.shape[1]
    #vander without constant and x:
    y_fitted_vander = np.vander(res.fittedvalues, order)[:, :-2] #drop constant
    exog = np.column_stack((res.model.exog, y_fitted_vander))
    res_aux = OLS(res.model.endog, exog).fit()
    #r_matrix = np.eye(degree, exog.shape[1], k_vars)
    r_matrix = np.eye(degree-1, exog.shape[1], k_vars)
    #df1 = degree - 1
    #df2 = exog.shape[0] - degree - res.df_model  (without constant)
    return res_aux.f_test(r_matrix) #, r_matrix, res_aux
Beispiel #4
0
def notyet_atst():
    d = macrodata.load().data

    realinv = d['realinv']
    realgdp = d['realgdp']
    realint = d['realint']
    endog = realinv
    exog = add_constant(np.c_[realgdp, realint],prepend=True)
    res_ols1 = OLS(endog, exog).fit()

    #growth rates
    gs_l_realinv = 400 * np.diff(np.log(d['realinv']))
    gs_l_realgdp = 400 * np.diff(np.log(d['realgdp']))
    lint = d['realint'][:-1]
    tbilrate = d['tbilrate'][:-1]

    endogg = gs_l_realinv
    exogg = add_constant(np.c_[gs_l_realgdp, lint], prepend=True)
    exogg2 = add_constant(np.c_[gs_l_realgdp, tbilrate], prepend=True)

    res_ols = OLS(endogg, exogg).fit()
    res_ols2 = OLS(endogg, exogg2).fit()

    #the following were done accidentally with res_ols1 in R,
    #with original Greene data

    params = np.array([-272.3986041341653, 0.1779455206941112,
                       0.2149432424658157])
    cov_hac_4 = np.array([1321.569466333051, -0.2318836566017612,
                37.01280466875694, -0.2318836566017614, 4.602339488102263e-05,
                -0.0104687835998635, 37.012804668757, -0.0104687835998635,
                21.16037144168061]).reshape(3,3, order='F')
    cov_hac_10 = np.array([2027.356101193361, -0.3507514463299015,
        54.81079621448568, -0.350751446329901, 6.953380432635583e-05,
        -0.01268990195095196, 54.81079621448564, -0.01268990195095195,
        22.92512402151113]).reshape(3,3, order='F')

    #goldfeld-quandt
    het_gq_greater = dict(statistic=13.20512768685082, df1=99, df2=98,
                          pvalue=1.246141976112324e-30, distr='f')
    het_gq_less = dict(statistic=13.20512768685082, df1=99, df2=98, pvalue=1.)
    het_gq_2sided = dict(statistic=13.20512768685082, df1=99, df2=98,
                          pvalue=1.246141976112324e-30, distr='f')

    #goldfeld-quandt, fraction = 0.5
    het_gq_greater_2 = dict(statistic=87.1328934692124, df1=48, df2=47,
                          pvalue=2.154956842194898e-33, distr='f')

    gq = smsdia.het_goldfeldquandt(endog, exog, split=0.5)
    compare_t_est(gq, het_gq_greater, decimal=(13, 14))
    assert_equal(gq[-1], 'increasing')


    harvey_collier = dict(stat=2.28042114041313, df=199,
                          pvalue=0.02364236161988260, distr='t')
    #hc = harvtest(fm, order.by=ggdp , data = list())
    harvey_collier_2 = dict(stat=0.7516918462158783, df=199,
                          pvalue=0.4531244858006127, distr='t')
Beispiel #5
0
 def setupClass(cls):
     data = longley.load()
     data.exog = add_constant(data.exog)
     res1 = OLS(data.endog, data.exog).fit()
     R = np.array([[0, 1, 1, 0, 0, 0, 0], [0, 1, 0, 1, 0, 0, 0],
                   [0, 1, 0, 0, 0, 0, 0], [0, 0, 0, 0, 1, 0, 0],
                   [0, 0, 0, 0, 0, 1, 0]])
     q = np.array([0, 0, 0, 1, 0])
     cls.Ftest1 = res1.f_test(R, q)
Beispiel #6
0
def coint(y1, y2, regression="c"):
    """
    This is a simple cointegration test. Uses unit-root test on residuals to
    test for cointegrated relationship

    See Hamilton (1994) 19.2

    Parameters
    ----------
    y1 : array_like, 1d
        first element in cointegrating vector
    y2 : array_like
        remaining elements in cointegrating vector
    c : str {'c'}
        Included in regression
        * 'c' : Constant

    Returns
    -------
    coint_t : float
        t-statistic of unit-root test on residuals
    pvalue : float
        MacKinnon's approximate p-value based on MacKinnon (1994)
    crit_value : dict
        Critical values for the test statistic at the 1 %, 5 %, and 10 % levels.

    Notes
    -----
    The Null hypothesis is that there is no cointegration, the alternative
    hypothesis is that there is cointegrating relationship. If the pvalue is
    small, below a critical size, then we can reject the hypothesis that there
    is no cointegrating relationship.

    P-values are obtained through regression surface approximation from
    MacKinnon 1994.

    References
    ----------
    MacKinnon, J.G. 1994.  "Approximate asymptotic distribution functions for
        unit-root and cointegration tests.  `Journal of Business and Economic
        Statistics` 12, 167-76.

    """
    regression = regression.lower()
    if regression not in ['c', 'nc', 'ct', 'ctt']:
        raise ValueError("regression option %s not understood") % regression
    y1 = np.asarray(y1)
    y2 = np.asarray(y2)
    if regression == 'c':
        y2 = add_constant(y2)
    st1_resid = OLS(y1, y2).fit().resid  #stage one residuals
    lgresid_cons = add_constant(st1_resid[0:-1])
    uroot_reg = OLS(st1_resid[1:], lgresid_cons).fit()
    coint_t = (uroot_reg.params[0] - 1) / uroot_reg.bse[0]
    pvalue = mackinnonp(coint_t, regression="c", N=2, lags=None)
    crit_value = mackinnoncrit(N=1, regression="c", nobs=len(y1))
    return coint_t, pvalue, crit_value
Beispiel #7
0
 def setupClass(cls):
     data = longley.load()
     data.exog = add_constant(data.exog)
     res1 = OLS(data.endog, data.exog).fit()
     R = np.array([[0,1,1,0,0,0,0],
           [0,1,0,1,0,0,0],
           [0,1,0,0,0,0,0],
           [0,0,0,0,1,0,0],
           [0,0,0,0,0,1,0]])
     q = np.array([0,0,0,1,0])
     cls.Ftest1 = res1.f_test(R,q)
Beispiel #8
0
    def setupClass(cls):
        from results.results_regression import Longley
        data = longley.load()
        data.exog = add_constant(data.exog)
        res1 = OLS(data.endog, data.exog).fit()
        res2 = Longley()
        res2.wresid = res1.wresid  # workaround hack
        cls.res1 = res1
        cls.res2 = res2

        res_qr = OLS(data.endog, data.exog).fit(method="qr")
        cls.res_qr = res_qr
Beispiel #9
0
def qqline(ax, line, x=None, y=None, dist=None, fmt='r-'):
    """
    Plot a reference line for a qqplot.

    Parameters
    ----------
    ax : matplotlib axes instance
        The axes on which to plot the line
    line : str {'45','r','s','q'}
        Options for the reference line to which the data is compared.:

        - '45' - 45-degree line
        - 's'  - standardized line, the expected order statistics are scaled by          the standard deviation of the given sample and have the mean added
          to them
        - 'r'  - A regression line is fit
        - 'q'  - A line is fit through the quartiles.
        - None - By default no reference line is added to the plot.
    x : array
        X data for plot. Not needed if line is '45'.
    y : array
        Y data for plot. Not needed if line is '45'.
    dist : scipy.stats.distribution
        A scipy.stats distribution, needed if line is 'q'.

    Notes
    -----
    There is no return value. The line is plotted on the given `ax`.
    """
    if line == '45':
        end_pts = zip(ax.get_xlim(), ax.get_ylim())
        end_pts[0] = max(end_pts[0])
        end_pts[1] = min(end_pts[1])
        ax.plot(end_pts, end_pts, fmt)
        return  # does this have any side effects?
    if x is None and y is None:
        raise ValueError("If line is not 45, x and y cannot be None.")
    elif line == 'r':
        # could use ax.lines[0].get_xdata(), get_ydata(),
        # but don't know axes are 'clean'
        y = OLS(y, add_constant(x)).fit().fittedvalues
        ax.plot(x, y, fmt)
    elif line == 's':
        m, b = y.std(), y.mean()
        ref_line = x * m + b
        ax.plot(x, ref_line, fmt)
    elif line == 'q':
        q25 = stats.scoreatpercentile(y, 25)
        q75 = stats.scoreatpercentile(y, 75)
        theoretical_quartiles = dist.ppf([.25, .75])
        m = (q75 - q25) / np.diff(theoretical_quartiles)
        b = q25 - m * theoretical_quartiles[0]
        ax.plot(x, m * x + b, fmt)
Beispiel #10
0
def qqline(ax, line, x=None, y=None, dist=None, fmt='r-'):
    """
    Plot a reference line for a qqplot.

    Parameters
    ----------
    ax : matplotlib axes instance
        The axes on which to plot the line
    line : str {'45','r','s','q'}
        Options for the reference line to which the data is compared.:

        - '45' - 45-degree line
        - 's'  - standardized line, the expected order statistics are scaled by          the standard deviation of the given sample and have the mean added
          to them
        - 'r'  - A regression line is fit
        - 'q'  - A line is fit through the quartiles.
        - None - By default no reference line is added to the plot.
    x : array
        X data for plot. Not needed if line is '45'.
    y : array
        Y data for plot. Not needed if line is '45'.
    dist : scipy.stats.distribution
        A scipy.stats distribution, needed if line is 'q'.

    Notes
    -----
    There is no return value. The line is plotted on the given `ax`.
    """
    if line == '45':
        end_pts = zip(ax.get_xlim(), ax.get_ylim())
        end_pts[0] = max(end_pts[0])
        end_pts[1] = min(end_pts[1])
        ax.plot(end_pts, end_pts, fmt)
        return # does this have any side effects?
    if x is None and y is None:
        raise ValueError("If line is not 45, x and y cannot be None.")
    elif line == 'r':
        # could use ax.lines[0].get_xdata(), get_ydata(),
        # but don't know axes are 'clean'
        y = OLS(y, add_constant(x)).fit().fittedvalues
        ax.plot(x,y,fmt)
    elif line == 's':
        m,b = y.std(), y.mean()
        ref_line = x*m + b
        ax.plot(x, ref_line, fmt)
    elif line == 'q':
        q25 = stats.scoreatpercentile(y, 25)
        q75 = stats.scoreatpercentile(y, 75)
        theoretical_quartiles = dist.ppf([.25,.75])
        m = (q75 - q25) / np.diff(theoretical_quartiles)
        b = q25 - m*theoretical_quartiles[0]
        ax.plot(x, m*x + b, fmt)
Beispiel #11
0
 def setupClass(cls):
     #        if skipR:
     #            raise SkipTest, "Rpy not installed"
     #        try:
     #            r.library('car')
     #        except RPyRException:
     #            raise SkipTest, "car library not installed for R"
     R = np.zeros(7)
     R[4:6] = [1, -1]
     #        self.R = R
     data = longley.load()
     data.exog = add_constant(data.exog)
     res1 = OLS(data.endog, data.exog).fit()
     cls.Ttest1 = res1.t_test(R)
Beispiel #12
0
    def setupClass(cls):
#        if skipR:
#            raise SkipTest, "Rpy not installed"
#        try:
#            r.library('car')
#        except RPyRException:
#            raise SkipTest, "car library not installed for R"
        R = np.zeros(7)
        R[4:6] = [1,-1]
#        self.R = R
        data = longley.load()
        data.exog = add_constant(data.exog)
        res1 = OLS(data.endog, data.exog).fit()
        cls.Ttest1 = res1.t_test(R)
Beispiel #13
0
    def __init__(self):
        super(self.__class__, self).__init__()  #initialize DGP

        nobs = self.nobs
        y_true, x, exog = self.y_true, self.x, self.exog

        np.random.seed(8765993)
        sigma_noise = 0.1
        y = y_true + sigma_noise * np.random.randn(nobs)

        m = AdditiveModel(x)
        m.fit(y)
        res_gam = m.results  #TODO: currently attached to class

        res_ols = OLS(y, exog).fit()

        #Note: there still are some naming inconsistencies
        self.res1 = res1 = Dummy()  #for gam model
        #res2 = Dummy() #for benchmark
        self.res2 = res2 = res_ols  #reuse existing ols results, will add additional

        res1.y_pred = res_gam.predict(x)
        res2.y_pred = res_ols.model.predict(res_ols.params, exog)
        res1.y_predshort = res_gam.predict(x[:10])

        slopes = [i for ss in m.smoothers for i in ss.params[1:]]

        const = res_gam.alpha + sum([ss.params[1] for ss in m.smoothers])
        #print const, slopes
        res1.params = np.array([const] + slopes)
Beispiel #14
0
    def fitbygroups(self):
        '''Fit OLS regression for each group separately.

        Returns
        -------
        results are attached

        olsbygroup : dictionary of result instance
            the returned regression results for each group
        sigmabygroup : array (ngroups,) (this should be called sigma2group ??? check)
            mse_resid for each group
        weights : array (nobs,)
            standard deviation of group extended to the original observations. This can
            be used as weights in WLS for group-wise heteroscedasticity.



        '''
        olsbygroup = {}
        sigmabygroup = []
        for gi, group in enumerate(
                self.unique):  #np.arange(len(self.unique))):
            groupmask = self.groupsint == gi  #group index
            res = OLS(self.endog[groupmask], self.exog[groupmask]).fit()
            olsbygroup[group] = res
            sigmabygroup.append(res.mse_resid)
        self.olsbygroup = olsbygroup
        self.sigmabygroup = np.array(sigmabygroup)
        self.weights = np.sqrt(
            self.sigmabygroup[self.groupsint])  #TODO:chk sqrt
Beispiel #15
0
 def setupClass(cls):
     data = longley.load()
     data.exog = add_constant(data.exog)
     ols_res = OLS(data.endog, data.exog).fit()
     gls_res = GLS(data.endog, data.exog).fit()
     cls.res1 = gls_res
     cls.res2 = ols_res
Beispiel #16
0
def pacf_ols(x, nlags=40):
    '''Calculate partial autocorrelations

    Parameters
    ----------
    x : 1d array
        observations of time series for which pacf is calculated
    nlags : int
        Number of lags for which pacf is returned.  Lag 0 is not returned.

    Returns
    -------
    pacf : 1d array
        partial autocorrelations, maxlag+1 elements

    Notes
    -----
    This solves a separate OLS estimation for each desired lag.
    '''
    #TODO: add warnings for Yule-Walker
    #NOTE: demeaning and not using a constant gave incorrect answers?
    #JP: demeaning should have a better estimate of the constant
    #maybe we can compare small sample properties with a MonteCarlo
    xlags, x0 = lagmat(x, nlags, original='sep')
    #xlags = sm.add_constant(lagmat(x, nlags), prepend=True)
    xlags = add_constant(xlags, prepend=True)
    pacf = [1.]
    for k in range(1, nlags + 1):
        res = OLS(x0[k:], xlags[k:, :k + 1]).fit()
        #np.take(xlags[k:], range(1,k+1)+[-1],

        pacf.append(res.params[-1])
    return np.array(pacf)
Beispiel #17
0
 def setupClass(cls):
     np.random.seed(54321)
     cls.endog_n_ = np.random.uniform(0, 20, size=30)
     cls.endog_n_one = cls.endog_n_[:, None]
     cls.exog_n_ = np.random.uniform(0, 20, size=30)
     cls.exog_n_one = cls.exog_n_[:, None]
     cls.degen_exog = cls.exog_n_one[:-1]
     cls.mod1 = OLS(cls.endog_n_one, cls.exog_n_one)
     cls.mod1.df_model += 1
     #cls.mod1.df_resid -= 1
     cls.res1 = cls.mod1.fit()
     # Note that these are created for every subclass..
     # A little extra overhead probably
     cls.mod2 = OLS(cls.endog_n_one, cls.exog_n_one)
     cls.mod2.df_model += 1
     cls.res2 = cls.mod2.fit()
Beispiel #18
0
    def spec_hausman(self, dof=None):
        '''Hausman's specification test


        See Also
        --------
        spec_hausman : generic function for Hausman's specification test

        '''
        #use normalized cov_params for OLS

        resols = OLS(endog, exog).fit()
        normalized_cov_params_ols = resols.model.normalized_cov_params
        se2 = resols.mse_resid

        params_diff = self._results.params - resols.params

        cov_diff = np.linalg.pinv(self.xhatprod) - normalized_cov_params_ols
        #TODO: the following is very inefficient, solves problem (svd) twice
        #use linalg.lstsq or svd directly
        #cov_diff will very often be in-definite (singular)
        if not dof:
            dof = tools.rank(cov_diff)
        cov_diffpinv = np.linalg.pinv(cov_diff)
        H = np.dot(params_diff, np.dot(cov_diffpinv, params_diff))/se2
        pval = stats.chi2.sf(H, dof)

        return H, pval, dof
Beispiel #19
0
def test_cov_cluster_2groups():
    #comparing cluster robust standard errors to Peterson
    #requires Petersen's test_data
    #http://www.kellogg.northwestern.edu/faculty/petersen/htm/papers/se/test_data.txt
    import os
    cur_dir = os.path.abspath(os.path.dirname(__file__))
    fpath = os.path.join(cur_dir, "test_data.txt")
    pet = np.genfromtxt(fpath)
    endog = pet[:, -1]
    group = pet[:, 0].astype(int)
    time = pet[:, 1].astype(int)
    exog = add_constant(pet[:, 2], prepend=True)
    res = OLS(endog, exog).fit()

    cov01, covg, covt = sw.cov_cluster_2groups(res, group, group2=time)

    #Reference number from Petersen
    #http://www.kellogg.northwestern.edu/faculty/petersen/htm/papers/se/test_data.htm

    bse_petw = [0.0284, 0.0284]
    bse_pet0 = [0.0670, 0.0506]
    bse_pet1 = [0.0234, 0.0334]  #year
    bse_pet01 = [0.0651, 0.0536]  #firm and year
    bse_0 = sw.se_cov(covg)
    bse_1 = sw.se_cov(covt)
    bse_01 = sw.se_cov(cov01)
    #print res.HC0_se, bse_petw - res.HC0_se
    #print bse_0, bse_0 - bse_pet0
    #print bse_1, bse_1 - bse_pet1
    #print bse_01, bse_01 - bse_pet01
    assert_almost_equal(bse_petw, res.HC0_se, decimal=4)
    assert_almost_equal(bse_0, bse_pet0, decimal=4)
    assert_almost_equal(bse_1, bse_pet1, decimal=4)
    assert_almost_equal(bse_01, bse_pet01, decimal=4)
Beispiel #20
0
def test_hac_simple():

    from gwstatsmodels.datasets import macrodata
    d2 = macrodata.load().data
    g_gdp = 400 * np.diff(np.log(d2['realgdp']))
    g_inv = 400 * np.diff(np.log(d2['realinv']))
    exogg = add_constant(np.c_[g_gdp, d2['realint'][:-1]], prepend=True)
    res_olsg = OLS(g_inv, exogg).fit()

    #> NeweyWest(fm, lag = 4, prewhite = FALSE, sandwich = TRUE, verbose=TRUE, adjust=TRUE)
    #Lag truncation parameter chosen: 4
    #                     (Intercept)                   ggdp                  lint
    cov1_r = [
        [1.40643899878678802, -0.3180328707083329709, -0.060621111216488610],
        [-0.31803287070833292, 0.1097308348999818661, 0.000395311760301478],
        [-0.06062111121648865, 0.0003953117603014895, 0.087511528912470993]
    ]

    #> NeweyWest(fm, lag = 4, prewhite = FALSE, sandwich = TRUE, verbose=TRUE, adjust=FALSE)
    #Lag truncation parameter chosen: 4
    #                    (Intercept)                  ggdp                  lint
    cov2_r = [
        [1.3855512908840137, -0.313309610252268500, -0.059720797683570477],
        [-0.3133096102522685, 0.108101169035130618, 0.000389440793564339],
        [-0.0597207976835705, 0.000389440793564336, 0.086211852740503622]
    ]

    cov1, se1 = sw.cov_hac_simple(res_olsg, nlags=4, use_correction=True)
    cov2, se2 = sw.cov_hac_simple(res_olsg, nlags=4, use_correction=False)
    assert_almost_equal(cov1, cov1_r, decimal=14)
    assert_almost_equal(cov2, cov2_r, decimal=14)
Beispiel #21
0
    def iterative_fit(self, maxiter=3):
        """
        Perform an iterative two-step procedure to estimate a WLS model.

        The model is assumed to have heteroscedastic errors.
        The variance is estimated by OLS regression of the link transformed
        squared residuals on Z, i.e.::

           link(sigma_i) = x_i*gamma.

        Parameters
        ----------
        maxiter : integer, optional
            the number of iterations

        Notes
        -----
        maxiter=1: returns the estimated based on given weights
        maxiter=2: performs a second estimation with the updated weights,
                   this is 2-step estimation
        maxiter>2: iteratively estimate and update the weights

        TODO: possible extension stop iteration if change in parameter
            estimates is smaller than x_tol

        Repeated calls to fit_iterative, will do one redundant pinv_wexog
        calculation. Calling fit_iterative(maxiter) ones does not do any
        redundant recalculations (whitening or calculating pinv_wexog).

        """

        import collections
        self.history = collections.defaultdict(list)  #not really necessary
        res_resid = None  #if maxiter < 2 no updating
        for i in range(maxiter):
            #pinv_wexog is cached
            if hasattr(self, 'pinv_wexog'):
                del self.pinv_wexog
            #self.initialize()
            #print 'wls self',
            results = self.fit()
            self.history['self_params'].append(results.params)
            if not i == maxiter - 1:  #skip for last iteration, could break instead
                #print 'ols',
                self.results_old = results  #for debugging
                #estimate heteroscedasticity
                res_resid = OLS(self.link(results.resid**2),
                                self.exog_var).fit()
                self.history['ols_params'].append(res_resid.params)
                #update weights
                self.weights = 1. / self.linkinv(res_resid.fittedvalues)
                self.weights /= self.weights.max()  #not required
                self.weights[self.weights < 1e-14] = 1e-14  #clip
                #print 'in iter', i, self.weights.var() #debug, do weights change
                self.initialize()

        #note results is the wrapper, results._results is the results instance
        results._results.results_residual_regression = res_resid
        return results
Beispiel #22
0
 def fit(self):
     res_pooled = self.fit_ols()  #get starting estimate
     sigma_i = self.get_within_cov(res_pooled.resid)
     self.cholsigmainv_i = np.linalg.cholesky(np.linalg.pinv(sigma_i)).T
     wendog = self.whiten_groups(self.endog, self.cholsigmainv_i)
     wexog = self.whiten_groups(self.exog, self.cholsigmainv_i)
     print wendog.shape, wexog.shape
     self.res1 = OLS(wendog, wexog).fit()
     return self.res1
Beispiel #23
0
    def test_recursive_residuals(self):

        reccumres_standardize = np.array([-2.151, -3.748, -3.114, -3.096,
        -1.865, -2.230, -1.194, -3.500, -3.638, -4.447, -4.602, -4.631, -3.999,
        -4.830, -5.429, -5.435, -6.554, -8.093, -8.567, -7.532, -7.079, -8.468,
        -9.320, -12.256, -11.932, -11.454, -11.690, -11.318, -12.665, -12.842,
        -11.693, -10.803, -12.113, -12.109, -13.002, -11.897, -10.787, -10.159,
        -9.038, -9.007, -8.634, -7.552, -7.153, -6.447, -5.183, -3.794, -3.511,
        -3.979, -3.236, -3.793, -3.699, -5.056, -5.724, -4.888, -4.309, -3.688,
        -3.918, -3.735, -3.452, -2.086, -6.520, -7.959, -6.760, -6.855, -6.032,
        -4.405, -4.123, -4.075, -3.235, -3.115, -3.131, -2.986, -1.813, -4.824,
        -4.424, -4.796, -4.000, -3.390, -4.485, -4.669, -4.560, -3.834, -5.507,
        -3.792, -2.427, -1.756, -0.354, 1.150, 0.586, 0.643, 1.773, -0.830,
        -0.388, 0.517, 0.819, 2.240, 3.791, 3.187, 3.409, 2.431, 0.668, 0.957,
        -0.928, 0.327, -0.285, -0.625, -2.316, -1.986, -0.744, -1.396, -1.728,
        -0.646, -2.602, -2.741, -2.289, -2.897, -1.934, -2.532, -3.175, -2.806,
        -3.099, -2.658, -2.487, -2.515, -2.224, -2.416, -1.141, 0.650, -0.947,
        0.725, 0.439, 0.885, 2.419, 2.642, 2.745, 3.506, 4.491, 5.377, 4.624,
        5.523, 6.488, 6.097, 5.390, 6.299, 6.656, 6.735, 8.151, 7.260, 7.846,
        8.771, 8.400, 8.717, 9.916, 9.008, 8.910, 8.294, 8.982, 8.540, 8.395,
        7.782, 7.794, 8.142, 8.362, 8.400, 7.850, 7.643, 8.228, 6.408, 7.218,
        7.699, 7.895, 8.725, 8.938, 8.781, 8.350, 9.136, 9.056, 10.365, 10.495,
        10.704, 10.784, 10.275, 10.389, 11.586, 11.033, 11.335, 11.661, 10.522,
        10.392, 10.521, 10.126, 9.428, 9.734, 8.954, 9.949, 10.595, 8.016,
        6.636, 6.975])

        rr = smsdia.recursive_olsresiduals(self.res, skip=3, alpha=0.95)
        assert_equal(np.round(rr[5][1:], 3), reccumres_standardize) #extra zero in front
        #assert_equal(np.round(rr[3][4:], 3), np.diff(reccumres_standardize))
        assert_almost_equal(rr[3][4:], np.diff(reccumres_standardize),3)
        assert_almost_equal(rr[4][3:].std(ddof=1), 10.7242, decimal=4)

        #regression number, visually checked with graph from gretl
        ub0 = np.array([ 13.37318571,  13.50758959,  13.64199346,  13.77639734,
                        13.91080121])
        ub1 = np.array([ 39.44753774,  39.58194162,  39.7163455 ,  39.85074937,
                        39.98515325])
        lb, ub = rr[6]
        assert_almost_equal(ub[:5], ub0, decimal=7)
        assert_almost_equal(lb[:5], -ub0, decimal=7)
        assert_almost_equal(ub[-5:], ub1, decimal=7)
        assert_almost_equal(lb[-5:], -ub1, decimal=7)

        #test a few values with explicit OLS
        endog = self.res.model.endog
        exog = self.res.model.exog
        params = []
        ypred = []
        for i in range(3,10):
            resi = OLS(endog[:i], exog[:i]).fit()
            ypred.append(resi.model.predict(resi.params, exog[i]))
            params.append(resi.params)
        assert_almost_equal(rr[2][3:10], ypred, decimal=12)
        assert_almost_equal(rr[0][3:10], endog[3:10] - ypred, decimal=12)
        assert_almost_equal(rr[1][2:9], params, decimal=12)
Beispiel #24
0
 def fitpooled(self):
     '''fit the pooled model, which assumes there are no differences across groups
     '''
     if self.het:
         if not hasattr(self, 'weights'):
             self.fitbygroups()
         weights = self.weights
         res = WLS(self.endog, self.exog, weights=weights).fit()
     else:
         res = OLS(self.endog, self.exog).fit()
     self.lspooled = res
Beispiel #25
0
    def __init__(self):
        super(self.__class__, self).__init__()  #initialize DGP

        y, x, exog = self.y, self.x, self.exog

        #use order = 2 in regression
        pmod = smoothers.PolySmoother(2, x)
        pmod.fit(y)  #no return

        self.res_ps = pmod
        self.res2 = OLS(y, exog[:, :2 + 1]).fit()
Beispiel #26
0
def plot_partregress_ax(endog, exog_i, exog_others, varname='',
                        title_fontsize=None, ax=None):
    """Plot partial regression for a single regressor.

    Parameters
    ----------
    endog : ndarray
       endogenous or response variable
    exog_i : ndarray
        exogenous, explanatory variable
    exog_others : ndarray
        other exogenous, explanatory variables, the effect of these variables
        will be removed by OLS regression
    varname : str
        name of the variable used in the title
    ax : Matplotlib AxesSubplot instance, optional
        If given, this subplot is used to plot in instead of a new figure being
        created.

    Returns
    -------
    fig : Matplotlib figure instance
        If `ax` is None, the created figure.  Otherwise the figure to which
        `ax` is connected.

    See Also
    --------
    plot_partregress : Plot partial regression for a set of regressors.

    """
    fig, ax = utils.create_mpl_ax(ax)

    res1a = OLS(endog, exog_others).fit()
    res1b = OLS(exog_i, exog_others).fit()
    ax.plot(res1b.resid, res1a.resid, 'o')
    res1c = OLS(res1a.resid, res1b.resid).fit()
    ax.plot(res1b.resid, res1c.fittedvalues, '-', color='k')
    ax.set_title('Partial Regression plot %s' % varname,
                 fontsize=title_fontsize)# + namestr)

    return fig
Beispiel #27
0
    def fit(self):
        alpha0 = 0.1  #startvalue
        func = self.fit_conditional
        fitres = optimize.fmin(func, alpha0)

        # fit_conditional only returns ssr for now
        alpha = fitres[0]
        y = self.ar1filter(self.endog, alpha)
        x = self.ar1filter(self.exog, alpha)
        reso = OLS(y, x).fit()

        return fitres, reso
Beispiel #28
0
    def __init__(self):
        super(self.__class__, self).__init__()  #initialize DGP

        y, x, exog = self.y, self.x, self.exog

        #use order = 3 in regression
        pmod = smoothers.PolySmoother(3, x)
        #pmod.fit(y)  #no return
        pmod.smooth(y)  #no return, use alias for fit

        self.res_ps = pmod
        self.res2 = OLS(y, exog[:, :3 + 1]).fit()
Beispiel #29
0
    def setupClass(cls):
        from results.results_regression import LongleyGls

        data = longley.load()
        exog = add_constant(np.column_stack(\
                (data.exog[:,1],data.exog[:,4])))
        tmp_results = OLS(data.endog, exog).fit()
        rho = np.corrcoef(tmp_results.resid[1:],
                          tmp_results.resid[:-1])[0][1]  # by assumption
        order = toeplitz(np.arange(16))
        sigma = rho**order
        GLS_results = GLS(data.endog, exog, sigma=sigma).fit()
        cls.res1 = GLS_results
        cls.res2 = LongleyGls()
Beispiel #30
0
    def _ols_xnoti(self, drop_idx, endog_idx='endog', store=True):
        '''regression results from LOVO auxiliary regression with cache


        The result instances are stored, which could use a large amount of
        memory if the datasets are large. There are too many combinations to
        store them all, except for small problems.

        Parameters
        ----------
        drop_idx : int
            index of exog that is dropped from the regression
        endog_idx : 'endog' or int
            If 'endog', then the endogenous variable of the result instance
            is regressed on the exogenous variables, excluding the one at
            drop_idx. If endog_idx is an integer, then the exog with that
            index is regressed with OLS on all other exogenous variables.
            (The latter is the auxiliary regression for the variance inflation
            factor.)

        this needs more thought, memory versus speed
        not yet used in any other parts, not sufficiently tested
        '''
        #reverse the structure, access store, if fail calculate ?
        #this creates keys in store even if store = false ! bug
        if endog_idx == 'endog':
            stored = self.aux_regression_endog
            if hasattr(stored, drop_idx):
                return stored[drop_idx]
            x_i = self.results.model.endog

        else:
            #nested dictionary
            try:
                self.aux_regression_exog[endog_idx][drop_idx]
            except KeyError:
                pass

            stored = self.aux_regression_exog[endog_idx]
            stored = {}

            x_i = self.exog[:, endog_idx]

        mask = np.arange(k_vars) != drop_idx
        x_noti = self.exog[:, mask]
        res = OLS(x_i, x_noti).fit()
        if store:
            stored[drop_idx] = res

        return res
Beispiel #31
0
    def __init__(self):
        d = macrodata.load().data
        #growth rates
        gs_l_realinv = 400 * np.diff(np.log(d['realinv']))
        gs_l_realgdp = 400 * np.diff(np.log(d['realgdp']))
        lint = d['realint'][:-1]
        tbilrate = d['tbilrate'][:-1]

        endogg = gs_l_realinv
        exogg = add_constant(np.c_[gs_l_realgdp, lint], prepend=True)
        exogg2 = add_constant(np.c_[gs_l_realgdp, tbilrate], prepend=True)
        exogg3 = add_constant(np.c_[gs_l_realgdp], prepend=True)

        res_ols = OLS(endogg, exogg).fit()
        res_ols2 = OLS(endogg, exogg2).fit()

        res_ols3 = OLS(endogg, exogg3).fit()

        self.res = res_ols
        self.res2 = res_ols2
        self.res3 = res_ols3
        self.endog = self.res.model.endog
        self.exog = self.res.model.exog
Beispiel #32
0
    def fit(self, lambd=1.):
        #maybe iterate
        #preliminary estimate
        res_gls = GLS(self.endog, self.exog, sigma=self.sigma).fit()
        res_resid = OLS(res_gls.resid**2, self.exog_var).fit()
        #or  log-link
        #res_resid = OLS(np.log(res_gls.resid**2), self.exog_var).fit()
        #here I could use whiten and current instance instead of delegating
        #but this is easier
        #see pattern of GLSAR, calls self.initialize and self.fit
        res_wls = WLS(self.endog,
                      self.exog,
                      weights=1. / res_resid.fittedvalues).fit()

        res_wls._results.results_residual_regression = res_resid
        return res_wls
Beispiel #33
0
def variance_inflation_factor(exog, exog_idx):
    '''variance inflation factor, VIF, for one exogenous variable

    The variance inflation factor is a measure for the increase of the
    variance of the parameter estimates if an additional variable, given by
    exog_idx is added to the linear regression. It is a measure for
    multicollinearity of the design matrix, exog.

    One recommendation is that if VIF is greater than 5, then the explanatory
    variable given by exog_idx is highly collinear with the other explanatory
    variables, and the parameter estimates will have large standard errors
    because of this.

    Parameters
    ----------
    exog : ndarray, (nobs, k_vars)
        design matrix with all explanatory variables, as for example used in
        regression
    exog_idx : int
        index of the exogenous variable in the columns of exog

    Returns
    -------
    vif : float
        variance inflation factor

    Notes
    -----
    This function does not save the auxiliary regression.

    See Also
    --------
    xxx : class for regression diagnostics  TODO: doesn't exist yet

    References
    ----------
    http://en.wikipedia.org/wiki/Variance_inflation_factor

    '''
    k_vars = exog.shape[1]
    x_i = exog[:, exog_idx]
    mask = np.arange(k_vars) != exog_idx
    x_noti = exog[:, mask]
    r_squared_i = OLS(x_i, x_noti).fit().rsquared
    vif = 1. / (1. - r_squared_i)
    return vif
Beispiel #34
0

        #using GMM and IV2SLS classes
        #----------------------------

        mod = IVGMM(endog, exog, instrument, nmoms=instrument.shape[1])
        res = mod.fit()
        modgmmols = IVGMM(endog, exog, exog, nmoms=exog.shape[1])
        resgmmols = modgmmols.fit()
        #the next is the same as IV2SLS, (Z'Z)^{-1} as weighting matrix
        modgmmiv = IVGMM(endog, exog, instrument, nmoms=instrument.shape[1]) #same as mod
        resgmmiv = modgmmiv.fitgmm(np.ones(exog.shape[1], float),
                        weights=np.linalg.inv(np.dot(instrument.T, instrument)))
        modls = IV2SLS(endog, exog, instrument)
        resls = modls.fit()
        modols = OLS(endog, exog)
        resols = modols.fit()

        print '\nIV case'
        print 'params'
        print 'IV2SLS', resls.params
        print 'GMMIV ', resgmmiv # .params
        print 'GMM   ', res.params
        print 'diff  ', res.params - resls.params
        print 'OLS   ', resols.params
        print 'GMMOLS', resgmmols.params

        print '\nbse'
        print 'IV2SLS', resls.bse
        print 'GMM   ', mod.bse   #bse currently only attached to model not results
        print 'diff  ', mod.bse - resls.bse
Beispiel #35
0
 def setupClass(cls):
     data = longley.load()
     data.exog = add_constant(data.exog)
     res1 = OLS(data.endog, data.exog).fit()
     R2 = [[0,1,-1,0,0,0,0],[0, 0, 0, 0, 1, -1, 0]]
     cls.Ftest1 = res1.f_test(R2)
Beispiel #36
0
    def test_all(self):

        d = macrodata.load().data
        #import datasetswsm.greene as g
        #d = g.load('5-1')

        #growth rates
        gs_l_realinv = 400 * np.diff(np.log(d['realinv']))
        gs_l_realgdp = 400 * np.diff(np.log(d['realgdp']))

        #simple diff, not growthrate, I want heteroscedasticity later for testing
        endogd = np.diff(d['realinv'])
        exogd = add_constant(np.c_[np.diff(d['realgdp']), d['realint'][:-1]],
                             prepend=True)

        endogg = gs_l_realinv
        exogg = add_constant(np.c_[gs_l_realgdp, d['realint'][:-1]],
                             prepend=True)

        res_ols = OLS(endogg, exogg).fit()
        #print res_ols.params

        mod_g1 = GLSAR(endogg, exogg, rho=-0.108136)
        res_g1 = mod_g1.fit()
        #print res_g1.params

        mod_g2 = GLSAR(endogg, exogg, rho=-0.108136)  #-0.1335859) from R
        res_g2 = mod_g2.iterative_fit(maxiter=5)
        #print res_g2.params

        rho = -0.108136

        #                 coefficient   std. error   t-ratio    p-value 95% CONFIDENCE INTERVAL
        partable = np.array([
            [-9.50990, 0.990456, -9.602, 3.65e-018, -11.4631, -7.55670],  # ***
            [4.37040, 0.208146, 21.00, 2.93e-052, 3.95993, 4.78086],  # ***
            [-0.579253, 0.268009, -2.161, 0.0319, -1.10777, -0.0507346]
        ])  #    **

        #Statistics based on the rho-differenced data:

        result_gretl_g1 = dict(endog_mean=("Mean dependent var", 3.113973),
                               endog_std=("S.D. dependent var", 18.67447),
                               ssr=("Sum squared resid", 22530.90),
                               mse_resid_sqrt=("S.E. of regression", 10.66735),
                               rsquared=("R-squared", 0.676973),
                               rsquared_adj=("Adjusted R-squared", 0.673710),
                               fvalue=("F(2, 198)", 221.0475),
                               f_pvalue=("P-value(F)", 3.56e-51),
                               resid_acf1=("rho", -0.003481),
                               dw=("Durbin-Watson", 1.993858))

        #fstatistic, p-value, df1, df2
        reset_2_3 = [5.219019, 0.00619, 2, 197, "f"]
        reset_2 = [7.268492, 0.00762, 1, 198, "f"]
        reset_3 = [5.248951, 0.023, 1, 198, "f"]
        #LM-statistic, p-value, df
        arch_4 = [7.30776, 0.120491, 4, "chi2"]

        #multicollinearity
        vif = [1.002, 1.002]
        cond_1norm = 6862.0664
        determinant = 1.0296049e+009
        reciprocal_condition_number = 0.013819244

        #Chi-square(2): test-statistic, pvalue, df
        normality = [20.2792, 3.94837e-005, 2]

        #tests
        res = res_g1  #with rho from Gretl

        #basic

        assert_almost_equal(res.params, partable[:, 0], 4)
        assert_almost_equal(res.bse, partable[:, 1], 6)
        assert_almost_equal(res.tvalues, partable[:, 2], 2)

        assert_almost_equal(res.ssr, result_gretl_g1['ssr'][1], decimal=2)
        #assert_almost_equal(res.llf, result_gretl_g1['llf'][1], decimal=7) #not in gretl
        #assert_almost_equal(res.rsquared, result_gretl_g1['rsquared'][1], decimal=7) #FAIL
        #assert_almost_equal(res.rsquared_adj, result_gretl_g1['rsquared_adj'][1], decimal=7) #FAIL
        assert_almost_equal(np.sqrt(res.mse_resid),
                            result_gretl_g1['mse_resid_sqrt'][1],
                            decimal=5)
        assert_almost_equal(res.fvalue,
                            result_gretl_g1['fvalue'][1],
                            decimal=4)
        assert_approx_equal(res.f_pvalue,
                            result_gretl_g1['f_pvalue'][1],
                            significant=2)
        #assert_almost_equal(res.durbin_watson, result_gretl_g1['dw'][1], decimal=7) #TODO

        #arch
        #sm_arch = smsdia.acorr_lm(res.wresid**2, maxlag=4, autolag=None)
        sm_arch = smsdia.het_arch(res.wresid, maxlag=4)
        assert_almost_equal(sm_arch[0], arch_4[0], decimal=4)
        assert_almost_equal(sm_arch[1], arch_4[1], decimal=6)

        #tests
        res = res_g2  #with estimated rho

        #estimated lag coefficient
        assert_almost_equal(res.model.rho, rho, decimal=3)

        #basic
        assert_almost_equal(res.params, partable[:, 0], 4)
        assert_almost_equal(res.bse, partable[:, 1], 3)
        assert_almost_equal(res.tvalues, partable[:, 2], 2)

        assert_almost_equal(res.ssr, result_gretl_g1['ssr'][1], decimal=2)
        #assert_almost_equal(res.llf, result_gretl_g1['llf'][1], decimal=7) #not in gretl
        #assert_almost_equal(res.rsquared, result_gretl_g1['rsquared'][1], decimal=7) #FAIL
        #assert_almost_equal(res.rsquared_adj, result_gretl_g1['rsquared_adj'][1], decimal=7) #FAIL
        assert_almost_equal(np.sqrt(res.mse_resid),
                            result_gretl_g1['mse_resid_sqrt'][1],
                            decimal=5)
        assert_almost_equal(res.fvalue,
                            result_gretl_g1['fvalue'][1],
                            decimal=0)
        assert_almost_equal(res.f_pvalue,
                            result_gretl_g1['f_pvalue'][1],
                            decimal=6)
        #assert_almost_equal(res.durbin_watson, result_gretl_g1['dw'][1], decimal=7) #TODO

        c = oi.reset_ramsey(res, degree=2)
        compare_ftest(c, reset_2, decimal=(2, 4))
        c = oi.reset_ramsey(res, degree=3)
        compare_ftest(c, reset_2_3, decimal=(2, 4))

        #arch
        #sm_arch = smsdia.acorr_lm(res.wresid**2, maxlag=4, autolag=None)
        sm_arch = smsdia.het_arch(res.wresid, maxlag=4)
        assert_almost_equal(sm_arch[0], arch_4[0], decimal=1)
        assert_almost_equal(sm_arch[1], arch_4[1], decimal=2)
        '''
        Performing iterative calculation of rho...

                         ITER       RHO        ESS
                           1     -0.10734   22530.9
                           2     -0.10814   22530.9

        Model 4: Cochrane-Orcutt, using observations 1959:3-2009:3 (T = 201)
        Dependent variable: ds_l_realinv
        rho = -0.108136

                         coefficient   std. error   t-ratio    p-value
          -------------------------------------------------------------
          const           -9.50990      0.990456    -9.602    3.65e-018 ***
          ds_l_realgdp     4.37040      0.208146    21.00     2.93e-052 ***
          realint_1       -0.579253     0.268009    -2.161    0.0319    **

        Statistics based on the rho-differenced data:

        Mean dependent var   3.113973   S.D. dependent var   18.67447
        Sum squared resid    22530.90   S.E. of regression   10.66735
        R-squared            0.676973   Adjusted R-squared   0.673710
        F(2, 198)            221.0475   P-value(F)           3.56e-51
        rho                 -0.003481   Durbin-Watson        1.993858
        '''
        '''
        RESET test for specification (squares and cubes)
        Test statistic: F = 5.219019,
        with p-value = P(F(2,197) > 5.21902) = 0.00619

        RESET test for specification (squares only)
        Test statistic: F = 7.268492,
        with p-value = P(F(1,198) > 7.26849) = 0.00762

        RESET test for specification (cubes only)
        Test statistic: F = 5.248951,
        with p-value = P(F(1,198) > 5.24895) = 0.023:
        '''
        '''
        Test for ARCH of order 4

                     coefficient   std. error   t-ratio   p-value
          --------------------------------------------------------
          alpha(0)   97.0386       20.3234       4.775    3.56e-06 ***
          alpha(1)    0.176114      0.0714698    2.464    0.0146   **
          alpha(2)   -0.0488339     0.0724981   -0.6736   0.5014
          alpha(3)   -0.0705413     0.0737058   -0.9571   0.3397
          alpha(4)    0.0384531     0.0725763    0.5298   0.5968

          Null hypothesis: no ARCH effect is present
          Test statistic: LM = 7.30776
          with p-value = P(Chi-square(4) > 7.30776) = 0.120491:
        '''
        '''
        Variance Inflation Factors

        Minimum possible value = 1.0
        Values > 10.0 may indicate a collinearity problem

           ds_l_realgdp    1.002
              realint_1    1.002

        VIF(j) = 1/(1 - R(j)^2), where R(j) is the multiple correlation coefficient
        between variable j and the other independent variables

        Properties of matrix X'X:

         1-norm = 6862.0664
         Determinant = 1.0296049e+009
         Reciprocal condition number = 0.013819244
        '''
        '''
        Test for ARCH of order 4 -
          Null hypothesis: no ARCH effect is present
          Test statistic: LM = 7.30776
          with p-value = P(Chi-square(4) > 7.30776) = 0.120491

        Test of common factor restriction -
          Null hypothesis: restriction is acceptable
          Test statistic: F(2, 195) = 0.426391
          with p-value = P(F(2, 195) > 0.426391) = 0.653468

        Test for normality of residual -
          Null hypothesis: error is normally distributed
          Test statistic: Chi-square(2) = 20.2792
          with p-value = 3.94837e-005:
        '''

        #no idea what this is
        '''
        Augmented regression for common factor test
        OLS, using observations 1959:3-2009:3 (T = 201)
        Dependent variable: ds_l_realinv

                           coefficient   std. error   t-ratio    p-value
          ---------------------------------------------------------------
          const            -10.9481      1.35807      -8.062    7.44e-014 ***
          ds_l_realgdp       4.28893     0.229459     18.69     2.40e-045 ***
          realint_1         -0.662644    0.334872     -1.979    0.0492    **
          ds_l_realinv_1    -0.108892    0.0715042    -1.523    0.1294
          ds_l_realgdp_1     0.660443    0.390372      1.692    0.0923    *
          realint_2          0.0769695   0.341527      0.2254   0.8219

          Sum of squared residuals = 22432.8

        Test of common factor restriction

          Test statistic: F(2, 195) = 0.426391, with p-value = 0.653468
        '''

        ################ with OLS, HAC errors

        #Model 5: OLS, using observations 1959:2-2009:3 (T = 202)
        #Dependent variable: ds_l_realinv
        #HAC standard errors, bandwidth 4 (Bartlett kernel)

        #coefficient   std. error   t-ratio    p-value 95% CONFIDENCE INTERVAL
        #for confidence interval t(199, 0.025) = 1.972

        partable = np.array([
            [-9.48167, 1.17709, -8.055, 7.17e-014, -11.8029, -7.16049],  # ***
            [4.37422, 0.328787, 13.30, 2.62e-029, 3.72587, 5.02258],  #***
            [-0.613997, 0.293619, -2.091, 0.0378, -1.19300, -0.0349939]
        ])  # **

        result_gretl_g1 = dict(endog_mean=("Mean dependent var", 3.257395),
                               endog_std=("S.D. dependent var", 18.73915),
                               ssr=("Sum squared resid", 22799.68),
                               mse_resid_sqrt=("S.E. of regression", 10.70380),
                               rsquared=("R-squared", 0.676978),
                               rsquared_adj=("Adjusted R-squared", 0.673731),
                               fvalue=("F(2, 199)", 90.79971),
                               f_pvalue=("P-value(F)", 9.53e-29),
                               llf=("Log-likelihood", -763.9752),
                               aic=("Akaike criterion", 1533.950),
                               bic=("Schwarz criterion", 1543.875),
                               hqic=("Hannan-Quinn", 1537.966),
                               resid_acf1=("rho", -0.107341),
                               dw=("Durbin-Watson", 2.213805))

        linear_logs = [1.68351, 0.430953, 2, "chi2"]
        #for logs: dropping 70 nan or incomplete observations, T=133
        #(res_ols.model.exog <=0).any(1).sum() = 69  ?not 70
        linear_squares = [7.52477, 0.0232283, 2, "chi2"]

        #Autocorrelation, Breusch-Godfrey test for autocorrelation up to order 4
        lm_acorr4 = [1.17928, 0.321197, 4, 195, "F"]
        lm2_acorr4 = [4.771043, 0.312, 4, "chi2"]
        acorr_ljungbox4 = [5.23587, 0.264, 4, "chi2"]

        #break
        cusum_Harvey_Collier = [0.494432, 0.621549, 198,
                                "t"]  #stats.t.sf(0.494432, 198)*2
        #see cusum results in files
        break_qlr = [3.01985, 0.1, 3, 196,
                     "maxF"]  #TODO check this, max at 2001:4
        break_chow = [13.1897, 0.00424384, 3, "chi2"]  # break at 1984:1

        arch_4 = [3.43473, 0.487871, 4, "chi2"]

        normality = [23.962, 0.00001, 2, "chi2"]

        het_white = [33.503723, 0.000003, 5, "chi2"]
        het_breush_pagan = [1.302014, 0.521520, 2,
                            "chi2"]  #TODO: not available
        het_breush_pagan_konker = [0.709924, 0.701200, 2, "chi2"]

        reset_2_3 = [5.219019, 0.00619, 2, 197, "f"]
        reset_2 = [7.268492, 0.00762, 1, 198, "f"]
        reset_3 = [5.248951, 0.023, 1, 198, "f"]  #not available

        cond_1norm = 5984.0525
        determinant = 7.1087467e+008
        reciprocal_condition_number = 0.013826504
        vif = [1.001, 1.001]

        names = 'date   residual        leverage       influence        DFFITS'.split(
        )
        cur_dir = os.path.abspath(os.path.dirname(__file__))
        fpath = os.path.join(cur_dir,
                             'results/leverage_influence_ols_nostars.txt')
        lev = np.genfromtxt(fpath,
                            skip_header=3,
                            skip_footer=1,
                            converters={0: lambda s: s})
        #either numpy 1.6 or python 3.2 changed behavior
        if np.isnan(lev[-1]['f1']):
            lev = np.genfromtxt(fpath,
                                skip_header=3,
                                skip_footer=2,
                                converters={0: lambda s: s})

        lev.dtype.names = names

        res = res_ols  #for easier copying

        cov_hac, bse_hac = sw.cov_hac_simple(res,
                                             nlags=4,
                                             use_correction=False)

        assert_almost_equal(res.params, partable[:, 0], 5)
        assert_almost_equal(bse_hac, partable[:, 1], 5)
        #TODO

        assert_almost_equal(res.ssr, result_gretl_g1['ssr'][1], decimal=2)
        #assert_almost_equal(res.llf, result_gretl_g1['llf'][1], decimal=7) #not in gretl
        assert_almost_equal(res.rsquared,
                            result_gretl_g1['rsquared'][1],
                            decimal=6)  #FAIL
        assert_almost_equal(res.rsquared_adj,
                            result_gretl_g1['rsquared_adj'][1],
                            decimal=6)  #FAIL
        assert_almost_equal(np.sqrt(res.mse_resid),
                            result_gretl_g1['mse_resid_sqrt'][1],
                            decimal=5)
        #f-value is based on cov_hac I guess
        #assert_almost_equal(res.fvalue, result_gretl_g1['fvalue'][1], decimal=0) #FAIL
        #assert_approx_equal(res.f_pvalue, result_gretl_g1['f_pvalue'][1], significant=1) #FAIL
        #assert_almost_equal(res.durbin_watson, result_gretl_g1['dw'][1], decimal=7) #TODO

        c = oi.reset_ramsey(res, degree=2)
        compare_ftest(c, reset_2, decimal=(6, 5))
        c = oi.reset_ramsey(res, degree=3)
        compare_ftest(c, reset_2_3, decimal=(6, 5))

        linear_sq = smsdia.linear_lm(res.resid, res.model.exog)
        assert_almost_equal(linear_sq[0], linear_squares[0], decimal=6)
        assert_almost_equal(linear_sq[1], linear_squares[1], decimal=7)

        hbpk = smsdia.het_breushpagan(res.resid, res.model.exog)
        assert_almost_equal(hbpk[0], het_breush_pagan_konker[0], decimal=6)
        assert_almost_equal(hbpk[1], het_breush_pagan_konker[1], decimal=6)

        hw = smsdia.het_white(res.resid, res.model.exog)
        assert_almost_equal(hw[:2], het_white[:2], 6)

        #arch
        #sm_arch = smsdia.acorr_lm(res.resid**2, maxlag=4, autolag=None)
        sm_arch = smsdia.het_arch(res.resid, maxlag=4)
        assert_almost_equal(sm_arch[0], arch_4[0], decimal=5)
        assert_almost_equal(sm_arch[1], arch_4[1], decimal=6)

        vif2 = [
            oi.variance_inflation_factor(res.model.exog, k) for k in [1, 2]
        ]

        infl = oi.OLSInfluence(res_ols)
        #print np.max(np.abs(lev['DFFITS'] - infl.dffits[0]))
        #print np.max(np.abs(lev['leverage'] - infl.hat_matrix_diag))
        #print np.max(np.abs(lev['influence'] - infl.influence))  #just added this based on Gretl

        #just rough test, low decimal in Gretl output,
        assert_almost_equal(lev['residual'], res.resid, decimal=3)
        assert_almost_equal(lev['DFFITS'], infl.dffits[0], decimal=3)
        assert_almost_equal(lev['leverage'], infl.hat_matrix_diag, decimal=3)
        assert_almost_equal(lev['influence'], infl.influence, decimal=4)
Beispiel #37
0
def grangercausalitytests(x, maxlag, addconst=True, verbose=True):
    '''four tests for granger causality of 2 timeseries

    all four tests give similar results
    `params_ftest` and `ssr_ftest` are equivalent based of F test which is
    identical to lmtest:grangertest in R

    Parameters
    ----------
    x : array, 2d, (nobs,2)
        data for test whether the time series in the second column Granger
        causes the time series in the first column
    maxlag : integer
        the Granger causality test results are calculated for all lags up to
        maxlag
    verbose : bool
        print results if true

    Returns
    -------
    results : dictionary
        all test results, dictionary keys are the number of lags. For each
        lag the values are a tuple, with the first element a dictionary with
        teststatistic, pvalues, degrees of freedom, the second element are
        the OLS estimation results for the restricted model, the unrestricted
        model and the restriction (contrast) matrix for the parameter f_test.

    Notes
    -----
    TODO: convert to class and attach results properly

    The Null hypothesis for grangercausalitytests is that the time series in
    the second column, x2, Granger causes the time series in the first column,
    x1. This means that past values of x2 have a statistically significant
    effect on the current value of x1, taking also past values of x1 into
    account, as regressors. We reject the null hypothesis of x2 Granger
    causing x1 if the pvalues are below a desired size of the test.

    'params_ftest', 'ssr_ftest' are based on F test

    'ssr_chi2test', 'lrtest' are based on chi-square test

    '''
    from scipy import stats # lazy import

    resli = {}

    for mlg in range(1, maxlag+1):
        result = {}
        if verbose:
            print '\nGranger Causality'
            print 'number of lags (no zero)', mlg
        mxlg = mlg #+ 1 # Note number of lags starting at zero in lagmat

        # create lagmat of both time series
        dta = lagmat2ds(x, mxlg, trim='both', dropex=1)

        #add constant
        if addconst:
            dtaown = add_constant(dta[:,1:mxlg+1])
            dtajoint = add_constant(dta[:,1:])
        else:
            raise ValueError('Not Implemented')
            dtaown = dta[:,1:mxlg]
            dtajoint = dta[:,1:]

        #run ols on both models without and with lags of second variable
        res2down = OLS(dta[:,0], dtaown).fit()
        res2djoint = OLS(dta[:,0], dtajoint).fit()

        #print results
        #for ssr based tests see: http://support.sas.com/rnd/app/examples/ets/granger/index.htm
        #the other tests are made-up

        # Granger Causality test using ssr (F statistic)
        fgc1 = (res2down.ssr-res2djoint.ssr)/res2djoint.ssr/(mxlg)*res2djoint.df_resid
        if verbose:
            print 'ssr based F test:         F=%-8.4f, p=%-8.4f, df_denom=%d, df_num=%d' % \
              (fgc1, stats.f.sf(fgc1, mxlg, res2djoint.df_resid), res2djoint.df_resid, mxlg)
        result['ssr_ftest'] = (fgc1, stats.f.sf(fgc1, mxlg, res2djoint.df_resid), res2djoint.df_resid, mxlg)

        # Granger Causality test using ssr (ch2 statistic)
        fgc2 = res2down.nobs*(res2down.ssr-res2djoint.ssr)/res2djoint.ssr
        if verbose:
            print 'ssr based chi2 test:   chi2=%-8.4f, p=%-8.4f, df=%d' %  \
              (fgc2, stats.chi2.sf(fgc2, mxlg), mxlg)
        result['ssr_chi2test'] = (fgc2, stats.chi2.sf(fgc2, mxlg), mxlg)

        #likelihood ratio test pvalue:
        lr = -2*(res2down.llf-res2djoint.llf)
        if verbose:
            print 'likelihood ratio test: chi2=%-8.4f, p=%-8.4f, df=%d' %  \
              (lr, stats.chi2.sf(lr, mxlg), mxlg)
        result['lrtest'] = (lr, stats.chi2.sf(lr, mxlg), mxlg)

        # F test that all lag coefficients of exog are zero
        rconstr = np.column_stack((np.zeros((mxlg-1,mxlg-1)), np.eye(mxlg-1, mxlg-1),\
                                   np.zeros((mxlg-1, 1))))
        rconstr = np.column_stack((np.zeros((mxlg,mxlg)), np.eye(mxlg, mxlg),\
                                   np.zeros((mxlg, 1))))
        ftres = res2djoint.f_test(rconstr)
        if verbose:
            print 'parameter F test:         F=%-8.4f, p=%-8.4f, df_denom=%d, df_num=%d' % \
              (ftres.fvalue, ftres.pvalue, ftres.df_denom, ftres.df_num)
        result['params_ftest'] = (np.squeeze(ftres.fvalue)[()],
                                  np.squeeze(ftres.pvalue)[()],
                                  ftres.df_denom, ftres.df_num)

        resli[mxlg] = (result, [res2down, res2djoint, rconstr])

    return resli