Example #1
0
def notyet_atst():
    d = macrodata.load().data

    realinv = d['realinv']
    realgdp = d['realgdp']
    realint = d['realint']
    endog = realinv
    exog = add_constant(np.c_[realgdp, realint],prepend=True)
    res_ols1 = OLS(endog, exog).fit()

    #growth rates
    gs_l_realinv = 400 * np.diff(np.log(d['realinv']))
    gs_l_realgdp = 400 * np.diff(np.log(d['realgdp']))
    lint = d['realint'][:-1]
    tbilrate = d['tbilrate'][:-1]

    endogg = gs_l_realinv
    exogg = add_constant(np.c_[gs_l_realgdp, lint], prepend=True)
    exogg2 = add_constant(np.c_[gs_l_realgdp, tbilrate], prepend=True)

    res_ols = OLS(endogg, exogg).fit()
    res_ols2 = OLS(endogg, exogg2).fit()

    #the following were done accidentally with res_ols1 in R,
    #with original Greene data

    params = np.array([-272.3986041341653, 0.1779455206941112,
                       0.2149432424658157])
    cov_hac_4 = np.array([1321.569466333051, -0.2318836566017612,
                37.01280466875694, -0.2318836566017614, 4.602339488102263e-05,
                -0.0104687835998635, 37.012804668757, -0.0104687835998635,
                21.16037144168061]).reshape(3,3, order='F')
    cov_hac_10 = np.array([2027.356101193361, -0.3507514463299015,
        54.81079621448568, -0.350751446329901, 6.953380432635583e-05,
        -0.01268990195095196, 54.81079621448564, -0.01268990195095195,
        22.92512402151113]).reshape(3,3, order='F')

    #goldfeld-quandt
    het_gq_greater = dict(statistic=13.20512768685082, df1=99, df2=98,
                          pvalue=1.246141976112324e-30, distr='f')
    het_gq_less = dict(statistic=13.20512768685082, df1=99, df2=98, pvalue=1.)
    het_gq_2sided = dict(statistic=13.20512768685082, df1=99, df2=98,
                          pvalue=1.246141976112324e-30, distr='f')

    #goldfeld-quandt, fraction = 0.5
    het_gq_greater_2 = dict(statistic=87.1328934692124, df1=48, df2=47,
                          pvalue=2.154956842194898e-33, distr='f')

    gq = smsdia.het_goldfeldquandt(endog, exog, split=0.5)
    compare_t_est(gq, het_gq_greater, decimal=(13, 14))
    assert_equal(gq[-1], 'increasing')


    harvey_collier = dict(stat=2.28042114041313, df=199,
                          pvalue=0.02364236161988260, distr='t')
    #hc = harvtest(fm, order.by=ggdp , data = list())
    harvey_collier_2 = dict(stat=0.7516918462158783, df=199,
                          pvalue=0.4531244858006127, distr='t')
Example #2
0
def coint(y1, y2, regression="c"):
    """
    This is a simple cointegration test. Uses unit-root test on residuals to
    test for cointegrated relationship

    See Hamilton (1994) 19.2

    Parameters
    ----------
    y1 : array_like, 1d
        first element in cointegrating vector
    y2 : array_like
        remaining elements in cointegrating vector
    c : str {'c'}
        Included in regression
        * 'c' : Constant

    Returns
    -------
    coint_t : float
        t-statistic of unit-root test on residuals
    pvalue : float
        MacKinnon's approximate p-value based on MacKinnon (1994)
    crit_value : dict
        Critical values for the test statistic at the 1 %, 5 %, and 10 % levels.


    Notes
    -----
    The Null hypothesis is that there is no cointegration, the alternative
    hypothesis is that there is cointegrating relationship. If the pvalue is
    small, below a critical size, then we can reject the hypothesis that there
    is no cointegrating relationship.

    P-values are obtained through regression surface approximation from
    MacKinnon 1994.

    References
    ----------
    MacKinnon, J.G. 1994.  "Approximate asymptotic distribution functions for
        unit-root and cointegration tests.  `Journal of Business and Economic
        Statistics` 12, 167-76.

    """
    regression = regression.lower()
    if regression not in ['c', 'nc', 'ct', 'ctt']:
        raise ValueError("regression option %s not understood") % regression
    y1 = np.asarray(y1)
    y2 = np.asarray(y2)
    if regression == 'c':
        y2 = add_constant(y2)
    st1_resid = OLS(y1, y2).fit().resid  #stage one residuals
    lgresid_cons = add_constant(st1_resid[0:-1])
    uroot_reg = OLS(st1_resid[1:], lgresid_cons).fit()
    coint_t = (uroot_reg.params[0] - 1) / uroot_reg.bse[0]
    pvalue = mackinnonp(coint_t, regression="c", N=2, lags=None)
    crit_value = mackinnoncrit(N=1, regression="c", nobs=len(y1))
    return coint_t, pvalue, crit_value
def coint(y1, y2, regression="c"):
    """
    This is a simple cointegration test. Uses unit-root test on residuals to
    test for cointegrated relationship

    See Hamilton (1994) 19.2

    Parameters
    ----------
    y1 : array_like, 1d
        first element in cointegrating vector
    y2 : array_like
        remaining elements in cointegrating vector
    c : str {'c'}
        Included in regression
        * 'c' : Constant

    Returns
    -------
    coint_t : float
        t-statistic of unit-root test on residuals
    pvalue : float
        MacKinnon's approximate p-value based on MacKinnon (1994)
    crit_value : dict
        Critical values for the test statistic at the 1 %, 5 %, and 10 % levels.

    Notes
    -----
    The Null hypothesis is that there is no cointegration, the alternative
    hypothesis is that there is cointegrating relationship. If the pvalue is
    small, below a critical size, then we can reject the hypothesis that there
    is no cointegrating relationship.

    P-values are obtained through regression surface approximation from
    MacKinnon 1994.

    References
    ----------
    MacKinnon, J.G. 1994.  "Approximate asymptotic distribution functions for
        unit-root and cointegration tests.  `Journal of Business and Economic
        Statistics` 12, 167-76.

    """
    regression = regression.lower()
    if regression not in ['c','nc','ct','ctt']:
        raise ValueError("regression option %s not understood") % regression
    y1 = np.asarray(y1)
    y2 = np.asarray(y2)
    if regression == 'c':
        y2 = add_constant(y2)
    st1_resid = OLS(y1, y2).fit().resid #stage one residuals
    lgresid_cons = add_constant(st1_resid[0:-1])
    uroot_reg = OLS(st1_resid[1:], lgresid_cons).fit()
    coint_t = (uroot_reg.params[0]-1)/uroot_reg.bse[0]
    pvalue = mackinnonp(coint_t, regression="c", N=2, lags=None)
    crit_value = mackinnoncrit(N=1, regression="c", nobs=len(y1))
    return coint_t, pvalue, crit_value
Example #4
0
 def setupClass(cls):
     data = longley.load()
     data.exog = add_constant(data.exog)
     ols_res = OLS(data.endog, data.exog).fit()
     gls_res = GLS(data.endog, data.exog).fit()
     cls.res1 = gls_res
     cls.res2 = ols_res
Example #5
0
def pacf_ols(x, nlags=40):
    '''Calculate partial autocorrelations

    Parameters
    ----------
    x : 1d array
        observations of time series for which pacf is calculated
    nlags : int
        Number of lags for which pacf is returned.  Lag 0 is not returned.

    Returns
    -------
    pacf : 1d array
        partial autocorrelations, maxlag+1 elements

    Notes
    -----
    This solves a separate OLS estimation for each desired lag.
    '''
    #TODO: add warnings for Yule-Walker
    #NOTE: demeaning and not using a constant gave incorrect answers?
    #JP: demeaning should have a better estimate of the constant
    #maybe we can compare small sample properties with a MonteCarlo
    xlags, x0 = lagmat(x, nlags, original='sep')
    #xlags = sm.add_constant(lagmat(x, nlags), prepend=True)
    xlags = add_constant(xlags, prepend=True)
    pacf = [1.]
    for k in range(1, nlags + 1):
        res = OLS(x0[k:], xlags[k:, :k + 1]).fit()
        #np.take(xlags[k:], range(1,k+1)+[-1],

        pacf.append(res.params[-1])
    return np.array(pacf)
 def setupClass(cls):
     data = longley.load()
     data.exog = add_constant(data.exog)
     ols_res = OLS(data.endog, data.exog).fit()
     gls_res = GLS(data.endog, data.exog).fit()
     cls.res1 = gls_res
     cls.res2 = ols_res
Example #7
0
def test_cov_cluster_2groups():
    #comparing cluster robust standard errors to Peterson
    #requires Petersen's test_data
    #http://www.kellogg.northwestern.edu/faculty/petersen/htm/papers/se/test_data.txt
    import os
    cur_dir = os.path.abspath(os.path.dirname(__file__))
    fpath = os.path.join(cur_dir,"test_data.txt")
    pet = np.genfromtxt(fpath)
    endog = pet[:,-1]
    group = pet[:,0].astype(int)
    time = pet[:,1].astype(int)
    exog = add_constant(pet[:,2], prepend=True)
    res = OLS(endog, exog).fit()

    cov01, covg, covt = sw.cov_cluster_2groups(res, group, group2=time)

    #Reference number from Petersen
    #http://www.kellogg.northwestern.edu/faculty/petersen/htm/papers/se/test_data.htm

    bse_petw = [0.0284, 0.0284]
    bse_pet0 = [0.0670, 0.0506]
    bse_pet1 = [0.0234, 0.0334]  #year
    bse_pet01 = [0.0651, 0.0536]  #firm and year
    bse_0 = sw.se_cov(covg)
    bse_1 = sw.se_cov(covt)
    bse_01 = sw.se_cov(cov01)
    #print res.HC0_se, bse_petw - res.HC0_se
    #print bse_0, bse_0 - bse_pet0
    #print bse_1, bse_1 - bse_pet1
    #print bse_01, bse_01 - bse_pet01
    assert_almost_equal(bse_petw, res.HC0_se, decimal=4)
    assert_almost_equal(bse_0, bse_pet0, decimal=4)
    assert_almost_equal(bse_1, bse_pet1, decimal=4)
    assert_almost_equal(bse_01, bse_pet01, decimal=4)
Example #8
0
def test_hac_simple():

    from scikits.statsmodels.datasets import macrodata
    d2 = macrodata.load().data
    g_gdp = 400*np.diff(np.log(d2['realgdp']))
    g_inv = 400*np.diff(np.log(d2['realinv']))
    exogg = add_constant(np.c_[g_gdp, d2['realint'][:-1]],prepend=True)
    res_olsg = OLS(g_inv, exogg).fit()



    #> NeweyWest(fm, lag = 4, prewhite = FALSE, sandwich = TRUE, verbose=TRUE, adjust=TRUE)
    #Lag truncation parameter chosen: 4
    #                     (Intercept)                   ggdp                  lint
    cov1_r = [[  1.40643899878678802, -0.3180328707083329709, -0.060621111216488610],
             [ -0.31803287070833292,  0.1097308348999818661,  0.000395311760301478],
             [ -0.06062111121648865,  0.0003953117603014895,  0.087511528912470993]]

    #> NeweyWest(fm, lag = 4, prewhite = FALSE, sandwich = TRUE, verbose=TRUE, adjust=FALSE)
    #Lag truncation parameter chosen: 4
    #                    (Intercept)                  ggdp                  lint
    cov2_r = [[ 1.3855512908840137, -0.313309610252268500, -0.059720797683570477],
             [ -0.3133096102522685,  0.108101169035130618,  0.000389440793564339],
             [ -0.0597207976835705,  0.000389440793564336,  0.086211852740503622]]

    cov1, se1 = sw.cov_hac_simple(res_olsg, nlags=4, use_correction=True)
    cov2, se2 = sw.cov_hac_simple(res_olsg, nlags=4, use_correction=False)
    assert_almost_equal(cov1, cov1_r, decimal=14)
    assert_almost_equal(cov2, cov2_r, decimal=14)
Example #9
0
def pacf_ols(x, nlags=40):
    '''Calculate partial autocorrelations

    Parameters
    ----------
    x : 1d array
        observations of time series for which pacf is calculated
    nlags : int
        Number of lags for which pacf is returned.  Lag 0 is not returned.

    Returns
    -------
    pacf : 1d array
        partial autocorrelations, maxlag+1 elements

    Notes
    -----
    This solves a separate OLS estimation for each desired lag.
    '''
    #TODO: add warnings for Yule-Walker
    #NOTE: demeaning and not using a constant gave incorrect answers?
    #JP: demeaning should have a better estimate of the constant
    #maybe we can compare small sample properties with a MonteCarlo
    xlags, x0 = lagmat(x, nlags, original='sep')
    #xlags = sm.add_constant(lagmat(x, nlags), prepend=True)
    xlags = add_constant(xlags, prepend=True)
    pacf = [1.]
    for k in range(1, nlags+1):
        res = OLS(x0[k:], xlags[k:,:k+1]).fit()
         #np.take(xlags[k:], range(1,k+1)+[-1],

        pacf.append(res.params[-1])
    return np.array(pacf)
Example #10
0
 def setupClass(cls):
     data = longley.load()
     data.exog = add_constant(data.exog)
     res1 = OLS(data.endog, data.exog).fit()
     R = np.array([[0, 1, 1, 0, 0, 0, 0], [0, 1, 0, 1, 0, 0, 0],
                   [0, 1, 0, 0, 0, 0, 0], [0, 0, 0, 0, 1, 0, 0],
                   [0, 0, 0, 0, 0, 1, 0]])
     q = np.array([0, 0, 0, 1, 0])
     cls.Ftest1 = res1.f_test(R, q)
Example #11
0
def test_prefect_pred():
    cur_dir = os.path.dirname(os.path.abspath(__file__))
    iris = np.genfromtxt(os.path.join(cur_dir, 'results', 'iris.csv'),
                    delimiter=",", skip_header=1)
    y = iris[:,-1]
    X = iris[:,:-1]
    X = X[y != 2]
    y = y[y != 2]
    X = add_constant(X, prepend=True)
    glm = GLM(y, X, family=sm.families.Binomial())
    assert_raises(PerfectSeparationError, glm.fit)
 def setupClass(cls):
     data = longley.load()
     data.exog = add_constant(data.exog)
     res1 = OLS(data.endog, data.exog).fit()
     R = np.array([[0,1,1,0,0,0,0],
           [0,1,0,1,0,0,0],
           [0,1,0,0,0,0,0],
           [0,0,0,0,1,0,0],
           [0,0,0,0,0,1,0]])
     q = np.array([0,0,0,1,0])
     cls.Ftest1 = res1.f_test(R,q)
Example #13
0
    def setupClass(cls):
        from results.results_regression import Longley
        data = longley.load()
        data.exog = add_constant(data.exog)
        res1 = OLS(data.endog, data.exog).fit()
        res2 = Longley()
        res2.wresid = res1.wresid  # workaround hack
        cls.res1 = res1
        cls.res2 = res2

        res_qr = OLS(data.endog, data.exog).fit(method="qr")
        cls.res_qr = res_qr
    def setupClass(cls):
        from results.results_regression import Longley
        data = longley.load()
        data.exog = add_constant(data.exog)
        res1 = OLS(data.endog, data.exog).fit()
        res2 = Longley()
        res2.wresid = res1.wresid # workaround hack
        cls.res1 = res1
        cls.res2 = res2

        res_qr = OLS(data.endog, data.exog).fit(method="qr")
        cls.res_qr = res_qr
Example #15
0
 def setupClass(cls):
     from results.results_glm import Cpunish
     from scikits.statsmodels.datasets.cpunish import load
     data = load()
     data.exog[:,3] = np.log(data.exog[:,3])
     data.exog = add_constant(data.exog)
     exposure = [100] * len(data.endog)
     cls.res1 = GLM(data.endog, data.exog, family=sm.families.Poisson(),
                 exposure=exposure).fit()
     cls.res1.params[-1] += np.log(100) # add exposure back in to param
                                         # to make the results the same
     cls.res2 = Cpunish()
Example #16
0
def test_prefect_pred():
    cur_dir = os.path.dirname(os.path.abspath(__file__))
    iris = np.genfromtxt(os.path.join(cur_dir, 'results', 'iris.csv'),
                         delimiter=",",
                         skip_header=1)
    y = iris[:, -1]
    X = iris[:, :-1]
    X = X[y != 2]
    y = y[y != 2]
    X = add_constant(X, prepend=True)
    glm = GLM(y, X, family=sm.families.Binomial())
    assert_raises(PerfectSeparationError, glm.fit)
Example #17
0
    def setupClass(cls):
        from results.results_glm import Cpunish
        from scikits.statsmodels.datasets.cpunish import load

        data = load()
        data.exog[:, 3] = np.log(data.exog[:, 3])
        data.exog = add_constant(data.exog)
        exposure = [100] * len(data.endog)
        cls.res1 = GLM(data.endog, data.exog, family=sm.families.Poisson(), exposure=exposure).fit()
        cls.res1.params[-1] += np.log(100)  # add exposure back in to param
        # to make the results the same
        cls.res2 = Cpunish()
    def setupClass(cls):
        from results.results_regression import LongleyGls

        data = longley.load()
        exog = add_constant(np.column_stack((data.exog[:, 1], data.exog[:, 4])))
        tmp_results = OLS(data.endog, exog).fit()
        rho = np.corrcoef(tmp_results.resid[1:], tmp_results.resid[:-1])[0][1]  # by assumption
        order = toeplitz(np.arange(16))
        sigma = rho ** order
        GLS_results = GLS(data.endog, exog, sigma=sigma).fit()
        cls.res1 = GLS_results
        cls.res2 = LongleyGls()
Example #19
0
def qqline(ax, line, x=None, y=None, dist=None, fmt='r-'):
    """
    Plot a reference line for a qqplot.

    Parameters
    ----------
    ax : matplotlib axes instance
        The axes on which to plot the line
    line : str {'45','r','s','q'}
        Options for the reference line to which the data is compared.
        '45' - 45-degree line
        's'  - standardized line, the expected order statistics are scaled by the
               standard deviation of the given sample and have the mean added to them
        'r'  - A regression line is fit
        'q'  - A line is fit through the quartiles.
        None - By default no reference line is added to the plot.
    x : array
        X data for plot. Not needed if line is '45'.
    y : array
        Y data for plot. Not needed if line is '45'.
    dist : scipy.stats.distribution
        A scipy.stats distribution, needed if line is 'q'.

    Notes
    -----
    There is no return value. The line is plotted on the given `ax`.
    """
    if line == '45':
        end_pts = zip(ax.get_xlim(), ax.get_ylim())
        end_pts[0] = max(end_pts[0])
        end_pts[1] = min(end_pts[1])
        ax.plot(end_pts, end_pts, fmt)
        return # does this have any side effects?
    if x is None and y is None:
        raise ValueError("If line is not 45, x and y cannot be None.")
    elif line == 'r':
        # could use ax.lines[0].get_xdata(), get_ydata(),
        # but don't know axes are 'clean'
        y = OLS(y, add_constant(x)).fit().fittedvalues
        ax.plot(x,y,fmt)
    elif line == 's':
        m,b = y.std(), y.mean()
        ref_line = x*m + b
        ax.plot(x, ref_line, fmt)
    elif line == 'q':
        q25 = stats.scoreatpercentile(y, 25)
        q75 = stats.scoreatpercentile(y, 75)
        theoretical_quartiles = dist.ppf([.25,.75])
        m = (q75 - q25) / np.diff(theoretical_quartiles)
        b = q25 - m*theoretical_quartiles[0]
        ax.plot(x, m*x + b, fmt)
Example #20
0
def qqline(ax, line, x=None, y=None, dist=None, fmt='r-'):
    """
    Plot a reference line for a qqplot.

    Parameters
    ----------
    ax : matplotlib axes instance
        The axes on which to plot the line
    line : str {'45','r','s','q'}
        Options for the reference line to which the data is compared.
        '45' - 45-degree line
        's'  - standardized line, the expected order statistics are scaled by the
               standard deviation of the given sample and have the mean added to them
        'r'  - A regression line is fit
        'q'  - A line is fit through the quartiles.
        None - By default no reference line is added to the plot.
    x : array
        X data for plot. Not needed if line is '45'.
    y : array
        Y data for plot. Not needed if line is '45'.
    dist : scipy.stats.distribution
        A scipy.stats distribution, needed if line is 'q'.

    Notes
    -----
    There is no return value. The line is plotted on the given `ax`.
    """
    if line == '45':
        end_pts = zip(ax.get_xlim(), ax.get_ylim())
        end_pts[0] = max(end_pts[0])
        end_pts[1] = min(end_pts[1])
        ax.plot(end_pts, end_pts, fmt)
        return  # does this have any side effects?
    if x is None and y is None:
        raise ValueError("If line is not 45, x and y cannot be None.")
    elif line == 'r':
        # could use ax.lines[0].get_xdata(), get_ydata(),
        # but don't know axes are 'clean'
        y = OLS(y, add_constant(x)).fit().fittedvalues
        ax.plot(x, y, fmt)
    elif line == 's':
        m, b = y.std(), y.mean()
        ref_line = x * m + b
        ax.plot(x, ref_line, fmt)
    elif line == 'q':
        q25 = stats.scoreatpercentile(y, 25)
        q75 = stats.scoreatpercentile(y, 75)
        theoretical_quartiles = dist.ppf([.25, .75])
        m = (q75 - q25) / np.diff(theoretical_quartiles)
        b = q25 - m * theoretical_quartiles[0]
        ax.plot(x, m * x + b, fmt)
Example #21
0
    def __init__(self):
        d = macrodata.load().data
        #growth rates
        gs_l_realinv = 400 * np.diff(np.log(d['realinv']))
        gs_l_realgdp = 400 * np.diff(np.log(d['realgdp']))
        lint = d['realint'][:-1]
        tbilrate = d['tbilrate'][:-1]

        endogg = gs_l_realinv
        exogg = add_constant(np.c_[gs_l_realgdp, lint], prepend=True)
        exogg2 = add_constant(np.c_[gs_l_realgdp, tbilrate], prepend=True)
        exogg3 = add_constant(np.c_[gs_l_realgdp], prepend=True)

        res_ols = OLS(endogg, exogg).fit()
        res_ols2 = OLS(endogg, exogg2).fit()

        res_ols3 = OLS(endogg, exogg3).fit()

        self.res = res_ols
        self.res2 = res_ols2
        self.res3 = res_ols3
        self.endog = self.res.model.endog
        self.exog = self.res.model.exog
Example #22
0
    def __init__(self):
        '''
        Tests Poisson family with canonical log link.

        Test results were obtained by R.
        '''
        from results.results_glm import Cpunish
        from scikits.statsmodels.datasets.cpunish import load
        self.data = load()
        self.data.exog[:,3] = np.log(self.data.exog[:,3])
        self.data.exog = add_constant(self.data.exog)
        self.res1 = GLM(self.data.endog, self.data.exog,
                    family=sm.families.Poisson()).fit()
        self.res2 = Cpunish()
Example #23
0
    def setupClass(cls):
        from results.results_regression import LongleyGls

        data = longley.load()
        exog = add_constant(np.column_stack(\
                (data.exog[:,1],data.exog[:,4])))
        tmp_results = OLS(data.endog, exog).fit()
        rho = np.corrcoef(tmp_results.resid[1:],
                          tmp_results.resid[:-1])[0][1]  # by assumption
        order = toeplitz(np.arange(16))
        sigma = rho**order
        GLS_results = GLS(data.endog, exog, sigma=sigma).fit()
        cls.res1 = GLS_results
        cls.res2 = LongleyGls()
Example #24
0
 def setupClass(cls):
     #        if skipR:
     #            raise SkipTest, "Rpy not installed"
     #        try:
     #            r.library('car')
     #        except RPyRException:
     #            raise SkipTest, "car library not installed for R"
     R = np.zeros(7)
     R[4:6] = [1, -1]
     #        self.R = R
     data = longley.load()
     data.exog = add_constant(data.exog)
     res1 = OLS(data.endog, data.exog).fit()
     cls.Ttest1 = res1.t_test(R)
Example #25
0
    def __init__(self):
        '''
        Tests Poisson family with canonical log link.

        Test results were obtained by R.
        '''
        from results.results_glm import Cpunish
        from scikits.statsmodels.datasets.cpunish import load
        self.data = load()
        self.data.exog[:,3] = np.log(self.data.exog[:,3])
        self.data.exog = add_constant(self.data.exog)
        self.res1 = GLM(self.data.endog, self.data.exog,
                    family=sm.families.Poisson()).fit()
        self.res2 = Cpunish()
    def setupClass(cls):
#        if skipR:
#            raise SkipTest, "Rpy not installed"
#        try:
#            r.library('car')
#        except RPyRException:
#            raise SkipTest, "car library not installed for R"
        R = np.zeros(7)
        R[4:6] = [1,-1]
#        self.R = R
        data = longley.load()
        data.exog = add_constant(data.exog)
        res1 = OLS(data.endog, data.exog).fit()
        cls.Ttest1 = res1.t_test(R)
Example #27
0
    def __init__(self):
        '''
        Test Gaussian family with canonical identity link
        '''
        # Test Precisions
        self.decimal_resids = DECIMAL_3
        self.decimal_params = DECIMAL_2
        self.decimal_bic = DECIMAL_0
        self.decimal_bse = DECIMAL_3

        from scikits.statsmodels.datasets.longley import load
        self.data = load()
        self.data.exog = add_constant(self.data.exog)
        self.res1 = GLM(self.data.endog, self.data.exog,
                        family=sm.families.Gaussian()).fit()
        from results.results_glm import Longley
        self.res2 = Longley()
Example #28
0
    def __init__(self):
        '''
        Test Gaussian family with canonical identity link
        '''
        # Test Precisions
        self.decimal_resids = DECIMAL_3
        self.decimal_params = DECIMAL_2
        self.decimal_bic = DECIMAL_0
        self.decimal_bse = DECIMAL_3

        from scikits.statsmodels.datasets.longley import load
        self.data = load()
        self.data.exog = add_constant(self.data.exog)
        self.res1 = GLM(self.data.endog, self.data.exog,
                        family=sm.families.Gaussian()).fit()
        from results.results_glm import Longley
        self.res2 = Longley()
Example #29
0
    def __init__(self):
        '''
        Test Binomial family with canonical logit link using star98 dataset.
        '''
        self.decimal_resids = DECIMAL_1
        self.decimal_bic = DECIMAL_2

        from scikits.statsmodels.datasets.star98 import load
        from results.results_glm import Star98
        data = load()
        data.exog = add_constant(data.exog)
        self.res1 = GLM(data.endog, data.exog, \
        family=sm.families.Binomial()).fit()
        #NOTE: if you want to replicate with RModel
        #res2 = RModel(data.endog[:,0]/trials, data.exog, r.glm,
        #        family=r.binomial, weights=trials)

        self.res2 = Star98()
Example #30
0
    def __init__(self):
        '''
        Test Binomial family with canonical logit link using star98 dataset.
        '''
        self.decimal_resids = DECIMAL_1
        self.decimal_bic = DECIMAL_2

        from scikits.statsmodels.datasets.star98 import load
        from results.results_glm import Star98
        data = load()
        data.exog = add_constant(data.exog)
        self.res1 = GLM(data.endog, data.exog, \
        family=sm.families.Binomial()).fit()
        #NOTE: if you want to replicate with RModel
        #res2 = RModel(data.endog[:,0]/trials, data.exog, r.glm,
        #        family=r.binomial, weights=trials)

        self.res2 = Star98()
Example #31
0
    def __init__(self):
        '''
        Tests Gamma family with canonical inverse link (power -1)
        '''
        # Test Precisions
        self.decimal_aic_R = -1  #TODO: off by about 1, we are right with Stata
        self.decimal_resids = DECIMAL_2

        from scikits.statsmodels.datasets.scotland import load
        from results.results_glm import Scotvote
        data = load()
        data.exog = add_constant(data.exog)
        res1 = GLM(data.endog, data.exog, \
                    family=sm.families.Gamma()).fit()
        self.res1 = res1
        #        res2 = RModel(data.endog, data.exog, r.glm, family=r.Gamma)
        res2 = Scotvote()
        res2.aic_R += 2  # R doesn't count degree of freedom for scale with gamma
        self.res2 = res2
Example #32
0
    def __init__(self):
        '''
        Tests Gamma family with canonical inverse link (power -1)
        '''
        # Test Precisions
        self.decimal_aic_R = -1 #TODO: off by about 1, we are right with Stata
        self.decimal_resids = DECIMAL_2

        from scikits.statsmodels.datasets.scotland import load
        from results.results_glm import Scotvote
        data = load()
        data.exog = add_constant(data.exog)
        res1 = GLM(data.endog, data.exog, \
                    family=sm.families.Gamma()).fit()
        self.res1 = res1
#        res2 = RModel(data.endog, data.exog, r.glm, family=r.Gamma)
        res2 = Scotvote()
        res2.aic_R += 2 # R doesn't count degree of freedom for scale with gamma
        self.res2 = res2
Example #33
0
    def __init__(self):
        '''
        Test Negative Binomial family with canonical log link
        '''
        # Test Precision
        self.decimal_resid = DECIMAL_1
        self.decimal_params = DECIMAL_3
        self.decimal_resids = -1 # 1 % mismatch at 0
        self.decimal_fittedvalues = DECIMAL_1

        from scikits.statsmodels.datasets.committee import load
        self.data = load()
        self.data.exog[:,2] = np.log(self.data.exog[:,2])
        interaction = self.data.exog[:,2]*self.data.exog[:,1]
        self.data.exog = np.column_stack((self.data.exog,interaction))
        self.data.exog = add_constant(self.data.exog)
        self.res1 = GLM(self.data.endog, self.data.exog,
                family=sm.families.NegativeBinomial()).fit()
        from results.results_glm import Committee
        res2 = Committee()
        res2.aic_R += 2 # They don't count a degree of freedom for the scale
        self.res2 = res2
Example #34
0
    def __init__(self):
        '''
        Test Negative Binomial family with canonical log link
        '''
        # Test Precision
        self.decimal_resid = DECIMAL_1
        self.decimal_params = DECIMAL_3
        self.decimal_resids = -1 # 1 % mismatch at 0
        self.decimal_fittedvalues = DECIMAL_1

        from scikits.statsmodels.datasets.committee import load
        self.data = load()
        self.data.exog[:,2] = np.log(self.data.exog[:,2])
        interaction = self.data.exog[:,2]*self.data.exog[:,1]
        self.data.exog = np.column_stack((self.data.exog,interaction))
        self.data.exog = add_constant(self.data.exog)
        self.res1 = GLM(self.data.endog, self.data.exog,
                family=sm.families.NegativeBinomial()).fit()
        from results.results_glm import Committee
        res2 = Committee()
        res2.aic_R += 2 # They don't count a degree of freedom for the scale
        self.res2 = res2
    nsample = 300  #different pattern last graph with 100 or 200 or 500
    sig = 0.5

    np.random.seed(9876789) #9876543)
    X = np.random.randn(nsample, 3)
    X = np.column_stack((np.ones((nsample,1)), X))
    beta = [1, 0.5, -0.5, 1.]
    y_true2 = np.dot(X, beta)


    x1 = np.linspace(0, 1, nsample)
    gamma = np.array([1, 3.])
    #with slope 3 instead of two, I get negative weights, Not correct
    #   - was misspecified, but the negative weights are still possible with identity link
    #gamma /= gamma.sum()   #normalize assuming x1.max is 1
    z_true = add_constant(x1, prepend=True)

    winv = np.dot(z_true, gamma)
    het_params = sig**2 * np.array([1, 3.])  # for squared
    sig2_het = sig**2 * winv

    weights_dgp = 1/winv
    weights_dgp /= weights_dgp.max()  #should be already normalized - NOT check normalization
    #y2[:nsample*6/10] = y_true2[:nsample*6/10] + sig*1. * np.random.normal(size=nsample*6/10)
    z0 = np.zeros(nsample)
    z0[(nsample * 5)//10:] = 1   #dummy for 2 halfs of sample
    z0 = add_constant(z0, prepend=True)

    z1 = add_constant(x1, prepend=True)

    noise = np.sqrt(sig2_het) * np.random.normal(size=nsample)
 def setupClass(cls):
     data = longley.load()
     data.exog = add_constant(data.exog)
     cls.res1 = OLS(data.endog, data.exog).fit()
     R = np.identity(7)
     cls.Ttest = cls.res1.t_test(R)
Example #37
0
 def test_add_constant_1d(self):
     x = np.arange(1, 5)
     x = tools.add_constant(x, prepend=True)
     y = np.asarray([[1, 1, 1, 1], [1, 2, 3, 4.]]).T
     assert_equal(x, y)
Example #38
0
 def test_add_constant_has_constant2d(self):
     x = np.asarray([[1,1,1,1],[1,2,3,4.]])
     y = tools.add_constant(x)
     assert_equal(x,y)
Example #39
0
 def test_add_constant_1d(self):
     x = np.arange(1,5)
     x = tools.add_constant(x, prepend=True)
     y = np.asarray([[1,1,1,1],[1,2,3,4.]]).T
     assert_equal(x, y)
Example #40
0
 def setupClass(cls):
     data = longley.load()
     data.exog = add_constant(data.exog)
     cls.res1 = OLS(data.endog, data.exog).fit()
     R = np.identity(7)
     cls.Ttest = cls.res1.t_test(R)
Example #41
0
    def test_all(self):

        d = macrodata.load().data
        #import datasetswsm.greene as g
        #d = g.load('5-1')

        #growth rates
        gs_l_realinv = 400 * np.diff(np.log(d['realinv']))
        gs_l_realgdp = 400 * np.diff(np.log(d['realgdp']))

        #simple diff, not growthrate, I want heteroscedasticity later for testing
        endogd = np.diff(d['realinv'])
        exogd = add_constant(np.c_[np.diff(d['realgdp']), d['realint'][:-1]],
                            prepend=True)

        endogg = gs_l_realinv
        exogg = add_constant(np.c_[gs_l_realgdp, d['realint'][:-1]],prepend=True)

        res_ols = OLS(endogg, exogg).fit()
        #print res_ols.params

        mod_g1 = GLSAR(endogg, exogg, rho=-0.108136)
        res_g1 = mod_g1.fit()
        #print res_g1.params

        mod_g2 = GLSAR(endogg, exogg, rho=-0.108136)   #-0.1335859) from R
        res_g2 = mod_g2.iterative_fit(maxiter=5)
        #print res_g2.params


        rho = -0.108136

        #                 coefficient   std. error   t-ratio    p-value 95% CONFIDENCE INTERVAL
        partable = np.array([
                        [-9.50990,  0.990456, -9.602, 3.65e-018, -11.4631, -7.55670], # ***
                        [ 4.37040,  0.208146, 21.00,  2.93e-052,  3.95993, 4.78086], # ***
                        [-0.579253, 0.268009, -2.161, 0.0319, -1.10777, -0.0507346]]) #    **

        #Statistics based on the rho-differenced data:

        result_gretl_g1 = dict(
        endog_mean = ("Mean dependent var",   3.113973),
        endog_std = ("S.D. dependent var",   18.67447),
        ssr = ("Sum squared resid",    22530.90),
        mse_resid_sqrt = ("S.E. of regression",   10.66735),
        rsquared = ("R-squared",            0.676973),
        rsquared_adj = ("Adjusted R-squared",   0.673710),
        fvalue = ("F(2, 198)",            221.0475),
        f_pvalue = ("P-value(F)",           3.56e-51),
        resid_acf1 = ("rho",                 -0.003481),
        dw = ("Durbin-Watson",        1.993858))


        #fstatistic, p-value, df1, df2
        reset_2_3 = [5.219019, 0.00619, 2, 197, "f"]
        reset_2 = [7.268492, 0.00762, 1, 198, "f"]
        reset_3 = [5.248951, 0.023, 1, 198, "f"]
        #LM-statistic, p-value, df
        arch_4 = [7.30776, 0.120491, 4, "chi2"]

        #multicollinearity
        vif = [1.002, 1.002]
        cond_1norm = 6862.0664
        determinant = 1.0296049e+009
        reciprocal_condition_number = 0.013819244

        #Chi-square(2): test-statistic, pvalue, df
        normality = [20.2792, 3.94837e-005, 2]

        #tests
        res = res_g1  #with rho from Gretl

        #basic

        assert_almost_equal(res.params, partable[:,0], 4)
        assert_almost_equal(res.bse, partable[:,1], 6)
        assert_almost_equal(res.tvalues, partable[:,2], 2)

        assert_almost_equal(res.ssr, result_gretl_g1['ssr'][1], decimal=2)
        #assert_almost_equal(res.llf, result_gretl_g1['llf'][1], decimal=7) #not in gretl
        #assert_almost_equal(res.rsquared, result_gretl_g1['rsquared'][1], decimal=7) #FAIL
        #assert_almost_equal(res.rsquared_adj, result_gretl_g1['rsquared_adj'][1], decimal=7) #FAIL
        assert_almost_equal(np.sqrt(res.mse_resid), result_gretl_g1['mse_resid_sqrt'][1], decimal=5)
        assert_almost_equal(res.fvalue, result_gretl_g1['fvalue'][1], decimal=4)
        assert_approx_equal(res.f_pvalue, result_gretl_g1['f_pvalue'][1], significant=2)
        #assert_almost_equal(res.durbin_watson, result_gretl_g1['dw'][1], decimal=7) #TODO

        #arch
        #sm_arch = smsdia.acorr_lm(res.wresid**2, maxlag=4, autolag=None)
        sm_arch = smsdia.het_arch(res.wresid, maxlag=4)
        assert_almost_equal(sm_arch[0], arch_4[0], decimal=4)
        assert_almost_equal(sm_arch[1], arch_4[1], decimal=6)

        #tests
        res = res_g2 #with estimated rho

        #estimated lag coefficient
        assert_almost_equal(res.model.rho, rho, decimal=3)

        #basic
        assert_almost_equal(res.params, partable[:,0], 4)
        assert_almost_equal(res.bse, partable[:,1], 3)
        assert_almost_equal(res.tvalues, partable[:,2], 2)

        assert_almost_equal(res.ssr, result_gretl_g1['ssr'][1], decimal=2)
        #assert_almost_equal(res.llf, result_gretl_g1['llf'][1], decimal=7) #not in gretl
        #assert_almost_equal(res.rsquared, result_gretl_g1['rsquared'][1], decimal=7) #FAIL
        #assert_almost_equal(res.rsquared_adj, result_gretl_g1['rsquared_adj'][1], decimal=7) #FAIL
        assert_almost_equal(np.sqrt(res.mse_resid), result_gretl_g1['mse_resid_sqrt'][1], decimal=5)
        assert_almost_equal(res.fvalue, result_gretl_g1['fvalue'][1], decimal=0)
        assert_almost_equal(res.f_pvalue, result_gretl_g1['f_pvalue'][1], decimal=6)
        #assert_almost_equal(res.durbin_watson, result_gretl_g1['dw'][1], decimal=7) #TODO



        c = oi.reset_ramsey(res, degree=2)
        compare_ftest(c, reset_2, decimal=(2,4))
        c = oi.reset_ramsey(res, degree=3)
        compare_ftest(c, reset_2_3, decimal=(2,4))

        #arch
        #sm_arch = smsdia.acorr_lm(res.wresid**2, maxlag=4, autolag=None)
        sm_arch = smsdia.het_arch(res.wresid, maxlag=4)
        assert_almost_equal(sm_arch[0], arch_4[0], decimal=1)
        assert_almost_equal(sm_arch[1], arch_4[1], decimal=2)



        '''
        Performing iterative calculation of rho...

                         ITER       RHO        ESS
                           1     -0.10734   22530.9
                           2     -0.10814   22530.9

        Model 4: Cochrane-Orcutt, using observations 1959:3-2009:3 (T = 201)
        Dependent variable: ds_l_realinv
        rho = -0.108136

                         coefficient   std. error   t-ratio    p-value
          -------------------------------------------------------------
          const           -9.50990      0.990456    -9.602    3.65e-018 ***
          ds_l_realgdp     4.37040      0.208146    21.00     2.93e-052 ***
          realint_1       -0.579253     0.268009    -2.161    0.0319    **

        Statistics based on the rho-differenced data:

        Mean dependent var   3.113973   S.D. dependent var   18.67447
        Sum squared resid    22530.90   S.E. of regression   10.66735
        R-squared            0.676973   Adjusted R-squared   0.673710
        F(2, 198)            221.0475   P-value(F)           3.56e-51
        rho                 -0.003481   Durbin-Watson        1.993858
        '''

        '''
        RESET test for specification (squares and cubes)
        Test statistic: F = 5.219019,
        with p-value = P(F(2,197) > 5.21902) = 0.00619

        RESET test for specification (squares only)
        Test statistic: F = 7.268492,
        with p-value = P(F(1,198) > 7.26849) = 0.00762

        RESET test for specification (cubes only)
        Test statistic: F = 5.248951,
        with p-value = P(F(1,198) > 5.24895) = 0.023:
        '''

        '''
        Test for ARCH of order 4

                     coefficient   std. error   t-ratio   p-value
          --------------------------------------------------------
          alpha(0)   97.0386       20.3234       4.775    3.56e-06 ***
          alpha(1)    0.176114      0.0714698    2.464    0.0146   **
          alpha(2)   -0.0488339     0.0724981   -0.6736   0.5014
          alpha(3)   -0.0705413     0.0737058   -0.9571   0.3397
          alpha(4)    0.0384531     0.0725763    0.5298   0.5968

          Null hypothesis: no ARCH effect is present
          Test statistic: LM = 7.30776
          with p-value = P(Chi-square(4) > 7.30776) = 0.120491:
        '''

        '''
        Variance Inflation Factors

        Minimum possible value = 1.0
        Values > 10.0 may indicate a collinearity problem

           ds_l_realgdp    1.002
              realint_1    1.002

        VIF(j) = 1/(1 - R(j)^2), where R(j) is the multiple correlation coefficient
        between variable j and the other independent variables

        Properties of matrix X'X:

         1-norm = 6862.0664
         Determinant = 1.0296049e+009
         Reciprocal condition number = 0.013819244
        '''
        '''
        Test for ARCH of order 4 -
          Null hypothesis: no ARCH effect is present
          Test statistic: LM = 7.30776
          with p-value = P(Chi-square(4) > 7.30776) = 0.120491

        Test of common factor restriction -
          Null hypothesis: restriction is acceptable
          Test statistic: F(2, 195) = 0.426391
          with p-value = P(F(2, 195) > 0.426391) = 0.653468

        Test for normality of residual -
          Null hypothesis: error is normally distributed
          Test statistic: Chi-square(2) = 20.2792
          with p-value = 3.94837e-005:
        '''

        #no idea what this is
        '''
        Augmented regression for common factor test
        OLS, using observations 1959:3-2009:3 (T = 201)
        Dependent variable: ds_l_realinv

                           coefficient   std. error   t-ratio    p-value
          ---------------------------------------------------------------
          const            -10.9481      1.35807      -8.062    7.44e-014 ***
          ds_l_realgdp       4.28893     0.229459     18.69     2.40e-045 ***
          realint_1         -0.662644    0.334872     -1.979    0.0492    **
          ds_l_realinv_1    -0.108892    0.0715042    -1.523    0.1294
          ds_l_realgdp_1     0.660443    0.390372      1.692    0.0923    *
          realint_2          0.0769695   0.341527      0.2254   0.8219

          Sum of squared residuals = 22432.8

        Test of common factor restriction

          Test statistic: F(2, 195) = 0.426391, with p-value = 0.653468
        '''


        ################ with OLS, HAC errors

        #Model 5: OLS, using observations 1959:2-2009:3 (T = 202)
        #Dependent variable: ds_l_realinv
        #HAC standard errors, bandwidth 4 (Bartlett kernel)

        #coefficient   std. error   t-ratio    p-value 95% CONFIDENCE INTERVAL
        #for confidence interval t(199, 0.025) = 1.972

        partable = np.array([
        [-9.48167,      1.17709,     -8.055,    7.17e-014, -11.8029, -7.16049], # ***
        [4.37422,      0.328787,    13.30,     2.62e-029, 3.72587, 5.02258], #***
        [-0.613997,     0.293619,    -2.091,    0.0378, -1.19300, -0.0349939]]) # **

        result_gretl_g1 = dict(
                    endog_mean = ("Mean dependent var",   3.257395),
                    endog_std = ("S.D. dependent var",   18.73915),
                    ssr = ("Sum squared resid",    22799.68),
                    mse_resid_sqrt = ("S.E. of regression",   10.70380),
                    rsquared = ("R-squared",            0.676978),
                    rsquared_adj = ("Adjusted R-squared",   0.673731),
                    fvalue = ("F(2, 199)",            90.79971),
                    f_pvalue = ("P-value(F)",           9.53e-29),
                    llf = ("Log-likelihood",      -763.9752),
                    aic = ("Akaike criterion",     1533.950),
                    bic = ("Schwarz criterion",    1543.875),
                    hqic = ("Hannan-Quinn",         1537.966),
                    resid_acf1 = ("rho",                 -0.107341),
                    dw = ("Durbin-Watson",        2.213805))

        linear_logs = [1.68351, 0.430953, 2, "chi2"]
        #for logs: dropping 70 nan or incomplete observations, T=133
        #(res_ols.model.exog <=0).any(1).sum() = 69  ?not 70
        linear_squares = [7.52477, 0.0232283, 2, "chi2"]

        #Autocorrelation, Breusch-Godfrey test for autocorrelation up to order 4
        lm_acorr4 = [1.17928, 0.321197, 4, 195, "F"]
        lm2_acorr4 = [4.771043, 0.312, 4, "chi2"]
        acorr_ljungbox4 = [5.23587, 0.264, 4, "chi2"]

        #break
        cusum_Harvey_Collier  = [0.494432, 0.621549, 198, "t"] #stats.t.sf(0.494432, 198)*2
        #see cusum results in files
        break_qlr = [3.01985, 0.1, 3, 196, "maxF"]  #TODO check this, max at 2001:4
        break_chow = [13.1897, 0.00424384, 3, "chi2"] # break at 1984:1

        arch_4 = [3.43473, 0.487871, 4, "chi2"]

        normality = [23.962, 0.00001, 2, "chi2"]

        het_white = [33.503723, 0.000003, 5, "chi2"]
        het_breush_pagan = [1.302014, 0.521520, 2, "chi2"]  #TODO: not available
        het_breush_pagan_konker = [0.709924, 0.701200, 2, "chi2"]


        reset_2_3 = [5.219019, 0.00619, 2, 197, "f"]
        reset_2 = [7.268492, 0.00762, 1, 198, "f"]
        reset_3 = [5.248951, 0.023, 1, 198, "f"]  #not available

        cond_1norm = 5984.0525
        determinant = 7.1087467e+008
        reciprocal_condition_number = 0.013826504
        vif = [1.001, 1.001]

        names = 'date   residual        leverage       influence        DFFITS'.split()
        lev = np.genfromtxt("E:\Josef\eclipsegworkspace\statsmodels-git\statsmodels-josef\scikits\statsmodels\datasets\macrodata\leverage_influence_ols_nostars.txt",
                            skip_header=3, skip_footer=1,
                            converters={0:lambda s: s})
        lev.dtype.names = names

        res = res_ols #for easier copying

        cov_hac, bse_hac = sw.cov_hac_simple(res, nlags=4, use_correction=False)

        assert_almost_equal(res.params, partable[:,0], 5)
        assert_almost_equal(bse_hac, partable[:,1], 5)
        #TODO

        assert_almost_equal(res.ssr, result_gretl_g1['ssr'][1], decimal=2)
        #assert_almost_equal(res.llf, result_gretl_g1['llf'][1], decimal=7) #not in gretl
        assert_almost_equal(res.rsquared, result_gretl_g1['rsquared'][1], decimal=6) #FAIL
        assert_almost_equal(res.rsquared_adj, result_gretl_g1['rsquared_adj'][1], decimal=6) #FAIL
        assert_almost_equal(np.sqrt(res.mse_resid), result_gretl_g1['mse_resid_sqrt'][1], decimal=5)
        #f-value is based on cov_hac I guess
        #assert_almost_equal(res.fvalue, result_gretl_g1['fvalue'][1], decimal=0) #FAIL
        #assert_approx_equal(res.f_pvalue, result_gretl_g1['f_pvalue'][1], significant=1) #FAIL
        #assert_almost_equal(res.durbin_watson, result_gretl_g1['dw'][1], decimal=7) #TODO


        c = oi.reset_ramsey(res, degree=2)
        compare_ftest(c, reset_2, decimal=(6,5))
        c = oi.reset_ramsey(res, degree=3)
        compare_ftest(c, reset_2_3, decimal=(6,5))

        linear_sq = smsdia.linear_lm(res.resid, res.model.exog)
        assert_almost_equal(linear_sq[0], linear_squares[0], decimal=6)
        assert_almost_equal(linear_sq[1], linear_squares[1], decimal=7)

        hbpk = smsdia.het_breushpagan(res.resid, res.model.exog)
        assert_almost_equal(hbpk[0], het_breush_pagan_konker[0], decimal=6)
        assert_almost_equal(hbpk[1], het_breush_pagan_konker[1], decimal=6)

        hw = smsdia.het_white(res.resid, res.model.exog)
        assert_almost_equal(hw[:2], het_white[:2], 6)

        #arch
        #sm_arch = smsdia.acorr_lm(res.resid**2, maxlag=4, autolag=None)
        sm_arch = smsdia.het_arch(res.resid, maxlag=4)
        assert_almost_equal(sm_arch[0], arch_4[0], decimal=5)
        assert_almost_equal(sm_arch[1], arch_4[1], decimal=6)

        vif2 = [oi.variance_inflation_factor(res.model.exog, k) for k in [1,2]]

        infl = oi.Influence(res_ols)
        #print np.max(np.abs(lev['DFFITS'] - infl.dffits[0]))
        #print np.max(np.abs(lev['leverage'] - infl.hat_matrix_diag))
        #print np.max(np.abs(lev['influence'] - infl.influence))  #just added this based on Gretl

        #just rough test, low decimal in Gretl output,
        assert_almost_equal(lev['residual'], res.resid, decimal=3)
        assert_almost_equal(lev['DFFITS'], infl.dffits[0], decimal=3)
        assert_almost_equal(lev['leverage'], infl.hat_matrix_diag, decimal=3)
        assert_almost_equal(lev['influence'], infl.influence, decimal=4)
Example #42
0
 def test_add_constant_has_constant1d(self):
     x = np.ones(5)
     x = tools.add_constant(x)
     assert_equal(x, np.ones(5))
Example #43
0
 def setupClass(cls):
     data = longley.load()
     data.exog = add_constant(data.exog)
     cls.res1 = OLS(data.endog, data.exog).fit()
     cls.res2 = WLS(data.endog, data.exog).fit()
 def setupClass(cls):
     data = longley.load()
     data.exog = add_constant(data.exog)
     cls.res1 = OLS(data.endog, data.exog).fit()
     cls.res2 = WLS(data.endog, data.exog).fit()
Example #45
0
def grangercausalitytests(x, maxlag, addconst=True, verbose=True):
    '''four tests for granger causality of 2 timeseries

    all four tests give similar results
    `params_ftest` and `ssr_ftest` are equivalent based of F test which is
    identical to lmtest:grangertest in R

    Parameters
    ----------
    x : array, 2d, (nobs,2)
        data for test whether the time series in the second column Granger
        causes the time series in the first column
    maxlag : integer
        the Granger causality test results are calculated for all lags up to
        maxlag
    verbose : bool
        print results if true

    Returns
    -------
    results : dictionary
        all test results, dictionary keys are the number of lags. For each
        lag the values are a tuple, with the first element a dictionary with
        teststatistic, pvalues, degrees of freedom, the second element are
        the OLS estimation results for the restricted model, the unrestricted
        model and the restriction (contrast) matrix for the parameter f_test.

    Notes
    -----
    TODO: convert to class and attach results properly

    The Null hypothesis for grangercausalitytests is that the time series in
    the second column, x2, Granger causes the time series in the first column,
    x1. This means that past values of x2 have a statistically significant
    effect on the current value of x1, taking also past values of x1 into
    account, as regressors. We reject the null hypothesis of x2 Granger
    causing x1 if the pvalues are below a desired size of the test.

    'params_ftest', 'ssr_ftest' are based on F test

    'ssr_chi2test', 'lrtest' are based on chi-square test

    '''
    from scipy import stats  # lazy import

    resli = {}

    for mlg in range(1, maxlag + 1):
        result = {}
        if verbose:
            print '\nGranger Causality'
            print 'number of lags (no zero)', mlg
        mxlg = mlg  #+ 1 # Note number of lags starting at zero in lagmat

        # create lagmat of both time series
        dta = lagmat2ds(x, mxlg, trim='both', dropex=1)

        #add constant
        if addconst:
            dtaown = add_constant(dta[:, 1:mxlg + 1])
            dtajoint = add_constant(dta[:, 1:])
        else:
            raise ValueError('Not Implemented')
            dtaown = dta[:, 1:mxlg]
            dtajoint = dta[:, 1:]

        #run ols on both models without and with lags of second variable
        res2down = OLS(dta[:, 0], dtaown).fit()
        res2djoint = OLS(dta[:, 0], dtajoint).fit()

        #print results
        #for ssr based tests see: http://support.sas.com/rnd/app/examples/ets/granger/index.htm
        #the other tests are made-up

        # Granger Causality test using ssr (F statistic)
        fgc1 = (res2down.ssr -
                res2djoint.ssr) / res2djoint.ssr / (mxlg) * res2djoint.df_resid
        if verbose:
            print 'ssr based F test:         F=%-8.4f, p=%-8.4f, df_denom=%d, df_num=%d' % \
              (fgc1, stats.f.sf(fgc1, mxlg, res2djoint.df_resid), res2djoint.df_resid, mxlg)
        result['ssr_ftest'] = (fgc1, stats.f.sf(fgc1, mxlg,
                                                res2djoint.df_resid),
                               res2djoint.df_resid, mxlg)

        # Granger Causality test using ssr (ch2 statistic)
        fgc2 = res2down.nobs * (res2down.ssr - res2djoint.ssr) / res2djoint.ssr
        if verbose:
            print 'ssr based chi2 test:   chi2=%-8.4f, p=%-8.4f, df=%d' %  \
              (fgc2, stats.chi2.sf(fgc2, mxlg), mxlg)
        result['ssr_chi2test'] = (fgc2, stats.chi2.sf(fgc2, mxlg), mxlg)

        #likelihood ratio test pvalue:
        lr = -2 * (res2down.llf - res2djoint.llf)
        if verbose:
            print 'likelihood ratio test: chi2=%-8.4f, p=%-8.4f, df=%d' %  \
              (lr, stats.chi2.sf(lr, mxlg), mxlg)
        result['lrtest'] = (lr, stats.chi2.sf(lr, mxlg), mxlg)

        # F test that all lag coefficients of exog are zero
        rconstr = np.column_stack((np.zeros((mxlg-1,mxlg-1)), np.eye(mxlg-1, mxlg-1),\
                                   np.zeros((mxlg-1, 1))))
        rconstr = np.column_stack((np.zeros((mxlg,mxlg)), np.eye(mxlg, mxlg),\
                                   np.zeros((mxlg, 1))))
        ftres = res2djoint.f_test(rconstr)
        if verbose:
            print 'parameter F test:         F=%-8.4f, p=%-8.4f, df_denom=%d, df_num=%d' % \
              (ftres.fvalue, ftres.pvalue, ftres.df_denom, ftres.df_num)
        result['params_ftest'] = (np.squeeze(ftres.fvalue)[()],
                                  np.squeeze(ftres.pvalue)[()], ftres.df_denom,
                                  ftres.df_num)

        resli[mxlg] = (result, [res2down, res2djoint, rconstr])

    return resli
Example #46
0
def add_trend(X, trend="c", prepend=False):
    """
    Adds a trend and/or constant to an array.

    Parameters
    ----------
    X : array-like
        Original array of data.
    trend : str {"c","t","ct","ctt"}
        "c" add constant only
        "t" add trend only
        "ct" add constant and linear trend
        "ctt" add constant and linear and quadratic trend.
    prepend : bool
        If True, prepends the new data to the columns of X.

    Notes
    -----
    Returns columns as ["ctt","ct","c"] whenever applicable.  There is currently
    no checking for an existing constant or trend.

    See also
    --------
    scikits.statsmodels.add_constant
    """
    #TODO: could be generalized for trend of aribitrary order
    trend = trend.lower()
    if trend == "c":    # handles structured arrays
        return add_constant(X, prepend=prepend)
    elif trend == "ct" or trend == "t":
        trendorder = 1
    elif trend == "ctt":
        trendorder = 2
    else:
        raise ValueError("trend %s not understood" % trend)
    X = np.asanyarray(X)
    nobs = len(X)
    trendarr = np.vander(np.arange(1,nobs+1, dtype=float), trendorder+1)
    # put in order ctt
    trendarr = np.fliplr(trendarr)
    if trend == "t":
        trendarr = trendarr[:,1]
    if not X.dtype.names:
        if not prepend:
            X = np.column_stack((X, trendarr))
        else:
            X = np.column_stack((trendarr, X))
    else:
        return_rec = data.__clas__ is np.recarray
        if trendorder == 1:
            if trend == "ct":
                dt = [('const',float),('trend',float)]
            else:
                dt = [('trend', float)]
        elif trendorder == 2:
            dt = [('const',float),('trend',float),('trend_squared', float)]
        trendarr = trendarr.view(dt)
        if prepend:
            X = nprf.append_fields(trendarr, X.dtype.names, [X[i] for i
                in data.dtype.names], usemask=False, asrecarray=return_rec)
        else:
            X = nprf.append_fields(X, trendarr.dtype.names, [trendarr[i] for i
                in trendarr.dtype.names], usemask=false, asrecarray=return_rec)
    return X
Example #47
0
 def test_add_constant_has_constant2d(self):
     x = np.asarray([[1, 1, 1, 1], [1, 2, 3, 4.]])
     y = tools.add_constant(x)
     assert_equal(x, y)
Example #48
0
 def test_add_constant_has_constant1d(self):
     x = np.ones(5)
     x = tools.add_constant(x)
     assert_equal(x, np.ones(5))
 def setupClass(cls):
     data = longley.load()
     data.exog = add_constant(data.exog)
     res1 = OLS(data.endog, data.exog).fit()
     R2 = [[0,1,-1,0,0,0,0],[0, 0, 0, 0, 1, -1, 0]]
     cls.Ftest1 = res1.f_test(R2)
Example #50
0
        if returns == 'text':
            return str(part1) + '\n' +  str(part2) + '\n' + str(part3L)
        elif returns == 'tables':
            return [part1, part2 ,part3L]
        elif returns == 'csv':
            return part1.as_csv() + '\n' + part2.as_csv() + '\n' + \
                   part3L.as_csv()
        elif returns == 'latex':
            print('not available yet')
        elif returns == 'html':
            print('not available yet')

if __name__ == "__main__":
    import scikits.statsmodels.api as sm
    data = sm.datasets.longley.load()
    data.exog = add_constant(data.exog)
    ols_results = OLS(data.endog, data.exog).results
    gls_results = GLS(data.endog, data.exog).results
    print(ols_results.summary())
    tables = ols_results.summary(returns='tables')
    csv = ols_results.summary(returns='csv')
"""
    Summary of Regression Results
=======================================
| Dependent Variable:            ['y']|
| Model:                           OLS|
| Method:                Least Squares|
| Date:               Tue, 29 Jun 2010|
| Time:                       22:32:21|
| # obs:                          16.0|
| Df residuals:                    9.0|
Example #51
0
 def setupClass(cls):
     data = longley.load()
     data.exog = add_constant(data.exog)
     res1 = OLS(data.endog, data.exog).fit()
     R2 = [[0, 1, -1, 0, 0, 0, 0], [0, 0, 0, 0, 1, -1, 0]]
     cls.Ftest1 = res1.f_test(R2)
Example #52
0


if __name__ == '__main__':
    import scikits.statsmodels.api as sm
    examples = ['ivols', 'distquant'][:]

    if 'ivols' in examples:
        exampledata = ['ols', 'iv', 'ivfake'][1]
        nobs = nsample = 500
        sige = 3
        corrfactor = 0.025


        x = np.linspace(0,10, nobs)
        X = tools.add_constant(np.column_stack((x, x**2)))
        beta = np.array([1, 0.1, 10])

        def sample_ols(exog):
            endog = np.dot(exog, beta) + sige*np.random.normal(size=nobs)
            return endog, exog, None

        def sample_iv(exog):
            print 'using iv example'
            X = exog.copy()
            e = sige * np.random.normal(size=nobs)
            endog = np.dot(X, beta) + e
            exog[:,0] = X[:,0] + corrfactor * e
            z0 = X[:,0] + np.random.normal(size=nobs)
            z1 = X.sum(1) + np.random.normal(size=nobs)
            z2 = X[:,1]
        if returns == 'text':
            return str(part1) + '\n' +  str(part2) + '\n' + str(part3L)
        elif returns == 'tables':
            return [part1, part2 ,part3L]
        elif returns == 'csv':
            return part1.as_csv() + '\n' + part2.as_csv() + '\n' + \
                   part3L.as_csv()
        elif returns == 'latex':
            print('not available yet')
        elif returns == 'html':
            print('not available yet')

if __name__ == "__main__":
    import scikits.statsmodels.api as sm
    data = sm.datasets.longley.load()
    data.exog = add_constant(data.exog)
    ols_results = OLS(data.endog, data.exog).results
    gls_results = GLS(data.endog, data.exog).results
    print(ols_results.summary())
    tables = ols_results.summary(returns='tables')
    csv = ols_results.summary(returns='csv')
"""
    Summary of Regression Results
=======================================
| Dependent Variable:            ['y']|
| Model:                           OLS|
| Method:                Least Squares|
| Date:               Tue, 29 Jun 2010|
| Time:                       22:32:21|
| # obs:                          16.0|
| Df residuals:                    9.0|
Example #54
0
def add_trend(X, trend="c", prepend=False):
    """
    Adds a trend and/or constant to an array.

    Parameters
    ----------
    X : array-like
        Original array of data.
    trend : str {"c","t","ct","ctt"}
        "c" add constant only
        "t" add trend only
        "ct" add constant and linear trend
        "ctt" add constant and linear and quadratic trend.
    prepend : bool
        If True, prepends the new data to the columns of X.

    Notes
    -----
    Returns columns as ["ctt","ct","c"] whenever applicable.  There is currently
    no checking for an existing constant or trend.

    See also
    --------
    scikits.statsmodels.add_constant
    """
    #TODO: could be generalized for trend of aribitrary order
    trend = trend.lower()
    if trend == "c":  # handles structured arrays
        return add_constant(X, prepend=prepend)
    elif trend == "ct" or trend == "t":
        trendorder = 1
    elif trend == "ctt":
        trendorder = 2
    else:
        raise ValueError("trend %s not understood" % trend)
    X = np.asanyarray(X)
    nobs = len(X)
    trendarr = np.vander(np.arange(1, nobs + 1, dtype=float), trendorder + 1)
    # put in order ctt
    trendarr = np.fliplr(trendarr)
    if trend == "t":
        trendarr = trendarr[:, 1]
    if not X.dtype.names:
        if not prepend:
            X = np.column_stack((X, trendarr))
        else:
            X = np.column_stack((trendarr, X))
    else:
        return_rec = data.__clas__ is np.recarray
        if trendorder == 1:
            if trend == "ct":
                dt = [('const', float), ('trend', float)]
            else:
                dt = [('trend', float)]
        elif trendorder == 2:
            dt = [('const', float), ('trend', float), ('trend_squared', float)]
        trendarr = trendarr.view(dt)
        if prepend:
            X = nprf.append_fields(trendarr,
                                   X.dtype.names,
                                   [X[i] for i in data.dtype.names],
                                   usemask=False,
                                   asrecarray=return_rec)
        else:
            X = nprf.append_fields(X,
                                   trendarr.dtype.names,
                                   [trendarr[i] for i in trendarr.dtype.names],
                                   usemask=false,
                                   asrecarray=return_rec)
    return X
Example #55
0
def grangercausalitytests(x, maxlag, addconst=True, verbose=True):
    '''four tests for granger causality of 2 timeseries

    all four tests give similar results
    `params_ftest` and `ssr_ftest` are equivalent based of F test which is identical to
    lmtest:grangertest in R

    Parameters
    ----------
    x : array, 2d, (nobs,2)
        data for test whether the time series in the second column Granger
        causes the time series in the first column
    maxlag : integer
        the Granger causality test results are calculated for all lags up to
        maxlag
    verbose : bool
        print results if true

    Returns
    -------
    results : dictionary
        all test results, dictionary keys are the number of lags. For each
        lag the values are a tuple, with the first element a dictionary with
        teststatistic, pvalues, degrees of freedom, the second element are
        the OLS estimation results for the restricted model, the unrestricted
        model and the restriction (contrast) matrix for the parameter f_test.

    Notes
    -----
    TODO: convert to class and attach results properly

    'params_ftest', 'ssr_ftest' are based on F test

    'ssr_chi2test', 'lrtest' are based on chi-square test

    '''
    from scipy import stats # lazy import

    resli = {}

    for mlg in range(1, maxlag+1):
        result = {}
        if verbose:
            print '\nGranger Causality'
            print 'number of lags (no zero)', mlg
        mxlg = mlg #+ 1 # Note number of lags starting at zero in lagmat

        # create lagmat of both time series
        dta = lagmat2ds(x, mxlg, trim='both', dropex=1)

        #add constant
        if addconst:
            dtaown = add_constant(dta[:,1:mxlg+1])
            dtajoint = add_constant(dta[:,1:])
        else:
            raise ValueError('Not Implemented')
            dtaown = dta[:,1:mxlg]
            dtajoint = dta[:,1:]

        #run ols on both models without and with lags of second variable
        res2down = OLS(dta[:,0], dtaown).fit()
        res2djoint = OLS(dta[:,0], dtajoint).fit()

        #print results
        #for ssr based tests see: http://support.sas.com/rnd/app/examples/ets/granger/index.htm
        #the other tests are made-up

        # Granger Causality test using ssr (F statistic)
        fgc1 = (res2down.ssr-res2djoint.ssr)/res2djoint.ssr/(mxlg)*res2djoint.df_resid
        if verbose:
            print 'ssr based F test:         F=%-8.4f, p=%-8.4f, df_denom=%d, df_num=%d' % \
              (fgc1, stats.f.sf(fgc1, mxlg, res2djoint.df_resid), res2djoint.df_resid, mxlg)
        result['ssr_ftest'] = (fgc1, stats.f.sf(fgc1, mxlg, res2djoint.df_resid), res2djoint.df_resid, mxlg)

        # Granger Causality test using ssr (ch2 statistic)
        fgc2 = res2down.nobs*(res2down.ssr-res2djoint.ssr)/res2djoint.ssr
        if verbose:
            print 'ssr based chi2 test:   chi2=%-8.4f, p=%-8.4f, df=%d' %  \
              (fgc2, stats.chi2.sf(fgc2, mxlg), mxlg)
        result['ssr_chi2test'] = (fgc2, stats.chi2.sf(fgc2, mxlg), mxlg)

        #likelihood ratio test pvalue:
        lr = -2*(res2down.llf-res2djoint.llf)
        if verbose:
            print 'likelihood ratio test: chi2=%-8.4f, p=%-8.4f, df=%d' %  \
              (lr, stats.chi2.sf(lr, mxlg), mxlg)
        result['lrtest'] = (lr, stats.chi2.sf(lr, mxlg), mxlg)

        # F test that all lag coefficients of exog are zero
        rconstr = np.column_stack((np.zeros((mxlg-1,mxlg-1)), np.eye(mxlg-1, mxlg-1),\
                                   np.zeros((mxlg-1, 1))))
        rconstr = np.column_stack((np.zeros((mxlg,mxlg)), np.eye(mxlg, mxlg),\
                                   np.zeros((mxlg, 1))))
        ftres = res2djoint.f_test(rconstr)
        if verbose:
            print 'parameter F test:         F=%-8.4f, p=%-8.4f, df_denom=%d, df_num=%d' % \
              (ftres.fvalue, ftres.pvalue, ftres.df_denom, ftres.df_num)
        result['params_ftest'] = (np.squeeze(ftres.fvalue)[()],
                                  np.squeeze(ftres.pvalue)[()],
                                  ftres.df_denom, ftres.df_num)

        resli[mxlg] = (result, [res2down, res2djoint, rconstr])

    return resli