Exemple #1
 def setupClass(cls):
     data = longley.load()
     data.exog = add_constant(data.exog, prepend=False)
     cls.endog = data.endog
     cls.exog = data.exog
     cls.ols_model = OLS(data.endog, data.exog)
from __future__ import print_function
import numpy as np
from statsmodels.regression.linear_model import OLS, GLSAR
from statsmodels.tools.tools import add_constant
from statsmodels.datasets import macrodata
import statsmodels.regression.tests.results.results_macro_ols_robust as res

d2 = macrodata.load().data
g_gdp = 400 * np.diff(np.log(d2['realgdp']))
g_inv = 400 * np.diff(np.log(d2['realinv']))
exogg = add_constant(np.c_[g_gdp, d2['realint'][:-1]], prepend=False)
res_olsg = OLS(g_inv, exogg).fit()

res_hc0 = res_olsg.get_robustcov_results('HC1')
res_hac4 = res_olsg.get_robustcov_results('HAC',

tt = res_hac4.t_test(np.eye(len(res_hac4.params)))

res_hac4.use_t = False
Exemple #3
VIF = np.diag(C).round(2)
print('VIF:', VIF)  #38.5 254.42 46.87 282.51

df_scaled = (df - df.mean()) / df.std()
A_scaled = np.array(df_scaled)
#print(A_scaled) #ndarray,not dataframe
x1x2 = A_scaled[:, [1, 2]]
x3x4 = A_scaled[:, [3, 4]]
A = np.array(df)
X = A[:, 1:]
B = np.dot(X.T, X)
ev, evct = np.linalg.eig(B)
kk = ev.max() / ev.min()
print('lambda1/lambda2:', kk)  #423.7

lr1 = OLS(dfy, add_constant(x1x2)).fit()
lr2 = OLS(dfy, add_constant(x3x4)).fit()
print('AIC:', lr1.aic, lr2.aic)  #x1x2=62.31 x3x4=76.74 x2x4=97.51

xmin = A_scaled[:, :]
nmin = sets
lrmin = OLS(dfy, add_constant(xmin)).fit()
for n in subsets(sets)[1:-1]:
    xx = A_scaled[:, n]
    lr = OLS(dfy, add_constant(xx)).fit()
    #    print(lr.aic)
    if lr.aic < lrmin.aic:
        lrmin.aic = lr.aic
        nmin = n
print('AICmin:', lrmin.aic, 'Combination:', nmin)  #x1,x2,x3,x4
def plot_ccpr(results, exog_idx, ax=None):
    """Plot CCPR against one regressor.

    Generates a CCPR (component and component-plus-residual) plot.

    results : result instance
        A regression results instance.
    exog_idx : int or string
        Exogenous, explanatory variable. If string is given, it should
        be the variable name that you want to use, and you can use arbitrary
        translations as with a formula.
    ax : Matplotlib AxesSubplot instance, optional
        If given, it is used to plot in instead of a new figure being

    fig : Matplotlib figure instance
        If `ax` is None, the created figure.  Otherwise the figure to which
        `ax` is connected.

    See Also
    plot_ccpr_grid : Creates CCPR plot for multiple regressors in a plot grid.

    The CCPR plot provides a way to judge the effect of one regressor on the
    response variable by taking into account the effects of the other
    independent variables. The partial residuals plot is defined as
    Residuals + B_i*X_i versus X_i. The component adds the B_i*X_i versus
    X_i to show where the fitted line would lie. Care should be taken if X_i
    is highly correlated with any of the other independent variables. If this
    is the case, the variance evident in the plot will be an underestimate of
    the true variance.

    fig, ax = utils.create_mpl_ax(ax)

    exog_name, exog_idx = utils.maybe_name_or_idx(exog_idx, results.model)
    results = maybe_unwrap_results(results)

    x1 = results.model.exog[:, exog_idx]
    #namestr = ' for %s' % self.name if self.name else ''
    x1beta = x1 * results.params[exog_idx]
    ax.plot(x1, x1beta + results.resid, 'o')
    from statsmodels.tools.tools import add_constant
    mod = OLS(x1beta, add_constant(x1)).fit()
    params = mod.params
    fig = abline_plot(*params, **dict(ax=ax))
    #ax.plot(x1, x1beta, '-')
    ax.set_title('Component and component plus residual plot')
    ax.set_ylabel("Residual + %s*beta_%d" % (exog_name, exog_idx))
    ax.set_xlabel("%s" % exog_name)

    return fig
Exemple #5
def grangercausalitytests_mod(x, maxlag, addconst=True, verbose=True):

    import numpy as np
    from scipy import stats
    from statsmodels.tsa.tsatools import lagmat2ds
    from statsmodels.tools.tools import add_constant
    from statsmodels.regression.linear_model import OLS
    from warnings import warn

    x = np.asarray(x)

    if x.shape[0] <= 3 * maxlag + int(addconst):
        warn("Insufficient observations. Maximum allowable lag is {0}."
             "The maximum lag will be set to "
             "this number".format(int((x.shape[0] - int(addconst)) / 3) - 1))
        maxlag = int((x.shape[0] - int(addconst)) / 3) - 1
#    print(x.shape[0])
#    print(int((x.shape[0] - int(addconst)) /  3) - 1)
#    print(maxlag)

    resli = {}

    for mlg in range(1, maxlag + 1):

        result = {}
        if verbose:
            print('\nGranger Causality')
            print('number of lags (no zero)', mlg)
        mxlg = mlg

        # create lagmat of both time series
        dta = lagmat2ds(x, mxlg, trim='both')
        dta = np.delete(dta, -1, axis=1)  # removal of the not lagged xs

        #add constant
        if addconst:
            dtaown = add_constant(dta[:, 1:(mxlg + 1)], prepend=False)
            dtajoint = add_constant(dta[:, 1:], prepend=False)
            raise NotImplementedError('Not Implemented')
            #dtaown = dta[:, 1:mxlg]
            #dtajoint = dta[:, 1:]

        # Run ols on both models without and with lags of second variable
        res2down = OLS(dta[:, 0], dtaown).fit()
        res2djoint = OLS(dta[:, 0], dtajoint).fit()

        #print results
        #for ssr based tests see:
        #the other tests are made-up

        # Granger Causality test using ssr (F statistic)
        fgc1 = ((res2down.ssr - res2djoint.ssr) / res2djoint.ssr / mxlg *
        if verbose:
            print('ssr based F test:         F=%-8.4f, p=%-8.4f, df_denom=%d,'
                  ' df_num=%d' %
                  (fgc1, stats.f.sf(fgc1, mxlg, res2djoint.df_resid),
                   res2djoint.df_resid, mxlg))
        result['ssr_ftest'] = (fgc1, stats.f.sf(fgc1, mxlg,
                               res2djoint.df_resid, mxlg)

        # Granger Causality test using ssr (ch2 statistic)
        fgc2 = res2down.nobs * (res2down.ssr - res2djoint.ssr) / res2djoint.ssr
        if verbose:
            print('ssr based chi2 test:   chi2=%-8.4f, p=%-8.4f, '
                  'df=%d' % (fgc2, stats.chi2.sf(fgc2, mxlg), mxlg))
        result['ssr_chi2test'] = (fgc2, stats.chi2.sf(fgc2, mxlg), mxlg)

        #likelihood ratio test pvalue:
        lr = -2 * (res2down.llf - res2djoint.llf)
        if verbose:
            print('likelihood ratio test: chi2=%-8.4f, p=%-8.4f, df=%d' %
                  (lr, stats.chi2.sf(lr, mxlg), mxlg))
        result['lrtest'] = (lr, stats.chi2.sf(lr, mxlg), mxlg)

        # F test that all lag coefficients of exog are zero
        rconstr = np.column_stack((np.zeros(
            (mxlg, mxlg)), np.eye(mxlg, mxlg), np.zeros((mxlg, 1))))
        ftres = res2djoint.f_test(rconstr)
        if verbose:
            print('parameter F test:         F=%-8.4f, p=%-8.4f, df_denom=%d,'
                  ' df_num=%d' %
                  (ftres.fvalue, ftres.pvalue, ftres.df_denom, ftres.df_num))
        result['params_ftest'] = (np.squeeze(ftres.fvalue)[()],
                                  np.squeeze(ftres.pvalue)[()], ftres.df_denom,

        resli[mxlg] = (result, [res2down, res2djoint, rconstr])
    return resli
def dispersion_poisson(results):
    """Score/LM type tests for Poisson variance assumptions

    Null Hypothesis is

    H0: var(y) = E(y) and assuming E(y) is correctly specified
    H1: var(y) ~= E(y)

    The tests are based on the constrained model, i.e. the Poisson model.
    The tests differ in their assumed alternatives, and in their maintained

    results : Poisson results instance
        This can be a results instance for either a discrete Poisson or a GLM
        with family Poisson.

    res : ndarray, shape (7, 2)
       each row contains the test statistic and p-value for one of the 7 tests
       computed here.
    description : 2-D list of strings
       Each test has two strings a descriptive name and a string for the
       alternative hypothesis.

    if hasattr(results, '_results'):
        results = results._results

    endog = results.model.endog
    nobs = endog.shape[0]  #TODO: use attribute, may need to be added
    fitted = results.predict()
    #fitted = results.fittedvalues  # discrete has linear prediction
    #this assumes Poisson
    resid2 = results.resid_response**2
    var_resid_endog = (resid2 - endog)
    var_resid_fitted = (resid2 - fitted)
    std1 = np.sqrt(2 * (fitted**2).sum())

    var_resid_endog_sum = var_resid_endog.sum()
    dean_a = var_resid_fitted.sum() / std1
    dean_b = var_resid_endog_sum / std1
    dean_c = (var_resid_endog / fitted).sum() / np.sqrt(2 * nobs)

    pval_dean_a = stats.norm.sf(np.abs(dean_a))
    pval_dean_b = stats.norm.sf(np.abs(dean_b))
    pval_dean_c = stats.norm.sf(np.abs(dean_c))

    results_all = [[dean_a, pval_dean_a], [dean_b, pval_dean_b],
                   [dean_c, pval_dean_c]]
    description = [['Dean A', 'mu (1 + a mu)'], ['Dean B', 'mu (1 + a mu)'],
                   ['Dean C', 'mu (1 + a)']]

    # Cameron Trived auxiliary regression page 78 count book 1989
    endog_v = var_resid_endog / fitted
    res_ols_nb2 = OLS(endog_v, fitted).fit(use_t=False)
    stat_ols_nb2 = res_ols_nb2.tvalues[0]
    pval_ols_nb2 = res_ols_nb2.pvalues[0]
    results_all.append([stat_ols_nb2, pval_ols_nb2])
    description.append(['CT nb2', 'mu (1 + a mu)'])

    res_ols_nb1 = OLS(endog_v, fitted).fit(use_t=False)
    stat_ols_nb1 = res_ols_nb1.tvalues[0]
    pval_ols_nb1 = res_ols_nb1.pvalues[0]
    results_all.append([stat_ols_nb1, pval_ols_nb1])
    description.append(['CT nb1', 'mu (1 + a)'])

    endog_v = var_resid_endog / fitted
    res_ols_nb2 = OLS(endog_v, fitted).fit(cov_type='HC1', use_t=False)
    stat_ols_hc1_nb2 = res_ols_nb2.tvalues[0]
    pval_ols_hc1_nb2 = res_ols_nb2.pvalues[0]
    results_all.append([stat_ols_hc1_nb2, pval_ols_hc1_nb2])
    description.append(['CT nb2 HC1', 'mu (1 + a mu)'])

    res_ols_nb1 = OLS(endog_v, np.ones(len(endog_v))).fit(cov_type='HC1',
    stat_ols_hc1_nb1 = res_ols_nb1.tvalues[0]
    pval_ols_hc1_nb1 = res_ols_nb1.pvalues[0]
    results_all.append([stat_ols_hc1_nb1, pval_ols_hc1_nb1])
    description.append(['CT nb1 HC1', 'mu (1 + a)'])

    return np.array(results_all), description
def _fit_arma_iter(outputs, inputs, p, q, r, l2_reg=0.0):
    """Iterative regression for estimating AR params in ARMAX(p, q, r) model.

  The iterative AR regression process provides consistent estimates for the
  AR parameters of an ARMAX(p, q, r) model after q iterative steps.

  It first fits an ARMAX(p, 0, r) model with least squares regression, then
  ARMAX(p, 1, r), and so on, ..., til ARMAX(p, q, r). At the i-th step, it
  fits an ARMAX(p, i, r) model, according to estimated error terms from the
  previous step.

  For description of the iterative regression method, see Section 2 of
  `Consistent Estimates of Autoregressive Parameters and Extended Sample
  Autocorrelation Function for Stationary and Nonstationary ARMA Models` at

  The implementation here is a generalization of the method mentioned in the
  paper. We adapt the method for multidimensional outputs, exogenous inputs, nan
  handling, and also add regularization on the MA parameters.

    outputs: Array with the output values from the LDS, nans allowed.
    inputs: Array with exogenous inputs values, nans allowed. Could be None.
    p: AR order, i.e. max lag of the autoregressive part.
    q: MA order, i.e. max lag of the error terms.
    r: Max lag of the exogenous inputs.
    l2_reg: L2 regularization coefficient, to be applied on MA coefficients.

    Fitted AR coefficients.
    if outputs.shape[1] > 1:
        # If there are multiple output dimensions, fit autoregressive params on
        # each dimension separately and average.
        params_list = [
            _fit_arma_iter(outputs[:, j:j+1], inputs, p, q, r, l2_reg=l2_reg) \
            for j in xrange(outputs.shape[1])]
        return np.mean(np.concatenate([a.reshape(1, -1) for a in params_list]),
    # We include a constant term in regression.
    k_const = 1
    # Input dim. If inputs is None, then in_dim = 0.
    in_dim = 0
    if inputs is not None:
        in_dim = inputs.shape[1]
        # Lag the inputs to obtain [?, r], column j means series x_{t-j}.
        # Use trim to drop rows with unknown values both at beginning and end.
        lagged_in = np.concatenate([
            lagmat(inputs[:, i], maxlag=r, trim='both') for i in xrange(in_dim)
        # Since we trim in beginning, the offset is r.
        lagged_in_offset = r
    # Lag the series itself to p-th order.
    lagged_out = lagmat(outputs, maxlag=p, trim='both')
    lagged_out_offset = p
    y = outputs
    y_offset = 0
    # Estimated residuals, initialized to 0.
    res = np.zeros_like(outputs)
    for i in xrange(q + 1):
        # Lag the residuals to i-th order in i-th iteration.
        lagged_res = lagmat(res, maxlag=i, trim='both')
        lagged_res_offset = y_offset + i
        # Compute offset in regression, since lagged_in, lagged_out, and lagged_res
        # have different offsets. Align them.
        if inputs is None:
            y_offset = max(lagged_out_offset, lagged_res_offset)
            y_offset = max(lagged_out_offset, lagged_res_offset,
        y = outputs[y_offset:, :]
        # Concatenate all variables in regression.
        x = np.concatenate([
            lagged_out[y_offset - lagged_out_offset:, :],
            lagged_res[y_offset - lagged_res_offset:, :]
        if inputs is not None:
            x = np.concatenate([lagged_in[y_offset - lagged_in_offset:, :], x],
        # Add constant term as the first variable.
        x = add_constant(x, prepend=True)
        if x.shape[1] < k_const + in_dim * r + p + i:
            raise ValueError('Insufficient sequence length for model fitting.')
        # Drop rows with nans.
        arr = np.concatenate([y, x], axis=1)
        arr = arr[~np.isnan(arr).any(axis=1)]
        y_dropped_na = arr[:, 0:1]
        x_dropped_na = arr[:, 1:]
        # Only regularize the MA part.
        alpha = np.concatenate(
            [np.zeros(k_const + in_dim * r + p), l2_reg * np.ones(i)], axis=0)
        # When L1_wt = 0, it's ridge regression.
        olsfit = OLS(y_dropped_na, x_dropped_na).fit_regularized(alpha=alpha,
        # Update estimated residuals.
        res = y - np.matmul(x, olsfit.params.reshape(-1, 1))
    if len(olsfit.params) != k_const + in_dim * r + p + q:
        raise ValueError('Expected param len %d, got %d.' %
                         (k_const + in_dim * r + p + q, len(olsfit.params)))
    if q == 0:
        return olsfit.params[-p:]
    return olsfit.params[-(p + q):-q]
 def setupClass(cls):
     super(TestNxNxOne, cls).setupClass()
     cls.mod2 = OLS(cls.endog_n_, cls.exog_n_one)
     cls.mod2.df_model += 1
     cls.res2 = cls.mod2.fit()
Exemple #9
 def test_no_penalization(self):
     res_ols = OLS(self.res1.model.endog, self.res1.model.exog).fit()
     res_theil = self.res1.model.fit(pen_weight=0, cov_type='data-prior')
     assert_allclose(res_theil.params, res_ols.params, rtol=1e-10)
     assert_allclose(res_theil.bse, res_ols.bse, rtol=1e-10)
 def setupClass(cls):
     data = longley.load()
     data.exog = add_constant(data.exog)
     cls.res1 = OLS(data.endog, data.exog).fit()
     R = np.identity(7)[:-1, :]
     cls.Ftest = cls.res1.f_test(R)
 def setupClass(cls):
     data = longley.load()
     data.exog = add_constant(data.exog)
     cls.res1 = GLS(data.endog, data.exog).fit()
     cls.res2 = OLS(data.endog, data.exog).fit()
 def _engine_factory(self, fy, X, check_integrity=True):
     if self.use_weighted_fit:
         return WLS(fy, X, weights=self._get_weights())
         return OLS(fy, X)
    def test_beta(self,
        Returns the profile log likelihood for regression parameters
        'param_num' at 'b0_vals.'

        b0_vals : list
            The value of parameters to be tested
        param_num : list
            Which parameters to be tested
        maxiter : int, optional
            How many iterations to use in the EM algorithm.  Default is 30
        ftol : float, optional
            The function tolerance for the EM optimization.
            Default is 10''**''-5
        print_weights : bool
            If true, returns the weights tate maximize the profile
            log likelihood. Default is False


        test_results : tuple
            The log-likelihood and p-pvalue of the test.


        The function will warn if the EM reaches the maxiter.  However, when
        optimizing over nuisance parameters, it is possible to reach a
        maximum number of inner iterations for a specific value for the
        nuisance parameters while the resultsof the function are still valid.
        This usually occurs when the optimization over the nuisance parameters
        selects parameter values that yield a log-likihood ratio close to


        >>> import statsmodels.api as sm
        >>> import numpy as np

        # Test parameter is .05 in one regressor no intercept model
        >>> data=sm.datasets.heart.load(as_pandas=False)
        >>> y = np.log10(data.endog)
        >>> x = data.exog
        >>> cens = data.censors
        >>> model = sm.emplike.emplikeAFT(y, x, cens)
        >>> res=model.test_beta([0], [0])
        >>> res
        (1.4657739632606308, 0.22601365256959183)

        #Test slope is 0 in  model with intercept

        >>> data=sm.datasets.heart.load(as_pandas=False)
        >>> y = np.log10(data.endog)
        >>> x = data.exog
        >>> cens = data.censors
        >>> model = sm.emplike.emplikeAFT(y, sm.add_constant(x), cens)
        >>> res = model.test_beta([0], [1])
        >>> res
        (4.623487775078047, 0.031537049752572731)
        censors = self.model.censors
        endog = self.model.endog
        exog = self.model.exog
        uncensored = (censors == 1).flatten()
        censored = (censors == 0).flatten()
        uncens_endog = endog[uncensored]
        uncens_exog = exog[uncensored, :]
        reg_model = OLS(uncens_endog, uncens_exog).fit()
        llr, pval, new_weights = reg_model.el_test(
            b0_vals, param_nums, return_weights=True)  # Needs to be changed
        km = self.model._make_km(endog, censors).flatten()  # when merged
        uncens_nobs = self.model.uncens_nobs
        F = np.asarray(new_weights).reshape(uncens_nobs)
        # Step 0 ^
        params = self.params()
        survidx = np.where(censors == 0)
        survidx = survidx[0] - np.arange(len(survidx[0]))
        numcensbelow = np.int_(np.cumsum(1 - censors))
        if len(param_nums) == len(params):
            llr = self._EM_test([],
            return llr, chi2.sf(llr, self.model.nvar)
            x0 = np.delete(params, param_nums)
                res = optimize.fmin(
                    (params, param_nums, b0_vals, F, survidx, uncens_nobs,
                     numcensbelow, km, uncensored, censored, maxiter, ftol),

                llr = res[1]
                return llr, chi2.sf(llr, len(param_nums))
            except np.linalg.linalg.LinAlgError:
                return np.inf, 0
Exemple #14
 def setupClass(cls):
     data = longley.load()
     data.exog = add_constant(data.exog, prepend=False)
     cls.res1 = OLS(data.endog, data.exog).fit()
     cls.res2 = WLS(data.endog, data.exog).fit()
def lm_test_glm(result, exog_extra, mean_deriv=None):
    '''score/lagrange multiplier test for GLM

    Wooldridge procedure for test of mean function in GLM

    results : GLMResults instance
        results instance with the constrained model
    exog_extra : ndarray or None
        additional exogenous variables for variable addition test
        This can be set to None if mean_deriv is provided.
    mean_deriv : None or ndarray
        Extra moment condition that correspond to the partial derivative of
        a mean function with respect to some parameters.

    test_results : Results instance
        The results instance has the following attributes which are score
        statistic and p-value for 3 versions of the score test.

        c1, pval1 : nonrobust score_test results
        c2, pval2 : score test results robust to over or under dispersion
        c3, pval3 : score test results fully robust to any heteroscedasticity

        The test results instance also has a simple summary method.

    TODO: add `df` to results and make df detection more robust

    This implements the auxiliary regression procedure of Wooldridge,
    implemented based on the presentation in chapter 8 in Handbook of
    Applied Econometrics 2.

    Wooldridge, Jeffrey M. 1997. “Quasi-Likelihood Methods for Count Data.”
    Handbook of Applied Econometrics 2: 352–406.

    and other articles and text book by Wooldridge


    if hasattr(result, '_result'):
        res = result._result
        res = result

    mod = result.model
    nobs = mod.endog.shape[0]

    #mean_func = mod.family.link.inverse
    dlinkinv = mod.family.link.inverse_deriv

    # derivative of mean function w.r.t. beta (linear params)
    dm = lambda x, linpred: dlinkinv(linpred)[:, None] * x

    var_func = mod.family.variance

    x = result.model.exog
    x2 = exog_extra

    # test omitted
    lin_pred = res.predict(linear=True)
    dm_incl = dm(x, lin_pred)
    if x2 is not None:
        dm_excl = dm(x2, lin_pred)
        if mean_deriv is not None:
            # allow both and stack
            dm_excl = np.column_stack((dm_excl, mean_deriv))
    elif mean_deriv is not None:
        dm_excl = mean_deriv
        raise ValueError('either exog_extra or mean_deriv have to be provided')

    # TODO check for rank or redundant, note OLS calculates the rank
    k_constraint = dm_excl.shape[1]
    fittedvalues = res.predict()  # discrete has linpred instead of mean
    v = var_func(fittedvalues)
    std = np.sqrt(v)
    res_ols1 = OLS(res.resid_response / std,
                   np.column_stack((dm_incl, dm_excl)) / std[:, None]).fit()

    # case: nonrobust assumes variance implied by distribution is correct
    c1 = res_ols1.ess
    pval1 = stats.chi2.sf(c1, k_constraint)
    #print c1, stats.chi2.sf(c1, 2)

    # case: robust to dispersion
    c2 = nobs * res_ols1.rsquared
    pval2 = stats.chi2.sf(c2, k_constraint)
    #print c2, stats.chi2.sf(c2, 2)

    # case: robust to heteroscedasticity
    from statsmodels.stats.multivariate_tools import partial_project
    pp = partial_project(dm_excl / std[:, None], dm_incl / std[:, None])
    resid_p = res.resid_response / std
    res_ols3 = OLS(np.ones(nobs), pp.resid * resid_p[:, None]).fit()
    #c3 = nobs * res_ols3.rsquared   # this is Wooldridge
    c3b = res_ols3.ess  # simpler if endog is ones
    pval3 = stats.chi2.sf(c3b, k_constraint)

    tres = TestResults(c1=c1,

    return tres
Exemple #16
 def setup_class(cls):
     y, x = cls.get_sample()
     mod1 = TheilGLS(y, x, sigma_prior=[0, 0, 1., 1.])
     cls.res1 = mod1.fit(0)
     cls.res2 = OLS(y, x).fit()
def cm_test_robust(resid, resid_deriv, instruments, weights=1):
    '''score/lagrange multiplier of Wooldridge

    generic version of Wooldridge procedure for test of conditional moments

    Limitation: This version allows only for one unconditional moment
    restriction, i.e. resid is scalar for each observation.
    Another limitation is that it assumes independent observations, no
    correlation in residuals and weights cannot be replaced by cross-observation

    resid : ndarray, (nobs, )
        conditional moment restriction, E(r | x, params) = 0
    resid_deriv : ndarray, (nobs, k_params)
        derivative of conditional moment restriction with respect to parameters
    instruments : ndarray, (nobs, k_instruments)
        indicator variables of Wooldridge, multiplies the conditional momen
    weights : ndarray
        This is a weights function as used in WLS. The moment
        restrictions are multiplied by weights. This corresponds to the
        inverse of the variance in a heteroskedastic model.

    test_results : Results instance
        ???  TODO

    This implements the auxiliary regression procedure of Wooldridge,
    implemented based on procedure 2.1 in Wooldridge 1990.

    Wooldridge allows for multivariate conditional moments (`resid`)
    TODO: check dimensions for multivariate case for extension

    and more Wooldridge

    # notation: Wooldridge uses too mamny Greek letters
    # instruments is capital lambda
    # resid is small phi
    # resid_deriv is capital phi
    # weights is C

    nobs = resid.shape[0]

    from statsmodels.stats.multivariate_tools import partial_project

    w_sqrt = np.sqrt(weights)
    if np.size(weights) > 1:
        w_sqrt = w_sqrt[:, None]
    pp = partial_project(instruments * w_sqrt, resid_deriv * w_sqrt)
    mom_resid = pp.resid

    moms_test = mom_resid * resid[:, None] * w_sqrt

    # we get this here in case we extend resid to be more than 1-D
    k_constraint = moms_test.shape[1]

    # use OPG variance as in Wooldridge 1990. This might generalize
    cov = moms_test.T.dot(moms_test)
    diff = moms_test.sum(0)

    # see Wooldridge last page in appendix
    stat = diff.dot(np.linalg.solve(cov, diff))

    # for checking, this corresponds to nobs * rsquared of auxiliary regression
    stat2 = OLS(np.ones(nobs), moms_test).fit().ess
    pval = stats.chi2.sf(stat, k_constraint)

    return stat, pval, stat2
Exemple #18
    def test_ols_noncentrality(self):
        k = self.k_groups

        res_ols = OLS(self.y, self.ex).fit()
        nobs_t = res_ols.model.nobs

        # constraint
        c_equal = -np.eye(k)[1:]
        c_equal[:, 0] = 1
        v = np.zeros(c_equal.shape[0])

        # noncentrality at estimated parameters
        wt = res_ols.wald_test(c_equal, scalar=True)
        df_num, df_denom = wt.df_num, wt.df_denom

        cov_p = res_ols.cov_params()

        nc_wt = wald_test_noncent_generic(res_ols.params,
        assert_allclose(nc_wt, wt.statistic * wt.df_num, rtol=1e-13)

        nc_wt2 = wald_test_noncent(res_ols.params,
        assert_allclose(nc_wt2, nc_wt, rtol=1e-13)

        es_ols = nc_wt / nobs_t
        es_oneway = smo.effectsize_oneway(res_ols.params,
        assert_allclose(es_ols, es_oneway, rtol=1e-13)

        alpha = 0.05
        pow_ols = smpwr.ftest_power(np.sqrt(es_ols),
        pow_oneway = smpwr.ftest_anova_power(np.sqrt(es_oneway),
        assert_allclose(pow_ols, pow_oneway, rtol=1e-13)

        # noncentrality at other params
        params_alt = res_ols.params * 0.75
        # compute constraint value so we can get noncentrality from wald_test
        v_off = _offset_constraint(c_equal, res_ols.params, params_alt)
        wt_off = res_ols.wald_test((c_equal, v + v_off), scalar=True)
        nc_wt_off = wald_test_noncent_generic(params_alt,
                        wt_off.statistic * wt_off.df_num,

        # check vectorized version, joint=False
        nc_wt_vec = wald_test_noncent_generic(params_alt,
        for i in range(c_equal.shape[0]):
            nc_wt_i = wald_test_noncent_generic(
                c_equal[i:i + 1],  # noqa
                v[i:i + 1],
                diff=None,  # noqa
            assert_allclose(nc_wt_vec[i], nc_wt_i, rtol=1e-13)
def dispersion_poisson_generic(results,
    """A variable addition test for the variance function

    This uses an artificial regression to calculate a variant of an LM or
    generalized score test for the specification of the variance assumption
    in a Poisson model. The performed test is a Wald test on the coefficients
    of the `exog_new_test`.

    Warning: insufficiently tested, especially for options

    if hasattr(results, '_results'):
        results = results._results

    endog = results.model.endog
    nobs = endog.shape[0]  #TODO: use attribute, may need to be added
    # fitted = results.fittedvalues  # generic has linpred as fittedvalues
    fitted = results.predict()
    resid2 = results.resid_response**2
    #the following assumes Poisson
    if use_endog:
        var_resid = (resid2 - endog)
        var_resid = (resid2 - fitted)

    endog_v = var_resid / fitted

    k_constraints = exog_new_test.shape[1]
    ex_list = [exog_new_test]
    if include_score:
        score_obs = results.model.score_obs(results.params)

    if exog_new_control is not None:

    if len(ex_list) > 1:
        ex = np.column_stack(ex_list)
        use_wald = True
        ex = ex_list[0]  # no control variables in exog
        use_wald = False

    res_ols = OLS(endog_v, ex).fit(cov_type=cov_type,

    if use_wald:
        # we have controls and need to test coefficients
        k_vars = ex.shape[1]
        constraints = np.eye(k_constraints, k_vars)
        ht = res_ols.wald_test(constraints)
        stat_ols = ht.statistic
        pval_ols = ht.pvalue
        # we do not have controls and can use overall fit
        nobs = endog_v.shape[0]
        rsquared_noncentered = 1 - res_ols.ssr / res_ols.uncentered_tss
        stat_ols = nobs * rsquared_noncentered
        pval_ols = stats.chi2.sf(stat_ols, k_constraints)

    return stat_ols, pval_ols
Exemple #20
def kpss(x, regression='c', lags=None, store=False):
    Kwiatkowski-Phillips-Schmidt-Shin test for stationarity.

    Computes the Kwiatkowski-Phillips-Schmidt-Shin (KPSS) test for the null
    hypothesis that x is level or trend stationary.

    x : array_like, 1d
        Data series
    regression : str{'c', 'ct'}
        Indicates the null hypothesis for the KPSS test
        * 'c' : The data is stationary around a constant (default)
        * 'ct' : The data is stationary around a trend
    lags : int
        Indicates the number of lags to be used. If None (default),
        lags is set to int(12 * (n / 100)**(1 / 4)), as outlined in
        Schwert (1989).
    store : bool
        If True, then a result instance is returned additionally to
        the KPSS statistic (default is False).

    kpss_stat : float
        The KPSS test statistic
    p_value : float
        The p-value of the test. The p-value is interpolated from
        Table 1 in Kwiatkowski et al. (1992), and a boundary point
        is returned if the test statistic is outside the table of
        critical values, that is, if the p-value is outside the
        interval (0.01, 0.1).
    lags : int
        The truncation lag parameter
    crit : dict
        The critical values at 10%, 5%, 2.5% and 1%. Based on
        Kwiatkowski et al. (1992).
    resstore : (optional) instance of ResultStore
        An instance of a dummy class with results attached as attributes

    To estimate sigma^2 the Newey-West estimator is used. If lags is None,
    the truncation lag parameter is set to int(12 * (n / 100) ** (1 / 4)),
    as outlined in Schwert (1989). The p-values are interpolated from
    Table 1 of Kwiatkowski et al. (1992). If the computed statistic is
    outside the table of critical values, then a warning message is

    Missing values are not handled.

    D. Kwiatkowski, P. C. B. Phillips, P. Schmidt, and Y. Shin (1992): Testing
    the Null Hypothesis of Stationarity against the Alternative of a Unit Root.
    `Journal of Econometrics` 54, 159-178.
    from warnings import warn

    nobs = len(x)
    x = np.asarray(x)
    hypo = regression.lower()

    # if m is not one, n != m * n
    if nobs != x.size:
        raise ValueError("x of shape {0} not understood".format(x.shape))

    if hypo == 'ct':
        # p. 162 Kwiatkowski et al. (1992): y_t = beta * t + r_t + e_t,
        # where beta is the trend, r_t a random walk and e_t a stationary
        # error term.
        resids = OLS(x, add_constant(np.arange(1, nobs + 1))).fit().resid
        crit = [0.119, 0.146, 0.176, 0.216]
    elif hypo == 'c':
        # special case of the model above, where beta = 0 (so the null
        # hypothesis is that the data is stationary around r_0).
        resids = x - x.mean()
        crit = [0.347, 0.463, 0.574, 0.739]
        raise ValueError("hypothesis '{0}' not understood".format(hypo))

    if lags is None:
        # from Kwiatkowski et al. referencing Schwert (1989)
        lags = int(np.ceil(12. * np.power(nobs / 100., 1 / 4.)))

    pvals = [0.10, 0.05, 0.025, 0.01]

    eta = sum(resids.cumsum()**2) / (nobs**2)  # eq. 11, p. 165
    s_hat = _sigma_est_kpss(resids, nobs, lags)

    kpss_stat = eta / s_hat
    p_value = np.interp(kpss_stat, crit, pvals)

    if p_value == pvals[-1]:
        warn("p-value is smaller than the indicated p-value",
    elif p_value == pvals[0]:
        warn("p-value is greater than the indicated p-value",

    crit_dict = {'10%': crit[0], '5%': crit[1], '2.5%': crit[2], '1%': crit[3]}

    if store:
        rstore = ResultsStore()
        rstore.lags = lags
        rstore.nobs = nobs

        stationary_type = "level" if hypo == 'c' else "trend"
        rstore.H0 = "The series is {0} stationary".format(stationary_type)
        rstore.HA = "The series is not {0} stationary".format(stationary_type)

        return kpss_stat, p_value, crit_dict, rstore
        return kpss_stat, p_value, lags, crit_dict
def plot_partregress(endog,
    """Plot partial regression for a single regressor.

    endog : ndarray or string
       endogenous or response variable. If string is given, you can use a
       arbitrary translations as with a formula.
    exog_i : ndarray or string
        exogenous, explanatory variable. If string is given, you can use a
        arbitrary translations as with a formula.
    exog_others : ndarray or list of strings
        other exogenous, explanatory variables. If a list of strings is given,
        each item is a term in formula. You can use a arbitrary translations
        as with a formula. The effect of these variables will be removed by
        OLS regression.
    data : DataFrame, dict, or recarray
        Some kind of data structure with names if the other variables are
        given as strings.
    title_kwargs : dict
        Keyword arguments to pass on for the title. The key to control the
        fonts is fontdict.
    obs_labels : bool or array-like
        Whether or not to annotate the plot points with their observation
        labels. If obs_labels is a boolean, the point labels will try to do
        the right thing. First it will try to use the index of data, then
        fall back to the index of exog_i. Alternatively, you may give an
        array-like object corresponding to the obseveration numbers.
    labels_kwargs : dict
        Keyword arguments that control annotate for the observation labels.
    ax : Matplotlib AxesSubplot instance, optional
        If given, this subplot is used to plot in instead of a new figure being
    ret_coords : bool
        If True will return the coordinates of the points in the plot. You
        can use this to add your own annotations.
        The keyword arguments passed to plot for the points.

    fig : Matplotlib figure instance
        If `ax` is None, the created figure.  Otherwise the figure to which
        `ax` is connected.
    coords : list, optional
        If ret_coords is True, return a tuple of arrays (x_coords, y_coords).

    The slope of the fitted line is the that of `exog_i` in the full
    multiple regression. The individual points can be used to assess the
    influence of points on the estimated coefficient.

    See Also
    plot_partregress_grid : Plot partial regression for a set of regressors.
    #NOTE: there is no interaction between possible missing data and
    #obs_labels yet, so this will need to be tweaked a bit for this case
    fig, ax = utils.create_mpl_ax(ax)

    # strings, use patsy to transform to data
    if isinstance(endog, string_types):
        endog = dmatrix(endog + "-1", data)

    if isinstance(exog_others, string_types):
        RHS = dmatrix(exog_others, data)
    elif isinstance(exog_others, list):
        RHS = "+".join(exog_others)
        RHS = dmatrix(RHS, data)
        RHS = exog_others
    RHS_isemtpy = False
    if isinstance(RHS, np.ndarray) and RHS.size == 0:
        RHS_isemtpy = True
    elif isinstance(RHS, pd.DataFrame) and RHS.empty:
        RHS_isemtpy = True
    if isinstance(exog_i, string_types):
        exog_i = dmatrix(exog_i + "-1", data)

    # all arrays or pandas-like

    if RHS_isemtpy:
        ax.plot(endog, exog_i, 'o', **kwargs)
        fitted_line = OLS(endog, exog_i).fit()
        x_axis_endog_name = 'x' if isinstance(exog_i,
                                              np.ndarray) else exog_i.name
        y_axis_endog_name = 'y' if isinstance(
            endog, np.ndarray) else endog.design_info.column_names[0]
        res_yaxis = OLS(endog, RHS).fit()
        res_xaxis = OLS(exog_i, RHS).fit()
        xaxis_resid = res_xaxis.resid
        yaxis_resid = res_yaxis.resid
        x_axis_endog_name = res_xaxis.model.endog_names
        y_axis_endog_name = res_yaxis.model.endog_names
        ax.plot(xaxis_resid, yaxis_resid, 'o', **kwargs)
        fitted_line = OLS(yaxis_resid, xaxis_resid).fit()

    fig = abline_plot(0, fitted_line.params[0], color='k', ax=ax)

    if x_axis_endog_name == 'y':  # for no names regression will just get a y
        x_axis_endog_name = 'x'  # this is misleading, so use x
    ax.set_xlabel("e(%s | X)" % x_axis_endog_name)
    ax.set_ylabel("e(%s | X)" % y_axis_endog_name)
    ax.set_title('Partial Regression Plot', **title_kwargs)

    #NOTE: if we want to get super fancy, we could annotate if a point is
    #clicked using this widget
    if obs_labels is True:
        if data is not None:
            obs_labels = data.index
        elif hasattr(exog_i, "index"):
            obs_labels = exog_i.index
            obs_labels = res_xaxis.model.data.row_labels
        #NOTE: row_labels can be None.
        #Maybe we should fix this to never be the case.
        if obs_labels is None:
            obs_labels = lrange(len(exog_i))

    if obs_labels is not False:  # could be array-like
        if len(obs_labels) != len(exog_i):
            raise ValueError("obs_labels does not match length of exog_i")
        label_kwargs.update(dict(ha="center", va="bottom"))
        ax = utils.annotate_axes(lrange(len(obs_labels)),
                                 lzip(res_xaxis.resid, res_yaxis.resid),
                                 [(0, 5)] * len(obs_labels),

    if ret_coords:
        return fig, (res_xaxis.resid, res_yaxis.resid)
        return fig
Exemple #22
def adfuller(x,
    Augmented Dickey-Fuller unit root test

    The Augmented Dickey-Fuller test can be used to test for a unit root in a
    univariate process in the presence of serial correlation.

    x : array_like, 1d
        data series
    maxlag : int
        Maximum lag which is included in test, default 12*(nobs/100)^{1/4}
    regression : {'c','ct','ctt','nc'}
        Constant and trend order to include in regression

        * 'c' : constant only (default)
        * 'ct' : constant and trend
        * 'ctt' : constant, and linear and quadratic trend
        * 'nc' : no constant, no trend
    autolag : {'AIC', 'BIC', 't-stat', None}
        * if None, then maxlag lags are used
        * if 'AIC' (default) or 'BIC', then the number of lags is chosen
          to minimize the corresponding information criterion
        * 't-stat' based choice of maxlag.  Starts with maxlag and drops a
          lag until the t-statistic on the last lag length is significant
          using a 5%-sized test
    store : bool
        If True, then a result instance is returned additionally to
        the adf statistic. Default is False
    regresults : bool, optional
        If True, the full regression results are returned. Default is False

    adf : float
        Test statistic
    pvalue : float
        MacKinnon's approximate p-value based on MacKinnon (1994, 2010)
    usedlag : int
        Number of lags used
    nobs : int
        Number of observations used for the ADF regression and calculation of
        the critical values
    critical values : dict
        Critical values for the test statistic at the 1 %, 5 %, and 10 %
        levels. Based on MacKinnon (2010)
    icbest : float
        The maximized information criterion if autolag is not None.
    resstore : ResultStore, optional
        A dummy class with results attached as attributes

    The null hypothesis of the Augmented Dickey-Fuller is that there is a unit
    root, with the alternative that there is no unit root. If the pvalue is
    above a critical size, then we cannot reject that there is a unit root.

    The p-values are obtained through regression surface approximation from
    MacKinnon 1994, but using the updated 2010 tables. If the p-value is close
    to significant, then the critical values should be used to judge whether
    to reject the null.

    The autolag option and maxlag for it are described in Greene.

    See example notebook

    .. [*] W. Green.  "Econometric Analysis," 5th ed., Pearson, 2003.

    .. [*] Hamilton, J.D.  "Time Series Analysis".  Princeton, 1994.

    .. [*] MacKinnon, J.G. 1994.  "Approximate asymptotic distribution functions for
        unit-root and cointegration tests.  `Journal of Business and Economic
        Statistics` 12, 167-76.

    .. [*] MacKinnon, J.G. 2010. "Critical Values for Cointegration Tests."  Queen's
        University, Dept of Economics, Working Papers.  Available at

    if regresults:
        store = True

    trenddict = {None: 'nc', 0: 'c', 1: 'ct', 2: 'ctt'}
    if regression is None or isinstance(regression, (int, long)):
        regression = trenddict[regression]
    regression = regression.lower()
    if regression not in ['c', 'nc', 'ct', 'ctt']:
        raise ValueError("regression option %s not understood") % regression
    x = np.asarray(x)
    nobs = x.shape[0]

    if maxlag is None:
        #from Greene referencing Schwert 1989
        maxlag = int(np.ceil(12. * np.power(nobs / 100., 1 / 4.)))

    xdiff = np.diff(x)
    xdall = lagmat(xdiff[:, None], maxlag, trim='both', original='in')
    nobs = xdall.shape[0]  # pylint: disable=E1103

    xdall[:, 0] = x[-nobs - 1:-1]  # replace 0 xdiff with level of x
    xdshort = xdiff[-nobs:]

    if store:
        resstore = ResultsStore()
    if autolag:
        if regression != 'nc':
            fullRHS = add_trend(xdall, regression, prepend=True)
            fullRHS = xdall
        startlag = fullRHS.shape[1] - xdall.shape[1] + 1  # 1 for level  # pylint: disable=E1103
        #search for lag length with smallest information criteria
        #Note: use the same number of observations to have comparable IC
        #aic and bic: smaller is better

        if not regresults:
            icbest, bestlag = _autolag(OLS, xdshort, fullRHS, startlag, maxlag,
            icbest, bestlag, alres = _autolag(OLS,
            resstore.autolag_results = alres

        bestlag -= startlag  # convert to lag not column index

        #rerun ols with best autolag
        xdall = lagmat(xdiff[:, None], bestlag, trim='both', original='in')
        nobs = xdall.shape[0]  # pylint: disable=E1103
        xdall[:, 0] = x[-nobs - 1:-1]  # replace 0 xdiff with level of x
        xdshort = xdiff[-nobs:]
        usedlag = bestlag
        usedlag = maxlag
        icbest = None
    if regression != 'nc':
        resols = OLS(xdshort, add_trend(xdall[:, :usedlag + 1],
        resols = OLS(xdshort, xdall[:, :usedlag + 1]).fit()

    adfstat = resols.tvalues[0]
    #    adfstat = (resols.params[0]-1.0)/resols.bse[0]
    # the "asymptotically correct" z statistic is obtained as
    # nobs/(1-np.sum(resols.params[1:-(trendorder+1)])) (resols.params[0] - 1)
    # I think this is the statistic that is used for series that are integrated
    # for orders higher than I(1), ie., not ADF but cointegration tests.

    # Get approx p-value and critical values
    pvalue = mackinnonp(adfstat, regression=regression, N=1)
    critvalues = mackinnoncrit(N=1, regression=regression, nobs=nobs)
    critvalues = {
        "1%": critvalues[0],
        "5%": critvalues[1],
        "10%": critvalues[2]
    if store:
        resstore.resols = resols
        resstore.maxlag = maxlag
        resstore.usedlag = usedlag
        resstore.adfstat = adfstat
        resstore.critvalues = critvalues
        resstore.nobs = nobs
        resstore.H0 = ("The coefficient on the lagged level equals 1 - "
                       "unit root")
        resstore.HA = "The coefficient on the lagged level < 1 - stationary"
        resstore.icbest = icbest
        resstore._str = 'Augmented Dickey-Fuller Test Results'
        return adfstat, pvalue, critvalues, resstore
        if not autolag:
            return adfstat, pvalue, usedlag, nobs, critvalues
            return adfstat, pvalue, usedlag, nobs, critvalues, icbest
 def setup_class(cls):
     data = stackloss.load()
     data.exog = add_constant(data.exog)
     cls.res1 = OLS(data.endog, data.exog).fit()
     cls.res2 = RegressionResults()
Exemple #24
def grangercausalitytests(x,
    """four tests for granger non causality of 2 timeseries

    all four tests give similar results
    `params_ftest` and `ssr_ftest` are equivalent based on F test which is
    identical to lmtest:grangertest in R

    x : array, 2d
        data for test whether the time series in the second column Granger
        causes the time series in the first column
    maxlag : integer
        the Granger causality test results are calculated for all lags up to
    verbose : bool
        print results if true

    results : dictionary
        all test results, dictionary keys are the number of lags. For each
        lag the values are a tuple, with the first element a dictionary with
        teststatistic, pvalues, degrees of freedom, the second element are
        the OLS estimation results for the restricted model, the unrestricted
        model and the restriction (contrast) matrix for the parameter f_test.

    TODO: convert to class and attach results properly

    The Null hypothesis for grangercausalitytests is that the time series in
    the second column, x2, does NOT Granger cause the time series in the first
    column, x1. Grange causality means that past values of x2 have a
    statistically significant effect on the current value of x1, taking past
    values of x1 into account as regressors. We reject the null hypothesis
    that x2 does not Granger cause x1 if the pvalues are below a desired size
    of the test.

    The null hypothesis for all four test is that the coefficients
    corresponding to past values of the second time series are zero.

    'params_ftest', 'ssr_ftest' are based on F distribution

    'ssr_chi2test', 'lrtest' are based on chi-square distribution

    Greene: Econometric Analysis

    from scipy import stats

    x = np.asarray(x)

    if x.shape[0] <= 3 * maxlag + int(addconst):
        raise ValueError(
            "Insufficient observations. Maximum allowable "
            "lag is {0}".format(int((x.shape[0] - int(addconst)) / 3) - 1))

    resli = {}

    savetoFile = open(saveto, 'w')

    for mlg in range(1, maxlag + 1):
        result = {}
        if verbose:
            print('\nGranger Causality', file=savetoFile)
            print('number of lags (no zero)', mlg, file=savetoFile)
        mxlg = mlg

        # create lagmat of both time series
        dta = lagmat2ds(x, mxlg, trim='both', dropex=1)

        #add constant
        if addconst:
            dtaown = add_constant(dta[:, 1:(mxlg + 1)], prepend=False)
            dtajoint = add_constant(dta[:, 1:], prepend=False)
            raise NotImplementedError('Not Implemented')
            #dtaown = dta[:, 1:mxlg]
            #dtajoint = dta[:, 1:]

        # Run ols on both models without and with lags of second variable
        res2down = OLS(dta[:, 0], dtaown).fit()
        res2djoint = OLS(dta[:, 0], dtajoint).fit()

        #print results
        #for ssr based tests see:
        #the other tests are made-up

        # Granger Causality test using ssr (F statistic)
        fgc1 = ((res2down.ssr - res2djoint.ssr) / res2djoint.ssr / mxlg *
        if verbose:
            print('ssr based F test:         F=%-8.4f, p=%-8.4f, df_denom=%d,'
                  ' df_num=%d' %
                  (fgc1, stats.f.sf(fgc1, mxlg, res2djoint.df_resid),
                   res2djoint.df_resid, mxlg),
        result['ssr_ftest'] = (fgc1, stats.f.sf(fgc1, mxlg,
                               res2djoint.df_resid, mxlg)

        # Granger Causality test using ssr (ch2 statistic)
        fgc2 = res2down.nobs * (res2down.ssr - res2djoint.ssr) / res2djoint.ssr
        if verbose:
            print('ssr based chi2 test:   chi2=%-8.4f, p=%-8.4f, '
                  'df=%d' % (fgc2, stats.chi2.sf(fgc2, mxlg), mxlg),
        result['ssr_chi2test'] = (fgc2, stats.chi2.sf(fgc2, mxlg), mxlg)

        #likelihood ratio test pvalue:
        lr = -2 * (res2down.llf - res2djoint.llf)
        if verbose:
            print('likelihood ratio test: chi2=%-8.4f, p=%-8.4f, df=%d' %
                  (lr, stats.chi2.sf(lr, mxlg), mxlg),
        result['lrtest'] = (lr, stats.chi2.sf(lr, mxlg), mxlg)

        # F test that all lag coefficients of exog are zero
        rconstr = np.column_stack((np.zeros(
            (mxlg, mxlg)), np.eye(mxlg, mxlg), np.zeros((mxlg, 1))))
        ftres = res2djoint.f_test(rconstr)
        if verbose:
            print('parameter F test:         F=%-8.4f, p=%-8.4f, df_denom=%d,'
                  ' df_num=%d' %
                  (ftres.fvalue, ftres.pvalue, ftres.df_denom, ftres.df_num),
        result['params_ftest'] = (np.squeeze(ftres.fvalue)[()],
                                  np.squeeze(ftres.pvalue)[()], ftres.df_denom,

        resli[mxlg] = (result, [res2down, res2djoint, rconstr])

    if verbose:
        savetoFile = open(saveto, 'r')

    return resli
Exemple #25
# obtain the feature matrix as a numpy array
X = boston.data

# obtain the target variable as a numpy array
y = boston.target

# create vector of ones...
int = np.ones(shape=y.shape)[..., None]

#...and add to feature matrix
X = np.concatenate((int, X), 1)

# calculate coefficients using closed-form solution
coeffs = inv(X.transpose().dot(X)).dot(X.transpose()).dot(y)

# extract the feature names of the boston data set and prepend the intercept
feature_names = np.insert(boston.feature_names, 0, 'INT')

# collect results into a DataFrame for pretty printing
results = pd.DataFrame({'coeffs': coeffs}, index=feature_names)

# create a linear model and extract the parameters
coeffs_lm = OLS(y, X).fit().params

# add the coefficients to the results DataFrame
results['coeffs_lm'] = coeffs_lm


Exemple #26
def coint(y0,
    """Test for no-cointegration of a univariate equation

    The null hypothesis is no cointegration. Variables in y0 and y1 are
    assumed to be integrated of order 1, I(1).

    This uses the augmented Engle-Granger two-step cointegration test.
    Constant or trend is included in 1st stage regression, i.e. in
    cointegrating equation.

    **Warning:** The autolag default has changed compared to statsmodels 0.8.
    In 0.8 autolag was always None, no the keyword is used and defaults to
    'aic'. Use `autolag=None` to avoid the lag search.

    y1 : array_like, 1d
        first element in cointegrating vector
    y2 : array_like
        remaining elements in cointegrating vector
    trend : str {'c', 'ct'}
        trend term included in regression for cointegrating equation

        * 'c' : constant
        * 'ct' : constant and linear trend
        * also available quadratic trend 'ctt', and no constant 'nc'

    method : string
        currently only 'aeg' for augmented Engle-Granger test is available.
        default might change.
    maxlag : None or int
        keyword for `adfuller`, largest or given number of lags
    autolag : string
        keyword for `adfuller`, lag selection criterion.

        * if None, then maxlag lags are used without lag search
        * if 'AIC' (default) or 'BIC', then the number of lags is chosen
          to minimize the corresponding information criterion
        * 't-stat' based choice of maxlag.  Starts with maxlag and drops a
          lag until the t-statistic on the last lag length is significant
          using a 5%-sized test

    return_results : bool
        for future compatibility, currently only tuple available.
        If True, then a results instance is returned. Otherwise, a tuple
        with the test outcome is returned.
        Set `return_results=False` to avoid future changes in return.

    coint_t : float
        t-statistic of unit-root test on residuals
    pvalue : float
        MacKinnon's approximate, asymptotic p-value based on MacKinnon (1994)
    crit_value : dict
        Critical values for the test statistic at the 1 %, 5 %, and 10 %
        levels based on regression curve. This depends on the number of

    The Null hypothesis is that there is no cointegration, the alternative
    hypothesis is that there is cointegrating relationship. If the pvalue is
    small, below a critical size, then we can reject the hypothesis that there
    is no cointegrating relationship.

    P-values and critical values are obtained through regression surface
    approximation from MacKinnon 1994 and 2010.

    If the two series are almost perfectly collinear, then computing the
    test is numerically unstable. However, the two series will be cointegrated
    under the maintained assumption that they are integrated. In this case
    the t-statistic will be set to -inf and the pvalue to zero.

    TODO: We could handle gaps in data by dropping rows with nans in the
    auxiliary regressions. Not implemented yet, currently assumes no nans
    and no gaps in time series.

    MacKinnon, J.G. 1994  "Approximate Asymptotic Distribution Functions for
        Unit-Root and Cointegration Tests." Journal of Business & Economics
        Statistics, 12.2, 167-76.
    MacKinnon, J.G. 2010.  "Critical Values for Cointegration Tests."
        Queen's University, Dept of Economics Working Papers 1227.

    trend = trend.lower()
    if trend not in ['c', 'nc', 'ct', 'ctt']:
        raise ValueError("trend option %s not understood" % trend)
    y0 = np.asarray(y0)
    y1 = np.asarray(y1)
    if y1.ndim < 2:
        y1 = y1[:, None]
    nobs, k_vars = y1.shape
    k_vars += 1  # add 1 for y0

    if trend == 'nc':
        xx = y1
        xx = add_trend(y1, trend=trend, prepend=False)

    res_co = OLS(y0, xx).fit()

    if res_co.rsquared < 1 - 100 * SQRTEPS:
        res_adf = adfuller(res_co.resid,
        import warnings
        warnings.warn("y0 and y1 are (almost) perfectly colinear."
                      "Cointegration test is not reliable in this case.")
        # Edge case where series are too similar
        res_adf = (-np.inf, )

    # no constant or trend, see egranger in Stata and MacKinnon
    if trend == 'nc':
        crit = [np.nan] * 3  # 2010 critical values not available
        crit = mackinnoncrit(N=k_vars, regression=trend, nobs=nobs - 1)
        #  nobs - 1, the -1 is to match egranger in Stata, I don't know why.
        #  TODO: check nobs or df = nobs - k

    pval_asy = mackinnonp(res_adf[0], regression=trend, N=k_vars)
    return res_adf[0], pval_asy, crit
Exemple #27
def _computeLinearModel(inputSample, outputSample, detection, noiseThres,
                        saturationThres, boxCox, censored):
    Run filerCensoredData and build the linear regression model.
    It is defined as a simple function because it is also needed in a loop for
    the bootstrap based POD.

    #################### Filter censored data ##############################
    if censored:
        # Filter censored data
        defects, defectsNoise, defectsSat, signals = \
            DataHandling.filterCensoredData(inputSample, outputSample,
                          noiseThres, saturationThres)
        defects, signals = inputSample, outputSample

    defectsSize = defects.getSize()

    ###################### Box Cox transformation ##########################
    # Compute Box Cox if enabled
    if boxCox:
        # optimization required, get optimal lambda without graph
        lambdaBoxCox, graphBoxCox = computeBoxCox(defects, signals)

        # Transformation of data
        boxCoxTransform = ot.BoxCoxTransform([lambdaBoxCox])
        signals = boxCoxTransform(signals)
        if censored:
            if noiseThres is not None:
                noiseThres = boxCoxTransform([noiseThres])[0]
            if saturationThres is not None:
                saturationThres = boxCoxTransform([saturationThres])[0]
        detectionBoxCox = boxCoxTransform([detection])[0]
        detectionBoxCox = detection
        lambdaBoxCox = None
        graphBoxCox = None

    ######################### Linear Regression model ######################
    # Linear regression with statsmodels module
    # Create the X matrix : [1, inputSample]
    X = ot.NumericalSample(defectsSize, [1, 0])
    X[:, 1] = defects
    algoLinear = OLS(np.array(signals), np.array(X)).fit()

    intercept = algoLinear.params[0]
    slope = algoLinear.params[1]
    # get standard error estimates (residuals standard deviation)
    stderr = np.sqrt(algoLinear.scale)
    # get residuals from algoLinear
    residuals = ot.NumericalSample(np.vstack(algoLinear.resid))

    if censored:
        # define initial starting point for MLE optimization
        initialStartMLE = [intercept, slope, stderr]
        # MLE optimization
        res = computeLinearParametersCensored(initialStartMLE, defects,
            defectsNoise, defectsSat, signals, noiseThres, saturationThres)
        intercept = res[0]
        slope = res[1]
        stderr = res[2]
        residuals = signals - (intercept + slope * defects)

    return {'defects':defects, 'signals':signals, 'intercept':intercept,
            'slope':slope, 'stderr':stderr, 'residuals':residuals,
            'detection':detectionBoxCox, 'lambdaBoxCox':lambdaBoxCox,
def gls(endog,
        order=(0, 0, 0),
        seasonal_order=(0, 0, 0, 0),
    Estimate ARMAX parameters by GLS.

    endog : array_like
        Input time series array.
    exog : array_like, optional
        Array of exogenous regressors. If not included, then `include_constant`
        must be True, and then `exog` will only include the constant column.
    order : tuple, optional
        The (p,d,q) order of the ARIMA model. Default is (0, 0, 0).
    seasonal_order : tuple, optional
        The (P,D,Q,s) order of the seasonal ARIMA model.
        Default is (0, 0, 0, 0).
    include_constant : bool, optional
        Whether to add a constant term in `exog` if it's not already there.
        The estimate of the constant will then appear as one of the `exog`
        parameters. If `exog` is None, then the constant will represent the
        mean of the process. Default is True if the specified model does not
        include integration and False otherwise.
    n_iter : int, optional
        Optionally iterate feasible GSL a specific number of times. Default is
        to iterate to convergence. If set, this argument overrides the
        `max_iter` and `tolerance` arguments.
    max_iter : int, optional
        Maximum number of feasible GLS iterations. Default is 50. If `n_iter`
        is set, it overrides this argument.
    tolerance : float, optional
        Tolerance for determining convergence of feasible GSL iterations. If
        `iter` is set, this argument has no effect.
        Default is 1e-8.
    arma_estimator : str, optional
        The estimator used for estimating the ARMA model. This option should
        not generally be used, unless the default method is failing or is
        otherwise unsuitable. Not all values will be valid, depending on the
        specified model orders (`order` and `seasonal_order`). Possible values
        * 'innovations_mle' - can be used with any specification
        * 'statespace' - can be used with any specification
        * 'hannan_rissanen' - can be used with any ARMA non-seasonal model
        * 'yule_walker' - only non-seasonal consecutive
          autoregressive (AR) models
        * 'burg' - only non-seasonal, consecutive autoregressive (AR) models
        * 'innovations' - only non-seasonal, consecutive moving
          average (MA) models.
        The default is 'innovations_mle'.
    arma_estimator_kwargs : dict, optional
        Arguments to pass to the ARMA estimator.

    parameters : SARIMAXParams object
        Contains the parameter estimates from the final iteration.
    other_results : Bunch
        Includes eight components: `spec`, `params`, `converged`,
        `differences`, `iterations`, `arma_estimator`, 'arma_estimator_kwargs',
        and `arma_results`.

    The primary reference is [1]_, section 6.6. In particular, the
    implementation follows the iterative procedure described in section 6.6.2.
    Construction of the transformed variables used to compute the GLS estimator
    described in section 6.6.1 is done via an application of the innovations
    algorithm (rather than explicit construction of the transformation matrix).

    Note that if the specified model includes integration, both the `endog` and
    `exog` series will be differenced prior to estimation and a warning will
    be issued to alert the user.

    .. [1] Brockwell, Peter J., and Richard A. Davis. 2016.
       Introduction to Time Series and Forecasting. Springer.
    # Handle n_iter
    if n_iter is not None:
        max_iter = n_iter
        tolerance = np.inf

    # Default for include_constant is True if there is no integration and
    # False otherwise
    integrated = order[1] > 0 or seasonal_order[1] > 0
    if include_constant is None:
        include_constant = not integrated
    elif include_constant and integrated:
        raise ValueError('Cannot include a constant in an integrated model.')

    # Handle including the constant (need to do it now so that the constant
    # parameter can be included in the specification as part of `exog`.)
    if include_constant:
        exog = np.ones_like(endog) if exog is None else add_constant(exog)

    # Create the SARIMAX specification
    spec = SARIMAXSpecification(endog,
    endog = spec.endog
    exog = spec.exog

    # Handle integration
    if spec.is_integrated:
        # TODO: this is the approach suggested by BD (see Remark 1 in
        # section 6.6.2 and Example 6.6.3), but maybe there are some cases
        # where we don't want to force this behavior on the user?
        warnings.warn('Provided `endog` and `exog` series have been'
                      ' differenced to eliminate integration prior to GLS'
                      ' parameter estimation.')
        endog = diff(endog,
        exog = diff(exog,
    augmented = np.c_[endog, exog]

    # Validate arma_estimator
    if arma_estimator_kwargs is None:
        arma_estimator_kwargs = {}

    # Step 1: OLS
    mod_ols = OLS(endog, exog)
    res_ols = mod_ols.fit()
    exog_params = res_ols.params
    resid = res_ols.resid

    # 0th iteration parameters
    p = SARIMAXParams(spec=spec)
    p.exog_params = exog_params
    if spec.max_ar_order > 0:
        p.ar_params = np.zeros(spec.k_ar_params)
    if spec.max_seasonal_ar_order > 0:
        p.seasonal_ar_params = np.zeros(spec.k_seasonal_ar_params)
    if spec.max_ma_order > 0:
        p.ma_params = np.zeros(spec.k_ma_params)
    if spec.max_seasonal_ma_order > 0:
        p.seasonal_ma_params = np.zeros(spec.k_seasonal_ma_params)
    p.sigma2 = res_ols.scale

    ar_params = p.ar_params
    seasonal_ar_params = p.seasonal_ar_params
    ma_params = p.ma_params
    seasonal_ma_params = p.seasonal_ma_params
    sigma2 = p.sigma2

    # Step 2 - 4: iterate feasible GLS to convergence
    arma_results = [None]
    differences = [None]
    parameters = [p]
    converged = False if n_iter is None else None
    i = 0
    for i in range(1, max_iter + 1):
        prev = exog_params

        # Step 2: ARMA
        # TODO: allow estimator-specific kwargs?
        if arma_estimator == 'yule_walker':
            p_arma, res_arma = yule_walker(resid,
        elif arma_estimator == 'burg':
            p_arma, res_arma = burg(resid,
        elif arma_estimator == 'innovations':
            out, res_arma = innovations(resid,
            p_arma = out[-1]
        elif arma_estimator == 'hannan_rissanen':
            p_arma, res_arma = hannan_rissanen(resid,
            # For later iterations, use a "warm start" for parameter estimates
            # (speeds up estimation and convergence)
            start_params = (None if i == 1 else np.r_[ar_params, ma_params,
            # Note: in each case, we do not pass in the order of integration
            # since we have already differenced the series
            tmp_order = (spec.order[0], 0, spec.order[2])
            tmp_seasonal_order = (spec.seasonal_order[0], 0,
            if arma_estimator == 'innovations_mle':
                p_arma, res_arma = innovations_mle(
                p_arma, res_arma = statespace(

        ar_params = p_arma.ar_params
        seasonal_ar_params = p_arma.seasonal_ar_params
        ma_params = p_arma.ma_params
        seasonal_ma_params = p_arma.seasonal_ma_params
        sigma2 = p_arma.sigma2

        # Step 3: GLS
        # Compute transformed variables that satisfy OLS assumptions
        # Note: In section 6.1.1 of Brockwell and Davis (2016), these
        # transformations are developed as computed by left multiplcation
        # by a matrix T. However, explicitly constructing T and then
        # performing the left-multiplications does not scale well when nobs is
        # large. Instead, we can retrieve the transformed variables as the
        # residuals of the innovations algorithm (the `normalize=True`
        # argument applies a Prais-Winsten-type normalization to the first few
        # observations to ensure homoskedasticity). Brockwell and Davis
        # mention that they also take this approach in practice.

        # GH-6540: AR must be stationary

        if not p_arma.is_stationary:
            raise ValueError(
                "Roots of the autoregressive parameters indicate that data is"
                "non-stationary. GLS cannot be used with non-stationary "
                "parameters. You should consider differencing the model data"
                "or applying a nonlinear transformation (e.g., natural log).")
        tmp, _ = arma_innovations.arma_innovations(augmented,
        u = tmp[:, 0]
        x = tmp[:, 1:]

        # OLS on transformed variables
        mod_gls = OLS(u, x)
        res_gls = mod_gls.fit()
        exog_params = res_gls.params
        resid = endog - np.dot(exog, exog_params)

        # Construct the parameter vector for the iteration
        p = SARIMAXParams(spec=spec)
        p.exog_params = exog_params
        if spec.max_ar_order > 0:
            p.ar_params = ar_params
        if spec.max_seasonal_ar_order > 0:
            p.seasonal_ar_params = seasonal_ar_params
        if spec.max_ma_order > 0:
            p.ma_params = ma_params
        if spec.max_seasonal_ma_order > 0:
            p.seasonal_ma_params = seasonal_ma_params
        p.sigma2 = sigma2

        # Check for convergence
        difference = np.abs(exog_params - prev)
        if n_iter is None and np.all(difference < tolerance):
            converged = True
        if n_iter is None:
            warnings.warn('Feasible GLS failed to converge in %d iterations.'
                          ' Consider increasing the maximum number of'
                          ' iterations using the `max_iter` argument or'
                          ' reducing the required tolerance using the'
                          ' `tolerance` argument.' % max_iter)

    # Construct final results
    p = parameters[-1]
    other_results = Bunch({
        'spec': spec,
        'params': parameters,
        'converged': converged,
        'differences': differences,
        'iterations': i,
        'arma_estimator': arma_estimator,
        'arma_estimator_kwargs': arma_estimator_kwargs,
        'arma_results': arma_results,

    return p, other_results
Exemple #29
def test_bool_regressor(reset_randomstate):
    exog = np.random.randint(0, 2, size=(100, 2)).astype(bool)
    endog = np.random.standard_normal(100)
    bool_res = OLS(endog, exog).fit()
    res = OLS(endog, exog.astype(np.double)).fit()
    assert_allclose(bool_res.params, res.params)
Exemple #30
 def test_norm_resid_zero_variance(self):
     with warnings.catch_warnings(record=True):
         y = self.res1.model.endog
         res = OLS(y, y).fit()
         assert_allclose(res.scale, 0, atol=1e-20)
         assert_allclose(res.wresid, res.resid_pearson, atol=5e-11)