Ejemplo n.º 1
0
def test_int_like(integer):
    assert isinstance(int_like(integer, "integer"), int)
    assert isinstance(int_like(integer, "integer", optional=True), int)
    assert int_like(None, "floating", optional=True) is None
    if isinstance(integer, (int, np.integer)):
        assert isinstance(int_like(integer, "integer", strict=True), int)
        assert int_like(None, "floating", optional=True, strict=True) is None
Ejemplo n.º 2
0
 def __init__(self, endog, exog, window=None, weights=None, min_nobs=None,
              missing='drop'):
     # Call Model.__init__ twice to use const detection in first pass
     # But to not drop in the second pass
     missing = string_like(missing, 'missing',
                           options=('drop', 'raise', 'skip'))
     temp_msng = 'drop' if missing != 'raise' else 'raise'
     Model.__init__(self, endog, exog, missing=temp_msng, hasconst=None)
     k_const = self.k_constant
     const_idx = self.data.const_idx
     Model.__init__(self, endog, exog, missing='none', hasconst=False)
     self.k_constant = k_const
     self.data.const_idx = const_idx
     self._y = array_like(endog, 'endog')
     nobs = self._y.shape[0]
     self._x = array_like(exog, 'endog', ndim=2, shape=(nobs, None))
     window = int_like(window, 'window', optional=True)
     weights = array_like(weights, 'weights', optional=True, shape=(nobs,))
     self._window = window if window is not None else self._y.shape[0]
     self._weighted = weights is not None
     self._weights = np.ones(nobs) if weights is None else weights
     w12 = np.sqrt(self._weights)
     self._wy = w12 * self._y
     self._wx = w12[:, None] * self._x
     self._is_nan = np.zeros_like(self._y, dtype=np.bool)
     self._has_nan = self._find_nans()
     self.const_idx = self.data.const_idx
     self._skip_missing = missing == 'skip'
     min_nobs = int_like(min_nobs, 'min_nobs', optional=True)
     self._min_nobs = min_nobs if min_nobs is not None else self._x.shape[1]
     if self._min_nobs < self._x.shape[1] or self._min_nobs > self._window:
         raise ValueError('min_nobs must be larger than the number of '
                          'regressors in the model and less than window')
Ejemplo n.º 3
0
def test_int_like(integer):
    assert isinstance(int_like(integer, 'integer'), int)
    assert isinstance(int_like(integer, 'integer', optional=True), int)
    assert int_like(None, 'floating', optional=True) is None
    if isinstance(integer, (int, np.integer)):
        assert isinstance(int_like(integer, 'integer', strict=True), int)
        assert int_like(None, 'floating', optional=True, strict=True) is None
    def __init__(
        self,
        endog,
        exog,
        window=None,
        *,
        weights=None,
        min_nobs=None,
        missing="drop",
        expanding=False
    ):
        # Call Model.__init__ twice to use const detection in first pass
        # But to not drop in the second pass
        missing = string_like(
            missing, "missing", options=("drop", "raise", "skip")
        )
        temp_msng = "drop" if missing != "raise" else "raise"
        Model.__init__(self, endog, exog, missing=temp_msng, hasconst=None)
        k_const = self.k_constant
        const_idx = self.data.const_idx
        Model.__init__(self, endog, exog, missing="none", hasconst=False)
        self.k_constant = k_const
        self.data.const_idx = const_idx
        self._y = array_like(endog, "endog")
        nobs = self._y.shape[0]
        self._x = array_like(exog, "endog", ndim=2, shape=(nobs, None))
        window = int_like(window, "window", optional=True)
        weights = array_like(weights, "weights", optional=True, shape=(nobs,))
        self._window = window if window is not None else self._y.shape[0]
        self._weighted = weights is not None
        self._weights = np.ones(nobs) if weights is None else weights
        w12 = np.sqrt(self._weights)
        self._wy = w12 * self._y
        self._wx = w12[:, None] * self._x

        min_nobs = int_like(min_nobs, "min_nobs", optional=True)
        self._min_nobs = min_nobs if min_nobs is not None else self._x.shape[1]
        if self._min_nobs < self._x.shape[1] or self._min_nobs > self._window:
            raise ValueError(
                "min_nobs must be larger than the number of "
                "regressors in the model and less than window"
            )

        self._expanding = expanding

        self._is_nan = np.zeros_like(self._y, dtype=bool)
        self._has_nan = self._find_nans()
        self.const_idx = self.data.const_idx
        self._skip_missing = missing == "skip"
def anderson_statistic(x, dist='norm', fit=True, params=(), axis=0):
    """
    Calculate the Anderson-Darling a2 statistic.

    Parameters
    ----------
    x : array_like
        The data to test.
    dist : {'norm', callable}
        The assumed distribution under the null of test statistic.
    fit : bool
        If True, then the distribution parameters are estimated.
        Currently only for 1d data x, except in case dist='norm'.
    params : tuple
        The optional distribution parameters if fit is False.
    axis : int
        If dist is 'norm' or fit is False, then data can be an n-dimensional
        and axis specifies the axis of a variable.

    Returns
    -------
    {float, ndarray}
        The Anderson-Darling statistic.
    """
    x = array_like(x, 'x', ndim=None)
    fit = bool_like(fit, 'fit')
    axis = int_like(axis, 'axis')
    y = np.sort(x, axis=axis)
    nobs = y.shape[axis]
    if fit:
        if dist == 'norm':
            xbar = np.expand_dims(np.mean(x, axis=axis), axis)
            s = np.expand_dims(np.std(x, ddof=1, axis=axis), axis)
            w = (y - xbar) / s
            z = stats.norm.cdf(w)
            # print z
        elif callable(dist):
            params = dist.fit(x)
            # print params
            z = dist.cdf(y, *params)
            print(z)
        else:
            raise ValueError("dist must be 'norm' or a Callable")
    else:
        if callable(dist):
            z = dist.cdf(y, *params)
        else:
            raise ValueError('if fit is false, then dist must be callable')

    i = np.arange(1, nobs + 1)
    sl1 = [None] * x.ndim
    sl1[axis] = slice(None)
    sl1 = tuple(sl1)
    sl2 = [slice(None)] * x.ndim
    sl2[axis] = slice(None, None, -1)
    sl2 = tuple(sl2)
    s = np.sum((2 * i[sl1] - 1.0) / nobs * (np.log(z) + np.log1p(-z[sl2])),
               axis=axis)
    a2 = -nobs - s
    return a2
Ejemplo n.º 6
0
    def forecast_components(self, steps: int = 1) -> pd.DataFrame:
        r"""
        Compute the three components of the Theta model forecast

        Parameters
        ----------
        steps : int
            The number of steps ahead to compute the forecast components.

        Returns
        -------
        DataFrame
            A DataFrame with three columns: trend, ses and seasonal containing
            the forecast values of each of the three components.

        Notes
        -----
        For a given value of :math:`\theta`, the deseasonalized forecast is
        `fcast = w * trend + ses` where :math:`w = \frac{theta - 1}{theta}`.
        The reseasonalized forecasts are then `seasonal * fcast` if the
        seasonality is multiplicative or `seasonal + fcast` if the seasonality
        is additive.
        """
        steps = int_like(steps, "steps")
        if steps < 1:
            raise ValueError("steps must be a positive integer")
        alpha = self._alpha
        b0 = self._b0
        nobs = self._nobs
        h = np.arange(1, steps + 1, dtype=np.float64) - 1
        if alpha > 0:
            h += 1 / alpha - ((1 - alpha)**nobs / alpha)
        trend = b0 * h
        ses = self._one_step * np.ones(steps)
        if self.model.method.startswith("add"):
            season = np.zeros(steps)
        else:
            season = np.ones(steps)
        # Re-seasonalize
        if self.model.deseasonalize:
            seasonal = self._seasonal
            period = self.model.period
            oos_idx = nobs + np.arange(steps)
            seasonal_locs = oos_idx % period
            if seasonal.shape[0]:
                season[:] = seasonal[seasonal_locs]
        index = getattr(self.model.endog_orig, "index", None)
        if index is None:
            index = pd.RangeIndex(0, self.model.endog_orig.shape[0])
        index = extend_index(steps, index)

        df = pd.DataFrame({
            "trend": trend,
            "ses": ses,
            "seasonal": season
        },
                          index=index)
        return df
Ejemplo n.º 7
0
def detrend(x, order=1, axis=0):
    """
    Detrend an array with a trend of given order along axis 0 or 1.

    Parameters
    ----------
    x : array_like, 1d or 2d
        Data, if 2d, then each row or column is independently detrended with
        the same trendorder, but independent trend estimates.
    order : int
        The polynomial order of the trend, zero is constant, one is
        linear trend, two is quadratic trend.
    axis : int
        Axis can be either 0, observations by rows, or 1, observations by
        columns.

    Returns
    -------
    ndarray
        The detrended series is the residual of the linear regression of the
        data on the trend of given order.
    """
    order = int_like(order, "order")
    axis = int_like(axis, "axis")

    if x.ndim == 2 and int(axis) == 1:
        x = x.T
    elif x.ndim > 2:
        raise NotImplementedError(
            "x.ndim > 2 is not implemented until it is needed")

    nobs = x.shape[0]
    if order == 0:
        # Special case demean
        resid = x - x.mean(axis=0)
    else:
        trends = np.vander(np.arange(float(nobs)), N=order + 1)
        beta = np.linalg.pinv(trends).dot(x)
        resid = x - np.dot(trends, beta)

    if x.ndim == 2 and int(axis) == 1:
        resid = resid.T

    return resid
Ejemplo n.º 8
0
def commutation_matrix(p, q):
    """
    Create the commutation matrix K_{p,q} satisfying vec(A') = K_{p,q} vec(A)

    Parameters
    ----------
    p : int
    q : int

    Returns
    -------
    K : ndarray (pq x pq)
    """
    p = int_like(p, "p")
    q = int_like(q, "q")

    K = np.eye(p * q)
    indices = np.arange(p * q).reshape((p, q), order="F")
    return K.take(indices.ravel(), axis=0)
Ejemplo n.º 9
0
def duplication_matrix(n):
    """
    Create duplication matrix D_n which satisfies vec(S) = D_n vech(S) for
    symmetric matrix S

    Returns
    -------
    D_n : ndarray
    """
    n = int_like(n, "n")
    tmp = np.eye(n * (n + 1) // 2)
    return np.array([unvech(x).ravel() for x in tmp]).T
Ejemplo n.º 10
0
 def __init__(self,
              endog,
              trend=None,
              damped=False,
              seasonal=None,
              seasonal_periods=None,
              dates=None,
              freq=None,
              missing='none'):
     super(ExponentialSmoothing, self).__init__(endog,
                                                None,
                                                dates,
                                                freq,
                                                missing=missing)
     self.endog = self.endog
     self._y = self._data = array_like(endog,
                                       'endog',
                                       contiguous=True,
                                       order='C')
     options = ("add", "mul", "additive", "multiplicative")
     trend = string_like(trend, 'trend', options=options, optional=True)
     if trend in ['additive', 'multiplicative']:
         trend = {'additive': 'add', 'multiplicative': 'mul'}[trend]
     self.trend = trend
     self.damped = bool_like(damped, 'damped')
     seasonal = string_like(seasonal,
                            'seasonal',
                            options=options,
                            optional=True)
     if seasonal in ['additive', 'multiplicative']:
         seasonal = {'additive': 'add', 'multiplicative': 'mul'}[seasonal]
     self.seasonal = seasonal
     self.trending = trend in ['mul', 'add']
     self.seasoning = seasonal in ['mul', 'add']
     if (self.trend == 'mul' or self.seasonal == 'mul') and \
             not np.all(self._data > 0.0):
         raise ValueError('endog must be strictly positive when using'
                          'multiplicative trend or seasonal components.')
     if self.damped and not self.trending:
         raise ValueError('Can only dampen the trend component')
     if self.seasoning:
         self.seasonal_periods = int_like(seasonal_periods,
                                          'seasonal_periods',
                                          optional=True)
         if seasonal_periods is None:
             self.seasonal_periods = freq_to_period(self._index_freq)
         if self.seasonal_periods <= 1:
             raise ValueError('seasonal_periods must be larger than 1.')
     else:
         self.seasonal_periods = 0
     self.nobs = len(self.endog)
Ejemplo n.º 11
0
def elimination_matrix(n):
    """
    Create the elimination matrix L_n which satisfies vech(M) = L_n vec(M) for
    any matrix M

    Parameters
    ----------

    Returns
    -------
    """
    n = int_like(n, "n")
    vech_indices = vec(np.tril(np.ones((n, n))))
    return np.eye(n * n)[vech_indices != 0]
Ejemplo n.º 12
0
    def __init__(
        self,
        endog,
        *,
        period: Optional[int] = None,
        deseasonalize: bool = True,
        use_test: bool = True,
        method: str = "auto",
        difference: bool = False
    ) -> None:
        self._y = array_like(endog, "endog", ndim=1)
        if isinstance(endog, pd.DataFrame):
            self.endog_orig = endog.iloc[:, 0]
        else:
            self.endog_orig = endog
        self._period = int_like(period, "period", optional=True)
        self._deseasonalize = bool_like(deseasonalize, "deseasonalize")
        self._use_test = (
            bool_like(use_test, "use_test") and self._deseasonalize
        )
        self._diff = bool_like(difference, "difference")
        self._method = string_like(
            method,
            "model",
            options=("auto", "additive", "multiplicative", "mul", "add"),
        )
        if self._period is None and self._deseasonalize:
            idx = getattr(endog, "index", None)
            pfreq = None
            if idx is not None:
                pfreq = getattr(idx, "freq", None)
                if pfreq is None:
                    pfreq = getattr(idx, "inferred_freq", None)
            if pfreq is not None:
                self._period = freq_to_period(pfreq)
            else:
                raise ValueError(
                    "You must specify a period or endog must be a "
                    "pandas object with a DatetimeIndex with "
                    "a freq not set to None"
                )

        self._has_seasonality = self._deseasonalize
Ejemplo n.º 13
0
def unintegrate_levels(x, d):
    """
    Returns the successive differences needed to unintegrate the series.

    Parameters
    ----------
    x : array_like
        The original series
    d : int
        The number of differences of the differenced series.

    Returns
    -------
    y : array_like
        The increasing differences from 0 to d-1 of the first d elements
        of x.

    See Also
    --------
    unintegrate
    """
    d = int_like(d, "d")
    x = x[:d]
    return np.asarray([np.diff(x, d - i)[0] for i in range(d, 0, -1)])
Ejemplo n.º 14
0
def add_lag(x, col=None, lags=1, drop=False, insert=True):
    """
    Returns an array with lags included given an array.

    Parameters
    ----------
    x : array
        An array or NumPy ndarray subclass. Can be either a 1d or 2d array with
        observations in columns.
    col : 'string', int, or None
        If data is a structured array or a recarray, `col` can be a string
        that is the name of the column containing the variable. Or `col` can
        be an int of the zero-based column index. If it's a 1d array `col`
        can be None.
    lags : int
        The number of lags desired.
    drop : bool
        Whether to keep the contemporaneous variable for the data.
    insert : bool or int
        If True, inserts the lagged values after `col`. If False, appends
        the data. If int inserts the lags at int.

    Returns
    -------
    array : ndarray
        Array with lags

    Examples
    --------

    >>> import statsmodels.api as sm
    >>> data = sm.datasets.macrodata.load(as_pandas=False)
    >>> data = data.data[['year','quarter','realgdp','cpi']]
    >>> data = sm.tsa.add_lag(data, 'realgdp', lags=2)

    Notes
    -----
    Trims the array both forward and backward, so that the array returned
    so that the length of the returned array is len(`X`) - lags. The lags are
    returned in increasing order, ie., t-1,t-2,...,t-lags
    """
    lags = int_like(lags, 'lags')
    drop = bool_like(drop, 'drop')

    if x.dtype.names:
        names = x.dtype.names
        if not col and np.squeeze(x).ndim > 1:
            raise IndexError("col is None and the input array is not 1d")
        elif len(names) == 1:
            col = names[0]
        if isinstance(col, int):
            col = x.dtype.names[col]

        contemp = x[col]

        # make names for lags
        tmp_names = [col + '_' + 'L(%i)' % i for i in range(1, lags + 1)]
        ndlags = lagmat(contemp, maxlag=lags, trim='Both')

        # get index for return
        if insert is True:
            ins_idx = list(names).index(col) + 1
        elif insert is False:
            ins_idx = len(names) + 1
        else:  # insert is an int
            if insert > len(names):
                import warnings
                warnings.warn(
                    "insert > number of variables, inserting at the"
                    " last position", ValueWarning)
            ins_idx = insert

        first_names = list(names[:ins_idx])
        last_names = list(names[ins_idx:])

        if drop:
            if col in first_names:
                first_names.pop(first_names.index(col))
            else:
                last_names.pop(last_names.index(col))

        if first_names:  # only do this if x is not "empty"
            # Workaround to avoid NumPy FutureWarning
            _x = recarray_select(x, first_names)
            first_arr = nprf.append_fields(_x[lags:],
                                           tmp_names,
                                           ndlags.T,
                                           usemask=False)

        else:
            first_arr = np.zeros(len(x) - lags,
                                 dtype=lzip(tmp_names,
                                            (x[col].dtype, ) * lags))
            for i, name in enumerate(tmp_names):
                first_arr[name] = ndlags[:, i]
        if last_names:
            return nprf.append_fields(first_arr,
                                      last_names,
                                      [x[name][lags:] for name in last_names],
                                      usemask=False)
        else:  # lags for last variable
            return first_arr

    else:  # we have an ndarray

        if x.ndim == 1:  # make 2d if 1d
            x = x[:, None]
        if col is None:
            col = 0

        # handle negative index
        if col < 0:
            col = x.shape[1] + col

        contemp = x[:, col]

        if insert is True:
            ins_idx = col + 1
        elif insert is False:
            ins_idx = x.shape[1]
        else:
            if insert < 0:  # handle negative index
                insert = x.shape[1] + insert + 1
            if insert > x.shape[1]:
                insert = x.shape[1]
                import warnings
                warnings.warn(
                    "insert > number of variables, inserting at the"
                    " last position", ValueWarning)
            ins_idx = insert

        ndlags = lagmat(contemp, lags, trim='Both')
        first_cols = lrange(ins_idx)
        last_cols = lrange(ins_idx, x.shape[1])
        if drop:
            if col in first_cols:
                first_cols.pop(first_cols.index(col))
            else:
                last_cols.pop(last_cols.index(col))
        return np.column_stack((x[lags:, first_cols], ndlags, x[lags:,
                                                                last_cols]))
Ejemplo n.º 15
0
def adfuller(
    x,
    maxlag=None,
    regression="c",
    autolag="AIC",
    store=False,
    regresults=False,
):
    """
    Augmented Dickey-Fuller unit root test.

    The Augmented Dickey-Fuller test can be used to test for a unit root in a
    univariate process in the presence of serial correlation.

    Parameters
    ----------
    x : array_like, 1d
        The data series to test.
    maxlag : int
        Maximum lag which is included in test, default 12*(nobs/100)^{1/4}.
    regression : {"c","ct","ctt","nc"}
        Constant and trend order to include in regression.

        * "c" : constant only (default).
        * "ct" : constant and trend.
        * "ctt" : constant, and linear and quadratic trend.
        * "nc" : no constant, no trend.

    autolag : {"AIC", "BIC", "t-stat", None}
        Method to use when automatically determining the lag length among the
        values 0, 1, ..., maxlag.

        * If "AIC" (default) or "BIC", then the number of lags is chosen
          to minimize the corresponding information criterion.
        * "t-stat" based choice of maxlag.  Starts with maxlag and drops a
          lag until the t-statistic on the last lag length is significant
          using a 5%-sized test.
        * If None, then the number of included lags is set to maxlag.
    store : bool
        If True, then a result instance is returned additionally to
        the adf statistic. Default is False.
    regresults : bool, optional
        If True, the full regression results are returned. Default is False.

    Returns
    -------
    adf : float
        The test statistic.
    pvalue : float
        MacKinnon's approximate p-value based on MacKinnon (1994, 2010).
    usedlag : int
        The number of lags used.
    nobs : int
        The number of observations used for the ADF regression and calculation
        of the critical values.
    critical values : dict
        Critical values for the test statistic at the 1 %, 5 %, and 10 %
        levels. Based on MacKinnon (2010).
    icbest : float
        The maximized information criterion if autolag is not None.
    resstore : ResultStore, optional
        A dummy class with results attached as attributes.

    Notes
    -----
    The null hypothesis of the Augmented Dickey-Fuller is that there is a unit
    root, with the alternative that there is no unit root. If the pvalue is
    above a critical size, then we cannot reject that there is a unit root.

    The p-values are obtained through regression surface approximation from
    MacKinnon 1994, but using the updated 2010 tables. If the p-value is close
    to significant, then the critical values should be used to judge whether
    to reject the null.

    The autolag option and maxlag for it are described in Greene.

    References
    ----------
    .. [1] W. Green.  "Econometric Analysis," 5th ed., Pearson, 2003.

    .. [2] Hamilton, J.D.  "Time Series Analysis".  Princeton, 1994.

    .. [3] MacKinnon, J.G. 1994.  "Approximate asymptotic distribution functions for
        unit-root and cointegration tests.  `Journal of Business and Economic
        Statistics` 12, 167-76.

    .. [4] MacKinnon, J.G. 2010. "Critical Values for Cointegration Tests."  Queen"s
        University, Dept of Economics, Working Papers.  Available at
        http://ideas.repec.org/p/qed/wpaper/1227.html

    Examples
    --------
    See example notebook
    """
    x = array_like(x, "x")
    maxlag = int_like(maxlag, "maxlag", optional=True)
    regression = string_like(regression,
                             "regression",
                             options=("c", "ct", "ctt", "nc"))
    autolag = string_like(autolag,
                          "autolag",
                          optional=True,
                          options=("aic", "bic", "t-stat"))
    store = bool_like(store, "store")
    regresults = bool_like(regresults, "regresults")

    if regresults:
        store = True

    trenddict = {None: "nc", 0: "c", 1: "ct", 2: "ctt"}
    if regression is None or isinstance(regression, int):
        regression = trenddict[regression]
    regression = regression.lower()
    nobs = x.shape[0]

    ntrend = len(regression) if regression != "nc" else 0
    if maxlag is None:
        # from Greene referencing Schwert 1989
        maxlag = int(np.ceil(12.0 * np.power(nobs / 100.0, 1 / 4.0)))
        # -1 for the diff
        maxlag = min(nobs // 2 - ntrend - 1, maxlag)
        if maxlag < 0:
            raise ValueError("sample size is too short to use selected "
                             "regression component")
    elif maxlag > nobs // 2 - ntrend - 1:
        raise ValueError("maxlag must be less than (nobs/2 - 1 - ntrend) "
                         "where n trend is the number of included "
                         "deterministic regressors")
    xdiff = np.diff(x)
    xdall = lagmat(xdiff[:, None], maxlag, trim="both", original="in")
    nobs = xdall.shape[0]

    xdall[:, 0] = x[-nobs - 1:-1]  # replace 0 xdiff with level of x
    xdshort = xdiff[-nobs:]

    if store:
        from statsmodels.stats.diagnostic import ResultsStore

        resstore = ResultsStore()
    if autolag:
        if regression != "nc":
            fullRHS = add_trend(xdall, regression, prepend=True)
        else:
            fullRHS = xdall
        startlag = fullRHS.shape[1] - xdall.shape[1] + 1
        # 1 for level
        # search for lag length with smallest information criteria
        # Note: use the same number of observations to have comparable IC
        # aic and bic: smaller is better

        if not regresults:
            icbest, bestlag = _autolag(OLS, xdshort, fullRHS, startlag, maxlag,
                                       autolag)
        else:
            icbest, bestlag, alres = _autolag(
                OLS,
                xdshort,
                fullRHS,
                startlag,
                maxlag,
                autolag,
                regresults=regresults,
            )
            resstore.autolag_results = alres

        bestlag -= startlag  # convert to lag not column index

        # rerun ols with best autolag
        xdall = lagmat(xdiff[:, None], bestlag, trim="both", original="in")
        nobs = xdall.shape[0]
        xdall[:, 0] = x[-nobs - 1:-1]  # replace 0 xdiff with level of x
        xdshort = xdiff[-nobs:]
        usedlag = bestlag
    else:
        usedlag = maxlag
        icbest = None
    if regression != "nc":
        resols = OLS(xdshort, add_trend(xdall[:, :usedlag + 1],
                                        regression)).fit()
    else:
        resols = OLS(xdshort, xdall[:, :usedlag + 1]).fit()

    adfstat = resols.tvalues[0]
    #    adfstat = (resols.params[0]-1.0)/resols.bse[0]
    # the "asymptotically correct" z statistic is obtained as
    # nobs/(1-np.sum(resols.params[1:-(trendorder+1)])) (resols.params[0] - 1)
    # I think this is the statistic that is used for series that are integrated
    # for orders higher than I(1), ie., not ADF but cointegration tests.

    # Get approx p-value and critical values
    pvalue = mackinnonp(adfstat, regression=regression, N=1)
    critvalues = mackinnoncrit(N=1, regression=regression, nobs=nobs)
    critvalues = {
        "1%": critvalues[0],
        "5%": critvalues[1],
        "10%": critvalues[2],
    }
    if store:
        resstore.resols = resols
        resstore.maxlag = maxlag
        resstore.usedlag = usedlag
        resstore.adfstat = adfstat
        resstore.critvalues = critvalues
        resstore.nobs = nobs
        resstore.H0 = ("The coefficient on the lagged level equals 1 - "
                       "unit root")
        resstore.HA = "The coefficient on the lagged level < 1 - stationary"
        resstore.icbest = icbest
        resstore._str = "Augmented Dickey-Fuller Test Results"
        return adfstat, pvalue, critvalues, resstore
    else:
        if not autolag:
            return adfstat, pvalue, usedlag, nobs, critvalues
        else:
            return adfstat, pvalue, usedlag, nobs, critvalues, icbest
Ejemplo n.º 16
0
def varsim(coefs,
           intercept,
           sig_u,
           steps=100,
           initial_values=None,
           seed=None,
           nsimulations=None):
    """
    Simulate VAR(p) process, given coefficients and assuming Gaussian noise

    Parameters
    ----------
    coefs : ndarray
        Coefficients for the VAR lags of endog.
    intercept : None or ndarray 1-D (neqs,) or (steps, neqs)
        This can be either the intercept for each equation or an offset.
        If None, then the VAR process has a zero intercept.
        If intercept is 1-D, then the same (endog specific) intercept is added
        to all observations.
        If intercept is 2-D, then it is treated as an offset and is added as
        an observation specific intercept to the autoregression. In this case,
        the intercept/offset should have same number of rows as steps, and the
        same number of columns as endogenous variables (neqs).
    sig_u : ndarray
        Covariance matrix of the residuals or innovations.
        If sig_u is None, then an identity matrix is used.
    steps : {None, int}
        number of observations to simulate, this includes the initial
        observations to start the autoregressive process.
        If offset is not None, then exog of the model are used if they were
        provided in the model
    initial_values : array_like, optional
        Initial values for use in the simulation. Shape should be
        (nlags, neqs) or (neqs,). Values should be ordered from less to
        most recent. Note that this values will be returned by the
        simulation as the first values of `endog_simulated` and they
        will count for the total number of steps.
    seed : {None, int}
        If seed is not None, then it will be used with for the random
        variables generated by numpy.random.
    nsimulations : {None, int}
        Number of simulations to perform. If `nsimulations` is None it will
        perform one simulation and return value will have shape (steps, neqs).

    Returns
    -------
    endog_simulated : nd_array
        Endog of the simulated VAR process. Shape will be (nsimulations, steps, neqs)
        or (steps, neqs) if `nsimulations` is None.
    """
    rs = np.random.RandomState(seed=seed)
    rmvnorm = rs.multivariate_normal
    p, k, k = coefs.shape
    nsimulations = int_like(nsimulations, "nsimulations", optional=True)
    if isinstance(nsimulations, int) and nsimulations <= 0:
        raise ValueError("nsimulations must be a positive integer if provided")
    if nsimulations is None:
        result_shape = (steps, k)
        nsimulations = 1
    else:
        result_shape = (nsimulations, steps, k)
    if sig_u is None:
        sig_u = np.eye(k)
    ugen = rmvnorm(np.zeros(len(sig_u)), sig_u,
                   steps * nsimulations).reshape(nsimulations, steps, k)
    result = np.zeros((nsimulations, steps, k))
    if intercept is not None:
        # intercept can be 2-D like an offset variable
        if np.ndim(intercept) > 1:
            if not len(intercept) == ugen.shape[1]:
                raise ValueError('2-D intercept needs to have length `steps`')
        # add intercept/offset also to intial values
        result += intercept
        result[:, p:] += ugen[:, p:]
    else:
        result[:, p:] = ugen[:, p:]

    initial_values = array_like(initial_values,
                                "initial_values",
                                optional=True,
                                maxdim=2)
    if initial_values is not None:
        if not (initial_values.shape == (p, k)
                or initial_values.shape == (k, )):
            raise ValueError(
                "initial_values should have shape (p, k) or (k,) where p is the number of lags and k is the number of equations."
            )
        result[:, :p] = initial_values

    # add in AR terms
    for t in range(p, steps):
        ygen = result[:, t]
        for j in range(p):
            ygen += np.dot(coefs[j], result[:, t - j - 1].T).T

    return result.reshape(result_shape)
Ejemplo n.º 17
0
def lagmat2ds(x,
              maxlag0,
              maxlagex=None,
              dropex=0,
              trim="forward",
              use_pandas=False):
    """
    Generate lagmatrix for 2d array, columns arranged by variables.

    Parameters
    ----------
    x : array_like
        Data, 2d. Observations in rows and variables in columns.
    maxlag0 : int
        The first variable all lags from zero to maxlag are included.
    maxlagex : {None, int}
        The max lag for all other variables all lags from zero to maxlag are
        included.
    dropex : int
        Exclude first dropex lags from other variables. For all variables,
        except the first, lags from dropex to maxlagex are included.
    trim : str
        The trimming method to use.

        * 'forward' : trim invalid observations in front.
        * 'backward' : trim invalid initial observations.
        * 'both' : trim invalid observations on both sides.
        * 'none' : no trimming of observations.
    use_pandas : bool
        If true, returns a DataFrame when the input is a pandas
        Series or DataFrame.  If false, return numpy ndarrays.

    Returns
    -------
    ndarray
        The array with lagged observations, columns ordered by variable.

    Notes
    -----
    Inefficient implementation for unequal lags, implemented for convenience.
    """
    maxlag0 = int_like(maxlag0, "maxlag0")
    maxlagex = int_like(maxlagex, "maxlagex", optional=True)
    trim = string_like(
        trim,
        "trim",
        optional=True,
        options=("forward", "backward", "both", "none"),
    )
    if maxlagex is None:
        maxlagex = maxlag0
    maxlag = max(maxlag0, maxlagex)
    is_pandas = _is_using_pandas(x, None)

    if x.ndim == 1:
        if is_pandas:
            x = pd.DataFrame(x)
        else:
            x = x[:, None]
    elif x.ndim == 0 or x.ndim > 2:
        raise ValueError("Only supports 1 and 2-dimensional data.")

    nobs, nvar = x.shape

    if is_pandas and use_pandas:
        lags = lagmat(x.iloc[:, 0],
                      maxlag,
                      trim=trim,
                      original="in",
                      use_pandas=True)
        lagsli = [lags.iloc[:, :maxlag0 + 1]]
        for k in range(1, nvar):
            lags = lagmat(x.iloc[:, k],
                          maxlag,
                          trim=trim,
                          original="in",
                          use_pandas=True)
            lagsli.append(lags.iloc[:, dropex:maxlagex + 1])
        return pd.concat(lagsli, axis=1)
    elif is_pandas:
        x = np.asanyarray(x)

    lagsli = [
        lagmat(x[:, 0], maxlag, trim=trim, original="in")[:, :maxlag0 + 1]
    ]
    for k in range(1, nvar):
        lagsli.append(
            lagmat(x[:, k], maxlag, trim=trim,
                   original="in")[:, dropex:maxlagex + 1])
    return np.column_stack(lagsli)
Ejemplo n.º 18
0
    def __init__(self, endog, trend=False, damped_trend=False, seasonal=None,
                 initialization_method='estimated', initial_level=None,
                 initial_trend=None, initial_seasonal=None, bounds=None,
                 concentrate_scale=True, dates=None, freq=None,
                 missing='none'):
        # Model definition
        self.trend = bool_like(trend, 'trend')
        self.damped_trend = bool_like(damped_trend, 'damped_trend')
        self.seasonal_periods = int_like(seasonal, 'seasonal', optional=True)
        self.seasonal = self.seasonal_periods is not None
        self.initialization_method = string_like(
            initialization_method, 'initialization_method').lower()
        self.concentrate_scale = bool_like(concentrate_scale,
                                           'concentrate_scale')

        # TODO: add validation for bounds (e.g. have all bounds, upper > lower)
        # TODO: add `bounds_method` argument to choose between "usual" and
        # "admissible" as in Hyndman et al. (2008)
        self.bounds = bounds
        if self.bounds is None:
            self.bounds = [(1e-4, 1-1e-4)] * 3 + [(0.8, 0.98)]

        # Validation
        if self.seasonal_periods == 1:
            raise ValueError('Cannot have a seasonal period of 1.')

        if self.seasonal and self.seasonal_periods is None:
            raise NotImplementedError('Unable to detect season automatically;'
                                      ' please specify `seasonal_periods`.')

        if self.initialization_method not in ['concentrated', 'estimated',
                                              'simple', 'heuristic', 'known']:
            raise ValueError('Invalid initialization method "%s".'
                             % initialization_method)

        if self.initialization_method == 'known':
            if initial_level is None:
                raise ValueError('`initial_level` argument must be provided'
                                 ' when initialization method is set to'
                                 ' "known".')
            if initial_trend is None and self.trend:
                raise ValueError('`initial_trend` argument must be provided'
                                 ' for models with a trend component when'
                                 ' initialization method is set to "known".')
            if initial_seasonal is None and self.seasonal:
                raise ValueError('`initial_seasonal` argument must be provided'
                                 ' for models with a seasonal component when'
                                 ' initialization method is set to "known".')

        # Initialize the state space model
        if not self.seasonal or self.seasonal_periods is None:
            self._seasonal_periods = 0
        else:
            self._seasonal_periods = self.seasonal_periods

        k_states = 2 + int(self.trend) + self._seasonal_periods
        k_posdef = 1

        init = ss_init.Initialization(k_states, 'known',
                                      constant=[0] * k_states)
        super(ExponentialSmoothing, self).__init__(
            endog, k_states=k_states, k_posdef=k_posdef,
            initialization=init, dates=dates, freq=freq, missing=missing)

        # Concentrate the scale out of the likelihood function
        if self.concentrate_scale:
            self.ssm.filter_concentrated = True

        # Setup fixed elements of the system matrices
        # Observation error
        self.ssm['design', 0, 0] = 1.
        self.ssm['selection', 0, 0] = 1.
        self.ssm['state_cov', 0, 0] = 1.

        # Level
        self.ssm['design', 0, 1] = 1.
        self.ssm['transition', 1, 1] = 1.

        # Trend
        if self.trend:
            self.ssm['transition', 1:3, 2] = 1.

        # Seasonal
        if self.seasonal:
            k = 2 + int(self.trend)
            self.ssm['design', 0, k] = 1.
            self.ssm['transition', k, -1] = 1.
            self.ssm['transition', k + 1:k_states, k:k_states - 1] = (
                np.eye(self.seasonal_periods - 1))

        # Initialization of the states
        if self.initialization_method != 'known':
            msg = ('Cannot give `%%s` argument when initialization is "%s"'
                   % initialization_method)
            if initial_level is not None:
                raise ValueError(msg % 'initial_level')
            if initial_trend is not None:
                raise ValueError(msg % 'initial_trend')
            if initial_seasonal is not None:
                raise ValueError(msg % 'initial_seasonal')

        if self.initialization_method == 'simple':
            initial_level, initial_trend, initial_seasonal = (
                es_init._initialization_simple(
                    self.endog[:, 0], trend='add' if self.trend else None,
                    seasonal='add' if self.seasonal else None,
                    seasonal_periods=self.seasonal_periods))
        elif self.initialization_method == 'heuristic':
            initial_level, initial_trend, initial_seasonal = (
                es_init._initialization_heuristic(
                    self.endog[:, 0], trend='add' if self.trend else None,
                    seasonal='add' if self.seasonal else None,
                    seasonal_periods=self.seasonal_periods))
        elif self.initialization_method == 'known':
            initial_level = float_like(initial_level, 'initial_level')
            if self.trend:
                initial_trend = float_like(initial_trend, 'initial_trend')
            if self.seasonal:
                initial_seasonal = array_like(initial_seasonal,
                                              'initial_seasonal')

                if len(initial_seasonal) == self.seasonal_periods - 1:
                    initial_seasonal = np.r_[initial_seasonal,
                                             0 - np.sum(initial_seasonal)]

                if len(initial_seasonal) != self.seasonal_periods:
                    raise ValueError(
                        'Invalid length of initial seasonal values. Must be'
                        ' one of s or s-1, where s is the number of seasonal'
                        ' periods.')

        # Note that the simple and heuristic methods of computing initial
        # seasonal factors return estimated seasonal factors associated with
        # the first t = 1, 2, ..., `n_seasons` observations. To use these as
        # the initial state, we lag them by `n_seasons`. This yields, for
        # example for `n_seasons = 4`, the seasons lagged L3, L2, L1, L0.
        # As described above, the state vector in this model should have
        # seasonal factors ordered L0, L1, L2, L3, and as a result we need to
        # reverse the order of the computed initial seasonal factors from
        # these methods.
        methods = ['simple', 'heuristic']
        if (self.initialization_method in methods
                and initial_seasonal is not None):
            initial_seasonal = initial_seasonal[::-1]

        self._initial_level = initial_level
        self._initial_trend = initial_trend
        self._initial_seasonal = initial_seasonal
        self._initial_state = None

        # Initialize now if possible (if we have a damped trend, then
        # initialization will depend on the phi parameter, and so has to be
        # done at each `update`)
        methods = ['simple', 'heuristic', 'known']
        if not self.damped_trend and self.initialization_method in methods:
            self._initialize_constant_statespace(initial_level, initial_trend,
                                                 initial_seasonal)

        # Save keys for kwarg initialization
        self._init_keys += ['trend', 'damped_trend', 'seasonal',
                            'initialization_method', 'initial_level',
                            'initial_trend', 'initial_seasonal', 'bounds',
                            'concentrate_scale', 'dates', 'freq', 'missing']
Ejemplo n.º 19
0
def lagmat(
    x,
    maxlag: int,
    trim: Literal["forward", "backward", "both", "none"] = 'forward',
    original: Literal["ex", "sep", "in"] = "ex",
    use_pandas: bool = False
) -> NDArray | DataFrame | tuple[NDArray, NDArray] | tuple[DataFrame,
                                                           DataFrame]:
    """
    Create 2d array of lags.

    Parameters
    ----------
    x : array_like
        Data; if 2d, observation in rows and variables in columns.
    maxlag : int
        All lags from zero to maxlag are included.
    trim : {'forward', 'backward', 'both', 'none', None}
        The trimming method to use.

        * 'forward' : trim invalid observations in front.
        * 'backward' : trim invalid initial observations.
        * 'both' : trim invalid observations on both sides.
        * 'none', None : no trimming of observations.
    original : {'ex','sep','in'}
        How the original is treated.

        * 'ex' : drops the original array returning only the lagged values.
        * 'in' : returns the original array and the lagged values as a single
          array.
        * 'sep' : returns a tuple (original array, lagged values). The original
                  array is truncated to have the same number of rows as
                  the returned lagmat.
    use_pandas : bool
        If true, returns a DataFrame when the input is a pandas
        Series or DataFrame.  If false, return numpy ndarrays.

    Returns
    -------
    lagmat : ndarray
        The array with lagged observations.
    y : ndarray, optional
        Only returned if original == 'sep'.

    Notes
    -----
    When using a pandas DataFrame or Series with use_pandas=True, trim can only
    be 'forward' or 'both' since it is not possible to consistently extend
    index values.

    Examples
    --------
    >>> from statsmodels.tsa.tsatools import lagmat
    >>> import numpy as np
    >>> X = np.arange(1,7).reshape(-1,2)
    >>> lagmat(X, maxlag=2, trim="forward", original='in')
    array([[ 1.,  2.,  0.,  0.,  0.,  0.],
       [ 3.,  4.,  1.,  2.,  0.,  0.],
       [ 5.,  6.,  3.,  4.,  1.,  2.]])

    >>> lagmat(X, maxlag=2, trim="backward", original='in')
    array([[ 5.,  6.,  3.,  4.,  1.,  2.],
       [ 0.,  0.,  5.,  6.,  3.,  4.],
       [ 0.,  0.,  0.,  0.,  5.,  6.]])

    >>> lagmat(X, maxlag=2, trim="both", original='in')
    array([[ 5.,  6.,  3.,  4.,  1.,  2.]])

    >>> lagmat(X, maxlag=2, trim="none", original='in')
    array([[ 1.,  2.,  0.,  0.,  0.,  0.],
       [ 3.,  4.,  1.,  2.,  0.,  0.],
       [ 5.,  6.,  3.,  4.,  1.,  2.],
       [ 0.,  0.,  5.,  6.,  3.,  4.],
       [ 0.,  0.,  0.,  0.,  5.,  6.]])
    """
    maxlag = int_like(maxlag, "maxlag")
    use_pandas = bool_like(use_pandas, "use_pandas")
    trim = string_like(
        trim,
        "trim",
        optional=True,
        options=("forward", "backward", "both", "none"),
    )
    original = string_like(original, "original", options=("ex", "sep", "in"))

    # TODO:  allow list of lags additional to maxlag
    orig = x
    x = array_like(x, "x", ndim=2, dtype=None)
    is_pandas = _is_using_pandas(orig, None) and use_pandas
    trim = "none" if trim is None else trim
    trim = trim.lower()
    if is_pandas and trim in ("none", "backward"):
        raise ValueError("trim cannot be 'none' or 'backward' when used on "
                         "Series or DataFrames")

    dropidx = 0
    nobs, nvar = x.shape
    if original in ["ex", "sep"]:
        dropidx = nvar
    if maxlag >= nobs:
        raise ValueError("maxlag should be < nobs")
    lm = np.zeros((nobs + maxlag, nvar * (maxlag + 1)))
    for k in range(0, int(maxlag + 1)):
        lm[maxlag - k:nobs + maxlag - k,
           nvar * (maxlag - k):nvar * (maxlag - k + 1), ] = x

    if trim in ("none", "forward"):
        startobs = 0
    elif trim in ("backward", "both"):
        startobs = maxlag
    else:
        raise ValueError("trim option not valid")

    if trim in ("none", "backward"):
        stopobs = len(lm)
    else:
        stopobs = nobs

    if is_pandas:
        x = orig
        x_columns = x.columns if isinstance(x, DataFrame) else [x.name]
        columns = [str(col) for col in x_columns]
        for lag in range(maxlag):
            lag_str = str(lag + 1)
            columns.extend([str(col) + ".L." + lag_str for col in x_columns])
        lm = DataFrame(lm[:stopobs], index=x.index, columns=columns)
        lags = lm.iloc[startobs:]
        if original in ("sep", "ex"):
            leads = lags[x_columns]
            lags = lags.drop(x_columns, axis=1)
    else:
        lags = lm[startobs:stopobs, dropidx:]
        if original == "sep":
            leads = lm[startobs:stopobs, :dropidx]

    if original == "sep":
        return lags, leads
    else:
        return lags
Ejemplo n.º 20
0
    def forecast(self, steps: int = 1, theta: float = 2) -> pd.Series:
        r"""
        Forecast the model for a given theta

        Parameters
        ----------
        steps : int
            The number of steps ahead to compute the forecast components.
        theta : float
            The theta value to use when computing the weight to combine
            the trend and the SES forecasts.

        Returns
        -------
        Series
            A Series containing the forecasts

        Notes
        -----
        The forecast is computed as

        .. math::

           \hat{X}_{T+h|T} = \frac{\theta-1}{\theta} b_0
                             \left[h - 1 + \frac{1}{\alpha}
                             - \frac{(1-\alpha)^T}{\alpha} \right]
                             + \tilde{X}_{T+h|T}

        where :math:`\tilde{X}_{T+h|T}` is the SES forecast of the endogenous
        variable using the parameter :math:`\alpha`. :math:`b_0` is the
        slope of a time trend line fitted to X using the terms 0, 1, ..., T-1.

        This expression follows from [1]_ and [2]_ when the combination
        weights are restricted to be (theta-1)/theta and 1/theta. This nests
        the original implementation when theta=2 and the two weights are both
        1/2.

        References
        ----------
        .. [1] Hyndman, R. J., & Billah, B. (2003). Unmasking the Theta method.
           International Journal of Forecasting, 19(2), 287-290.
        .. [2] Fioruci, J. A., Pellegrini, T. R., Louzada, F., & Petropoulos,
           F. (2015). The optimized theta method. arXiv preprint
           arXiv:1503.03529.
        """

        steps = int_like(steps, "steps")
        if steps < 1:
            raise ValueError("steps must be a positive integer")
        theta = float_like(theta, "theta")
        if theta < 1:
            raise ValueError("theta must be a float >= 1")
        thresh = 4.0 / np.finfo(np.double).eps
        trend_weight = (theta - 1) / theta if theta < thresh else 1.0
        comp = self.forecast_components(steps=steps)
        fcast = trend_weight * comp.trend + np.asarray(comp.ses)
        # Re-seasonalize if needed
        if self.model.deseasonalize:
            seasonal = np.asarray(comp.seasonal)
            if self.model.method.startswith("mul"):
                fcast *= seasonal
            else:
                fcast += seasonal
        fcast.name = "forecast"

        return fcast
Ejemplo n.º 21
0
def add_lag(x, col=None, lags=1, drop=False, insert=True):
    """
    Returns an array with lags included given an array.

    Parameters
    ----------
    x : array_like
        An array or NumPy ndarray subclass. Can be either a 1d or 2d array with
        observations in columns.
    col : int or None
        `col` can be an int of the zero-based column index. If it's a
        1d array `col` can be None.
    lags : int
        The number of lags desired.
    drop : bool
        Whether to keep the contemporaneous variable for the data.
    insert : bool or int
        If True, inserts the lagged values after `col`. If False, appends
        the data. If int inserts the lags at int.

    Returns
    -------
    array : ndarray
        Array with lags

    Examples
    --------

    >>> import statsmodels.api as sm
    >>> data = sm.datasets.macrodata.load()
    >>> data = data.data[['year','quarter','realgdp','cpi']]
    >>> data = sm.tsa.add_lag(data, 'realgdp', lags=2)

    Notes
    -----
    Trims the array both forward and backward, so that the array returned
    so that the length of the returned array is len(`X`) - lags. The lags are
    returned in increasing order, ie., t-1,t-2,...,t-lags
    """
    lags = int_like(lags, "lags")
    drop = bool_like(drop, "drop")
    x = array_like(x, "x", ndim=2)
    if col is None:
        col = 0

    # handle negative index
    if col < 0:
        col = x.shape[1] + col
    if x.ndim == 1:
        x = x[:, None]
    contemp = x[:, col]

    if insert is True:
        ins_idx = col + 1
    elif insert is False:
        ins_idx = x.shape[1]
    else:
        if insert < 0:  # handle negative index
            insert = x.shape[1] + insert + 1
        if insert > x.shape[1]:
            insert = x.shape[1]

            warnings.warn(
                "insert > number of variables, inserting at the"
                " last position",
                ValueWarning,
            )
        ins_idx = insert

    ndlags = lagmat(contemp, lags, trim="Both")
    first_cols = lrange(ins_idx)
    last_cols = lrange(ins_idx, x.shape[1])
    if drop:
        if col in first_cols:
            first_cols.pop(first_cols.index(col))
        else:
            last_cols.pop(last_cols.index(col))
    return np.column_stack((x[lags:, first_cols], ndlags, x[lags:, last_cols]))
Ejemplo n.º 22
0
    def __init__(self, data, ncomp=None, standardize=True, demean=True,
                 normalize=True, gls=False, weights=None, method='svd',
                 missing=None, tol=5e-8, max_iter=1000, tol_em=5e-8,
                 max_em_iter=100):
        self._index = None
        self._columns = []
        if isinstance(data, pd.DataFrame):
            self._index = data.index
            self._columns = data.columns

        self.data = array_like(data, "data", ndim=2)
        # Store inputs
        self._gls = bool_like(gls, "gls")
        self._normalize = bool_like(normalize, "normalize")
        self._tol = float_like(tol, "tol")
        if not 0 < self._tol < 1:
            raise ValueError('tol must be strictly between 0 and 1')
        self._max_iter = int_like(max_iter, "int_like")
        self._max_em_iter = int_like(max_em_iter, "max_em_iter")
        self._tol_em = float_like(tol_em, "tol_em")

        # Prepare data
        self._standardize = bool_like(standardize, "standardize")
        self._demean = bool_like(demean, "demean")

        self._nobs, self._nvar = self.data.shape
        weights = array_like(weights, "weights", maxdim=1, optional=True)
        if weights is None:
            weights = np.ones(self._nvar)
        else:
            weights = np.array(weights).flatten()
            if weights.shape[0] != self._nvar:
                raise ValueError('weights should have nvar elements')
            weights = weights / np.sqrt((weights ** 2.0).mean())
        self.weights = weights

        # Check ncomp against maximum
        min_dim = min(self._nobs, self._nvar)
        self._ncomp = min_dim if ncomp is None else ncomp
        if self._ncomp > min_dim:
            import warnings

            warn = 'The requested number of components is more than can be ' \
                   'computed from data. The maximum number of components is ' \
                   'the minimum of the number of observations or variables'
            warnings.warn(warn, ValueWarning)
            self._ncomp = min_dim

        self._method = method
        # Workaround to avoid instance methods in __dict__
        if self._method not in ('eig', 'svd', 'nipals'):
            raise ValueError('method {0} is not known.'.format(method))

        self.rows = np.arange(self._nobs)
        self.cols = np.arange(self._nvar)
        # Handle missing
        self._missing = string_like(missing, "missing", optional=True)
        self._adjusted_data = self.data
        self._adjust_missing()

        # Update size
        self._nobs, self._nvar = self._adjusted_data.shape
        if self._ncomp == np.min(self.data.shape):
            self._ncomp = np.min(self._adjusted_data.shape)
        elif self._ncomp > np.min(self._adjusted_data.shape):
            raise ValueError('When adjusting for missing values, user '
                             'provided ncomp must be no larger than the '
                             'smallest dimension of the '
                             'missing-value-adjusted data size.')

        # Attributes and internal values
        self._tss = 0.0
        self._ess = None
        self.transformed_data = None
        self._mu = None
        self._sigma = None
        self._ess_indiv = None
        self._tss_indiv = None
        self.scores = self.factors = None
        self.loadings = None
        self.coeff = None
        self.eigenvals = None
        self.eigenvecs = None
        self.projection = None
        self.rsquare = None
        self.ic = None

        # Prepare data
        self.transformed_data = self._prepare_data()
        # Perform the PCA
        self._pca()
        if gls:
            self._compute_gls_weights()
            self.transformed_data = self._prepare_data()
            self._pca()

        # Final calculations
        self._compute_rsquare_and_ic()
        if self._index is not None:
            self._to_pandas()
Ejemplo n.º 23
0
    def fit(
        self,
        method="inv",
        cov_type="nonrobust",
        cov_kwds=None,
        reset=None,
        use_t=False,
        params_only=False,
    ):
        """
        Estimate model parameters.

        Parameters
        ----------
        method : {'inv', 'lstsq', 'pinv'}
            Method to use when computing the the model parameters.

            * 'inv' - use moving windows inner-products and matrix inversion.
              This method is the fastest, but may be less accurate than the
              other methods.
            * 'lstsq' - Use numpy.linalg.lstsq
            * 'pinv' - Use numpy.linalg.pinv. This method matches the default
              estimator in non-moving regression estimators.
        cov_type : {'nonrobust', 'HCCM', 'HC0'}
            Covariance estimator:

            * nonrobust - The classic OLS covariance estimator
            * HCCM, HC0 - White heteroskedasticity robust covariance
        cov_kwds : dict
            Unused
        reset : int, optional
            Interval to recompute the moving window inner products used to
            estimate the model parameters. Smaller values improve accuracy,
            although in practice this setting is not required to be set.
        use_t : bool, optional
            Flag indicating to use the Student's t distribution when computing
            p-values.
        params_only : bool, optional
            Flag indicating that only parameters should be computed. Avoids
            calculating all other statistics or performing inference.

        Returns
        -------
        RollingRegressionResults
            Estimation results where all pre-sample values are nan-filled.
        """
        method = string_like(method,
                             "method",
                             options=("inv", "lstsq", "pinv"))
        reset = int_like(reset, "reset", optional=True)
        reset = self._y.shape[0] if reset is None else reset
        if reset < 1:
            raise ValueError("reset must be a positive integer")

        nobs, k = self._x.shape
        store = RollingStore(
            params=np.full((nobs, k), np.nan),
            ssr=np.full(nobs, np.nan),
            llf=np.full(nobs, np.nan),
            nobs=np.zeros(nobs, dtype=int),
            s2=np.full(nobs, np.nan),
            xpxi=np.full((nobs, k, k), np.nan),
            xeex=np.full((nobs, k, k), np.nan),
            centered_tss=np.full(nobs, np.nan),
            uncentered_tss=np.full(nobs, np.nan),
        )
        w = self._window
        first = self._min_nobs if self._expanding else w
        xpx, xpy, nobs = self._reset(first)
        if not (self._has_nan[first - 1] and self._skip_missing):
            self._fit_single(first, xpx, xpy, nobs, store, params_only, method)
        wx, wy = self._wx, self._wy
        for i in range(first + 1, self._x.shape[0] + 1):
            if self._has_nan[i - 1] and self._skip_missing:
                continue
            if i % reset == 0:
                xpx, xpy, nobs = self._reset(i)
            else:
                if not self._is_nan[i - w - 1] and i > w:
                    remove_x = wx[i - w - 1:i - w]
                    xpx -= remove_x.T @ remove_x
                    xpy -= remove_x.T @ wy[i - w - 1:i - w]
                    nobs -= 1
                if not self._is_nan[i - 1]:
                    add_x = wx[i - 1:i]
                    xpx += add_x.T @ add_x
                    xpy += add_x.T @ wy[i - 1:i]
                    nobs += 1

            self._fit_single(i, xpx, xpy, nobs, store, params_only, method)

        return RollingRegressionResults(self, store, self.k_constant, use_t,
                                        cov_type)
Ejemplo n.º 24
0
def test_not_int_like(not_integer):
    with pytest.raises(TypeError):
        int_like(not_integer, "integer")
Ejemplo n.º 25
0
def lagmat(x, maxlag, trim='forward', original='ex', use_pandas=False):
    """
    Create 2d array of lags

    Parameters
    ----------
    x : array_like, 1d or 2d
        data; if 2d, observation in rows and variables in columns
    maxlag : int
        all lags from zero to maxlag are included
    trim : str {'forward', 'backward', 'both', 'none'} or None
        * 'forward' : trim invalid observations in front
        * 'backward' : trim invalid initial observations
        * 'both' : trim invalid observations on both sides
        * 'none', None : no trimming of observations
    original : str {'ex','sep','in'}
        * 'ex' : drops the original array returning only the lagged values.
        * 'in' : returns the original array and the lagged values as a single
          array.
        * 'sep' : returns a tuple (original array, lagged values). The original
                  array is truncated to have the same number of rows as
                  the returned lagmat.
    use_pandas : bool, optional
        If true, returns a DataFrame when the input is a pandas
        Series or DataFrame.  If false, return numpy ndarrays.

    Returns
    -------
    lagmat : 2d array
        array with lagged observations
    y : 2d array, optional
        Only returned if original == 'sep'

    Examples
    --------
    >>> from statsmodels.tsa.tsatools import lagmat
    >>> import numpy as np
    >>> X = np.arange(1,7).reshape(-1,2)
    >>> lagmat(X, maxlag=2, trim="forward", original='in')
    array([[ 1.,  2.,  0.,  0.,  0.,  0.],
       [ 3.,  4.,  1.,  2.,  0.,  0.],
       [ 5.,  6.,  3.,  4.,  1.,  2.]])

    >>> lagmat(X, maxlag=2, trim="backward", original='in')
    array([[ 5.,  6.,  3.,  4.,  1.,  2.],
       [ 0.,  0.,  5.,  6.,  3.,  4.],
       [ 0.,  0.,  0.,  0.,  5.,  6.]])

    >>> lagmat(X, maxlag=2, trim="both", original='in')
    array([[ 5.,  6.,  3.,  4.,  1.,  2.]])

    >>> lagmat(X, maxlag=2, trim="none", original='in')
    array([[ 1.,  2.,  0.,  0.,  0.,  0.],
       [ 3.,  4.,  1.,  2.,  0.,  0.],
       [ 5.,  6.,  3.,  4.,  1.,  2.],
       [ 0.,  0.,  5.,  6.,  3.,  4.],
       [ 0.,  0.,  0.,  0.,  5.,  6.]])

    Notes
    -----
    When using a pandas DataFrame or Series with use_pandas=True, trim can only
    be 'forward' or 'both' since it is not possible to consistently extend
    index values.
    """
    maxlag = int_like(maxlag, 'maxlag')
    use_pandas = bool_like(use_pandas, 'use_pandas')
    trim = string_like(trim,
                       'trim',
                       optional=True,
                       options=('forward', 'backward', 'both', 'none'))
    original = string_like(original, 'original', options=('ex', 'sep', 'in'))

    # TODO:  allow list of lags additional to maxlag
    orig = x
    x = array_like(x, 'x', ndim=2, dtype=None)
    is_pandas = _is_using_pandas(orig, None) and use_pandas
    trim = 'none' if trim is None else trim
    trim = trim.lower()
    if is_pandas and trim in ('none', 'backward'):
        raise ValueError("trim cannot be 'none' or 'forward' when used on "
                         "Series or DataFrames")

    dropidx = 0
    nobs, nvar = x.shape
    if original in ['ex', 'sep']:
        dropidx = nvar
    if maxlag >= nobs:
        raise ValueError("maxlag should be < nobs")
    lm = np.zeros((nobs + maxlag, nvar * (maxlag + 1)))
    for k in range(0, int(maxlag + 1)):
        lm[maxlag - k:nobs + maxlag - k,
           nvar * (maxlag - k):nvar * (maxlag - k + 1)] = x

    if trim in ('none', 'forward'):
        startobs = 0
    elif trim in ('backward', 'both'):
        startobs = maxlag
    else:
        raise ValueError('trim option not valid')

    if trim in ('none', 'backward'):
        stopobs = len(lm)
    else:
        stopobs = nobs

    if is_pandas:
        x = orig
        x_columns = x.columns if isinstance(x, DataFrame) else [x.name]
        columns = [str(col) for col in x_columns]
        for lag in range(maxlag):
            lag_str = str(lag + 1)
            columns.extend([str(col) + '.L.' + lag_str for col in x_columns])
        lm = DataFrame(lm[:stopobs], index=x.index, columns=columns)
        lags = lm.iloc[startobs:]
        if original in ('sep', 'ex'):
            leads = lags[x_columns]
            lags = lags.drop(x_columns, 1)
    else:
        lags = lm[startobs:stopobs, dropidx:]
        if original == 'sep':
            leads = lm[startobs:stopobs, :dropidx]

    if original == 'sep':
        return lags, leads
    else:
        return lags
Ejemplo n.º 26
0
    def __init__(
        self,
        data: Union[np.ndarray, pd.Series, pd.DataFrame],
        stats: Sequence[str] = None,
        *,
        numeric: bool = True,
        categorical: bool = True,
        alpha: float = 0.05,
        use_t: bool = False,
        percentiles: Sequence[Union[int, float]] = PERCENTILES,
        ntop: bool = 5,
    ):
        data_arr = data
        if not isinstance(data, (pd.Series, pd.DataFrame)):
            data_arr = array_like(data, "data", maxdim=2)
        if data_arr.ndim == 1:
            data = pd.Series(data)
        numeric = bool_like(numeric, "numeric")
        categorical = bool_like(categorical, "categorical")
        include = []
        col_types = ""
        if numeric:
            include.append(np.number)
            col_types = "numeric"
        if categorical:
            include.append("category")
            col_types += "and " if col_types != "" else ""
            col_types += "categorical"
        if not numeric and not categorical:
            raise ValueError(
                "At least one of numeric and categorical must be True"
            )
        self._data = pd.DataFrame(data).select_dtypes(include)
        if self._data.shape[1] == 0:

            raise ValueError(
                "Selecting {col_types} results in an empty DataFrame"
            )
        self._is_numeric = [is_numeric_dtype(dt) for dt in self._data.dtypes]
        self._is_cat_like = [
            is_categorical_dtype(dt) for dt in self._data.dtypes
        ]

        if stats is not None:
            undef = [stat for stat in stats if stat not in DEFAULT_STATISTICS]
            if undef:
                raise ValueError(
                    f"{', '.join(undef)} are not known statistics"
                )
        self._stats = (
            list(DEFAULT_STATISTICS) if stats is None else list(stats)
        )
        self._ntop = int_like(ntop, "ntop")
        self._compute_top = "top" in self._stats
        self._compute_freq = "freq" in self._stats
        if self._compute_top and self._ntop <= 0 < sum(self._is_cat_like):
            raise ValueError("top must be a non-negative integer")

        self._compute_perc = "percentiles" in self._stats
        self._percentiles = array_like(
            percentiles, "percentiles", maxdim=1, dtype="d"
        )
        self._percentiles = np.sort(self._percentiles)
        if np.unique(self._percentiles).shape[0] != self._percentiles.shape[0]:
            raise ValueError("percentiles must be distinct")
        if np.any(self._percentiles >= 100) or np.any(self._percentiles <= 0):
            raise ValueError("percentiles must be strictly between 0 and 100")

        # Expand special stats
        replacements = {
            "mode": ["mode", "mode_freq"],
            "ci": ["upper_ci", "lower_ci"],
            "jarque_bera": ["jarque_bera", "jarque_bera_pval"],
            "top": [f"top_{i}" for i in range(1, self._ntop + 1)],
            "freq": [f"freq_{i}" for i in range(1, self._ntop + 1)],
            "percentiles": [f"{i}%" for i in percentiles],
        }

        for key in replacements:
            if key in self._stats:
                idx = self._stats.index(key)
                self._stats = (
                    self._stats[:idx]
                    + replacements[key]
                    + self._stats[idx + 1 :]
                )

        self._alpha = float_like(alpha, "alpha")
        if not 0 < alpha < 1:
            raise ValueError("alpha must be strictly between 0 and 1")
        self._use_t = bool_like(use_t, "use_t")
Ejemplo n.º 27
0
def lagmat2ds(x,
              maxlag0,
              maxlagex=None,
              dropex=0,
              trim='forward',
              use_pandas=False):
    """
    Generate lagmatrix for 2d array, columns arranged by variables

    Parameters
    ----------
    x : array_like, 2d
        2d data, observation in rows and variables in columns
    maxlag0 : int
        for first variable all lags from zero to maxlag are included
    maxlagex : None or int
        max lag for all other variables all lags from zero to maxlag are included
    dropex : int (default is 0)
        exclude first dropex lags from other variables
        for all variables, except the first, lags from dropex to maxlagex are
        included
    trim : str
        * 'forward' : trim invalid observations in front
        * 'backward' : trim invalid initial observations
        * 'both' : trim invalid observations on both sides
        * 'none' : no trimming of observations
    use_pandas : bool, optional
        If true, returns a DataFrame when the input is a pandas
        Series or DataFrame.  If false, return numpy ndarrays.

    Returns
    -------
    lagmat : 2d array
        array with lagged observations, columns ordered by variable

    Notes
    -----
    Inefficient implementation for unequal lags, implemented for convenience
    """
    maxlag0 = int_like(maxlag0, 'maxlag0')
    maxlagex = int_like(maxlagex, 'maxlagex', optional=True)
    trim = string_like(trim,
                       'trim',
                       optional=True,
                       options=('forward', 'backward', 'both', 'none'))
    if maxlagex is None:
        maxlagex = maxlag0
    maxlag = max(maxlag0, maxlagex)
    is_pandas = _is_using_pandas(x, None)

    if x.ndim == 1:
        if is_pandas:
            x = pd.DataFrame(x)
        else:
            x = x[:, None]
    elif x.ndim == 0 or x.ndim > 2:
        raise ValueError('Only supports 1 and 2-dimensional data.')

    nobs, nvar = x.shape

    if is_pandas and use_pandas:
        lags = lagmat(x.iloc[:, 0],
                      maxlag,
                      trim=trim,
                      original='in',
                      use_pandas=True)
        lagsli = [lags.iloc[:, :maxlag0 + 1]]
        for k in range(1, nvar):
            lags = lagmat(x.iloc[:, k],
                          maxlag,
                          trim=trim,
                          original='in',
                          use_pandas=True)
            lagsli.append(lags.iloc[:, dropex:maxlagex + 1])
        return pd.concat(lagsli, axis=1)
    elif is_pandas:
        x = np.asanyarray(x)

    lagsli = [
        lagmat(x[:, 0], maxlag, trim=trim, original='in')[:, :maxlag0 + 1]
    ]
    for k in range(1, nvar):
        lagsli.append(
            lagmat(x[:, k], maxlag, trim=trim,
                   original='in')[:, dropex:maxlagex + 1])
    return np.column_stack(lagsli)