Example #1
0
 def __init__(self,
              endog,
              trend=None,
              damped=False,
              seasonal=None,
              seasonal_periods=None,
              dates=None,
              freq=None,
              missing='none'):
     super(ExponentialSmoothing, self).__init__(endog,
                                                None,
                                                dates,
                                                freq,
                                                missing=missing)
     self.endog = self.endog
     self._y = self._data = array_like(endog,
                                       'endog',
                                       contiguous=True,
                                       order='C')
     options = ("add", "mul", "additive", "multiplicative")
     trend = string_like(trend, 'trend', options=options, optional=True)
     if trend in ['additive', 'multiplicative']:
         trend = {'additive': 'add', 'multiplicative': 'mul'}[trend]
     self.trend = trend
     self.damped = bool_like(damped, 'damped')
     seasonal = string_like(seasonal,
                            'seasonal',
                            options=options,
                            optional=True)
     if seasonal in ['additive', 'multiplicative']:
         seasonal = {'additive': 'add', 'multiplicative': 'mul'}[seasonal]
     self.seasonal = seasonal
     self.trending = trend in ['mul', 'add']
     self.seasoning = seasonal in ['mul', 'add']
     if (self.trend == 'mul' or self.seasonal == 'mul') and \
             not np.all(self._data > 0.0):
         raise ValueError('endog must be strictly positive when using'
                          'multiplicative trend or seasonal components.')
     if self.damped and not self.trending:
         raise ValueError('Can only dampen the trend component')
     if self.seasoning:
         self.seasonal_periods = int_like(seasonal_periods,
                                          'seasonal_periods',
                                          optional=True)
         if seasonal_periods is None:
             self.seasonal_periods = freq_to_period(self._index_freq)
         if self.seasonal_periods <= 1:
             raise ValueError('seasonal_periods must be larger than 1.')
     else:
         self.seasonal_periods = 0
     self.nobs = len(self.endog)
Example #2
0
 def __init__(self, endog, exog, window=None, weights=None, min_nobs=None,
              missing='drop'):
     # Call Model.__init__ twice to use const detection in first pass
     # But to not drop in the second pass
     missing = string_like(missing, 'missing',
                           options=('drop', 'raise', 'skip'))
     temp_msng = 'drop' if missing != 'raise' else 'raise'
     Model.__init__(self, endog, exog, missing=temp_msng, hasconst=None)
     k_const = self.k_constant
     const_idx = self.data.const_idx
     Model.__init__(self, endog, exog, missing='none', hasconst=False)
     self.k_constant = k_const
     self.data.const_idx = const_idx
     self._y = array_like(endog, 'endog')
     nobs = self._y.shape[0]
     self._x = array_like(exog, 'endog', ndim=2, shape=(nobs, None))
     window = int_like(window, 'window', optional=True)
     weights = array_like(weights, 'weights', optional=True, shape=(nobs,))
     self._window = window if window is not None else self._y.shape[0]
     self._weighted = weights is not None
     self._weights = np.ones(nobs) if weights is None else weights
     w12 = np.sqrt(self._weights)
     self._wy = w12 * self._y
     self._wx = w12[:, None] * self._x
     self._is_nan = np.zeros_like(self._y, dtype=np.bool)
     self._has_nan = self._find_nans()
     self.const_idx = self.data.const_idx
     self._skip_missing = missing == 'skip'
     min_nobs = int_like(min_nobs, 'min_nobs', optional=True)
     self._min_nobs = min_nobs if min_nobs is not None else self._x.shape[1]
     if self._min_nobs < self._x.shape[1] or self._min_nobs > self._window:
         raise ValueError('min_nobs must be larger than the number of '
                          'regressors in the model and less than window')
Example #3
0
    def __init__(self, freq: str, period: str) -> None:
        freq_options: Set[str] = set()
        freq_options.update(
            *[list(val.keys()) for val in self._supported.values()])
        period_options = list(self._supported.keys())

        freq = string_like(freq,
                           "freq",
                           options=tuple(freq_options),
                           lower=False)
        period = string_like(period,
                             "period",
                             options=period_options,
                             lower=False)
        if freq not in self._supported[period]:
            raise ValueError(f"The combination of freq={freq} and "
                             f"period={period} is not supported.")
        super().__init__(freq)
        self._period = period
        self._freq_str = self._freq.freqstr.split("-")[0]
Example #4
0
def test_string():
    out = string_like('apple', 'value')
    assert out == 'apple'

    out = string_like('apple', 'value', options=('apple', 'banana', 'cherry'))
    assert out == 'apple'

    with pytest.raises(TypeError, match='value must be a string'):
        string_like(1, 'value')
    with pytest.raises(TypeError, match='value must be a string'):
        string_like(b'4', 'value')
    with pytest.raises(ValueError,
                       match='value must be one of: \'apple\','
                       ' \'banana\', \'cherry\''):
        string_like('date', 'value', options=('apple', 'banana', 'cherry'))
    def __init__(
        self,
        endog,
        exog,
        window=None,
        *,
        weights=None,
        min_nobs=None,
        missing="drop",
        expanding=False
    ):
        # Call Model.__init__ twice to use const detection in first pass
        # But to not drop in the second pass
        missing = string_like(
            missing, "missing", options=("drop", "raise", "skip")
        )
        temp_msng = "drop" if missing != "raise" else "raise"
        Model.__init__(self, endog, exog, missing=temp_msng, hasconst=None)
        k_const = self.k_constant
        const_idx = self.data.const_idx
        Model.__init__(self, endog, exog, missing="none", hasconst=False)
        self.k_constant = k_const
        self.data.const_idx = const_idx
        self._y = array_like(endog, "endog")
        nobs = self._y.shape[0]
        self._x = array_like(exog, "endog", ndim=2, shape=(nobs, None))
        window = int_like(window, "window", optional=True)
        weights = array_like(weights, "weights", optional=True, shape=(nobs,))
        self._window = window if window is not None else self._y.shape[0]
        self._weighted = weights is not None
        self._weights = np.ones(nobs) if weights is None else weights
        w12 = np.sqrt(self._weights)
        self._wy = w12 * self._y
        self._wx = w12[:, None] * self._x

        min_nobs = int_like(min_nobs, "min_nobs", optional=True)
        self._min_nobs = min_nobs if min_nobs is not None else self._x.shape[1]
        if self._min_nobs < self._x.shape[1] or self._min_nobs > self._window:
            raise ValueError(
                "min_nobs must be larger than the number of "
                "regressors in the model and less than window"
            )

        self._expanding = expanding

        self._is_nan = np.zeros_like(self._y, dtype=bool)
        self._has_nan = self._find_nans()
        self.const_idx = self.data.const_idx
        self._skip_missing = missing == "skip"
Example #6
0
def test_string():
    out = string_like("apple", "value")
    assert out == "apple"

    out = string_like("apple", "value", options=("apple", "banana", "cherry"))
    assert out == "apple"

    with pytest.raises(TypeError, match="value must be a string"):
        string_like(1, "value")
    with pytest.raises(TypeError, match="value must be a string"):
        string_like(b"4", "value")
    with pytest.raises(
            ValueError,
            match="value must be one of: 'apple',"
            " 'banana', 'cherry'",
    ):
        string_like("date", "value", options=("apple", "banana", "cherry"))
Example #7
0
def test_optional_string():
    out = string_like("apple", "value")
    assert out == "apple"

    out = string_like("apple", "value", options=("apple", "banana", "cherry"))
    assert out == "apple"

    out = string_like(None, "value", optional=True)
    assert out is None

    out = string_like(None,
                      "value",
                      optional=True,
                      options=("apple", "banana", "cherry"))
    assert out is None

    with pytest.raises(TypeError, match="value must be a string"):
        string_like(1, "value", optional=True)
    with pytest.raises(TypeError, match="value must be a string"):
        string_like(b"4", "value", optional=True)
Example #8
0
def test_optional_string():
    out = string_like('apple', 'value')
    assert out == 'apple'

    out = string_like('apple', 'value', options=('apple', 'banana', 'cherry'))
    assert out == 'apple'

    out = string_like(None, 'value', optional=True)
    assert out is None

    out = string_like(None,
                      'value',
                      optional=True,
                      options=('apple', 'banana', 'cherry'))
    assert out is None

    with pytest.raises(TypeError, match='value must be a string'):
        string_like(1, 'value', optional=True)
    with pytest.raises(TypeError, match='value must be a string'):
        string_like(b'4', 'value', optional=True)
Example #9
0
    def __init__(
        self,
        endog,
        *,
        period: Optional[int] = None,
        deseasonalize: bool = True,
        use_test: bool = True,
        method: str = "auto",
        difference: bool = False
    ) -> None:
        self._y = array_like(endog, "endog", ndim=1)
        if isinstance(endog, pd.DataFrame):
            self.endog_orig = endog.iloc[:, 0]
        else:
            self.endog_orig = endog
        self._period = int_like(period, "period", optional=True)
        self._deseasonalize = bool_like(deseasonalize, "deseasonalize")
        self._use_test = (
            bool_like(use_test, "use_test") and self._deseasonalize
        )
        self._diff = bool_like(difference, "difference")
        self._method = string_like(
            method,
            "model",
            options=("auto", "additive", "multiplicative", "mul", "add"),
        )
        if self._period is None and self._deseasonalize:
            idx = getattr(endog, "index", None)
            pfreq = None
            if idx is not None:
                pfreq = getattr(idx, "freq", None)
                if pfreq is None:
                    pfreq = getattr(idx, "inferred_freq", None)
            if pfreq is not None:
                self._period = freq_to_period(pfreq)
            else:
                raise ValueError(
                    "You must specify a period or endog must be a "
                    "pandas object with a DatetimeIndex with "
                    "a freq not set to None"
                )

        self._has_seasonality = self._deseasonalize
Example #10
0
def lagmat2ds(x,
              maxlag0,
              maxlagex=None,
              dropex=0,
              trim="forward",
              use_pandas=False):
    """
    Generate lagmatrix for 2d array, columns arranged by variables.

    Parameters
    ----------
    x : array_like
        Data, 2d. Observations in rows and variables in columns.
    maxlag0 : int
        The first variable all lags from zero to maxlag are included.
    maxlagex : {None, int}
        The max lag for all other variables all lags from zero to maxlag are
        included.
    dropex : int
        Exclude first dropex lags from other variables. For all variables,
        except the first, lags from dropex to maxlagex are included.
    trim : str
        The trimming method to use.

        * 'forward' : trim invalid observations in front.
        * 'backward' : trim invalid initial observations.
        * 'both' : trim invalid observations on both sides.
        * 'none' : no trimming of observations.
    use_pandas : bool
        If true, returns a DataFrame when the input is a pandas
        Series or DataFrame.  If false, return numpy ndarrays.

    Returns
    -------
    ndarray
        The array with lagged observations, columns ordered by variable.

    Notes
    -----
    Inefficient implementation for unequal lags, implemented for convenience.
    """
    maxlag0 = int_like(maxlag0, "maxlag0")
    maxlagex = int_like(maxlagex, "maxlagex", optional=True)
    trim = string_like(
        trim,
        "trim",
        optional=True,
        options=("forward", "backward", "both", "none"),
    )
    if maxlagex is None:
        maxlagex = maxlag0
    maxlag = max(maxlag0, maxlagex)
    is_pandas = _is_using_pandas(x, None)

    if x.ndim == 1:
        if is_pandas:
            x = pd.DataFrame(x)
        else:
            x = x[:, None]
    elif x.ndim == 0 or x.ndim > 2:
        raise ValueError("Only supports 1 and 2-dimensional data.")

    nobs, nvar = x.shape

    if is_pandas and use_pandas:
        lags = lagmat(x.iloc[:, 0],
                      maxlag,
                      trim=trim,
                      original="in",
                      use_pandas=True)
        lagsli = [lags.iloc[:, :maxlag0 + 1]]
        for k in range(1, nvar):
            lags = lagmat(x.iloc[:, k],
                          maxlag,
                          trim=trim,
                          original="in",
                          use_pandas=True)
            lagsli.append(lags.iloc[:, dropex:maxlagex + 1])
        return pd.concat(lagsli, axis=1)
    elif is_pandas:
        x = np.asanyarray(x)

    lagsli = [
        lagmat(x[:, 0], maxlag, trim=trim, original="in")[:, :maxlag0 + 1]
    ]
    for k in range(1, nvar):
        lagsli.append(
            lagmat(x[:, k], maxlag, trim=trim,
                   original="in")[:, dropex:maxlagex + 1])
    return np.column_stack(lagsli)
Example #11
0
def add_trend(x, trend="c", prepend=False, has_constant="skip"):
    """
    Add a trend and/or constant to an array.

    Parameters
    ----------
    x : array_like
        Original array of data.
    trend : str {'n', 'c', 't', 'ct', 'ctt'}
        The trend to add.

        * 'n' add no trend.
        * 'c' add constant only.
        * 't' add trend only.
        * 'ct' add constant and linear trend.
        * 'ctt' add constant and linear and quadratic trend.
    prepend : bool
        If True, prepends the new data to the columns of X.
    has_constant : str {'raise', 'add', 'skip'}
        Controls what happens when trend is 'c' and a constant column already
        exists in x. 'raise' will raise an error. 'add' will add a column of
        1s. 'skip' will return the data without change. 'skip' is the default.

    Returns
    -------
    array_like
        The original data with the additional trend columns.  If x is a
        pandas Series or DataFrame, then the trend column names are 'const',
        'trend' and 'trend_squared'.

    See Also
    --------
    statsmodels.tools.tools.add_constant
        Add a constant column to an array.

    Notes
    -----
    Returns columns as ['ctt','ct','c'] whenever applicable. There is currently
    no checking for an existing trend.
    """
    prepend = bool_like(prepend, "prepend")
    trend = string_like(trend, "trend", options=("n", "c", "t", "ct", "ctt"))
    has_constant = string_like(has_constant,
                               "has_constant",
                               options=("raise", "add", "skip"))

    # TODO: could be generalized for trend of aribitrary order
    columns = ["const", "trend", "trend_squared"]
    if trend == "n":
        return x.copy()
    elif trend == "c":  # handles structured arrays
        columns = columns[:1]
        trendorder = 0
    elif trend == "ct" or trend == "t":
        columns = columns[:2]
        if trend == "t":
            columns = columns[1:2]
        trendorder = 1
    elif trend == "ctt":
        trendorder = 2

    if _is_recarray(x):
        from statsmodels.tools.sm_exceptions import recarray_exception

        raise NotImplementedError(recarray_exception)

    is_pandas = _is_using_pandas(x, None)
    if is_pandas:
        if isinstance(x, pd.Series):
            x = pd.DataFrame(x)
        else:
            x = x.copy()
    else:
        x = np.asanyarray(x)

    nobs = len(x)
    trendarr = np.vander(np.arange(1, nobs + 1, dtype=np.float64),
                         trendorder + 1)
    # put in order ctt
    trendarr = np.fliplr(trendarr)
    if trend == "t":
        trendarr = trendarr[:, 1]

    if "c" in trend:
        if is_pandas:
            # Mixed type protection
            def safe_is_const(s):
                try:
                    return np.ptp(s) == 0.0 and np.any(s != 0.0)
                except:
                    return False

            col_const = x.apply(safe_is_const, 0)
        else:
            ptp0 = np.ptp(np.asanyarray(x), axis=0)
            col_is_const = ptp0 == 0
            nz_const = col_is_const & (x[0] != 0)
            col_const = nz_const

        if np.any(col_const):
            if has_constant == "raise":
                if x.ndim == 1:
                    base_err = "x is constant."
                else:
                    columns = np.arange(x.shape[1])[col_const]
                    if isinstance(x, pd.DataFrame):
                        columns = x.columns
                    const_cols = ", ".join([str(c) for c in columns])
                    base_err = (
                        "x contains one or more constant columns. Column(s) "
                        f"{const_cols} are constant.")
                msg = f"{base_err} Adding a constant with trend='{trend}' is not allowed."
                raise ValueError(msg)
            elif has_constant == "skip":
                columns = columns[1:]
                trendarr = trendarr[:, 1:]

    order = 1 if prepend else -1
    if is_pandas:
        trendarr = pd.DataFrame(trendarr, index=x.index, columns=columns)
        x = [trendarr, x]
        x = pd.concat(x[::order], axis=1)
    else:
        x = [trendarr, x]
        x = np.column_stack(x[::order])

    return x
Example #12
0
def lagmat(
    x,
    maxlag: int,
    trim: Literal["forward", "backward", "both", "none"] = 'forward',
    original: Literal["ex", "sep", "in"] = "ex",
    use_pandas: bool = False
) -> NDArray | DataFrame | tuple[NDArray, NDArray] | tuple[DataFrame,
                                                           DataFrame]:
    """
    Create 2d array of lags.

    Parameters
    ----------
    x : array_like
        Data; if 2d, observation in rows and variables in columns.
    maxlag : int
        All lags from zero to maxlag are included.
    trim : {'forward', 'backward', 'both', 'none', None}
        The trimming method to use.

        * 'forward' : trim invalid observations in front.
        * 'backward' : trim invalid initial observations.
        * 'both' : trim invalid observations on both sides.
        * 'none', None : no trimming of observations.
    original : {'ex','sep','in'}
        How the original is treated.

        * 'ex' : drops the original array returning only the lagged values.
        * 'in' : returns the original array and the lagged values as a single
          array.
        * 'sep' : returns a tuple (original array, lagged values). The original
                  array is truncated to have the same number of rows as
                  the returned lagmat.
    use_pandas : bool
        If true, returns a DataFrame when the input is a pandas
        Series or DataFrame.  If false, return numpy ndarrays.

    Returns
    -------
    lagmat : ndarray
        The array with lagged observations.
    y : ndarray, optional
        Only returned if original == 'sep'.

    Notes
    -----
    When using a pandas DataFrame or Series with use_pandas=True, trim can only
    be 'forward' or 'both' since it is not possible to consistently extend
    index values.

    Examples
    --------
    >>> from statsmodels.tsa.tsatools import lagmat
    >>> import numpy as np
    >>> X = np.arange(1,7).reshape(-1,2)
    >>> lagmat(X, maxlag=2, trim="forward", original='in')
    array([[ 1.,  2.,  0.,  0.,  0.,  0.],
       [ 3.,  4.,  1.,  2.,  0.,  0.],
       [ 5.,  6.,  3.,  4.,  1.,  2.]])

    >>> lagmat(X, maxlag=2, trim="backward", original='in')
    array([[ 5.,  6.,  3.,  4.,  1.,  2.],
       [ 0.,  0.,  5.,  6.,  3.,  4.],
       [ 0.,  0.,  0.,  0.,  5.,  6.]])

    >>> lagmat(X, maxlag=2, trim="both", original='in')
    array([[ 5.,  6.,  3.,  4.,  1.,  2.]])

    >>> lagmat(X, maxlag=2, trim="none", original='in')
    array([[ 1.,  2.,  0.,  0.,  0.,  0.],
       [ 3.,  4.,  1.,  2.,  0.,  0.],
       [ 5.,  6.,  3.,  4.,  1.,  2.],
       [ 0.,  0.,  5.,  6.,  3.,  4.],
       [ 0.,  0.,  0.,  0.,  5.,  6.]])
    """
    maxlag = int_like(maxlag, "maxlag")
    use_pandas = bool_like(use_pandas, "use_pandas")
    trim = string_like(
        trim,
        "trim",
        optional=True,
        options=("forward", "backward", "both", "none"),
    )
    original = string_like(original, "original", options=("ex", "sep", "in"))

    # TODO:  allow list of lags additional to maxlag
    orig = x
    x = array_like(x, "x", ndim=2, dtype=None)
    is_pandas = _is_using_pandas(orig, None) and use_pandas
    trim = "none" if trim is None else trim
    trim = trim.lower()
    if is_pandas and trim in ("none", "backward"):
        raise ValueError("trim cannot be 'none' or 'backward' when used on "
                         "Series or DataFrames")

    dropidx = 0
    nobs, nvar = x.shape
    if original in ["ex", "sep"]:
        dropidx = nvar
    if maxlag >= nobs:
        raise ValueError("maxlag should be < nobs")
    lm = np.zeros((nobs + maxlag, nvar * (maxlag + 1)))
    for k in range(0, int(maxlag + 1)):
        lm[maxlag - k:nobs + maxlag - k,
           nvar * (maxlag - k):nvar * (maxlag - k + 1), ] = x

    if trim in ("none", "forward"):
        startobs = 0
    elif trim in ("backward", "both"):
        startobs = maxlag
    else:
        raise ValueError("trim option not valid")

    if trim in ("none", "backward"):
        stopobs = len(lm)
    else:
        stopobs = nobs

    if is_pandas:
        x = orig
        x_columns = x.columns if isinstance(x, DataFrame) else [x.name]
        columns = [str(col) for col in x_columns]
        for lag in range(maxlag):
            lag_str = str(lag + 1)
            columns.extend([str(col) + ".L." + lag_str for col in x_columns])
        lm = DataFrame(lm[:stopobs], index=x.index, columns=columns)
        lags = lm.iloc[startobs:]
        if original in ("sep", "ex"):
            leads = lags[x_columns]
            lags = lags.drop(x_columns, axis=1)
    else:
        lags = lm[startobs:stopobs, dropidx:]
        if original == "sep":
            leads = lm[startobs:stopobs, :dropidx]

    if original == "sep":
        return lags, leads
    else:
        return lags
Example #13
0
def lagmat2ds(x,
              maxlag0,
              maxlagex=None,
              dropex=0,
              trim='forward',
              use_pandas=False):
    """
    Generate lagmatrix for 2d array, columns arranged by variables

    Parameters
    ----------
    x : array_like, 2d
        2d data, observation in rows and variables in columns
    maxlag0 : int
        for first variable all lags from zero to maxlag are included
    maxlagex : None or int
        max lag for all other variables all lags from zero to maxlag are included
    dropex : int (default is 0)
        exclude first dropex lags from other variables
        for all variables, except the first, lags from dropex to maxlagex are
        included
    trim : str
        * 'forward' : trim invalid observations in front
        * 'backward' : trim invalid initial observations
        * 'both' : trim invalid observations on both sides
        * 'none' : no trimming of observations
    use_pandas : bool, optional
        If true, returns a DataFrame when the input is a pandas
        Series or DataFrame.  If false, return numpy ndarrays.

    Returns
    -------
    lagmat : 2d array
        array with lagged observations, columns ordered by variable

    Notes
    -----
    Inefficient implementation for unequal lags, implemented for convenience
    """
    maxlag0 = int_like(maxlag0, 'maxlag0')
    maxlagex = int_like(maxlagex, 'maxlagex', optional=True)
    trim = string_like(trim,
                       'trim',
                       optional=True,
                       options=('forward', 'backward', 'both', 'none'))
    if maxlagex is None:
        maxlagex = maxlag0
    maxlag = max(maxlag0, maxlagex)
    is_pandas = _is_using_pandas(x, None)

    if x.ndim == 1:
        if is_pandas:
            x = pd.DataFrame(x)
        else:
            x = x[:, None]
    elif x.ndim == 0 or x.ndim > 2:
        raise ValueError('Only supports 1 and 2-dimensional data.')

    nobs, nvar = x.shape

    if is_pandas and use_pandas:
        lags = lagmat(x.iloc[:, 0],
                      maxlag,
                      trim=trim,
                      original='in',
                      use_pandas=True)
        lagsli = [lags.iloc[:, :maxlag0 + 1]]
        for k in range(1, nvar):
            lags = lagmat(x.iloc[:, k],
                          maxlag,
                          trim=trim,
                          original='in',
                          use_pandas=True)
            lagsli.append(lags.iloc[:, dropex:maxlagex + 1])
        return pd.concat(lagsli, axis=1)
    elif is_pandas:
        x = np.asanyarray(x)

    lagsli = [
        lagmat(x[:, 0], maxlag, trim=trim, original='in')[:, :maxlag0 + 1]
    ]
    for k in range(1, nvar):
        lagsli.append(
            lagmat(x[:, k], maxlag, trim=trim,
                   original='in')[:, dropex:maxlagex + 1])
    return np.column_stack(lagsli)
Example #14
0
def lagmat(x, maxlag, trim='forward', original='ex', use_pandas=False):
    """
    Create 2d array of lags

    Parameters
    ----------
    x : array_like, 1d or 2d
        data; if 2d, observation in rows and variables in columns
    maxlag : int
        all lags from zero to maxlag are included
    trim : str {'forward', 'backward', 'both', 'none'} or None
        * 'forward' : trim invalid observations in front
        * 'backward' : trim invalid initial observations
        * 'both' : trim invalid observations on both sides
        * 'none', None : no trimming of observations
    original : str {'ex','sep','in'}
        * 'ex' : drops the original array returning only the lagged values.
        * 'in' : returns the original array and the lagged values as a single
          array.
        * 'sep' : returns a tuple (original array, lagged values). The original
                  array is truncated to have the same number of rows as
                  the returned lagmat.
    use_pandas : bool, optional
        If true, returns a DataFrame when the input is a pandas
        Series or DataFrame.  If false, return numpy ndarrays.

    Returns
    -------
    lagmat : 2d array
        array with lagged observations
    y : 2d array, optional
        Only returned if original == 'sep'

    Examples
    --------
    >>> from statsmodels.tsa.tsatools import lagmat
    >>> import numpy as np
    >>> X = np.arange(1,7).reshape(-1,2)
    >>> lagmat(X, maxlag=2, trim="forward", original='in')
    array([[ 1.,  2.,  0.,  0.,  0.,  0.],
       [ 3.,  4.,  1.,  2.,  0.,  0.],
       [ 5.,  6.,  3.,  4.,  1.,  2.]])

    >>> lagmat(X, maxlag=2, trim="backward", original='in')
    array([[ 5.,  6.,  3.,  4.,  1.,  2.],
       [ 0.,  0.,  5.,  6.,  3.,  4.],
       [ 0.,  0.,  0.,  0.,  5.,  6.]])

    >>> lagmat(X, maxlag=2, trim="both", original='in')
    array([[ 5.,  6.,  3.,  4.,  1.,  2.]])

    >>> lagmat(X, maxlag=2, trim="none", original='in')
    array([[ 1.,  2.,  0.,  0.,  0.,  0.],
       [ 3.,  4.,  1.,  2.,  0.,  0.],
       [ 5.,  6.,  3.,  4.,  1.,  2.],
       [ 0.,  0.,  5.,  6.,  3.,  4.],
       [ 0.,  0.,  0.,  0.,  5.,  6.]])

    Notes
    -----
    When using a pandas DataFrame or Series with use_pandas=True, trim can only
    be 'forward' or 'both' since it is not possible to consistently extend
    index values.
    """
    maxlag = int_like(maxlag, 'maxlag')
    use_pandas = bool_like(use_pandas, 'use_pandas')
    trim = string_like(trim,
                       'trim',
                       optional=True,
                       options=('forward', 'backward', 'both', 'none'))
    original = string_like(original, 'original', options=('ex', 'sep', 'in'))

    # TODO:  allow list of lags additional to maxlag
    orig = x
    x = array_like(x, 'x', ndim=2, dtype=None)
    is_pandas = _is_using_pandas(orig, None) and use_pandas
    trim = 'none' if trim is None else trim
    trim = trim.lower()
    if is_pandas and trim in ('none', 'backward'):
        raise ValueError("trim cannot be 'none' or 'forward' when used on "
                         "Series or DataFrames")

    dropidx = 0
    nobs, nvar = x.shape
    if original in ['ex', 'sep']:
        dropidx = nvar
    if maxlag >= nobs:
        raise ValueError("maxlag should be < nobs")
    lm = np.zeros((nobs + maxlag, nvar * (maxlag + 1)))
    for k in range(0, int(maxlag + 1)):
        lm[maxlag - k:nobs + maxlag - k,
           nvar * (maxlag - k):nvar * (maxlag - k + 1)] = x

    if trim in ('none', 'forward'):
        startobs = 0
    elif trim in ('backward', 'both'):
        startobs = maxlag
    else:
        raise ValueError('trim option not valid')

    if trim in ('none', 'backward'):
        stopobs = len(lm)
    else:
        stopobs = nobs

    if is_pandas:
        x = orig
        x_columns = x.columns if isinstance(x, DataFrame) else [x.name]
        columns = [str(col) for col in x_columns]
        for lag in range(maxlag):
            lag_str = str(lag + 1)
            columns.extend([str(col) + '.L.' + lag_str for col in x_columns])
        lm = DataFrame(lm[:stopobs], index=x.index, columns=columns)
        lags = lm.iloc[startobs:]
        if original in ('sep', 'ex'):
            leads = lags[x_columns]
            lags = lags.drop(x_columns, 1)
    else:
        lags = lm[startobs:stopobs, dropidx:]
        if original == 'sep':
            leads = lm[startobs:stopobs, :dropidx]

    if original == 'sep':
        return lags, leads
    else:
        return lags
Example #15
0
    def engle_granger_coef(self,
                           y0,
                           y1,
                           trend='c',
                           method='aeg',
                           maxlag=None,
                           autolag='aic',
                           normalize=True,
                           debug=True):
        """
        Engle-Granger Cointegration Coefficient Calculations.

        This equation takes a linear combination of two L(1) time series to
        create a L(0) or stationary time series.

        This is useful if the two series have a similar stochastic long-term
        trend, as it eliminates them and allows you

        Parameters
        ----------
        y0 : array_like
            The first element in cointegrated system. Must be 1-d.
        y1 : array_like
            The remaining elements in cointegrated system.
        trend : str {'c', 'ct'}
            The trend term included in regression for cointegrating equation.

            * 'c' : constant.
            * 'ct' : constant and linear trend.
            * also available quadratic trend 'ctt', and no constant 'nc'.

        method : {'aeg'}
            Only 'aeg' (augmented Engle-Granger) is available.
        maxlag : None or int
            Argument for `adfuller`, largest or given number of lags.
        autolag : str
            Argument for `adfuller`, lag selection criterion.

            * If None, then maxlag lags are used without lag search.
            * If 'AIC' (default) or 'BIC', then the number of lags is chosen
            to minimize the corresponding information criterion.
            * 't-stat' based choice of maxlag.  Starts with maxlag and drops a
            lag until the t-statistic on the last lag length is significant
            using a 5%-sized test.
        normalize: boolean, optional
            As there are infinite scalar combinations that will produce the
            factor, this normalizes the first entry to be 1.
        debug: boolean, optional
            Checks if the series has a possible cointegration factor using the
            Engle-Granger Cointegration Test

        Returns
        -------
        coefs: array
            A vector that will create a L(0) time series if a combination
            exists.

        Notes
        -----
        The series should be checked independently for their integration
        order. The series must be L(1) to get consistent results. You can
        check this by using the int_order function.

        References
        ----------
        .. [1] MacKinnon, J.G. 1994  "Approximate Asymptotic Distribution
        Functions for Unit-Root and Cointegration Tests." Journal of
        Business & Economics Statistics, 12.2, 167-76.
        .. [2] MacKinnon, J.G. 2010.  "Critical Values for Cointegration
        Tests." Queen's University, Dept of Economics Working Papers 1227.
        http://ideas.repec.org/p/qed/wpaper/1227.html
        .. [3] Hamilton, J. D. (1994). Time series analysis
        (Vol. 2, pp. 690-696). Princeton, NJ: Princeton university press.
        """
        if debug:
            coint_t, pvalue, crit_value = coint(y0, y1, trend, method, maxlag,
                                                autolag)
            if pvalue >= .10:
                print('The null hypothesis cannot be rejected')

        trend = string_like(trend, 'trend', options=('c', 'nc', 'ct', 'ctt'))
        nobs, k_vars = y1.shape

        y1 = add_trend(y1, trend=trend, prepend=False)

        eg_model = OLS(y0, y1).fit()
        coefs = eg_model.params[0:k_vars]

        if normalize:
            coefs = coefs / coefs[0]

        return coefs
def kstest_fit(x, dist='norm', pvalmethod="table"):
    """
    Test assumed normal or exponential distribution using Lilliefors' test.

    Lilliefors' test is a Kolmogorov-Smirnov test with estimated parameters.

    Parameters
    ----------
    x : array_like, 1d
        Data to test.
    dist : {'norm', 'exp'}, optional
        The assumed distribution.
    pvalmethod : {'approx', 'table'}, optional
        The method used to compute the p-value of the test statistic. In
        general, 'table' is preferred and makes use of a very large simulation.
        'approx' is only valid for normality. if `dist = 'exp'` `table` is
        always used. 'approx' uses the approximation formula of Dalal and
        Wilkinson, valid for pvalues < 0.1. If the pvalue is larger than 0.1,
        then the result of `table` is returned.

    Returns
    -------
    ksstat : float
        Kolmogorov-Smirnov test statistic with estimated mean and variance.
    pvalue : float
        If the pvalue is lower than some threshold, e.g. 0.05, then we can
        reject the Null hypothesis that the sample comes from a normal
        distribution.

    Notes
    -----
    'table' uses an improved table based on 10,000,000 simulations. The
    critical values are approximated using
    log(cv_alpha) = b_alpha + c[0] log(n) + c[1] log(n)**2
    where cv_alpha is the critical value for a test with size alpha,
    b_alpha is an alpha-specific intercept term and c[1] and c[2] are
    coefficients that are shared all alphas.
    Values in the table are linearly interpolated. Values outside the
    range are be returned as bounds, 0.990 for large and 0.001 for small
    pvalues.

    For implementation details, see  lilliefors_critical_value_simulation.py in
    the test directory.
    """
    pvalmethod = string_like(pvalmethod,
                             "pvalmethod",
                             options=("approx", "table"))
    x = np.asarray(x)
    nobs = len(x)

    if dist == 'norm':
        z = (x - x.mean()) / x.std(ddof=1)
        test_d = stats.norm.cdf
        lilliefors_table = lilliefors_table_norm
    elif dist == 'exp':
        z = x / x.mean()
        test_d = stats.expon.cdf
        lilliefors_table = lilliefors_table_expon
        pvalmethod = 'table'
    else:
        raise ValueError("Invalid dist parameter, must be 'norm' or 'exp'")

    min_nobs = 4 if dist == 'norm' else 3
    if nobs < min_nobs:
        raise ValueError('Test for distribution {0} requires at least {1} '
                         'observations'.format(dist, min_nobs))

    d_ks = ksstat(z, test_d, alternative='two_sided')

    if pvalmethod == 'approx':
        pval = pval_lf(d_ks, nobs)
        # check pval is in desired range
        if pval > 0.1:
            pval = lilliefors_table.prob(d_ks, nobs)
    else:  # pvalmethod == 'table'
        pval = lilliefors_table.prob(d_ks, nobs)

    return d_ks, pval
Example #17
0
    def __init__(self,
                 endog,
                 k_regimes,
                 trend='c',
                 exog=None,
                 order=0,
                 exog_tvtp=None,
                 switching_trend=True,
                 switching_exog=True,
                 switching_variance=False,
                 dates=None,
                 freq=None,
                 missing='none'):

        # Properties
        from statsmodels.tools.validation import string_like
        self.trend = string_like(trend, "trend", options=("n", "c", "ct", "t"))
        self.switching_trend = switching_trend
        self.switching_exog = switching_exog
        self.switching_variance = switching_variance

        # Exogenous data
        self.k_exog, exog = markov_switching.prepare_exog(exog)

        # Trend
        nobs = len(endog)
        self.k_trend = 0
        self._k_exog = self.k_exog
        trend_exog = None
        if trend == 'c':
            trend_exog = np.ones((nobs, 1))
            self.k_trend = 1
        elif trend == 't':
            trend_exog = (np.arange(nobs) + 1)[:, np.newaxis]
            self.k_trend = 1
        elif trend == 'ct':
            trend_exog = np.c_[np.ones((nobs, 1)),
                               (np.arange(nobs) + 1)[:, np.newaxis]]
            self.k_trend = 2
        if trend_exog is not None:
            exog = trend_exog if exog is None else np.c_[trend_exog, exog]
            self._k_exog += self.k_trend

        # Initialize the base model
        super(MarkovRegression, self).__init__(endog,
                                               k_regimes,
                                               order=order,
                                               exog_tvtp=exog_tvtp,
                                               exog=exog,
                                               dates=dates,
                                               freq=freq,
                                               missing=missing)

        # Switching options
        if self.switching_trend is True or self.switching_trend is False:
            self.switching_trend = [self.switching_trend] * self.k_trend
        elif not len(self.switching_trend) == self.k_trend:
            raise ValueError('Invalid iterable passed to `switching_trend`.')
        if self.switching_exog is True or self.switching_exog is False:
            self.switching_exog = [self.switching_exog] * self.k_exog
        elif not len(self.switching_exog) == self.k_exog:
            raise ValueError('Invalid iterable passed to `switching_exog`.')

        self.switching_coeffs = (
            np.r_[self.switching_trend,
                  self.switching_exog].astype(bool).tolist())

        # Parameters
        self.parameters['exog'] = self.switching_coeffs
        self.parameters['variance'] = [1] if self.switching_variance else [0]
    def __init__(self, endog, trend=False, damped_trend=False, seasonal=None,
                 initialization_method='estimated', initial_level=None,
                 initial_trend=None, initial_seasonal=None, bounds=None,
                 concentrate_scale=True, dates=None, freq=None,
                 missing='none'):
        # Model definition
        self.trend = bool_like(trend, 'trend')
        self.damped_trend = bool_like(damped_trend, 'damped_trend')
        self.seasonal_periods = int_like(seasonal, 'seasonal', optional=True)
        self.seasonal = self.seasonal_periods is not None
        self.initialization_method = string_like(
            initialization_method, 'initialization_method').lower()
        self.concentrate_scale = bool_like(concentrate_scale,
                                           'concentrate_scale')

        # TODO: add validation for bounds (e.g. have all bounds, upper > lower)
        # TODO: add `bounds_method` argument to choose between "usual" and
        # "admissible" as in Hyndman et al. (2008)
        self.bounds = bounds
        if self.bounds is None:
            self.bounds = [(1e-4, 1-1e-4)] * 3 + [(0.8, 0.98)]

        # Validation
        if self.seasonal_periods == 1:
            raise ValueError('Cannot have a seasonal period of 1.')

        if self.seasonal and self.seasonal_periods is None:
            raise NotImplementedError('Unable to detect season automatically;'
                                      ' please specify `seasonal_periods`.')

        if self.initialization_method not in ['concentrated', 'estimated',
                                              'simple', 'heuristic', 'known']:
            raise ValueError('Invalid initialization method "%s".'
                             % initialization_method)

        if self.initialization_method == 'known':
            if initial_level is None:
                raise ValueError('`initial_level` argument must be provided'
                                 ' when initialization method is set to'
                                 ' "known".')
            if initial_trend is None and self.trend:
                raise ValueError('`initial_trend` argument must be provided'
                                 ' for models with a trend component when'
                                 ' initialization method is set to "known".')
            if initial_seasonal is None and self.seasonal:
                raise ValueError('`initial_seasonal` argument must be provided'
                                 ' for models with a seasonal component when'
                                 ' initialization method is set to "known".')

        # Initialize the state space model
        if not self.seasonal or self.seasonal_periods is None:
            self._seasonal_periods = 0
        else:
            self._seasonal_periods = self.seasonal_periods

        k_states = 2 + int(self.trend) + self._seasonal_periods
        k_posdef = 1

        init = ss_init.Initialization(k_states, 'known',
                                      constant=[0] * k_states)
        super(ExponentialSmoothing, self).__init__(
            endog, k_states=k_states, k_posdef=k_posdef,
            initialization=init, dates=dates, freq=freq, missing=missing)

        # Concentrate the scale out of the likelihood function
        if self.concentrate_scale:
            self.ssm.filter_concentrated = True

        # Setup fixed elements of the system matrices
        # Observation error
        self.ssm['design', 0, 0] = 1.
        self.ssm['selection', 0, 0] = 1.
        self.ssm['state_cov', 0, 0] = 1.

        # Level
        self.ssm['design', 0, 1] = 1.
        self.ssm['transition', 1, 1] = 1.

        # Trend
        if self.trend:
            self.ssm['transition', 1:3, 2] = 1.

        # Seasonal
        if self.seasonal:
            k = 2 + int(self.trend)
            self.ssm['design', 0, k] = 1.
            self.ssm['transition', k, -1] = 1.
            self.ssm['transition', k + 1:k_states, k:k_states - 1] = (
                np.eye(self.seasonal_periods - 1))

        # Initialization of the states
        if self.initialization_method != 'known':
            msg = ('Cannot give `%%s` argument when initialization is "%s"'
                   % initialization_method)
            if initial_level is not None:
                raise ValueError(msg % 'initial_level')
            if initial_trend is not None:
                raise ValueError(msg % 'initial_trend')
            if initial_seasonal is not None:
                raise ValueError(msg % 'initial_seasonal')

        if self.initialization_method == 'simple':
            initial_level, initial_trend, initial_seasonal = (
                es_init._initialization_simple(
                    self.endog[:, 0], trend='add' if self.trend else None,
                    seasonal='add' if self.seasonal else None,
                    seasonal_periods=self.seasonal_periods))
        elif self.initialization_method == 'heuristic':
            initial_level, initial_trend, initial_seasonal = (
                es_init._initialization_heuristic(
                    self.endog[:, 0], trend='add' if self.trend else None,
                    seasonal='add' if self.seasonal else None,
                    seasonal_periods=self.seasonal_periods))
        elif self.initialization_method == 'known':
            initial_level = float_like(initial_level, 'initial_level')
            if self.trend:
                initial_trend = float_like(initial_trend, 'initial_trend')
            if self.seasonal:
                initial_seasonal = array_like(initial_seasonal,
                                              'initial_seasonal')

                if len(initial_seasonal) == self.seasonal_periods - 1:
                    initial_seasonal = np.r_[initial_seasonal,
                                             0 - np.sum(initial_seasonal)]

                if len(initial_seasonal) != self.seasonal_periods:
                    raise ValueError(
                        'Invalid length of initial seasonal values. Must be'
                        ' one of s or s-1, where s is the number of seasonal'
                        ' periods.')

        # Note that the simple and heuristic methods of computing initial
        # seasonal factors return estimated seasonal factors associated with
        # the first t = 1, 2, ..., `n_seasons` observations. To use these as
        # the initial state, we lag them by `n_seasons`. This yields, for
        # example for `n_seasons = 4`, the seasons lagged L3, L2, L1, L0.
        # As described above, the state vector in this model should have
        # seasonal factors ordered L0, L1, L2, L3, and as a result we need to
        # reverse the order of the computed initial seasonal factors from
        # these methods.
        methods = ['simple', 'heuristic']
        if (self.initialization_method in methods
                and initial_seasonal is not None):
            initial_seasonal = initial_seasonal[::-1]

        self._initial_level = initial_level
        self._initial_trend = initial_trend
        self._initial_seasonal = initial_seasonal
        self._initial_state = None

        # Initialize now if possible (if we have a damped trend, then
        # initialization will depend on the phi parameter, and so has to be
        # done at each `update`)
        methods = ['simple', 'heuristic', 'known']
        if not self.damped_trend and self.initialization_method in methods:
            self._initialize_constant_statespace(initial_level, initial_trend,
                                                 initial_seasonal)

        # Save keys for kwarg initialization
        self._init_keys += ['trend', 'damped_trend', 'seasonal',
                            'initialization_method', 'initial_level',
                            'initial_trend', 'initial_seasonal', 'bounds',
                            'concentrate_scale', 'dates', 'freq', 'missing']
def adfuller(
    x,
    maxlag=None,
    regression="c",
    autolag="AIC",
    store=False,
    regresults=False,
):
    """
    Augmented Dickey-Fuller unit root test.

    The Augmented Dickey-Fuller test can be used to test for a unit root in a
    univariate process in the presence of serial correlation.

    Parameters
    ----------
    x : array_like, 1d
        The data series to test.
    maxlag : int
        Maximum lag which is included in test, default 12*(nobs/100)^{1/4}.
    regression : {"c","ct","ctt","nc"}
        Constant and trend order to include in regression.

        * "c" : constant only (default).
        * "ct" : constant and trend.
        * "ctt" : constant, and linear and quadratic trend.
        * "nc" : no constant, no trend.

    autolag : {"AIC", "BIC", "t-stat", None}
        Method to use when automatically determining the lag length among the
        values 0, 1, ..., maxlag.

        * If "AIC" (default) or "BIC", then the number of lags is chosen
          to minimize the corresponding information criterion.
        * "t-stat" based choice of maxlag.  Starts with maxlag and drops a
          lag until the t-statistic on the last lag length is significant
          using a 5%-sized test.
        * If None, then the number of included lags is set to maxlag.
    store : bool
        If True, then a result instance is returned additionally to
        the adf statistic. Default is False.
    regresults : bool, optional
        If True, the full regression results are returned. Default is False.

    Returns
    -------
    adf : float
        The test statistic.
    pvalue : float
        MacKinnon's approximate p-value based on MacKinnon (1994, 2010).
    usedlag : int
        The number of lags used.
    nobs : int
        The number of observations used for the ADF regression and calculation
        of the critical values.
    critical values : dict
        Critical values for the test statistic at the 1 %, 5 %, and 10 %
        levels. Based on MacKinnon (2010).
    icbest : float
        The maximized information criterion if autolag is not None.
    resstore : ResultStore, optional
        A dummy class with results attached as attributes.

    Notes
    -----
    The null hypothesis of the Augmented Dickey-Fuller is that there is a unit
    root, with the alternative that there is no unit root. If the pvalue is
    above a critical size, then we cannot reject that there is a unit root.

    The p-values are obtained through regression surface approximation from
    MacKinnon 1994, but using the updated 2010 tables. If the p-value is close
    to significant, then the critical values should be used to judge whether
    to reject the null.

    The autolag option and maxlag for it are described in Greene.

    References
    ----------
    .. [1] W. Green.  "Econometric Analysis," 5th ed., Pearson, 2003.

    .. [2] Hamilton, J.D.  "Time Series Analysis".  Princeton, 1994.

    .. [3] MacKinnon, J.G. 1994.  "Approximate asymptotic distribution functions for
        unit-root and cointegration tests.  `Journal of Business and Economic
        Statistics` 12, 167-76.

    .. [4] MacKinnon, J.G. 2010. "Critical Values for Cointegration Tests."  Queen"s
        University, Dept of Economics, Working Papers.  Available at
        http://ideas.repec.org/p/qed/wpaper/1227.html

    Examples
    --------
    See example notebook
    """
    x = array_like(x, "x")
    maxlag = int_like(maxlag, "maxlag", optional=True)
    regression = string_like(regression,
                             "regression",
                             options=("c", "ct", "ctt", "nc"))
    autolag = string_like(autolag,
                          "autolag",
                          optional=True,
                          options=("aic", "bic", "t-stat"))
    store = bool_like(store, "store")
    regresults = bool_like(regresults, "regresults")

    if regresults:
        store = True

    trenddict = {None: "nc", 0: "c", 1: "ct", 2: "ctt"}
    if regression is None or isinstance(regression, int):
        regression = trenddict[regression]
    regression = regression.lower()
    nobs = x.shape[0]

    ntrend = len(regression) if regression != "nc" else 0
    if maxlag is None:
        # from Greene referencing Schwert 1989
        maxlag = int(np.ceil(12.0 * np.power(nobs / 100.0, 1 / 4.0)))
        # -1 for the diff
        maxlag = min(nobs // 2 - ntrend - 1, maxlag)
        if maxlag < 0:
            raise ValueError("sample size is too short to use selected "
                             "regression component")
    elif maxlag > nobs // 2 - ntrend - 1:
        raise ValueError("maxlag must be less than (nobs/2 - 1 - ntrend) "
                         "where n trend is the number of included "
                         "deterministic regressors")
    xdiff = np.diff(x)
    xdall = lagmat(xdiff[:, None], maxlag, trim="both", original="in")
    nobs = xdall.shape[0]

    xdall[:, 0] = x[-nobs - 1:-1]  # replace 0 xdiff with level of x
    xdshort = xdiff[-nobs:]

    if store:
        from statsmodels.stats.diagnostic import ResultsStore

        resstore = ResultsStore()
    if autolag:
        if regression != "nc":
            fullRHS = add_trend(xdall, regression, prepend=True)
        else:
            fullRHS = xdall
        startlag = fullRHS.shape[1] - xdall.shape[1] + 1
        # 1 for level
        # search for lag length with smallest information criteria
        # Note: use the same number of observations to have comparable IC
        # aic and bic: smaller is better

        if not regresults:
            icbest, bestlag = _autolag(OLS, xdshort, fullRHS, startlag, maxlag,
                                       autolag)
        else:
            icbest, bestlag, alres = _autolag(
                OLS,
                xdshort,
                fullRHS,
                startlag,
                maxlag,
                autolag,
                regresults=regresults,
            )
            resstore.autolag_results = alres

        bestlag -= startlag  # convert to lag not column index

        # rerun ols with best autolag
        xdall = lagmat(xdiff[:, None], bestlag, trim="both", original="in")
        nobs = xdall.shape[0]
        xdall[:, 0] = x[-nobs - 1:-1]  # replace 0 xdiff with level of x
        xdshort = xdiff[-nobs:]
        usedlag = bestlag
    else:
        usedlag = maxlag
        icbest = None
    if regression != "nc":
        resols = OLS(xdshort, add_trend(xdall[:, :usedlag + 1],
                                        regression)).fit()
    else:
        resols = OLS(xdshort, xdall[:, :usedlag + 1]).fit()

    adfstat = resols.tvalues[0]
    #    adfstat = (resols.params[0]-1.0)/resols.bse[0]
    # the "asymptotically correct" z statistic is obtained as
    # nobs/(1-np.sum(resols.params[1:-(trendorder+1)])) (resols.params[0] - 1)
    # I think this is the statistic that is used for series that are integrated
    # for orders higher than I(1), ie., not ADF but cointegration tests.

    # Get approx p-value and critical values
    pvalue = mackinnonp(adfstat, regression=regression, N=1)
    critvalues = mackinnoncrit(N=1, regression=regression, nobs=nobs)
    critvalues = {
        "1%": critvalues[0],
        "5%": critvalues[1],
        "10%": critvalues[2],
    }
    if store:
        resstore.resols = resols
        resstore.maxlag = maxlag
        resstore.usedlag = usedlag
        resstore.adfstat = adfstat
        resstore.critvalues = critvalues
        resstore.nobs = nobs
        resstore.H0 = ("The coefficient on the lagged level equals 1 - "
                       "unit root")
        resstore.HA = "The coefficient on the lagged level < 1 - stationary"
        resstore.icbest = icbest
        resstore._str = "Augmented Dickey-Fuller Test Results"
        return adfstat, pvalue, critvalues, resstore
    else:
        if not autolag:
            return adfstat, pvalue, usedlag, nobs, critvalues
        else:
            return adfstat, pvalue, usedlag, nobs, critvalues, icbest
Example #20
0
    def fit(
        self,
        method="inv",
        cov_type="nonrobust",
        cov_kwds=None,
        reset=None,
        use_t=False,
        params_only=False,
    ):
        """
        Estimate model parameters.

        Parameters
        ----------
        method : {'inv', 'lstsq', 'pinv'}
            Method to use when computing the the model parameters.

            * 'inv' - use moving windows inner-products and matrix inversion.
              This method is the fastest, but may be less accurate than the
              other methods.
            * 'lstsq' - Use numpy.linalg.lstsq
            * 'pinv' - Use numpy.linalg.pinv. This method matches the default
              estimator in non-moving regression estimators.
        cov_type : {'nonrobust', 'HCCM', 'HC0'}
            Covariance estimator:

            * nonrobust - The classic OLS covariance estimator
            * HCCM, HC0 - White heteroskedasticity robust covariance
        cov_kwds : dict
            Unused
        reset : int, optional
            Interval to recompute the moving window inner products used to
            estimate the model parameters. Smaller values improve accuracy,
            although in practice this setting is not required to be set.
        use_t : bool, optional
            Flag indicating to use the Student's t distribution when computing
            p-values.
        params_only : bool, optional
            Flag indicating that only parameters should be computed. Avoids
            calculating all other statistics or performing inference.

        Returns
        -------
        RollingRegressionResults
            Estimation results where all pre-sample values are nan-filled.
        """
        method = string_like(method,
                             "method",
                             options=("inv", "lstsq", "pinv"))
        reset = int_like(reset, "reset", optional=True)
        reset = self._y.shape[0] if reset is None else reset
        if reset < 1:
            raise ValueError("reset must be a positive integer")

        nobs, k = self._x.shape
        store = RollingStore(
            params=np.full((nobs, k), np.nan),
            ssr=np.full(nobs, np.nan),
            llf=np.full(nobs, np.nan),
            nobs=np.zeros(nobs, dtype=int),
            s2=np.full(nobs, np.nan),
            xpxi=np.full((nobs, k, k), np.nan),
            xeex=np.full((nobs, k, k), np.nan),
            centered_tss=np.full(nobs, np.nan),
            uncentered_tss=np.full(nobs, np.nan),
        )
        w = self._window
        first = self._min_nobs if self._expanding else w
        xpx, xpy, nobs = self._reset(first)
        if not (self._has_nan[first - 1] and self._skip_missing):
            self._fit_single(first, xpx, xpy, nobs, store, params_only, method)
        wx, wy = self._wx, self._wy
        for i in range(first + 1, self._x.shape[0] + 1):
            if self._has_nan[i - 1] and self._skip_missing:
                continue
            if i % reset == 0:
                xpx, xpy, nobs = self._reset(i)
            else:
                if not self._is_nan[i - w - 1] and i > w:
                    remove_x = wx[i - w - 1:i - w]
                    xpx -= remove_x.T @ remove_x
                    xpy -= remove_x.T @ wy[i - w - 1:i - w]
                    nobs -= 1
                if not self._is_nan[i - 1]:
                    add_x = wx[i - 1:i]
                    xpx += add_x.T @ add_x
                    xpy += add_x.T @ wy[i - 1:i]
                    nobs += 1

            self._fit_single(i, xpx, xpy, nobs, store, params_only, method)

        return RollingRegressionResults(self, store, self.k_constant, use_t,
                                        cov_type)
Example #21
0
    def __init__(self, data, ncomp=None, standardize=True, demean=True,
                 normalize=True, gls=False, weights=None, method='svd',
                 missing=None, tol=5e-8, max_iter=1000, tol_em=5e-8,
                 max_em_iter=100):
        self._index = None
        self._columns = []
        if isinstance(data, pd.DataFrame):
            self._index = data.index
            self._columns = data.columns

        self.data = array_like(data, "data", ndim=2)
        # Store inputs
        self._gls = bool_like(gls, "gls")
        self._normalize = bool_like(normalize, "normalize")
        self._tol = float_like(tol, "tol")
        if not 0 < self._tol < 1:
            raise ValueError('tol must be strictly between 0 and 1')
        self._max_iter = int_like(max_iter, "int_like")
        self._max_em_iter = int_like(max_em_iter, "max_em_iter")
        self._tol_em = float_like(tol_em, "tol_em")

        # Prepare data
        self._standardize = bool_like(standardize, "standardize")
        self._demean = bool_like(demean, "demean")

        self._nobs, self._nvar = self.data.shape
        weights = array_like(weights, "weights", maxdim=1, optional=True)
        if weights is None:
            weights = np.ones(self._nvar)
        else:
            weights = np.array(weights).flatten()
            if weights.shape[0] != self._nvar:
                raise ValueError('weights should have nvar elements')
            weights = weights / np.sqrt((weights ** 2.0).mean())
        self.weights = weights

        # Check ncomp against maximum
        min_dim = min(self._nobs, self._nvar)
        self._ncomp = min_dim if ncomp is None else ncomp
        if self._ncomp > min_dim:
            import warnings

            warn = 'The requested number of components is more than can be ' \
                   'computed from data. The maximum number of components is ' \
                   'the minimum of the number of observations or variables'
            warnings.warn(warn, ValueWarning)
            self._ncomp = min_dim

        self._method = method
        # Workaround to avoid instance methods in __dict__
        if self._method not in ('eig', 'svd', 'nipals'):
            raise ValueError('method {0} is not known.'.format(method))

        self.rows = np.arange(self._nobs)
        self.cols = np.arange(self._nvar)
        # Handle missing
        self._missing = string_like(missing, "missing", optional=True)
        self._adjusted_data = self.data
        self._adjust_missing()

        # Update size
        self._nobs, self._nvar = self._adjusted_data.shape
        if self._ncomp == np.min(self.data.shape):
            self._ncomp = np.min(self._adjusted_data.shape)
        elif self._ncomp > np.min(self._adjusted_data.shape):
            raise ValueError('When adjusting for missing values, user '
                             'provided ncomp must be no larger than the '
                             'smallest dimension of the '
                             'missing-value-adjusted data size.')

        # Attributes and internal values
        self._tss = 0.0
        self._ess = None
        self.transformed_data = None
        self._mu = None
        self._sigma = None
        self._ess_indiv = None
        self._tss_indiv = None
        self.scores = self.factors = None
        self.loadings = None
        self.coeff = None
        self.eigenvals = None
        self.eigenvecs = None
        self.projection = None
        self.rsquare = None
        self.ic = None

        # Prepare data
        self.transformed_data = self._prepare_data()
        # Perform the PCA
        self._pca()
        if gls:
            self._compute_gls_weights()
            self.transformed_data = self._prepare_data()
            self._pca()

        # Final calculations
        self._compute_rsquare_and_ic()
        if self._index is not None:
            self._to_pandas()
Example #22
0
def add_trend(x, trend="c", prepend=False, has_constant='skip'):
    """
    Adds a trend and/or constant to an array.

    Parameters
    ----------
    x : array_like
        Original array of data.
    trend : str {'n', 'c', 't', 'ct', 'ctt'}

        * 'n' add no trend.
        * 'c' add constant only.
        * 't' add trend only.
        * 'ct' add constant and linear trend.
        * 'ctt' add constant and linear and quadratic trend.
    prepend : bool
        If True, prepends the new data to the columns of X.
    has_constant : str {'raise', 'add', 'skip'}
        Controls what happens when trend is 'c' and a constant already
        exists in x. 'raise' will raise an error. 'add' will duplicate a
        constant. 'skip' will return the data without change. 'skip' is the
        default.

    Returns
    -------
    array_like
        The original data with the additional trend columns.  If x is a
        recarray or pandas Series or DataFrame, then the trend column names
        are 'const', 'trend' and 'trend_squared'.

    Notes
    -----
    Returns columns as ['ctt','ct','c'] whenever applicable. There is currently
    no checking for an existing trend.

    See Also
    --------
    statsmodels.tools.tools.add_constant
        Add a constant column to an array.
    """
    prepend = bool_like(prepend, 'prepend')
    trend = string_like(trend, 'trend', options=('n', 'c', 't', 'ct', 'ctt'))
    has_constant = string_like(has_constant,
                               'has_constant',
                               options=('raise', 'add', 'skip'))

    # TODO: could be generalized for trend of aribitrary order
    columns = ['const', 'trend', 'trend_squared']
    if trend == 'n':
        return x.copy()
    elif trend == "c":  # handles structured arrays
        columns = columns[:1]
        trendorder = 0
    elif trend == "ct" or trend == "t":
        columns = columns[:2]
        if trend == "t":
            columns = columns[1:2]
        trendorder = 1
    elif trend == "ctt":
        trendorder = 2

    is_recarray = _is_recarray(x)
    is_pandas = _is_using_pandas(x, None) or is_recarray
    if is_pandas or is_recarray:
        if is_recarray:
            descr = x.dtype.descr
            x = pd.DataFrame.from_records(x)
        elif isinstance(x, pd.Series):
            x = pd.DataFrame(x)
        else:
            x = x.copy()
    else:
        x = np.asanyarray(x)

    nobs = len(x)
    trendarr = np.vander(np.arange(1, nobs + 1, dtype=np.float64),
                         trendorder + 1)
    # put in order ctt
    trendarr = np.fliplr(trendarr)
    if trend == "t":
        trendarr = trendarr[:, 1]

    if "c" in trend:
        if is_pandas or is_recarray:
            # Mixed type protection
            def safe_is_const(s):
                try:
                    return np.ptp(s) == 0.0 and np.any(s != 0.0)
                except:
                    return False

            col_const = x.apply(safe_is_const, 0)
        else:
            ptp0 = np.ptp(np.asanyarray(x), axis=0)
            col_is_const = ptp0 == 0
            nz_const = col_is_const & (x[0] != 0)
            col_const = nz_const

        if np.any(col_const):
            if has_constant == 'raise':
                msg = "x contains a constant. Adding a constant with " \
                      "trend='{0}' is not allowed.".format(trend)
                raise ValueError(msg)
            elif has_constant == 'skip':
                columns = columns[1:]
                trendarr = trendarr[:, 1:]

    order = 1 if prepend else -1
    if is_recarray or is_pandas:
        trendarr = pd.DataFrame(trendarr, index=x.index, columns=columns)
        x = [trendarr, x]
        x = pd.concat(x[::order], 1)
    else:
        x = [trendarr, x]
        x = np.column_stack(x[::order])

    if is_recarray:
        x = x.to_records(index=False)
        new_descr = x.dtype.descr
        extra_col = len(new_descr) - len(descr)
        if prepend:
            descr = new_descr[:extra_col] + descr
        else:
            descr = descr + new_descr[-extra_col:]

        x = x.astype(np.dtype(descr))

    return x
Example #23
0
    def dynamic_coefs(self,
                      y0,
                      y1,
                      n_lags=None,
                      trend='c',
                      normalize=True,
                      reverse=False):
        """
        Dynamic Cointegration Coefficient Calculation.

        This equation takes a linear combination of multiple L(1) time
        series to create a L(0) or stationary time series.

        This is useful if the two series have a similar stochastic long-term
        trend, as it eliminates them and allows you.

        Unlike Engle-Granger, this method uses dynamic regression - taking
        an equal combination of lags and leads of the differences of the
        series - to create a more accurate parameter vector. This method
        calculates the lag-lead matricies for the given lag values or searches
        for the best amount of lags using BIC calculations. Once the optimal
        value is found, the calculation is done and returned. The optimal
        lag can be found by using dot notation and finding max_lag. You
        can also find the model by using .model.

        Parameters
        ----------
        y0 : array_like
            The first element in cointegrated system. Must be 1-d.
        y1 : array_like
            The remaining elements in cointegrated system.
        n_lags: int, array, None
            This determines which values the function should search for the
            best vector.

            * int: If an int, the calculation is done for only that lag
            * array: If an array of two integers, the first value is where
                        the search begins and the second is where it ends
            * None: If None is given, the function searches from 2 to
                        ceiling of the cube root of the number of observations
                        divided by two plus two in order to ensure at least
                        one value is searched.
                        I.E last_lag = (n_obs**(1/3) / 2) + 2

        trend : str {'c', 'ct'}
            The trend term included in regression for cointegrating equation.

            * 'c' : constant.
            * 'ct' : constant and linear trend.
            * also available quadratic trend 'ctt', and no constant 'nc'.

        normalize: Boolean
            If true, the first entry in the parameter vector is normalized to
            one and everything else is divided by the first entry. This is
            because any cointegrating vector could be multiplied by a scalar
            and still be a cointegrating vector.
        reverse: Boolean
            The series must be ordered from the latest data points to the last.
            This is in order to calculate the differences. Using this, you can
            reverse the ordering of your data points.

        Returns
        -------
        coefs: array
            A vector that will create a L(0) time series if a
            combination sexists.

        Notes
        -----
        The data must go from the latest observations to the earliest. If not,
        the coef vector will be the opposite sign.

        The series should be checked independently for their integration order.
        The series must be L(1) to get consistent results. You can check this
        by using the int_order function.

        References
        ----------
        .. [1] Stock, J. H., & Watson, M. W. (1993). A simple estimator of
        cointegrating vectors in higher order integrated systems.
        Econometrica: Journal of the Econometric Society, 783-820.
        .. [2] Hamilton, J. D. (1994). Time series analysis
        (Vol. 2, pp. 690-696). Princeton, NJ: Princeton university press.
        """
        self.bics = []
        self.max_val = []
        self.model = ''
        self.coefs = []

        trend = string_like(trend, 'trend', options=('c', 'nc', 'ct', 'ctt'))
        y1 = add_trend(y1, trend=trend, prepend=True)
        y1 = y1.reset_index(drop=True)
        if reverse:
            y0, y1 = y0[::-1], y1[::-1]

        if _is_using_pandas(y0, y1):
            columns = list(y1.columns)

        else:
            # Need to check if NumPy, because I can only support those two
            n_obs, k = y1.shape
            columns = [f'Var_{x}' for x in range(k)]
            y0, y1 = pd.DataFrame(y0), pd.DataFrame(y1)

        if n_lags is None:
            n_obs, k = y1.shape
            dta = pd.DataFrame(np.diff(a=y1, n=1, axis=0))
            for lag in range(2, int(np.ceil(n_obs**(1 / 3) / 2) + 2)):

                df1 = pd.DataFrame(lagmat(dta, lag + 1, trim='backward'))
                cols = dict(zip(list(df1.columns)[::-1][0:k][::-1], columns))
                df1 = df1.rename(columns=cols)

                df2 = pd.DataFrame(lagmat(dta, lag, trim='forward'))

                lags_leads = pd.concat([df1, df2], axis=1, join='outer')
                lags_leads = lags_leads.drop(list(range(0, lag)))
                lags_leads = lags_leads.reset_index(drop=True)

                lags_leads = lags_leads.drop(
                    list(range(len(lags_leads) - lag, len(lags_leads))))

                lags_leads = lags_leads.reset_index(drop=True)
                data_y = y0.drop(list(range(0, lag))).reset_index(drop=True)
                data_y = data_y.drop(
                    list(range(len(data_y) - lag - 1, len(data_y))))
                data_y = data_y.reset_index(drop=True)

                self.bics.append([OLS(data_y, lags_leads).fit().bic, lag])

            self.max_val = max(self.bics, key=lambda item: item[0])
            self.max_val = self.max_val[1]

        elif len(n_lags) == 2:
            start, end = int(n_lags[0]), int(n_lags[1])
            n_obs, k = y1.shape
            dta = pd.DataFrame(np.diff(a=y1, n=1, axis=0))

            for lag in range(start, end + 1):
                df1 = pd.DataFrame(lagmat(dta, lag + 1, trim='backward'))
                cols = dict(zip(list(df1.columns)[::-1][0:k][::-1], columns))
                df1 = df1.rename(columns=cols)

                df2 = pd.DataFrame(lagmat(dta, lag, trim='forward'))

                lags_leads = pd.concat([df1, df2], axis=1, join='outer')
                lags_leads = lags_leads.drop(list(range(0, lag)))
                lags_leads = lags_leads.reset_index(drop=True)
                lags_leads = lags_leads.drop(
                    list(range(len(lags_leads) - lag, len(lags_leads))))
                lags_leads = lags_leads.reset_index(drop=True)

                data_y = y0.drop(list(range(0, lag))).reset_index(drop=True)
                data_y = data_y.drop(
                    list(range(len(data_y) - lag - 1, len(data_y))))
                data_y = data_y.reset_index(drop=True)

                self.bics.append([OLS(data_y, lags_leads).fit().bic, lag])

            self.max_val = max(self.bics, key=lambda item: item[0])
            self.max_val = self.max_val[1]

        elif len(n_lags) == 1:
            self.max_val = int(n_lags)

        else:
            raise ('Make sure your lags are in one of the required forms.')

        dta = pd.DataFrame(np.diff(a=y1, n=1, axis=0))
        # Create a matrix of the lags, this also retains the original matrix,
        # which is why max_val + 1
        df1 = pd.DataFrame(lagmat(dta, self.max_val + 1, trim='backward'))

        # Rename the columns, as we need to keep track of them. We know the
        # original will be the final values
        cols = dict(zip(list(df1.columns)[::-1][0:k][::-1], columns))
        df1 = df1.rename(columns=cols)

        # Do the same, but these are leads, this does not keep the
        # original matrix, thus max_val
        df2 = pd.DataFrame(lagmat(dta, self.max_val, trim='forward'))

        # There are missing data due to the lags and leads, we concat
        # the frames and drop the values of which are missing.
        lags_leads = pd.concat([df1, df2], axis=1, join='outer')
        lags_leads = lags_leads.drop(list(range(0, self.max_val)))
        lags_leads = lags_leads.reset_index(drop=True)
        lags_leads = lags_leads.drop(
            list(range(len(lags_leads) - self.max_val, len(lags_leads))))
        lags_leads.reset_index(drop=True)

        # We also need to do this for the endog values, we need to
        # drop 1 extra due to a loss from first differencing.
        # This will be at the end of the matrix.
        data_y = y0.drop(list(range(0, self.max_val))).reset_index(drop=True)
        data_y = data_y.drop(
            list(range(len(data_y) - self.max_val - 1, len(data_y))))
        data_y = data_y.reset_index(drop=True)

        self.model = OLS(data_y, lags_leads).fit()

        self.coefs = self.model.params[list(y1.columns)]

        if normalize:
            self.coefs = self.coefs / self.coefs[0]

        return (self.coefs)