def __init__(self, endog, trend=None, damped=False, seasonal=None, seasonal_periods=None, dates=None, freq=None, missing='none'): super(ExponentialSmoothing, self).__init__(endog, None, dates, freq, missing=missing) self.endog = self.endog self._y = self._data = array_like(endog, 'endog', contiguous=True, order='C') options = ("add", "mul", "additive", "multiplicative") trend = string_like(trend, 'trend', options=options, optional=True) if trend in ['additive', 'multiplicative']: trend = {'additive': 'add', 'multiplicative': 'mul'}[trend] self.trend = trend self.damped = bool_like(damped, 'damped') seasonal = string_like(seasonal, 'seasonal', options=options, optional=True) if seasonal in ['additive', 'multiplicative']: seasonal = {'additive': 'add', 'multiplicative': 'mul'}[seasonal] self.seasonal = seasonal self.trending = trend in ['mul', 'add'] self.seasoning = seasonal in ['mul', 'add'] if (self.trend == 'mul' or self.seasonal == 'mul') and \ not np.all(self._data > 0.0): raise ValueError('endog must be strictly positive when using' 'multiplicative trend or seasonal components.') if self.damped and not self.trending: raise ValueError('Can only dampen the trend component') if self.seasoning: self.seasonal_periods = int_like(seasonal_periods, 'seasonal_periods', optional=True) if seasonal_periods is None: self.seasonal_periods = freq_to_period(self._index_freq) if self.seasonal_periods <= 1: raise ValueError('seasonal_periods must be larger than 1.') else: self.seasonal_periods = 0 self.nobs = len(self.endog)
def __init__(self, endog, exog, window=None, weights=None, min_nobs=None, missing='drop'): # Call Model.__init__ twice to use const detection in first pass # But to not drop in the second pass missing = string_like(missing, 'missing', options=('drop', 'raise', 'skip')) temp_msng = 'drop' if missing != 'raise' else 'raise' Model.__init__(self, endog, exog, missing=temp_msng, hasconst=None) k_const = self.k_constant const_idx = self.data.const_idx Model.__init__(self, endog, exog, missing='none', hasconst=False) self.k_constant = k_const self.data.const_idx = const_idx self._y = array_like(endog, 'endog') nobs = self._y.shape[0] self._x = array_like(exog, 'endog', ndim=2, shape=(nobs, None)) window = int_like(window, 'window', optional=True) weights = array_like(weights, 'weights', optional=True, shape=(nobs,)) self._window = window if window is not None else self._y.shape[0] self._weighted = weights is not None self._weights = np.ones(nobs) if weights is None else weights w12 = np.sqrt(self._weights) self._wy = w12 * self._y self._wx = w12[:, None] * self._x self._is_nan = np.zeros_like(self._y, dtype=np.bool) self._has_nan = self._find_nans() self.const_idx = self.data.const_idx self._skip_missing = missing == 'skip' min_nobs = int_like(min_nobs, 'min_nobs', optional=True) self._min_nobs = min_nobs if min_nobs is not None else self._x.shape[1] if self._min_nobs < self._x.shape[1] or self._min_nobs > self._window: raise ValueError('min_nobs must be larger than the number of ' 'regressors in the model and less than window')
def __init__(self, freq: str, period: str) -> None: freq_options: Set[str] = set() freq_options.update( *[list(val.keys()) for val in self._supported.values()]) period_options = list(self._supported.keys()) freq = string_like(freq, "freq", options=tuple(freq_options), lower=False) period = string_like(period, "period", options=period_options, lower=False) if freq not in self._supported[period]: raise ValueError(f"The combination of freq={freq} and " f"period={period} is not supported.") super().__init__(freq) self._period = period self._freq_str = self._freq.freqstr.split("-")[0]
def test_string(): out = string_like('apple', 'value') assert out == 'apple' out = string_like('apple', 'value', options=('apple', 'banana', 'cherry')) assert out == 'apple' with pytest.raises(TypeError, match='value must be a string'): string_like(1, 'value') with pytest.raises(TypeError, match='value must be a string'): string_like(b'4', 'value') with pytest.raises(ValueError, match='value must be one of: \'apple\',' ' \'banana\', \'cherry\''): string_like('date', 'value', options=('apple', 'banana', 'cherry'))
def __init__( self, endog, exog, window=None, *, weights=None, min_nobs=None, missing="drop", expanding=False ): # Call Model.__init__ twice to use const detection in first pass # But to not drop in the second pass missing = string_like( missing, "missing", options=("drop", "raise", "skip") ) temp_msng = "drop" if missing != "raise" else "raise" Model.__init__(self, endog, exog, missing=temp_msng, hasconst=None) k_const = self.k_constant const_idx = self.data.const_idx Model.__init__(self, endog, exog, missing="none", hasconst=False) self.k_constant = k_const self.data.const_idx = const_idx self._y = array_like(endog, "endog") nobs = self._y.shape[0] self._x = array_like(exog, "endog", ndim=2, shape=(nobs, None)) window = int_like(window, "window", optional=True) weights = array_like(weights, "weights", optional=True, shape=(nobs,)) self._window = window if window is not None else self._y.shape[0] self._weighted = weights is not None self._weights = np.ones(nobs) if weights is None else weights w12 = np.sqrt(self._weights) self._wy = w12 * self._y self._wx = w12[:, None] * self._x min_nobs = int_like(min_nobs, "min_nobs", optional=True) self._min_nobs = min_nobs if min_nobs is not None else self._x.shape[1] if self._min_nobs < self._x.shape[1] or self._min_nobs > self._window: raise ValueError( "min_nobs must be larger than the number of " "regressors in the model and less than window" ) self._expanding = expanding self._is_nan = np.zeros_like(self._y, dtype=bool) self._has_nan = self._find_nans() self.const_idx = self.data.const_idx self._skip_missing = missing == "skip"
def test_string(): out = string_like("apple", "value") assert out == "apple" out = string_like("apple", "value", options=("apple", "banana", "cherry")) assert out == "apple" with pytest.raises(TypeError, match="value must be a string"): string_like(1, "value") with pytest.raises(TypeError, match="value must be a string"): string_like(b"4", "value") with pytest.raises( ValueError, match="value must be one of: 'apple'," " 'banana', 'cherry'", ): string_like("date", "value", options=("apple", "banana", "cherry"))
def test_optional_string(): out = string_like("apple", "value") assert out == "apple" out = string_like("apple", "value", options=("apple", "banana", "cherry")) assert out == "apple" out = string_like(None, "value", optional=True) assert out is None out = string_like(None, "value", optional=True, options=("apple", "banana", "cherry")) assert out is None with pytest.raises(TypeError, match="value must be a string"): string_like(1, "value", optional=True) with pytest.raises(TypeError, match="value must be a string"): string_like(b"4", "value", optional=True)
def test_optional_string(): out = string_like('apple', 'value') assert out == 'apple' out = string_like('apple', 'value', options=('apple', 'banana', 'cherry')) assert out == 'apple' out = string_like(None, 'value', optional=True) assert out is None out = string_like(None, 'value', optional=True, options=('apple', 'banana', 'cherry')) assert out is None with pytest.raises(TypeError, match='value must be a string'): string_like(1, 'value', optional=True) with pytest.raises(TypeError, match='value must be a string'): string_like(b'4', 'value', optional=True)
def __init__( self, endog, *, period: Optional[int] = None, deseasonalize: bool = True, use_test: bool = True, method: str = "auto", difference: bool = False ) -> None: self._y = array_like(endog, "endog", ndim=1) if isinstance(endog, pd.DataFrame): self.endog_orig = endog.iloc[:, 0] else: self.endog_orig = endog self._period = int_like(period, "period", optional=True) self._deseasonalize = bool_like(deseasonalize, "deseasonalize") self._use_test = ( bool_like(use_test, "use_test") and self._deseasonalize ) self._diff = bool_like(difference, "difference") self._method = string_like( method, "model", options=("auto", "additive", "multiplicative", "mul", "add"), ) if self._period is None and self._deseasonalize: idx = getattr(endog, "index", None) pfreq = None if idx is not None: pfreq = getattr(idx, "freq", None) if pfreq is None: pfreq = getattr(idx, "inferred_freq", None) if pfreq is not None: self._period = freq_to_period(pfreq) else: raise ValueError( "You must specify a period or endog must be a " "pandas object with a DatetimeIndex with " "a freq not set to None" ) self._has_seasonality = self._deseasonalize
def lagmat2ds(x, maxlag0, maxlagex=None, dropex=0, trim="forward", use_pandas=False): """ Generate lagmatrix for 2d array, columns arranged by variables. Parameters ---------- x : array_like Data, 2d. Observations in rows and variables in columns. maxlag0 : int The first variable all lags from zero to maxlag are included. maxlagex : {None, int} The max lag for all other variables all lags from zero to maxlag are included. dropex : int Exclude first dropex lags from other variables. For all variables, except the first, lags from dropex to maxlagex are included. trim : str The trimming method to use. * 'forward' : trim invalid observations in front. * 'backward' : trim invalid initial observations. * 'both' : trim invalid observations on both sides. * 'none' : no trimming of observations. use_pandas : bool If true, returns a DataFrame when the input is a pandas Series or DataFrame. If false, return numpy ndarrays. Returns ------- ndarray The array with lagged observations, columns ordered by variable. Notes ----- Inefficient implementation for unequal lags, implemented for convenience. """ maxlag0 = int_like(maxlag0, "maxlag0") maxlagex = int_like(maxlagex, "maxlagex", optional=True) trim = string_like( trim, "trim", optional=True, options=("forward", "backward", "both", "none"), ) if maxlagex is None: maxlagex = maxlag0 maxlag = max(maxlag0, maxlagex) is_pandas = _is_using_pandas(x, None) if x.ndim == 1: if is_pandas: x = pd.DataFrame(x) else: x = x[:, None] elif x.ndim == 0 or x.ndim > 2: raise ValueError("Only supports 1 and 2-dimensional data.") nobs, nvar = x.shape if is_pandas and use_pandas: lags = lagmat(x.iloc[:, 0], maxlag, trim=trim, original="in", use_pandas=True) lagsli = [lags.iloc[:, :maxlag0 + 1]] for k in range(1, nvar): lags = lagmat(x.iloc[:, k], maxlag, trim=trim, original="in", use_pandas=True) lagsli.append(lags.iloc[:, dropex:maxlagex + 1]) return pd.concat(lagsli, axis=1) elif is_pandas: x = np.asanyarray(x) lagsli = [ lagmat(x[:, 0], maxlag, trim=trim, original="in")[:, :maxlag0 + 1] ] for k in range(1, nvar): lagsli.append( lagmat(x[:, k], maxlag, trim=trim, original="in")[:, dropex:maxlagex + 1]) return np.column_stack(lagsli)
def add_trend(x, trend="c", prepend=False, has_constant="skip"): """ Add a trend and/or constant to an array. Parameters ---------- x : array_like Original array of data. trend : str {'n', 'c', 't', 'ct', 'ctt'} The trend to add. * 'n' add no trend. * 'c' add constant only. * 't' add trend only. * 'ct' add constant and linear trend. * 'ctt' add constant and linear and quadratic trend. prepend : bool If True, prepends the new data to the columns of X. has_constant : str {'raise', 'add', 'skip'} Controls what happens when trend is 'c' and a constant column already exists in x. 'raise' will raise an error. 'add' will add a column of 1s. 'skip' will return the data without change. 'skip' is the default. Returns ------- array_like The original data with the additional trend columns. If x is a pandas Series or DataFrame, then the trend column names are 'const', 'trend' and 'trend_squared'. See Also -------- statsmodels.tools.tools.add_constant Add a constant column to an array. Notes ----- Returns columns as ['ctt','ct','c'] whenever applicable. There is currently no checking for an existing trend. """ prepend = bool_like(prepend, "prepend") trend = string_like(trend, "trend", options=("n", "c", "t", "ct", "ctt")) has_constant = string_like(has_constant, "has_constant", options=("raise", "add", "skip")) # TODO: could be generalized for trend of aribitrary order columns = ["const", "trend", "trend_squared"] if trend == "n": return x.copy() elif trend == "c": # handles structured arrays columns = columns[:1] trendorder = 0 elif trend == "ct" or trend == "t": columns = columns[:2] if trend == "t": columns = columns[1:2] trendorder = 1 elif trend == "ctt": trendorder = 2 if _is_recarray(x): from statsmodels.tools.sm_exceptions import recarray_exception raise NotImplementedError(recarray_exception) is_pandas = _is_using_pandas(x, None) if is_pandas: if isinstance(x, pd.Series): x = pd.DataFrame(x) else: x = x.copy() else: x = np.asanyarray(x) nobs = len(x) trendarr = np.vander(np.arange(1, nobs + 1, dtype=np.float64), trendorder + 1) # put in order ctt trendarr = np.fliplr(trendarr) if trend == "t": trendarr = trendarr[:, 1] if "c" in trend: if is_pandas: # Mixed type protection def safe_is_const(s): try: return np.ptp(s) == 0.0 and np.any(s != 0.0) except: return False col_const = x.apply(safe_is_const, 0) else: ptp0 = np.ptp(np.asanyarray(x), axis=0) col_is_const = ptp0 == 0 nz_const = col_is_const & (x[0] != 0) col_const = nz_const if np.any(col_const): if has_constant == "raise": if x.ndim == 1: base_err = "x is constant." else: columns = np.arange(x.shape[1])[col_const] if isinstance(x, pd.DataFrame): columns = x.columns const_cols = ", ".join([str(c) for c in columns]) base_err = ( "x contains one or more constant columns. Column(s) " f"{const_cols} are constant.") msg = f"{base_err} Adding a constant with trend='{trend}' is not allowed." raise ValueError(msg) elif has_constant == "skip": columns = columns[1:] trendarr = trendarr[:, 1:] order = 1 if prepend else -1 if is_pandas: trendarr = pd.DataFrame(trendarr, index=x.index, columns=columns) x = [trendarr, x] x = pd.concat(x[::order], axis=1) else: x = [trendarr, x] x = np.column_stack(x[::order]) return x
def lagmat( x, maxlag: int, trim: Literal["forward", "backward", "both", "none"] = 'forward', original: Literal["ex", "sep", "in"] = "ex", use_pandas: bool = False ) -> NDArray | DataFrame | tuple[NDArray, NDArray] | tuple[DataFrame, DataFrame]: """ Create 2d array of lags. Parameters ---------- x : array_like Data; if 2d, observation in rows and variables in columns. maxlag : int All lags from zero to maxlag are included. trim : {'forward', 'backward', 'both', 'none', None} The trimming method to use. * 'forward' : trim invalid observations in front. * 'backward' : trim invalid initial observations. * 'both' : trim invalid observations on both sides. * 'none', None : no trimming of observations. original : {'ex','sep','in'} How the original is treated. * 'ex' : drops the original array returning only the lagged values. * 'in' : returns the original array and the lagged values as a single array. * 'sep' : returns a tuple (original array, lagged values). The original array is truncated to have the same number of rows as the returned lagmat. use_pandas : bool If true, returns a DataFrame when the input is a pandas Series or DataFrame. If false, return numpy ndarrays. Returns ------- lagmat : ndarray The array with lagged observations. y : ndarray, optional Only returned if original == 'sep'. Notes ----- When using a pandas DataFrame or Series with use_pandas=True, trim can only be 'forward' or 'both' since it is not possible to consistently extend index values. Examples -------- >>> from statsmodels.tsa.tsatools import lagmat >>> import numpy as np >>> X = np.arange(1,7).reshape(-1,2) >>> lagmat(X, maxlag=2, trim="forward", original='in') array([[ 1., 2., 0., 0., 0., 0.], [ 3., 4., 1., 2., 0., 0.], [ 5., 6., 3., 4., 1., 2.]]) >>> lagmat(X, maxlag=2, trim="backward", original='in') array([[ 5., 6., 3., 4., 1., 2.], [ 0., 0., 5., 6., 3., 4.], [ 0., 0., 0., 0., 5., 6.]]) >>> lagmat(X, maxlag=2, trim="both", original='in') array([[ 5., 6., 3., 4., 1., 2.]]) >>> lagmat(X, maxlag=2, trim="none", original='in') array([[ 1., 2., 0., 0., 0., 0.], [ 3., 4., 1., 2., 0., 0.], [ 5., 6., 3., 4., 1., 2.], [ 0., 0., 5., 6., 3., 4.], [ 0., 0., 0., 0., 5., 6.]]) """ maxlag = int_like(maxlag, "maxlag") use_pandas = bool_like(use_pandas, "use_pandas") trim = string_like( trim, "trim", optional=True, options=("forward", "backward", "both", "none"), ) original = string_like(original, "original", options=("ex", "sep", "in")) # TODO: allow list of lags additional to maxlag orig = x x = array_like(x, "x", ndim=2, dtype=None) is_pandas = _is_using_pandas(orig, None) and use_pandas trim = "none" if trim is None else trim trim = trim.lower() if is_pandas and trim in ("none", "backward"): raise ValueError("trim cannot be 'none' or 'backward' when used on " "Series or DataFrames") dropidx = 0 nobs, nvar = x.shape if original in ["ex", "sep"]: dropidx = nvar if maxlag >= nobs: raise ValueError("maxlag should be < nobs") lm = np.zeros((nobs + maxlag, nvar * (maxlag + 1))) for k in range(0, int(maxlag + 1)): lm[maxlag - k:nobs + maxlag - k, nvar * (maxlag - k):nvar * (maxlag - k + 1), ] = x if trim in ("none", "forward"): startobs = 0 elif trim in ("backward", "both"): startobs = maxlag else: raise ValueError("trim option not valid") if trim in ("none", "backward"): stopobs = len(lm) else: stopobs = nobs if is_pandas: x = orig x_columns = x.columns if isinstance(x, DataFrame) else [x.name] columns = [str(col) for col in x_columns] for lag in range(maxlag): lag_str = str(lag + 1) columns.extend([str(col) + ".L." + lag_str for col in x_columns]) lm = DataFrame(lm[:stopobs], index=x.index, columns=columns) lags = lm.iloc[startobs:] if original in ("sep", "ex"): leads = lags[x_columns] lags = lags.drop(x_columns, axis=1) else: lags = lm[startobs:stopobs, dropidx:] if original == "sep": leads = lm[startobs:stopobs, :dropidx] if original == "sep": return lags, leads else: return lags
def lagmat2ds(x, maxlag0, maxlagex=None, dropex=0, trim='forward', use_pandas=False): """ Generate lagmatrix for 2d array, columns arranged by variables Parameters ---------- x : array_like, 2d 2d data, observation in rows and variables in columns maxlag0 : int for first variable all lags from zero to maxlag are included maxlagex : None or int max lag for all other variables all lags from zero to maxlag are included dropex : int (default is 0) exclude first dropex lags from other variables for all variables, except the first, lags from dropex to maxlagex are included trim : str * 'forward' : trim invalid observations in front * 'backward' : trim invalid initial observations * 'both' : trim invalid observations on both sides * 'none' : no trimming of observations use_pandas : bool, optional If true, returns a DataFrame when the input is a pandas Series or DataFrame. If false, return numpy ndarrays. Returns ------- lagmat : 2d array array with lagged observations, columns ordered by variable Notes ----- Inefficient implementation for unequal lags, implemented for convenience """ maxlag0 = int_like(maxlag0, 'maxlag0') maxlagex = int_like(maxlagex, 'maxlagex', optional=True) trim = string_like(trim, 'trim', optional=True, options=('forward', 'backward', 'both', 'none')) if maxlagex is None: maxlagex = maxlag0 maxlag = max(maxlag0, maxlagex) is_pandas = _is_using_pandas(x, None) if x.ndim == 1: if is_pandas: x = pd.DataFrame(x) else: x = x[:, None] elif x.ndim == 0 or x.ndim > 2: raise ValueError('Only supports 1 and 2-dimensional data.') nobs, nvar = x.shape if is_pandas and use_pandas: lags = lagmat(x.iloc[:, 0], maxlag, trim=trim, original='in', use_pandas=True) lagsli = [lags.iloc[:, :maxlag0 + 1]] for k in range(1, nvar): lags = lagmat(x.iloc[:, k], maxlag, trim=trim, original='in', use_pandas=True) lagsli.append(lags.iloc[:, dropex:maxlagex + 1]) return pd.concat(lagsli, axis=1) elif is_pandas: x = np.asanyarray(x) lagsli = [ lagmat(x[:, 0], maxlag, trim=trim, original='in')[:, :maxlag0 + 1] ] for k in range(1, nvar): lagsli.append( lagmat(x[:, k], maxlag, trim=trim, original='in')[:, dropex:maxlagex + 1]) return np.column_stack(lagsli)
def lagmat(x, maxlag, trim='forward', original='ex', use_pandas=False): """ Create 2d array of lags Parameters ---------- x : array_like, 1d or 2d data; if 2d, observation in rows and variables in columns maxlag : int all lags from zero to maxlag are included trim : str {'forward', 'backward', 'both', 'none'} or None * 'forward' : trim invalid observations in front * 'backward' : trim invalid initial observations * 'both' : trim invalid observations on both sides * 'none', None : no trimming of observations original : str {'ex','sep','in'} * 'ex' : drops the original array returning only the lagged values. * 'in' : returns the original array and the lagged values as a single array. * 'sep' : returns a tuple (original array, lagged values). The original array is truncated to have the same number of rows as the returned lagmat. use_pandas : bool, optional If true, returns a DataFrame when the input is a pandas Series or DataFrame. If false, return numpy ndarrays. Returns ------- lagmat : 2d array array with lagged observations y : 2d array, optional Only returned if original == 'sep' Examples -------- >>> from statsmodels.tsa.tsatools import lagmat >>> import numpy as np >>> X = np.arange(1,7).reshape(-1,2) >>> lagmat(X, maxlag=2, trim="forward", original='in') array([[ 1., 2., 0., 0., 0., 0.], [ 3., 4., 1., 2., 0., 0.], [ 5., 6., 3., 4., 1., 2.]]) >>> lagmat(X, maxlag=2, trim="backward", original='in') array([[ 5., 6., 3., 4., 1., 2.], [ 0., 0., 5., 6., 3., 4.], [ 0., 0., 0., 0., 5., 6.]]) >>> lagmat(X, maxlag=2, trim="both", original='in') array([[ 5., 6., 3., 4., 1., 2.]]) >>> lagmat(X, maxlag=2, trim="none", original='in') array([[ 1., 2., 0., 0., 0., 0.], [ 3., 4., 1., 2., 0., 0.], [ 5., 6., 3., 4., 1., 2.], [ 0., 0., 5., 6., 3., 4.], [ 0., 0., 0., 0., 5., 6.]]) Notes ----- When using a pandas DataFrame or Series with use_pandas=True, trim can only be 'forward' or 'both' since it is not possible to consistently extend index values. """ maxlag = int_like(maxlag, 'maxlag') use_pandas = bool_like(use_pandas, 'use_pandas') trim = string_like(trim, 'trim', optional=True, options=('forward', 'backward', 'both', 'none')) original = string_like(original, 'original', options=('ex', 'sep', 'in')) # TODO: allow list of lags additional to maxlag orig = x x = array_like(x, 'x', ndim=2, dtype=None) is_pandas = _is_using_pandas(orig, None) and use_pandas trim = 'none' if trim is None else trim trim = trim.lower() if is_pandas and trim in ('none', 'backward'): raise ValueError("trim cannot be 'none' or 'forward' when used on " "Series or DataFrames") dropidx = 0 nobs, nvar = x.shape if original in ['ex', 'sep']: dropidx = nvar if maxlag >= nobs: raise ValueError("maxlag should be < nobs") lm = np.zeros((nobs + maxlag, nvar * (maxlag + 1))) for k in range(0, int(maxlag + 1)): lm[maxlag - k:nobs + maxlag - k, nvar * (maxlag - k):nvar * (maxlag - k + 1)] = x if trim in ('none', 'forward'): startobs = 0 elif trim in ('backward', 'both'): startobs = maxlag else: raise ValueError('trim option not valid') if trim in ('none', 'backward'): stopobs = len(lm) else: stopobs = nobs if is_pandas: x = orig x_columns = x.columns if isinstance(x, DataFrame) else [x.name] columns = [str(col) for col in x_columns] for lag in range(maxlag): lag_str = str(lag + 1) columns.extend([str(col) + '.L.' + lag_str for col in x_columns]) lm = DataFrame(lm[:stopobs], index=x.index, columns=columns) lags = lm.iloc[startobs:] if original in ('sep', 'ex'): leads = lags[x_columns] lags = lags.drop(x_columns, 1) else: lags = lm[startobs:stopobs, dropidx:] if original == 'sep': leads = lm[startobs:stopobs, :dropidx] if original == 'sep': return lags, leads else: return lags
def engle_granger_coef(self, y0, y1, trend='c', method='aeg', maxlag=None, autolag='aic', normalize=True, debug=True): """ Engle-Granger Cointegration Coefficient Calculations. This equation takes a linear combination of two L(1) time series to create a L(0) or stationary time series. This is useful if the two series have a similar stochastic long-term trend, as it eliminates them and allows you Parameters ---------- y0 : array_like The first element in cointegrated system. Must be 1-d. y1 : array_like The remaining elements in cointegrated system. trend : str {'c', 'ct'} The trend term included in regression for cointegrating equation. * 'c' : constant. * 'ct' : constant and linear trend. * also available quadratic trend 'ctt', and no constant 'nc'. method : {'aeg'} Only 'aeg' (augmented Engle-Granger) is available. maxlag : None or int Argument for `adfuller`, largest or given number of lags. autolag : str Argument for `adfuller`, lag selection criterion. * If None, then maxlag lags are used without lag search. * If 'AIC' (default) or 'BIC', then the number of lags is chosen to minimize the corresponding information criterion. * 't-stat' based choice of maxlag. Starts with maxlag and drops a lag until the t-statistic on the last lag length is significant using a 5%-sized test. normalize: boolean, optional As there are infinite scalar combinations that will produce the factor, this normalizes the first entry to be 1. debug: boolean, optional Checks if the series has a possible cointegration factor using the Engle-Granger Cointegration Test Returns ------- coefs: array A vector that will create a L(0) time series if a combination exists. Notes ----- The series should be checked independently for their integration order. The series must be L(1) to get consistent results. You can check this by using the int_order function. References ---------- .. [1] MacKinnon, J.G. 1994 "Approximate Asymptotic Distribution Functions for Unit-Root and Cointegration Tests." Journal of Business & Economics Statistics, 12.2, 167-76. .. [2] MacKinnon, J.G. 2010. "Critical Values for Cointegration Tests." Queen's University, Dept of Economics Working Papers 1227. http://ideas.repec.org/p/qed/wpaper/1227.html .. [3] Hamilton, J. D. (1994). Time series analysis (Vol. 2, pp. 690-696). Princeton, NJ: Princeton university press. """ if debug: coint_t, pvalue, crit_value = coint(y0, y1, trend, method, maxlag, autolag) if pvalue >= .10: print('The null hypothesis cannot be rejected') trend = string_like(trend, 'trend', options=('c', 'nc', 'ct', 'ctt')) nobs, k_vars = y1.shape y1 = add_trend(y1, trend=trend, prepend=False) eg_model = OLS(y0, y1).fit() coefs = eg_model.params[0:k_vars] if normalize: coefs = coefs / coefs[0] return coefs
def kstest_fit(x, dist='norm', pvalmethod="table"): """ Test assumed normal or exponential distribution using Lilliefors' test. Lilliefors' test is a Kolmogorov-Smirnov test with estimated parameters. Parameters ---------- x : array_like, 1d Data to test. dist : {'norm', 'exp'}, optional The assumed distribution. pvalmethod : {'approx', 'table'}, optional The method used to compute the p-value of the test statistic. In general, 'table' is preferred and makes use of a very large simulation. 'approx' is only valid for normality. if `dist = 'exp'` `table` is always used. 'approx' uses the approximation formula of Dalal and Wilkinson, valid for pvalues < 0.1. If the pvalue is larger than 0.1, then the result of `table` is returned. Returns ------- ksstat : float Kolmogorov-Smirnov test statistic with estimated mean and variance. pvalue : float If the pvalue is lower than some threshold, e.g. 0.05, then we can reject the Null hypothesis that the sample comes from a normal distribution. Notes ----- 'table' uses an improved table based on 10,000,000 simulations. The critical values are approximated using log(cv_alpha) = b_alpha + c[0] log(n) + c[1] log(n)**2 where cv_alpha is the critical value for a test with size alpha, b_alpha is an alpha-specific intercept term and c[1] and c[2] are coefficients that are shared all alphas. Values in the table are linearly interpolated. Values outside the range are be returned as bounds, 0.990 for large and 0.001 for small pvalues. For implementation details, see lilliefors_critical_value_simulation.py in the test directory. """ pvalmethod = string_like(pvalmethod, "pvalmethod", options=("approx", "table")) x = np.asarray(x) nobs = len(x) if dist == 'norm': z = (x - x.mean()) / x.std(ddof=1) test_d = stats.norm.cdf lilliefors_table = lilliefors_table_norm elif dist == 'exp': z = x / x.mean() test_d = stats.expon.cdf lilliefors_table = lilliefors_table_expon pvalmethod = 'table' else: raise ValueError("Invalid dist parameter, must be 'norm' or 'exp'") min_nobs = 4 if dist == 'norm' else 3 if nobs < min_nobs: raise ValueError('Test for distribution {0} requires at least {1} ' 'observations'.format(dist, min_nobs)) d_ks = ksstat(z, test_d, alternative='two_sided') if pvalmethod == 'approx': pval = pval_lf(d_ks, nobs) # check pval is in desired range if pval > 0.1: pval = lilliefors_table.prob(d_ks, nobs) else: # pvalmethod == 'table' pval = lilliefors_table.prob(d_ks, nobs) return d_ks, pval
def __init__(self, endog, k_regimes, trend='c', exog=None, order=0, exog_tvtp=None, switching_trend=True, switching_exog=True, switching_variance=False, dates=None, freq=None, missing='none'): # Properties from statsmodels.tools.validation import string_like self.trend = string_like(trend, "trend", options=("n", "c", "ct", "t")) self.switching_trend = switching_trend self.switching_exog = switching_exog self.switching_variance = switching_variance # Exogenous data self.k_exog, exog = markov_switching.prepare_exog(exog) # Trend nobs = len(endog) self.k_trend = 0 self._k_exog = self.k_exog trend_exog = None if trend == 'c': trend_exog = np.ones((nobs, 1)) self.k_trend = 1 elif trend == 't': trend_exog = (np.arange(nobs) + 1)[:, np.newaxis] self.k_trend = 1 elif trend == 'ct': trend_exog = np.c_[np.ones((nobs, 1)), (np.arange(nobs) + 1)[:, np.newaxis]] self.k_trend = 2 if trend_exog is not None: exog = trend_exog if exog is None else np.c_[trend_exog, exog] self._k_exog += self.k_trend # Initialize the base model super(MarkovRegression, self).__init__(endog, k_regimes, order=order, exog_tvtp=exog_tvtp, exog=exog, dates=dates, freq=freq, missing=missing) # Switching options if self.switching_trend is True or self.switching_trend is False: self.switching_trend = [self.switching_trend] * self.k_trend elif not len(self.switching_trend) == self.k_trend: raise ValueError('Invalid iterable passed to `switching_trend`.') if self.switching_exog is True or self.switching_exog is False: self.switching_exog = [self.switching_exog] * self.k_exog elif not len(self.switching_exog) == self.k_exog: raise ValueError('Invalid iterable passed to `switching_exog`.') self.switching_coeffs = ( np.r_[self.switching_trend, self.switching_exog].astype(bool).tolist()) # Parameters self.parameters['exog'] = self.switching_coeffs self.parameters['variance'] = [1] if self.switching_variance else [0]
def __init__(self, endog, trend=False, damped_trend=False, seasonal=None, initialization_method='estimated', initial_level=None, initial_trend=None, initial_seasonal=None, bounds=None, concentrate_scale=True, dates=None, freq=None, missing='none'): # Model definition self.trend = bool_like(trend, 'trend') self.damped_trend = bool_like(damped_trend, 'damped_trend') self.seasonal_periods = int_like(seasonal, 'seasonal', optional=True) self.seasonal = self.seasonal_periods is not None self.initialization_method = string_like( initialization_method, 'initialization_method').lower() self.concentrate_scale = bool_like(concentrate_scale, 'concentrate_scale') # TODO: add validation for bounds (e.g. have all bounds, upper > lower) # TODO: add `bounds_method` argument to choose between "usual" and # "admissible" as in Hyndman et al. (2008) self.bounds = bounds if self.bounds is None: self.bounds = [(1e-4, 1-1e-4)] * 3 + [(0.8, 0.98)] # Validation if self.seasonal_periods == 1: raise ValueError('Cannot have a seasonal period of 1.') if self.seasonal and self.seasonal_periods is None: raise NotImplementedError('Unable to detect season automatically;' ' please specify `seasonal_periods`.') if self.initialization_method not in ['concentrated', 'estimated', 'simple', 'heuristic', 'known']: raise ValueError('Invalid initialization method "%s".' % initialization_method) if self.initialization_method == 'known': if initial_level is None: raise ValueError('`initial_level` argument must be provided' ' when initialization method is set to' ' "known".') if initial_trend is None and self.trend: raise ValueError('`initial_trend` argument must be provided' ' for models with a trend component when' ' initialization method is set to "known".') if initial_seasonal is None and self.seasonal: raise ValueError('`initial_seasonal` argument must be provided' ' for models with a seasonal component when' ' initialization method is set to "known".') # Initialize the state space model if not self.seasonal or self.seasonal_periods is None: self._seasonal_periods = 0 else: self._seasonal_periods = self.seasonal_periods k_states = 2 + int(self.trend) + self._seasonal_periods k_posdef = 1 init = ss_init.Initialization(k_states, 'known', constant=[0] * k_states) super(ExponentialSmoothing, self).__init__( endog, k_states=k_states, k_posdef=k_posdef, initialization=init, dates=dates, freq=freq, missing=missing) # Concentrate the scale out of the likelihood function if self.concentrate_scale: self.ssm.filter_concentrated = True # Setup fixed elements of the system matrices # Observation error self.ssm['design', 0, 0] = 1. self.ssm['selection', 0, 0] = 1. self.ssm['state_cov', 0, 0] = 1. # Level self.ssm['design', 0, 1] = 1. self.ssm['transition', 1, 1] = 1. # Trend if self.trend: self.ssm['transition', 1:3, 2] = 1. # Seasonal if self.seasonal: k = 2 + int(self.trend) self.ssm['design', 0, k] = 1. self.ssm['transition', k, -1] = 1. self.ssm['transition', k + 1:k_states, k:k_states - 1] = ( np.eye(self.seasonal_periods - 1)) # Initialization of the states if self.initialization_method != 'known': msg = ('Cannot give `%%s` argument when initialization is "%s"' % initialization_method) if initial_level is not None: raise ValueError(msg % 'initial_level') if initial_trend is not None: raise ValueError(msg % 'initial_trend') if initial_seasonal is not None: raise ValueError(msg % 'initial_seasonal') if self.initialization_method == 'simple': initial_level, initial_trend, initial_seasonal = ( es_init._initialization_simple( self.endog[:, 0], trend='add' if self.trend else None, seasonal='add' if self.seasonal else None, seasonal_periods=self.seasonal_periods)) elif self.initialization_method == 'heuristic': initial_level, initial_trend, initial_seasonal = ( es_init._initialization_heuristic( self.endog[:, 0], trend='add' if self.trend else None, seasonal='add' if self.seasonal else None, seasonal_periods=self.seasonal_periods)) elif self.initialization_method == 'known': initial_level = float_like(initial_level, 'initial_level') if self.trend: initial_trend = float_like(initial_trend, 'initial_trend') if self.seasonal: initial_seasonal = array_like(initial_seasonal, 'initial_seasonal') if len(initial_seasonal) == self.seasonal_periods - 1: initial_seasonal = np.r_[initial_seasonal, 0 - np.sum(initial_seasonal)] if len(initial_seasonal) != self.seasonal_periods: raise ValueError( 'Invalid length of initial seasonal values. Must be' ' one of s or s-1, where s is the number of seasonal' ' periods.') # Note that the simple and heuristic methods of computing initial # seasonal factors return estimated seasonal factors associated with # the first t = 1, 2, ..., `n_seasons` observations. To use these as # the initial state, we lag them by `n_seasons`. This yields, for # example for `n_seasons = 4`, the seasons lagged L3, L2, L1, L0. # As described above, the state vector in this model should have # seasonal factors ordered L0, L1, L2, L3, and as a result we need to # reverse the order of the computed initial seasonal factors from # these methods. methods = ['simple', 'heuristic'] if (self.initialization_method in methods and initial_seasonal is not None): initial_seasonal = initial_seasonal[::-1] self._initial_level = initial_level self._initial_trend = initial_trend self._initial_seasonal = initial_seasonal self._initial_state = None # Initialize now if possible (if we have a damped trend, then # initialization will depend on the phi parameter, and so has to be # done at each `update`) methods = ['simple', 'heuristic', 'known'] if not self.damped_trend and self.initialization_method in methods: self._initialize_constant_statespace(initial_level, initial_trend, initial_seasonal) # Save keys for kwarg initialization self._init_keys += ['trend', 'damped_trend', 'seasonal', 'initialization_method', 'initial_level', 'initial_trend', 'initial_seasonal', 'bounds', 'concentrate_scale', 'dates', 'freq', 'missing']
def adfuller( x, maxlag=None, regression="c", autolag="AIC", store=False, regresults=False, ): """ Augmented Dickey-Fuller unit root test. The Augmented Dickey-Fuller test can be used to test for a unit root in a univariate process in the presence of serial correlation. Parameters ---------- x : array_like, 1d The data series to test. maxlag : int Maximum lag which is included in test, default 12*(nobs/100)^{1/4}. regression : {"c","ct","ctt","nc"} Constant and trend order to include in regression. * "c" : constant only (default). * "ct" : constant and trend. * "ctt" : constant, and linear and quadratic trend. * "nc" : no constant, no trend. autolag : {"AIC", "BIC", "t-stat", None} Method to use when automatically determining the lag length among the values 0, 1, ..., maxlag. * If "AIC" (default) or "BIC", then the number of lags is chosen to minimize the corresponding information criterion. * "t-stat" based choice of maxlag. Starts with maxlag and drops a lag until the t-statistic on the last lag length is significant using a 5%-sized test. * If None, then the number of included lags is set to maxlag. store : bool If True, then a result instance is returned additionally to the adf statistic. Default is False. regresults : bool, optional If True, the full regression results are returned. Default is False. Returns ------- adf : float The test statistic. pvalue : float MacKinnon's approximate p-value based on MacKinnon (1994, 2010). usedlag : int The number of lags used. nobs : int The number of observations used for the ADF regression and calculation of the critical values. critical values : dict Critical values for the test statistic at the 1 %, 5 %, and 10 % levels. Based on MacKinnon (2010). icbest : float The maximized information criterion if autolag is not None. resstore : ResultStore, optional A dummy class with results attached as attributes. Notes ----- The null hypothesis of the Augmented Dickey-Fuller is that there is a unit root, with the alternative that there is no unit root. If the pvalue is above a critical size, then we cannot reject that there is a unit root. The p-values are obtained through regression surface approximation from MacKinnon 1994, but using the updated 2010 tables. If the p-value is close to significant, then the critical values should be used to judge whether to reject the null. The autolag option and maxlag for it are described in Greene. References ---------- .. [1] W. Green. "Econometric Analysis," 5th ed., Pearson, 2003. .. [2] Hamilton, J.D. "Time Series Analysis". Princeton, 1994. .. [3] MacKinnon, J.G. 1994. "Approximate asymptotic distribution functions for unit-root and cointegration tests. `Journal of Business and Economic Statistics` 12, 167-76. .. [4] MacKinnon, J.G. 2010. "Critical Values for Cointegration Tests." Queen"s University, Dept of Economics, Working Papers. Available at http://ideas.repec.org/p/qed/wpaper/1227.html Examples -------- See example notebook """ x = array_like(x, "x") maxlag = int_like(maxlag, "maxlag", optional=True) regression = string_like(regression, "regression", options=("c", "ct", "ctt", "nc")) autolag = string_like(autolag, "autolag", optional=True, options=("aic", "bic", "t-stat")) store = bool_like(store, "store") regresults = bool_like(regresults, "regresults") if regresults: store = True trenddict = {None: "nc", 0: "c", 1: "ct", 2: "ctt"} if regression is None or isinstance(regression, int): regression = trenddict[regression] regression = regression.lower() nobs = x.shape[0] ntrend = len(regression) if regression != "nc" else 0 if maxlag is None: # from Greene referencing Schwert 1989 maxlag = int(np.ceil(12.0 * np.power(nobs / 100.0, 1 / 4.0))) # -1 for the diff maxlag = min(nobs // 2 - ntrend - 1, maxlag) if maxlag < 0: raise ValueError("sample size is too short to use selected " "regression component") elif maxlag > nobs // 2 - ntrend - 1: raise ValueError("maxlag must be less than (nobs/2 - 1 - ntrend) " "where n trend is the number of included " "deterministic regressors") xdiff = np.diff(x) xdall = lagmat(xdiff[:, None], maxlag, trim="both", original="in") nobs = xdall.shape[0] xdall[:, 0] = x[-nobs - 1:-1] # replace 0 xdiff with level of x xdshort = xdiff[-nobs:] if store: from statsmodels.stats.diagnostic import ResultsStore resstore = ResultsStore() if autolag: if regression != "nc": fullRHS = add_trend(xdall, regression, prepend=True) else: fullRHS = xdall startlag = fullRHS.shape[1] - xdall.shape[1] + 1 # 1 for level # search for lag length with smallest information criteria # Note: use the same number of observations to have comparable IC # aic and bic: smaller is better if not regresults: icbest, bestlag = _autolag(OLS, xdshort, fullRHS, startlag, maxlag, autolag) else: icbest, bestlag, alres = _autolag( OLS, xdshort, fullRHS, startlag, maxlag, autolag, regresults=regresults, ) resstore.autolag_results = alres bestlag -= startlag # convert to lag not column index # rerun ols with best autolag xdall = lagmat(xdiff[:, None], bestlag, trim="both", original="in") nobs = xdall.shape[0] xdall[:, 0] = x[-nobs - 1:-1] # replace 0 xdiff with level of x xdshort = xdiff[-nobs:] usedlag = bestlag else: usedlag = maxlag icbest = None if regression != "nc": resols = OLS(xdshort, add_trend(xdall[:, :usedlag + 1], regression)).fit() else: resols = OLS(xdshort, xdall[:, :usedlag + 1]).fit() adfstat = resols.tvalues[0] # adfstat = (resols.params[0]-1.0)/resols.bse[0] # the "asymptotically correct" z statistic is obtained as # nobs/(1-np.sum(resols.params[1:-(trendorder+1)])) (resols.params[0] - 1) # I think this is the statistic that is used for series that are integrated # for orders higher than I(1), ie., not ADF but cointegration tests. # Get approx p-value and critical values pvalue = mackinnonp(adfstat, regression=regression, N=1) critvalues = mackinnoncrit(N=1, regression=regression, nobs=nobs) critvalues = { "1%": critvalues[0], "5%": critvalues[1], "10%": critvalues[2], } if store: resstore.resols = resols resstore.maxlag = maxlag resstore.usedlag = usedlag resstore.adfstat = adfstat resstore.critvalues = critvalues resstore.nobs = nobs resstore.H0 = ("The coefficient on the lagged level equals 1 - " "unit root") resstore.HA = "The coefficient on the lagged level < 1 - stationary" resstore.icbest = icbest resstore._str = "Augmented Dickey-Fuller Test Results" return adfstat, pvalue, critvalues, resstore else: if not autolag: return adfstat, pvalue, usedlag, nobs, critvalues else: return adfstat, pvalue, usedlag, nobs, critvalues, icbest
def fit( self, method="inv", cov_type="nonrobust", cov_kwds=None, reset=None, use_t=False, params_only=False, ): """ Estimate model parameters. Parameters ---------- method : {'inv', 'lstsq', 'pinv'} Method to use when computing the the model parameters. * 'inv' - use moving windows inner-products and matrix inversion. This method is the fastest, but may be less accurate than the other methods. * 'lstsq' - Use numpy.linalg.lstsq * 'pinv' - Use numpy.linalg.pinv. This method matches the default estimator in non-moving regression estimators. cov_type : {'nonrobust', 'HCCM', 'HC0'} Covariance estimator: * nonrobust - The classic OLS covariance estimator * HCCM, HC0 - White heteroskedasticity robust covariance cov_kwds : dict Unused reset : int, optional Interval to recompute the moving window inner products used to estimate the model parameters. Smaller values improve accuracy, although in practice this setting is not required to be set. use_t : bool, optional Flag indicating to use the Student's t distribution when computing p-values. params_only : bool, optional Flag indicating that only parameters should be computed. Avoids calculating all other statistics or performing inference. Returns ------- RollingRegressionResults Estimation results where all pre-sample values are nan-filled. """ method = string_like(method, "method", options=("inv", "lstsq", "pinv")) reset = int_like(reset, "reset", optional=True) reset = self._y.shape[0] if reset is None else reset if reset < 1: raise ValueError("reset must be a positive integer") nobs, k = self._x.shape store = RollingStore( params=np.full((nobs, k), np.nan), ssr=np.full(nobs, np.nan), llf=np.full(nobs, np.nan), nobs=np.zeros(nobs, dtype=int), s2=np.full(nobs, np.nan), xpxi=np.full((nobs, k, k), np.nan), xeex=np.full((nobs, k, k), np.nan), centered_tss=np.full(nobs, np.nan), uncentered_tss=np.full(nobs, np.nan), ) w = self._window first = self._min_nobs if self._expanding else w xpx, xpy, nobs = self._reset(first) if not (self._has_nan[first - 1] and self._skip_missing): self._fit_single(first, xpx, xpy, nobs, store, params_only, method) wx, wy = self._wx, self._wy for i in range(first + 1, self._x.shape[0] + 1): if self._has_nan[i - 1] and self._skip_missing: continue if i % reset == 0: xpx, xpy, nobs = self._reset(i) else: if not self._is_nan[i - w - 1] and i > w: remove_x = wx[i - w - 1:i - w] xpx -= remove_x.T @ remove_x xpy -= remove_x.T @ wy[i - w - 1:i - w] nobs -= 1 if not self._is_nan[i - 1]: add_x = wx[i - 1:i] xpx += add_x.T @ add_x xpy += add_x.T @ wy[i - 1:i] nobs += 1 self._fit_single(i, xpx, xpy, nobs, store, params_only, method) return RollingRegressionResults(self, store, self.k_constant, use_t, cov_type)
def __init__(self, data, ncomp=None, standardize=True, demean=True, normalize=True, gls=False, weights=None, method='svd', missing=None, tol=5e-8, max_iter=1000, tol_em=5e-8, max_em_iter=100): self._index = None self._columns = [] if isinstance(data, pd.DataFrame): self._index = data.index self._columns = data.columns self.data = array_like(data, "data", ndim=2) # Store inputs self._gls = bool_like(gls, "gls") self._normalize = bool_like(normalize, "normalize") self._tol = float_like(tol, "tol") if not 0 < self._tol < 1: raise ValueError('tol must be strictly between 0 and 1') self._max_iter = int_like(max_iter, "int_like") self._max_em_iter = int_like(max_em_iter, "max_em_iter") self._tol_em = float_like(tol_em, "tol_em") # Prepare data self._standardize = bool_like(standardize, "standardize") self._demean = bool_like(demean, "demean") self._nobs, self._nvar = self.data.shape weights = array_like(weights, "weights", maxdim=1, optional=True) if weights is None: weights = np.ones(self._nvar) else: weights = np.array(weights).flatten() if weights.shape[0] != self._nvar: raise ValueError('weights should have nvar elements') weights = weights / np.sqrt((weights ** 2.0).mean()) self.weights = weights # Check ncomp against maximum min_dim = min(self._nobs, self._nvar) self._ncomp = min_dim if ncomp is None else ncomp if self._ncomp > min_dim: import warnings warn = 'The requested number of components is more than can be ' \ 'computed from data. The maximum number of components is ' \ 'the minimum of the number of observations or variables' warnings.warn(warn, ValueWarning) self._ncomp = min_dim self._method = method # Workaround to avoid instance methods in __dict__ if self._method not in ('eig', 'svd', 'nipals'): raise ValueError('method {0} is not known.'.format(method)) self.rows = np.arange(self._nobs) self.cols = np.arange(self._nvar) # Handle missing self._missing = string_like(missing, "missing", optional=True) self._adjusted_data = self.data self._adjust_missing() # Update size self._nobs, self._nvar = self._adjusted_data.shape if self._ncomp == np.min(self.data.shape): self._ncomp = np.min(self._adjusted_data.shape) elif self._ncomp > np.min(self._adjusted_data.shape): raise ValueError('When adjusting for missing values, user ' 'provided ncomp must be no larger than the ' 'smallest dimension of the ' 'missing-value-adjusted data size.') # Attributes and internal values self._tss = 0.0 self._ess = None self.transformed_data = None self._mu = None self._sigma = None self._ess_indiv = None self._tss_indiv = None self.scores = self.factors = None self.loadings = None self.coeff = None self.eigenvals = None self.eigenvecs = None self.projection = None self.rsquare = None self.ic = None # Prepare data self.transformed_data = self._prepare_data() # Perform the PCA self._pca() if gls: self._compute_gls_weights() self.transformed_data = self._prepare_data() self._pca() # Final calculations self._compute_rsquare_and_ic() if self._index is not None: self._to_pandas()
def add_trend(x, trend="c", prepend=False, has_constant='skip'): """ Adds a trend and/or constant to an array. Parameters ---------- x : array_like Original array of data. trend : str {'n', 'c', 't', 'ct', 'ctt'} * 'n' add no trend. * 'c' add constant only. * 't' add trend only. * 'ct' add constant and linear trend. * 'ctt' add constant and linear and quadratic trend. prepend : bool If True, prepends the new data to the columns of X. has_constant : str {'raise', 'add', 'skip'} Controls what happens when trend is 'c' and a constant already exists in x. 'raise' will raise an error. 'add' will duplicate a constant. 'skip' will return the data without change. 'skip' is the default. Returns ------- array_like The original data with the additional trend columns. If x is a recarray or pandas Series or DataFrame, then the trend column names are 'const', 'trend' and 'trend_squared'. Notes ----- Returns columns as ['ctt','ct','c'] whenever applicable. There is currently no checking for an existing trend. See Also -------- statsmodels.tools.tools.add_constant Add a constant column to an array. """ prepend = bool_like(prepend, 'prepend') trend = string_like(trend, 'trend', options=('n', 'c', 't', 'ct', 'ctt')) has_constant = string_like(has_constant, 'has_constant', options=('raise', 'add', 'skip')) # TODO: could be generalized for trend of aribitrary order columns = ['const', 'trend', 'trend_squared'] if trend == 'n': return x.copy() elif trend == "c": # handles structured arrays columns = columns[:1] trendorder = 0 elif trend == "ct" or trend == "t": columns = columns[:2] if trend == "t": columns = columns[1:2] trendorder = 1 elif trend == "ctt": trendorder = 2 is_recarray = _is_recarray(x) is_pandas = _is_using_pandas(x, None) or is_recarray if is_pandas or is_recarray: if is_recarray: descr = x.dtype.descr x = pd.DataFrame.from_records(x) elif isinstance(x, pd.Series): x = pd.DataFrame(x) else: x = x.copy() else: x = np.asanyarray(x) nobs = len(x) trendarr = np.vander(np.arange(1, nobs + 1, dtype=np.float64), trendorder + 1) # put in order ctt trendarr = np.fliplr(trendarr) if trend == "t": trendarr = trendarr[:, 1] if "c" in trend: if is_pandas or is_recarray: # Mixed type protection def safe_is_const(s): try: return np.ptp(s) == 0.0 and np.any(s != 0.0) except: return False col_const = x.apply(safe_is_const, 0) else: ptp0 = np.ptp(np.asanyarray(x), axis=0) col_is_const = ptp0 == 0 nz_const = col_is_const & (x[0] != 0) col_const = nz_const if np.any(col_const): if has_constant == 'raise': msg = "x contains a constant. Adding a constant with " \ "trend='{0}' is not allowed.".format(trend) raise ValueError(msg) elif has_constant == 'skip': columns = columns[1:] trendarr = trendarr[:, 1:] order = 1 if prepend else -1 if is_recarray or is_pandas: trendarr = pd.DataFrame(trendarr, index=x.index, columns=columns) x = [trendarr, x] x = pd.concat(x[::order], 1) else: x = [trendarr, x] x = np.column_stack(x[::order]) if is_recarray: x = x.to_records(index=False) new_descr = x.dtype.descr extra_col = len(new_descr) - len(descr) if prepend: descr = new_descr[:extra_col] + descr else: descr = descr + new_descr[-extra_col:] x = x.astype(np.dtype(descr)) return x
def dynamic_coefs(self, y0, y1, n_lags=None, trend='c', normalize=True, reverse=False): """ Dynamic Cointegration Coefficient Calculation. This equation takes a linear combination of multiple L(1) time series to create a L(0) or stationary time series. This is useful if the two series have a similar stochastic long-term trend, as it eliminates them and allows you. Unlike Engle-Granger, this method uses dynamic regression - taking an equal combination of lags and leads of the differences of the series - to create a more accurate parameter vector. This method calculates the lag-lead matricies for the given lag values or searches for the best amount of lags using BIC calculations. Once the optimal value is found, the calculation is done and returned. The optimal lag can be found by using dot notation and finding max_lag. You can also find the model by using .model. Parameters ---------- y0 : array_like The first element in cointegrated system. Must be 1-d. y1 : array_like The remaining elements in cointegrated system. n_lags: int, array, None This determines which values the function should search for the best vector. * int: If an int, the calculation is done for only that lag * array: If an array of two integers, the first value is where the search begins and the second is where it ends * None: If None is given, the function searches from 2 to ceiling of the cube root of the number of observations divided by two plus two in order to ensure at least one value is searched. I.E last_lag = (n_obs**(1/3) / 2) + 2 trend : str {'c', 'ct'} The trend term included in regression for cointegrating equation. * 'c' : constant. * 'ct' : constant and linear trend. * also available quadratic trend 'ctt', and no constant 'nc'. normalize: Boolean If true, the first entry in the parameter vector is normalized to one and everything else is divided by the first entry. This is because any cointegrating vector could be multiplied by a scalar and still be a cointegrating vector. reverse: Boolean The series must be ordered from the latest data points to the last. This is in order to calculate the differences. Using this, you can reverse the ordering of your data points. Returns ------- coefs: array A vector that will create a L(0) time series if a combination sexists. Notes ----- The data must go from the latest observations to the earliest. If not, the coef vector will be the opposite sign. The series should be checked independently for their integration order. The series must be L(1) to get consistent results. You can check this by using the int_order function. References ---------- .. [1] Stock, J. H., & Watson, M. W. (1993). A simple estimator of cointegrating vectors in higher order integrated systems. Econometrica: Journal of the Econometric Society, 783-820. .. [2] Hamilton, J. D. (1994). Time series analysis (Vol. 2, pp. 690-696). Princeton, NJ: Princeton university press. """ self.bics = [] self.max_val = [] self.model = '' self.coefs = [] trend = string_like(trend, 'trend', options=('c', 'nc', 'ct', 'ctt')) y1 = add_trend(y1, trend=trend, prepend=True) y1 = y1.reset_index(drop=True) if reverse: y0, y1 = y0[::-1], y1[::-1] if _is_using_pandas(y0, y1): columns = list(y1.columns) else: # Need to check if NumPy, because I can only support those two n_obs, k = y1.shape columns = [f'Var_{x}' for x in range(k)] y0, y1 = pd.DataFrame(y0), pd.DataFrame(y1) if n_lags is None: n_obs, k = y1.shape dta = pd.DataFrame(np.diff(a=y1, n=1, axis=0)) for lag in range(2, int(np.ceil(n_obs**(1 / 3) / 2) + 2)): df1 = pd.DataFrame(lagmat(dta, lag + 1, trim='backward')) cols = dict(zip(list(df1.columns)[::-1][0:k][::-1], columns)) df1 = df1.rename(columns=cols) df2 = pd.DataFrame(lagmat(dta, lag, trim='forward')) lags_leads = pd.concat([df1, df2], axis=1, join='outer') lags_leads = lags_leads.drop(list(range(0, lag))) lags_leads = lags_leads.reset_index(drop=True) lags_leads = lags_leads.drop( list(range(len(lags_leads) - lag, len(lags_leads)))) lags_leads = lags_leads.reset_index(drop=True) data_y = y0.drop(list(range(0, lag))).reset_index(drop=True) data_y = data_y.drop( list(range(len(data_y) - lag - 1, len(data_y)))) data_y = data_y.reset_index(drop=True) self.bics.append([OLS(data_y, lags_leads).fit().bic, lag]) self.max_val = max(self.bics, key=lambda item: item[0]) self.max_val = self.max_val[1] elif len(n_lags) == 2: start, end = int(n_lags[0]), int(n_lags[1]) n_obs, k = y1.shape dta = pd.DataFrame(np.diff(a=y1, n=1, axis=0)) for lag in range(start, end + 1): df1 = pd.DataFrame(lagmat(dta, lag + 1, trim='backward')) cols = dict(zip(list(df1.columns)[::-1][0:k][::-1], columns)) df1 = df1.rename(columns=cols) df2 = pd.DataFrame(lagmat(dta, lag, trim='forward')) lags_leads = pd.concat([df1, df2], axis=1, join='outer') lags_leads = lags_leads.drop(list(range(0, lag))) lags_leads = lags_leads.reset_index(drop=True) lags_leads = lags_leads.drop( list(range(len(lags_leads) - lag, len(lags_leads)))) lags_leads = lags_leads.reset_index(drop=True) data_y = y0.drop(list(range(0, lag))).reset_index(drop=True) data_y = data_y.drop( list(range(len(data_y) - lag - 1, len(data_y)))) data_y = data_y.reset_index(drop=True) self.bics.append([OLS(data_y, lags_leads).fit().bic, lag]) self.max_val = max(self.bics, key=lambda item: item[0]) self.max_val = self.max_val[1] elif len(n_lags) == 1: self.max_val = int(n_lags) else: raise ('Make sure your lags are in one of the required forms.') dta = pd.DataFrame(np.diff(a=y1, n=1, axis=0)) # Create a matrix of the lags, this also retains the original matrix, # which is why max_val + 1 df1 = pd.DataFrame(lagmat(dta, self.max_val + 1, trim='backward')) # Rename the columns, as we need to keep track of them. We know the # original will be the final values cols = dict(zip(list(df1.columns)[::-1][0:k][::-1], columns)) df1 = df1.rename(columns=cols) # Do the same, but these are leads, this does not keep the # original matrix, thus max_val df2 = pd.DataFrame(lagmat(dta, self.max_val, trim='forward')) # There are missing data due to the lags and leads, we concat # the frames and drop the values of which are missing. lags_leads = pd.concat([df1, df2], axis=1, join='outer') lags_leads = lags_leads.drop(list(range(0, self.max_val))) lags_leads = lags_leads.reset_index(drop=True) lags_leads = lags_leads.drop( list(range(len(lags_leads) - self.max_val, len(lags_leads)))) lags_leads.reset_index(drop=True) # We also need to do this for the endog values, we need to # drop 1 extra due to a loss from first differencing. # This will be at the end of the matrix. data_y = y0.drop(list(range(0, self.max_val))).reset_index(drop=True) data_y = data_y.drop( list(range(len(data_y) - self.max_val - 1, len(data_y)))) data_y = data_y.reset_index(drop=True) self.model = OLS(data_y, lags_leads).fit() self.coefs = self.model.params[list(y1.columns)] if normalize: self.coefs = self.coefs / self.coefs[0] return (self.coefs)