def __init__( self, index: Union[Sequence[Hashable], pd.Index], *, period: Optional[Union[float, int]] = None, constant: bool = False, order: int = 0, seasonal: bool = False, fourier: int = 0, additional_terms: Sequence[DeterministicTerm] = (), drop: bool = False, ): if not isinstance(index, pd.Index): index = pd.Index(index) self._index = index self._deterministic_terms: List[DeterministicTerm] = [] self._extendable = False self._index_freq = None self._validate_index() period = float_like(period, "period", optional=True) self._constant = constant = bool_like(constant, "constant") self._order = required_int_like(order, "order") self._seasonal = seasonal = bool_like(seasonal, "seasonal") self._fourier = required_int_like(fourier, "fourier") additional_terms = tuple(additional_terms) self._cached_in_sample = None self._drop = bool_like(drop, "drop") self._additional_terms = additional_terms if constant or order: self._deterministic_terms.append(TimeTrend(constant, order)) if seasonal and fourier: raise ValueError( """seasonal and fourier can be initialized through the constructor since\ these will be necessarily perfectly collinear. Instead, you can pass \ additional components using the additional_terms input.""") if (seasonal or fourier) and period is None: if period is None: self._period = period = freq_to_period(self._index_freq) if seasonal: period = required_int_like(period, "period") self._deterministic_terms.append(Seasonality(period)) elif fourier: period = float_like(period, "period") assert period is not None self._deterministic_terms.append(Fourier(period, order=fourier)) for term in additional_terms: if not isinstance(term, DeterministicTerm): raise TypeError( "All additional terms must be instances of subsclasses " "of DeterministicTerm") if term not in self._deterministic_terms: self._deterministic_terms.append(term) else: raise ValueError( "One or more terms in additional_terms has been added " "through the parameters of the constructor. Terms must " "be unique.") self._period = period self._retain_cols: Optional[List[Hashable]] = None
def test_bool_like(boolean): assert isinstance(bool_like(boolean, "boolean"), bool) assert bool_like(None, "boolean", optional=True) is None if isinstance(boolean, bool): assert isinstance(bool_like(boolean, "boolean", strict=True), bool) else: with pytest.raises(TypeError): bool_like(boolean, "boolean", strict=True)
def anderson_statistic(x, dist='norm', fit=True, params=(), axis=0): """ Calculate the Anderson-Darling a2 statistic. Parameters ---------- x : array_like The data to test. dist : {'norm', callable} The assumed distribution under the null of test statistic. fit : bool If True, then the distribution parameters are estimated. Currently only for 1d data x, except in case dist='norm'. params : tuple The optional distribution parameters if fit is False. axis : int If dist is 'norm' or fit is False, then data can be an n-dimensional and axis specifies the axis of a variable. Returns ------- {float, ndarray} The Anderson-Darling statistic. """ x = array_like(x, 'x', ndim=None) fit = bool_like(fit, 'fit') axis = int_like(axis, 'axis') y = np.sort(x, axis=axis) nobs = y.shape[axis] if fit: if dist == 'norm': xbar = np.expand_dims(np.mean(x, axis=axis), axis) s = np.expand_dims(np.std(x, ddof=1, axis=axis), axis) w = (y - xbar) / s z = stats.norm.cdf(w) # print z elif callable(dist): params = dist.fit(x) # print params z = dist.cdf(y, *params) print(z) else: raise ValueError("dist must be 'norm' or a Callable") else: if callable(dist): z = dist.cdf(y, *params) else: raise ValueError('if fit is false, then dist must be callable') i = np.arange(1, nobs + 1) sl1 = [None] * x.ndim sl1[axis] = slice(None) sl1 = tuple(sl1) sl2 = [slice(None)] * x.ndim sl2[axis] = slice(None, None, -1) sl2 = tuple(sl2) s = np.sum((2 * i[sl1] - 1.0) / nobs * (np.log(z) + np.log1p(-z[sl2])), axis=axis) a2 = -nobs - s return a2
def __init__( self, endog, *, period: Optional[int] = None, deseasonalize: bool = True, use_test: bool = True, method: str = "auto", difference: bool = False ) -> None: self._y = array_like(endog, "endog", ndim=1) if isinstance(endog, pd.DataFrame): self.endog_orig = endog.iloc[:, 0] else: self.endog_orig = endog self._period = int_like(period, "period", optional=True) self._deseasonalize = bool_like(deseasonalize, "deseasonalize") self._use_test = ( bool_like(use_test, "use_test") and self._deseasonalize ) self._diff = bool_like(difference, "difference") self._method = string_like( method, "model", options=("auto", "additive", "multiplicative", "mul", "add"), ) if self._period is None and self._deseasonalize: idx = getattr(endog, "index", None) pfreq = None if idx is not None: pfreq = getattr(idx, "freq", None) if pfreq is None: pfreq = getattr(idx, "inferred_freq", None) if pfreq is not None: self._period = freq_to_period(pfreq) else: raise ValueError( "You must specify a period or endog must be a " "pandas object with a DatetimeIndex with " "a freq not set to None" ) self._has_seasonality = self._deseasonalize
def __init__(self, endog, trend=None, damped=False, seasonal=None, seasonal_periods=None, dates=None, freq=None, missing='none'): super(ExponentialSmoothing, self).__init__(endog, None, dates, freq, missing=missing) self.endog = self.endog self._y = self._data = array_like(endog, 'endog', contiguous=True, order='C') options = ("add", "mul", "additive", "multiplicative") trend = string_like(trend, 'trend', options=options, optional=True) if trend in ['additive', 'multiplicative']: trend = {'additive': 'add', 'multiplicative': 'mul'}[trend] self.trend = trend self.damped = bool_like(damped, 'damped') seasonal = string_like(seasonal, 'seasonal', options=options, optional=True) if seasonal in ['additive', 'multiplicative']: seasonal = {'additive': 'add', 'multiplicative': 'mul'}[seasonal] self.seasonal = seasonal self.trending = trend in ['mul', 'add'] self.seasoning = seasonal in ['mul', 'add'] if (self.trend == 'mul' or self.seasonal == 'mul') and \ not np.all(self._data > 0.0): raise ValueError('endog must be strictly positive when using' 'multiplicative trend or seasonal components.') if self.damped and not self.trending: raise ValueError('Can only dampen the trend component') if self.seasoning: self.seasonal_periods = int_like(seasonal_periods, 'seasonal_periods', optional=True) if seasonal_periods is None: self.seasonal_periods = freq_to_period(self._index_freq) if self.seasonal_periods <= 1: raise ValueError('seasonal_periods must be larger than 1.') else: self.seasonal_periods = 0 self.nobs = len(self.endog)
def __init__( self, data: Union[np.ndarray, pd.Series, pd.DataFrame], stats: Sequence[str] = None, *, numeric: bool = True, categorical: bool = True, alpha: float = 0.05, use_t: bool = False, percentiles: Sequence[Union[int, float]] = PERCENTILES, ntop: bool = 5, ): data_arr = data if not isinstance(data, (pd.Series, pd.DataFrame)): data_arr = array_like(data, "data", maxdim=2) if data_arr.ndim == 1: data = pd.Series(data) numeric = bool_like(numeric, "numeric") categorical = bool_like(categorical, "categorical") include = [] col_types = "" if numeric: include.append(np.number) col_types = "numeric" if categorical: include.append("category") col_types += "and " if col_types != "" else "" col_types += "categorical" if not numeric and not categorical: raise ValueError( "At least one of numeric and categorical must be True" ) self._data = pd.DataFrame(data).select_dtypes(include) if self._data.shape[1] == 0: raise ValueError( "Selecting {col_types} results in an empty DataFrame" ) self._is_numeric = [is_numeric_dtype(dt) for dt in self._data.dtypes] self._is_cat_like = [ is_categorical_dtype(dt) for dt in self._data.dtypes ] if stats is not None: undef = [stat for stat in stats if stat not in DEFAULT_STATISTICS] if undef: raise ValueError( f"{', '.join(undef)} are not known statistics" ) self._stats = ( list(DEFAULT_STATISTICS) if stats is None else list(stats) ) self._ntop = int_like(ntop, "ntop") self._compute_top = "top" in self._stats self._compute_freq = "freq" in self._stats if self._compute_top and self._ntop <= 0 < sum(self._is_cat_like): raise ValueError("top must be a non-negative integer") self._compute_perc = "percentiles" in self._stats self._percentiles = array_like( percentiles, "percentiles", maxdim=1, dtype="d" ) self._percentiles = np.sort(self._percentiles) if np.unique(self._percentiles).shape[0] != self._percentiles.shape[0]: raise ValueError("percentiles must be distinct") if np.any(self._percentiles >= 100) or np.any(self._percentiles <= 0): raise ValueError("percentiles must be strictly between 0 and 100") # Expand special stats replacements = { "mode": ["mode", "mode_freq"], "ci": ["upper_ci", "lower_ci"], "jarque_bera": ["jarque_bera", "jarque_bera_pval"], "top": [f"top_{i}" for i in range(1, self._ntop + 1)], "freq": [f"freq_{i}" for i in range(1, self._ntop + 1)], "percentiles": [f"{i}%" for i in percentiles], } for key in replacements: if key in self._stats: idx = self._stats.index(key) self._stats = ( self._stats[:idx] + replacements[key] + self._stats[idx + 1 :] ) self._alpha = float_like(alpha, "alpha") if not 0 < alpha < 1: raise ValueError("alpha must be strictly between 0 and 1") self._use_t = bool_like(use_t, "use_t")
def test_not_bool_like(): with pytest.raises(TypeError): bool_like(np.array([True, True]), boolean)
def add_trend(x, trend="c", prepend=False, has_constant="skip"): """ Add a trend and/or constant to an array. Parameters ---------- x : array_like Original array of data. trend : str {'n', 'c', 't', 'ct', 'ctt'} The trend to add. * 'n' add no trend. * 'c' add constant only. * 't' add trend only. * 'ct' add constant and linear trend. * 'ctt' add constant and linear and quadratic trend. prepend : bool If True, prepends the new data to the columns of X. has_constant : str {'raise', 'add', 'skip'} Controls what happens when trend is 'c' and a constant column already exists in x. 'raise' will raise an error. 'add' will add a column of 1s. 'skip' will return the data without change. 'skip' is the default. Returns ------- array_like The original data with the additional trend columns. If x is a pandas Series or DataFrame, then the trend column names are 'const', 'trend' and 'trend_squared'. See Also -------- statsmodels.tools.tools.add_constant Add a constant column to an array. Notes ----- Returns columns as ['ctt','ct','c'] whenever applicable. There is currently no checking for an existing trend. """ prepend = bool_like(prepend, "prepend") trend = string_like(trend, "trend", options=("n", "c", "t", "ct", "ctt")) has_constant = string_like(has_constant, "has_constant", options=("raise", "add", "skip")) # TODO: could be generalized for trend of aribitrary order columns = ["const", "trend", "trend_squared"] if trend == "n": return x.copy() elif trend == "c": # handles structured arrays columns = columns[:1] trendorder = 0 elif trend == "ct" or trend == "t": columns = columns[:2] if trend == "t": columns = columns[1:2] trendorder = 1 elif trend == "ctt": trendorder = 2 if _is_recarray(x): from statsmodels.tools.sm_exceptions import recarray_exception raise NotImplementedError(recarray_exception) is_pandas = _is_using_pandas(x, None) if is_pandas: if isinstance(x, pd.Series): x = pd.DataFrame(x) else: x = x.copy() else: x = np.asanyarray(x) nobs = len(x) trendarr = np.vander(np.arange(1, nobs + 1, dtype=np.float64), trendorder + 1) # put in order ctt trendarr = np.fliplr(trendarr) if trend == "t": trendarr = trendarr[:, 1] if "c" in trend: if is_pandas: # Mixed type protection def safe_is_const(s): try: return np.ptp(s) == 0.0 and np.any(s != 0.0) except: return False col_const = x.apply(safe_is_const, 0) else: ptp0 = np.ptp(np.asanyarray(x), axis=0) col_is_const = ptp0 == 0 nz_const = col_is_const & (x[0] != 0) col_const = nz_const if np.any(col_const): if has_constant == "raise": if x.ndim == 1: base_err = "x is constant." else: columns = np.arange(x.shape[1])[col_const] if isinstance(x, pd.DataFrame): columns = x.columns const_cols = ", ".join([str(c) for c in columns]) base_err = ( "x contains one or more constant columns. Column(s) " f"{const_cols} are constant.") msg = f"{base_err} Adding a constant with trend='{trend}' is not allowed." raise ValueError(msg) elif has_constant == "skip": columns = columns[1:] trendarr = trendarr[:, 1:] order = 1 if prepend else -1 if is_pandas: trendarr = pd.DataFrame(trendarr, index=x.index, columns=columns) x = [trendarr, x] x = pd.concat(x[::order], axis=1) else: x = [trendarr, x] x = np.column_stack(x[::order]) return x
def lagmat( x, maxlag: int, trim: Literal["forward", "backward", "both", "none"] = 'forward', original: Literal["ex", "sep", "in"] = "ex", use_pandas: bool = False ) -> NDArray | DataFrame | tuple[NDArray, NDArray] | tuple[DataFrame, DataFrame]: """ Create 2d array of lags. Parameters ---------- x : array_like Data; if 2d, observation in rows and variables in columns. maxlag : int All lags from zero to maxlag are included. trim : {'forward', 'backward', 'both', 'none', None} The trimming method to use. * 'forward' : trim invalid observations in front. * 'backward' : trim invalid initial observations. * 'both' : trim invalid observations on both sides. * 'none', None : no trimming of observations. original : {'ex','sep','in'} How the original is treated. * 'ex' : drops the original array returning only the lagged values. * 'in' : returns the original array and the lagged values as a single array. * 'sep' : returns a tuple (original array, lagged values). The original array is truncated to have the same number of rows as the returned lagmat. use_pandas : bool If true, returns a DataFrame when the input is a pandas Series or DataFrame. If false, return numpy ndarrays. Returns ------- lagmat : ndarray The array with lagged observations. y : ndarray, optional Only returned if original == 'sep'. Notes ----- When using a pandas DataFrame or Series with use_pandas=True, trim can only be 'forward' or 'both' since it is not possible to consistently extend index values. Examples -------- >>> from statsmodels.tsa.tsatools import lagmat >>> import numpy as np >>> X = np.arange(1,7).reshape(-1,2) >>> lagmat(X, maxlag=2, trim="forward", original='in') array([[ 1., 2., 0., 0., 0., 0.], [ 3., 4., 1., 2., 0., 0.], [ 5., 6., 3., 4., 1., 2.]]) >>> lagmat(X, maxlag=2, trim="backward", original='in') array([[ 5., 6., 3., 4., 1., 2.], [ 0., 0., 5., 6., 3., 4.], [ 0., 0., 0., 0., 5., 6.]]) >>> lagmat(X, maxlag=2, trim="both", original='in') array([[ 5., 6., 3., 4., 1., 2.]]) >>> lagmat(X, maxlag=2, trim="none", original='in') array([[ 1., 2., 0., 0., 0., 0.], [ 3., 4., 1., 2., 0., 0.], [ 5., 6., 3., 4., 1., 2.], [ 0., 0., 5., 6., 3., 4.], [ 0., 0., 0., 0., 5., 6.]]) """ maxlag = int_like(maxlag, "maxlag") use_pandas = bool_like(use_pandas, "use_pandas") trim = string_like( trim, "trim", optional=True, options=("forward", "backward", "both", "none"), ) original = string_like(original, "original", options=("ex", "sep", "in")) # TODO: allow list of lags additional to maxlag orig = x x = array_like(x, "x", ndim=2, dtype=None) is_pandas = _is_using_pandas(orig, None) and use_pandas trim = "none" if trim is None else trim trim = trim.lower() if is_pandas and trim in ("none", "backward"): raise ValueError("trim cannot be 'none' or 'backward' when used on " "Series or DataFrames") dropidx = 0 nobs, nvar = x.shape if original in ["ex", "sep"]: dropidx = nvar if maxlag >= nobs: raise ValueError("maxlag should be < nobs") lm = np.zeros((nobs + maxlag, nvar * (maxlag + 1))) for k in range(0, int(maxlag + 1)): lm[maxlag - k:nobs + maxlag - k, nvar * (maxlag - k):nvar * (maxlag - k + 1), ] = x if trim in ("none", "forward"): startobs = 0 elif trim in ("backward", "both"): startobs = maxlag else: raise ValueError("trim option not valid") if trim in ("none", "backward"): stopobs = len(lm) else: stopobs = nobs if is_pandas: x = orig x_columns = x.columns if isinstance(x, DataFrame) else [x.name] columns = [str(col) for col in x_columns] for lag in range(maxlag): lag_str = str(lag + 1) columns.extend([str(col) + ".L." + lag_str for col in x_columns]) lm = DataFrame(lm[:stopobs], index=x.index, columns=columns) lags = lm.iloc[startobs:] if original in ("sep", "ex"): leads = lags[x_columns] lags = lags.drop(x_columns, axis=1) else: lags = lm[startobs:stopobs, dropidx:] if original == "sep": leads = lm[startobs:stopobs, :dropidx] if original == "sep": return lags, leads else: return lags
def add_lag(x, col=None, lags=1, drop=False, insert=True): """ Returns an array with lags included given an array. Parameters ---------- x : array_like An array or NumPy ndarray subclass. Can be either a 1d or 2d array with observations in columns. col : int or None `col` can be an int of the zero-based column index. If it's a 1d array `col` can be None. lags : int The number of lags desired. drop : bool Whether to keep the contemporaneous variable for the data. insert : bool or int If True, inserts the lagged values after `col`. If False, appends the data. If int inserts the lags at int. Returns ------- array : ndarray Array with lags Examples -------- >>> import statsmodels.api as sm >>> data = sm.datasets.macrodata.load() >>> data = data.data[['year','quarter','realgdp','cpi']] >>> data = sm.tsa.add_lag(data, 'realgdp', lags=2) Notes ----- Trims the array both forward and backward, so that the array returned so that the length of the returned array is len(`X`) - lags. The lags are returned in increasing order, ie., t-1,t-2,...,t-lags """ lags = int_like(lags, "lags") drop = bool_like(drop, "drop") x = array_like(x, "x", ndim=2) if col is None: col = 0 # handle negative index if col < 0: col = x.shape[1] + col if x.ndim == 1: x = x[:, None] contemp = x[:, col] if insert is True: ins_idx = col + 1 elif insert is False: ins_idx = x.shape[1] else: if insert < 0: # handle negative index insert = x.shape[1] + insert + 1 if insert > x.shape[1]: insert = x.shape[1] warnings.warn( "insert > number of variables, inserting at the" " last position", ValueWarning, ) ins_idx = insert ndlags = lagmat(contemp, lags, trim="Both") first_cols = lrange(ins_idx) last_cols = lrange(ins_idx, x.shape[1]) if drop: if col in first_cols: first_cols.pop(first_cols.index(col)) else: last_cols.pop(last_cols.index(col)) return np.column_stack((x[lags:, first_cols], ndlags, x[lags:, last_cols]))
def lagmat(x, maxlag, trim='forward', original='ex', use_pandas=False): """ Create 2d array of lags Parameters ---------- x : array_like, 1d or 2d data; if 2d, observation in rows and variables in columns maxlag : int all lags from zero to maxlag are included trim : str {'forward', 'backward', 'both', 'none'} or None * 'forward' : trim invalid observations in front * 'backward' : trim invalid initial observations * 'both' : trim invalid observations on both sides * 'none', None : no trimming of observations original : str {'ex','sep','in'} * 'ex' : drops the original array returning only the lagged values. * 'in' : returns the original array and the lagged values as a single array. * 'sep' : returns a tuple (original array, lagged values). The original array is truncated to have the same number of rows as the returned lagmat. use_pandas : bool, optional If true, returns a DataFrame when the input is a pandas Series or DataFrame. If false, return numpy ndarrays. Returns ------- lagmat : 2d array array with lagged observations y : 2d array, optional Only returned if original == 'sep' Examples -------- >>> from statsmodels.tsa.tsatools import lagmat >>> import numpy as np >>> X = np.arange(1,7).reshape(-1,2) >>> lagmat(X, maxlag=2, trim="forward", original='in') array([[ 1., 2., 0., 0., 0., 0.], [ 3., 4., 1., 2., 0., 0.], [ 5., 6., 3., 4., 1., 2.]]) >>> lagmat(X, maxlag=2, trim="backward", original='in') array([[ 5., 6., 3., 4., 1., 2.], [ 0., 0., 5., 6., 3., 4.], [ 0., 0., 0., 0., 5., 6.]]) >>> lagmat(X, maxlag=2, trim="both", original='in') array([[ 5., 6., 3., 4., 1., 2.]]) >>> lagmat(X, maxlag=2, trim="none", original='in') array([[ 1., 2., 0., 0., 0., 0.], [ 3., 4., 1., 2., 0., 0.], [ 5., 6., 3., 4., 1., 2.], [ 0., 0., 5., 6., 3., 4.], [ 0., 0., 0., 0., 5., 6.]]) Notes ----- When using a pandas DataFrame or Series with use_pandas=True, trim can only be 'forward' or 'both' since it is not possible to consistently extend index values. """ maxlag = int_like(maxlag, 'maxlag') use_pandas = bool_like(use_pandas, 'use_pandas') trim = string_like(trim, 'trim', optional=True, options=('forward', 'backward', 'both', 'none')) original = string_like(original, 'original', options=('ex', 'sep', 'in')) # TODO: allow list of lags additional to maxlag orig = x x = array_like(x, 'x', ndim=2, dtype=None) is_pandas = _is_using_pandas(orig, None) and use_pandas trim = 'none' if trim is None else trim trim = trim.lower() if is_pandas and trim in ('none', 'backward'): raise ValueError("trim cannot be 'none' or 'forward' when used on " "Series or DataFrames") dropidx = 0 nobs, nvar = x.shape if original in ['ex', 'sep']: dropidx = nvar if maxlag >= nobs: raise ValueError("maxlag should be < nobs") lm = np.zeros((nobs + maxlag, nvar * (maxlag + 1))) for k in range(0, int(maxlag + 1)): lm[maxlag - k:nobs + maxlag - k, nvar * (maxlag - k):nvar * (maxlag - k + 1)] = x if trim in ('none', 'forward'): startobs = 0 elif trim in ('backward', 'both'): startobs = maxlag else: raise ValueError('trim option not valid') if trim in ('none', 'backward'): stopobs = len(lm) else: stopobs = nobs if is_pandas: x = orig x_columns = x.columns if isinstance(x, DataFrame) else [x.name] columns = [str(col) for col in x_columns] for lag in range(maxlag): lag_str = str(lag + 1) columns.extend([str(col) + '.L.' + lag_str for col in x_columns]) lm = DataFrame(lm[:stopobs], index=x.index, columns=columns) lags = lm.iloc[startobs:] if original in ('sep', 'ex'): leads = lags[x_columns] lags = lags.drop(x_columns, 1) else: lags = lm[startobs:stopobs, dropidx:] if original == 'sep': leads = lm[startobs:stopobs, :dropidx] if original == 'sep': return lags, leads else: return lags
def add_trend(x, trend="c", prepend=False, has_constant='skip'): """ Adds a trend and/or constant to an array. Parameters ---------- x : array_like Original array of data. trend : str {'n', 'c', 't', 'ct', 'ctt'} * 'n' add no trend. * 'c' add constant only. * 't' add trend only. * 'ct' add constant and linear trend. * 'ctt' add constant and linear and quadratic trend. prepend : bool If True, prepends the new data to the columns of X. has_constant : str {'raise', 'add', 'skip'} Controls what happens when trend is 'c' and a constant already exists in x. 'raise' will raise an error. 'add' will duplicate a constant. 'skip' will return the data without change. 'skip' is the default. Returns ------- array_like The original data with the additional trend columns. If x is a recarray or pandas Series or DataFrame, then the trend column names are 'const', 'trend' and 'trend_squared'. Notes ----- Returns columns as ['ctt','ct','c'] whenever applicable. There is currently no checking for an existing trend. See Also -------- statsmodels.tools.tools.add_constant Add a constant column to an array. """ prepend = bool_like(prepend, 'prepend') trend = string_like(trend, 'trend', options=('n', 'c', 't', 'ct', 'ctt')) has_constant = string_like(has_constant, 'has_constant', options=('raise', 'add', 'skip')) # TODO: could be generalized for trend of aribitrary order columns = ['const', 'trend', 'trend_squared'] if trend == 'n': return x.copy() elif trend == "c": # handles structured arrays columns = columns[:1] trendorder = 0 elif trend == "ct" or trend == "t": columns = columns[:2] if trend == "t": columns = columns[1:2] trendorder = 1 elif trend == "ctt": trendorder = 2 is_recarray = _is_recarray(x) is_pandas = _is_using_pandas(x, None) or is_recarray if is_pandas or is_recarray: if is_recarray: descr = x.dtype.descr x = pd.DataFrame.from_records(x) elif isinstance(x, pd.Series): x = pd.DataFrame(x) else: x = x.copy() else: x = np.asanyarray(x) nobs = len(x) trendarr = np.vander(np.arange(1, nobs + 1, dtype=np.float64), trendorder + 1) # put in order ctt trendarr = np.fliplr(trendarr) if trend == "t": trendarr = trendarr[:, 1] if "c" in trend: if is_pandas or is_recarray: # Mixed type protection def safe_is_const(s): try: return np.ptp(s) == 0.0 and np.any(s != 0.0) except: return False col_const = x.apply(safe_is_const, 0) else: ptp0 = np.ptp(np.asanyarray(x), axis=0) col_is_const = ptp0 == 0 nz_const = col_is_const & (x[0] != 0) col_const = nz_const if np.any(col_const): if has_constant == 'raise': msg = "x contains a constant. Adding a constant with " \ "trend='{0}' is not allowed.".format(trend) raise ValueError(msg) elif has_constant == 'skip': columns = columns[1:] trendarr = trendarr[:, 1:] order = 1 if prepend else -1 if is_recarray or is_pandas: trendarr = pd.DataFrame(trendarr, index=x.index, columns=columns) x = [trendarr, x] x = pd.concat(x[::order], 1) else: x = [trendarr, x] x = np.column_stack(x[::order]) if is_recarray: x = x.to_records(index=False) new_descr = x.dtype.descr extra_col = len(new_descr) - len(descr) if prepend: descr = new_descr[:extra_col] + descr else: descr = descr + new_descr[-extra_col:] x = x.astype(np.dtype(descr)) return x
def add_lag(x, col=None, lags=1, drop=False, insert=True): """ Returns an array with lags included given an array. Parameters ---------- x : array An array or NumPy ndarray subclass. Can be either a 1d or 2d array with observations in columns. col : 'string', int, or None If data is a structured array or a recarray, `col` can be a string that is the name of the column containing the variable. Or `col` can be an int of the zero-based column index. If it's a 1d array `col` can be None. lags : int The number of lags desired. drop : bool Whether to keep the contemporaneous variable for the data. insert : bool or int If True, inserts the lagged values after `col`. If False, appends the data. If int inserts the lags at int. Returns ------- array : ndarray Array with lags Examples -------- >>> import statsmodels.api as sm >>> data = sm.datasets.macrodata.load(as_pandas=False) >>> data = data.data[['year','quarter','realgdp','cpi']] >>> data = sm.tsa.add_lag(data, 'realgdp', lags=2) Notes ----- Trims the array both forward and backward, so that the array returned so that the length of the returned array is len(`X`) - lags. The lags are returned in increasing order, ie., t-1,t-2,...,t-lags """ lags = int_like(lags, 'lags') drop = bool_like(drop, 'drop') if x.dtype.names: names = x.dtype.names if not col and np.squeeze(x).ndim > 1: raise IndexError("col is None and the input array is not 1d") elif len(names) == 1: col = names[0] if isinstance(col, int): col = x.dtype.names[col] contemp = x[col] # make names for lags tmp_names = [col + '_' + 'L(%i)' % i for i in range(1, lags + 1)] ndlags = lagmat(contemp, maxlag=lags, trim='Both') # get index for return if insert is True: ins_idx = list(names).index(col) + 1 elif insert is False: ins_idx = len(names) + 1 else: # insert is an int if insert > len(names): import warnings warnings.warn( "insert > number of variables, inserting at the" " last position", ValueWarning) ins_idx = insert first_names = list(names[:ins_idx]) last_names = list(names[ins_idx:]) if drop: if col in first_names: first_names.pop(first_names.index(col)) else: last_names.pop(last_names.index(col)) if first_names: # only do this if x is not "empty" # Workaround to avoid NumPy FutureWarning _x = recarray_select(x, first_names) first_arr = nprf.append_fields(_x[lags:], tmp_names, ndlags.T, usemask=False) else: first_arr = np.zeros(len(x) - lags, dtype=lzip(tmp_names, (x[col].dtype, ) * lags)) for i, name in enumerate(tmp_names): first_arr[name] = ndlags[:, i] if last_names: return nprf.append_fields(first_arr, last_names, [x[name][lags:] for name in last_names], usemask=False) else: # lags for last variable return first_arr else: # we have an ndarray if x.ndim == 1: # make 2d if 1d x = x[:, None] if col is None: col = 0 # handle negative index if col < 0: col = x.shape[1] + col contemp = x[:, col] if insert is True: ins_idx = col + 1 elif insert is False: ins_idx = x.shape[1] else: if insert < 0: # handle negative index insert = x.shape[1] + insert + 1 if insert > x.shape[1]: insert = x.shape[1] import warnings warnings.warn( "insert > number of variables, inserting at the" " last position", ValueWarning) ins_idx = insert ndlags = lagmat(contemp, lags, trim='Both') first_cols = lrange(ins_idx) last_cols = lrange(ins_idx, x.shape[1]) if drop: if col in first_cols: first_cols.pop(first_cols.index(col)) else: last_cols.pop(last_cols.index(col)) return np.column_stack((x[lags:, first_cols], ndlags, x[lags:, last_cols]))
def __init__(self, data, ncomp=None, standardize=True, demean=True, normalize=True, gls=False, weights=None, method='svd', missing=None, tol=5e-8, max_iter=1000, tol_em=5e-8, max_em_iter=100): self._index = None self._columns = [] if isinstance(data, pd.DataFrame): self._index = data.index self._columns = data.columns self.data = array_like(data, "data", ndim=2) # Store inputs self._gls = bool_like(gls, "gls") self._normalize = bool_like(normalize, "normalize") self._tol = float_like(tol, "tol") if not 0 < self._tol < 1: raise ValueError('tol must be strictly between 0 and 1') self._max_iter = int_like(max_iter, "int_like") self._max_em_iter = int_like(max_em_iter, "max_em_iter") self._tol_em = float_like(tol_em, "tol_em") # Prepare data self._standardize = bool_like(standardize, "standardize") self._demean = bool_like(demean, "demean") self._nobs, self._nvar = self.data.shape weights = array_like(weights, "weights", maxdim=1, optional=True) if weights is None: weights = np.ones(self._nvar) else: weights = np.array(weights).flatten() if weights.shape[0] != self._nvar: raise ValueError('weights should have nvar elements') weights = weights / np.sqrt((weights ** 2.0).mean()) self.weights = weights # Check ncomp against maximum min_dim = min(self._nobs, self._nvar) self._ncomp = min_dim if ncomp is None else ncomp if self._ncomp > min_dim: import warnings warn = 'The requested number of components is more than can be ' \ 'computed from data. The maximum number of components is ' \ 'the minimum of the number of observations or variables' warnings.warn(warn, ValueWarning) self._ncomp = min_dim self._method = method # Workaround to avoid instance methods in __dict__ if self._method not in ('eig', 'svd', 'nipals'): raise ValueError('method {0} is not known.'.format(method)) self.rows = np.arange(self._nobs) self.cols = np.arange(self._nvar) # Handle missing self._missing = string_like(missing, "missing", optional=True) self._adjusted_data = self.data self._adjust_missing() # Update size self._nobs, self._nvar = self._adjusted_data.shape if self._ncomp == np.min(self.data.shape): self._ncomp = np.min(self._adjusted_data.shape) elif self._ncomp > np.min(self._adjusted_data.shape): raise ValueError('When adjusting for missing values, user ' 'provided ncomp must be no larger than the ' 'smallest dimension of the ' 'missing-value-adjusted data size.') # Attributes and internal values self._tss = 0.0 self._ess = None self.transformed_data = None self._mu = None self._sigma = None self._ess_indiv = None self._tss_indiv = None self.scores = self.factors = None self.loadings = None self.coeff = None self.eigenvals = None self.eigenvecs = None self.projection = None self.rsquare = None self.ic = None # Prepare data self.transformed_data = self._prepare_data() # Perform the PCA self._pca() if gls: self._compute_gls_weights() self.transformed_data = self._prepare_data() self._pca() # Final calculations self._compute_rsquare_and_ic() if self._index is not None: self._to_pandas()
def __init__(self, endog, trend=False, damped_trend=False, seasonal=None, initialization_method='estimated', initial_level=None, initial_trend=None, initial_seasonal=None, bounds=None, concentrate_scale=True, dates=None, freq=None, missing='none'): # Model definition self.trend = bool_like(trend, 'trend') self.damped_trend = bool_like(damped_trend, 'damped_trend') self.seasonal_periods = int_like(seasonal, 'seasonal', optional=True) self.seasonal = self.seasonal_periods is not None self.initialization_method = string_like( initialization_method, 'initialization_method').lower() self.concentrate_scale = bool_like(concentrate_scale, 'concentrate_scale') # TODO: add validation for bounds (e.g. have all bounds, upper > lower) # TODO: add `bounds_method` argument to choose between "usual" and # "admissible" as in Hyndman et al. (2008) self.bounds = bounds if self.bounds is None: self.bounds = [(1e-4, 1-1e-4)] * 3 + [(0.8, 0.98)] # Validation if self.seasonal_periods == 1: raise ValueError('Cannot have a seasonal period of 1.') if self.seasonal and self.seasonal_periods is None: raise NotImplementedError('Unable to detect season automatically;' ' please specify `seasonal_periods`.') if self.initialization_method not in ['concentrated', 'estimated', 'simple', 'heuristic', 'known']: raise ValueError('Invalid initialization method "%s".' % initialization_method) if self.initialization_method == 'known': if initial_level is None: raise ValueError('`initial_level` argument must be provided' ' when initialization method is set to' ' "known".') if initial_trend is None and self.trend: raise ValueError('`initial_trend` argument must be provided' ' for models with a trend component when' ' initialization method is set to "known".') if initial_seasonal is None and self.seasonal: raise ValueError('`initial_seasonal` argument must be provided' ' for models with a seasonal component when' ' initialization method is set to "known".') # Initialize the state space model if not self.seasonal or self.seasonal_periods is None: self._seasonal_periods = 0 else: self._seasonal_periods = self.seasonal_periods k_states = 2 + int(self.trend) + self._seasonal_periods k_posdef = 1 init = ss_init.Initialization(k_states, 'known', constant=[0] * k_states) super(ExponentialSmoothing, self).__init__( endog, k_states=k_states, k_posdef=k_posdef, initialization=init, dates=dates, freq=freq, missing=missing) # Concentrate the scale out of the likelihood function if self.concentrate_scale: self.ssm.filter_concentrated = True # Setup fixed elements of the system matrices # Observation error self.ssm['design', 0, 0] = 1. self.ssm['selection', 0, 0] = 1. self.ssm['state_cov', 0, 0] = 1. # Level self.ssm['design', 0, 1] = 1. self.ssm['transition', 1, 1] = 1. # Trend if self.trend: self.ssm['transition', 1:3, 2] = 1. # Seasonal if self.seasonal: k = 2 + int(self.trend) self.ssm['design', 0, k] = 1. self.ssm['transition', k, -1] = 1. self.ssm['transition', k + 1:k_states, k:k_states - 1] = ( np.eye(self.seasonal_periods - 1)) # Initialization of the states if self.initialization_method != 'known': msg = ('Cannot give `%%s` argument when initialization is "%s"' % initialization_method) if initial_level is not None: raise ValueError(msg % 'initial_level') if initial_trend is not None: raise ValueError(msg % 'initial_trend') if initial_seasonal is not None: raise ValueError(msg % 'initial_seasonal') if self.initialization_method == 'simple': initial_level, initial_trend, initial_seasonal = ( es_init._initialization_simple( self.endog[:, 0], trend='add' if self.trend else None, seasonal='add' if self.seasonal else None, seasonal_periods=self.seasonal_periods)) elif self.initialization_method == 'heuristic': initial_level, initial_trend, initial_seasonal = ( es_init._initialization_heuristic( self.endog[:, 0], trend='add' if self.trend else None, seasonal='add' if self.seasonal else None, seasonal_periods=self.seasonal_periods)) elif self.initialization_method == 'known': initial_level = float_like(initial_level, 'initial_level') if self.trend: initial_trend = float_like(initial_trend, 'initial_trend') if self.seasonal: initial_seasonal = array_like(initial_seasonal, 'initial_seasonal') if len(initial_seasonal) == self.seasonal_periods - 1: initial_seasonal = np.r_[initial_seasonal, 0 - np.sum(initial_seasonal)] if len(initial_seasonal) != self.seasonal_periods: raise ValueError( 'Invalid length of initial seasonal values. Must be' ' one of s or s-1, where s is the number of seasonal' ' periods.') # Note that the simple and heuristic methods of computing initial # seasonal factors return estimated seasonal factors associated with # the first t = 1, 2, ..., `n_seasons` observations. To use these as # the initial state, we lag them by `n_seasons`. This yields, for # example for `n_seasons = 4`, the seasons lagged L3, L2, L1, L0. # As described above, the state vector in this model should have # seasonal factors ordered L0, L1, L2, L3, and as a result we need to # reverse the order of the computed initial seasonal factors from # these methods. methods = ['simple', 'heuristic'] if (self.initialization_method in methods and initial_seasonal is not None): initial_seasonal = initial_seasonal[::-1] self._initial_level = initial_level self._initial_trend = initial_trend self._initial_seasonal = initial_seasonal self._initial_state = None # Initialize now if possible (if we have a damped trend, then # initialization will depend on the phi parameter, and so has to be # done at each `update`) methods = ['simple', 'heuristic', 'known'] if not self.damped_trend and self.initialization_method in methods: self._initialize_constant_statespace(initial_level, initial_trend, initial_seasonal) # Save keys for kwarg initialization self._init_keys += ['trend', 'damped_trend', 'seasonal', 'initialization_method', 'initial_level', 'initial_trend', 'initial_seasonal', 'bounds', 'concentrate_scale', 'dates', 'freq', 'missing']
def simple_rur(self, x, store=False): x = array_like(x, "x") store = bool_like(store, "store") nobs = x.shape[0] # if m is not one, n != m * n if nobs != x.size: raise ValueError("x of shape {0} not understood".format(x.shape)) # Table from [1] has been replicated using 200,000 samples # Critical values for new n_obs values have been identified pvals = [0.01, 0.025, 0.05, 0.10, 0.90, 0.95] n = np.array( [25, 50, 100, 150, 200, 250, 500, 1000, 2000, 3000, 4000, 5000]) crit = np.array([ [0.6626, 0.8126, 0.9192, 1.0712, 2.4863, 2.7312], [0.7977, 0.9274, 1.0478, 1.1964, 2.6821, 2.9613], [0.907, 1.0243, 1.1412, 1.2888, 2.8317, 3.1393], [0.9543, 1.0768, 1.1869, 1.3294, 2.8915, 3.2049], [0.9833, 1.0984, 1.2101, 1.3494, 2.9308, 3.2482], [0.9982, 1.1137, 1.2242, 1.3632, 2.9571, 3.2482], [1.0494, 1.1643, 1.2712, 1.4076, 3.0207, 3.3584], [1.0846, 1.1959, 1.2988, 1.4344, 3.0653, 3.4073], [1.1121, 1.2200, 1.3230, 1.4556, 3.0948, 3.4439], [1.1204, 1.2295, 1.3318, 1.4656, 3.1054, 3.4632], [1.1309, 1.2347, 1.3318, 1.4693, 3.1165, 3.4717], [1.1377, 1.2402, 1.3408, 1.4729, 3.1252, 3.4807], ]) # Interpolation for nobs inter_crit = np.zeros((1, crit.shape[1])) for i in range(crit.shape[1]): f = interp1d(n, crit[:, i]) inter_crit[0, i] = f(nobs) # Calculate RUR stat count = 0 max_p = x[0] min_p = x[0] for v in x[1:]: if v > max_p: max_p = v count = count + 1 if v < min_p: min_p = v count = count + 1 rur_stat = count / np.sqrt(len(x)) k = len(pvals) - 1 for i in range(len(pvals) - 1, -1, -1): if rur_stat < inter_crit[0, i]: k = i else: break p_value = pvals[k] warn_msg = """\ The test statistic is outside of the range of p-values available in the look-up table. The actual p-value is {direction} than the p-value returned. """ direction = "" if p_value == pvals[-1]: direction = "smaller" elif p_value == pvals[0]: direction = "larger" if direction: warnings.warn(warn_msg.format(direction=direction), InterpolationWarning) crit_dict = { "10%": inter_crit[0, 3], "5%": inter_crit[0, 2], "2.5%": inter_crit[0, 1], "1%": inter_crit[0, 0], } if store: from statsmodels.stats.diagnostic import ResultsStore rstore = ResultsStore() rstore.nobs = nobs rstore.H0 = "The series is not stationary" rstore.HA = "The series is stationary" return rur_stat, p_value, crit_dict, rstore else: return rur_stat, p_value, crit_dict
def __init__(self, constant: bool = True, order: int = 0) -> None: self._constant = bool_like(constant, "constant") self._order = required_int_like(order, "order")
def adfuller( x, maxlag=None, regression="c", autolag="AIC", store=False, regresults=False, ): """ Augmented Dickey-Fuller unit root test. The Augmented Dickey-Fuller test can be used to test for a unit root in a univariate process in the presence of serial correlation. Parameters ---------- x : array_like, 1d The data series to test. maxlag : int Maximum lag which is included in test, default 12*(nobs/100)^{1/4}. regression : {"c","ct","ctt","nc"} Constant and trend order to include in regression. * "c" : constant only (default). * "ct" : constant and trend. * "ctt" : constant, and linear and quadratic trend. * "nc" : no constant, no trend. autolag : {"AIC", "BIC", "t-stat", None} Method to use when automatically determining the lag length among the values 0, 1, ..., maxlag. * If "AIC" (default) or "BIC", then the number of lags is chosen to minimize the corresponding information criterion. * "t-stat" based choice of maxlag. Starts with maxlag and drops a lag until the t-statistic on the last lag length is significant using a 5%-sized test. * If None, then the number of included lags is set to maxlag. store : bool If True, then a result instance is returned additionally to the adf statistic. Default is False. regresults : bool, optional If True, the full regression results are returned. Default is False. Returns ------- adf : float The test statistic. pvalue : float MacKinnon's approximate p-value based on MacKinnon (1994, 2010). usedlag : int The number of lags used. nobs : int The number of observations used for the ADF regression and calculation of the critical values. critical values : dict Critical values for the test statistic at the 1 %, 5 %, and 10 % levels. Based on MacKinnon (2010). icbest : float The maximized information criterion if autolag is not None. resstore : ResultStore, optional A dummy class with results attached as attributes. Notes ----- The null hypothesis of the Augmented Dickey-Fuller is that there is a unit root, with the alternative that there is no unit root. If the pvalue is above a critical size, then we cannot reject that there is a unit root. The p-values are obtained through regression surface approximation from MacKinnon 1994, but using the updated 2010 tables. If the p-value is close to significant, then the critical values should be used to judge whether to reject the null. The autolag option and maxlag for it are described in Greene. References ---------- .. [1] W. Green. "Econometric Analysis," 5th ed., Pearson, 2003. .. [2] Hamilton, J.D. "Time Series Analysis". Princeton, 1994. .. [3] MacKinnon, J.G. 1994. "Approximate asymptotic distribution functions for unit-root and cointegration tests. `Journal of Business and Economic Statistics` 12, 167-76. .. [4] MacKinnon, J.G. 2010. "Critical Values for Cointegration Tests." Queen"s University, Dept of Economics, Working Papers. Available at http://ideas.repec.org/p/qed/wpaper/1227.html Examples -------- See example notebook """ x = array_like(x, "x") maxlag = int_like(maxlag, "maxlag", optional=True) regression = string_like(regression, "regression", options=("c", "ct", "ctt", "nc")) autolag = string_like(autolag, "autolag", optional=True, options=("aic", "bic", "t-stat")) store = bool_like(store, "store") regresults = bool_like(regresults, "regresults") if regresults: store = True trenddict = {None: "nc", 0: "c", 1: "ct", 2: "ctt"} if regression is None or isinstance(regression, int): regression = trenddict[regression] regression = regression.lower() nobs = x.shape[0] ntrend = len(regression) if regression != "nc" else 0 if maxlag is None: # from Greene referencing Schwert 1989 maxlag = int(np.ceil(12.0 * np.power(nobs / 100.0, 1 / 4.0))) # -1 for the diff maxlag = min(nobs // 2 - ntrend - 1, maxlag) if maxlag < 0: raise ValueError("sample size is too short to use selected " "regression component") elif maxlag > nobs // 2 - ntrend - 1: raise ValueError("maxlag must be less than (nobs/2 - 1 - ntrend) " "where n trend is the number of included " "deterministic regressors") xdiff = np.diff(x) xdall = lagmat(xdiff[:, None], maxlag, trim="both", original="in") nobs = xdall.shape[0] xdall[:, 0] = x[-nobs - 1:-1] # replace 0 xdiff with level of x xdshort = xdiff[-nobs:] if store: from statsmodels.stats.diagnostic import ResultsStore resstore = ResultsStore() if autolag: if regression != "nc": fullRHS = add_trend(xdall, regression, prepend=True) else: fullRHS = xdall startlag = fullRHS.shape[1] - xdall.shape[1] + 1 # 1 for level # search for lag length with smallest information criteria # Note: use the same number of observations to have comparable IC # aic and bic: smaller is better if not regresults: icbest, bestlag = _autolag(OLS, xdshort, fullRHS, startlag, maxlag, autolag) else: icbest, bestlag, alres = _autolag( OLS, xdshort, fullRHS, startlag, maxlag, autolag, regresults=regresults, ) resstore.autolag_results = alres bestlag -= startlag # convert to lag not column index # rerun ols with best autolag xdall = lagmat(xdiff[:, None], bestlag, trim="both", original="in") nobs = xdall.shape[0] xdall[:, 0] = x[-nobs - 1:-1] # replace 0 xdiff with level of x xdshort = xdiff[-nobs:] usedlag = bestlag else: usedlag = maxlag icbest = None if regression != "nc": resols = OLS(xdshort, add_trend(xdall[:, :usedlag + 1], regression)).fit() else: resols = OLS(xdshort, xdall[:, :usedlag + 1]).fit() adfstat = resols.tvalues[0] # adfstat = (resols.params[0]-1.0)/resols.bse[0] # the "asymptotically correct" z statistic is obtained as # nobs/(1-np.sum(resols.params[1:-(trendorder+1)])) (resols.params[0] - 1) # I think this is the statistic that is used for series that are integrated # for orders higher than I(1), ie., not ADF but cointegration tests. # Get approx p-value and critical values pvalue = mackinnonp(adfstat, regression=regression, N=1) critvalues = mackinnoncrit(N=1, regression=regression, nobs=nobs) critvalues = { "1%": critvalues[0], "5%": critvalues[1], "10%": critvalues[2], } if store: resstore.resols = resols resstore.maxlag = maxlag resstore.usedlag = usedlag resstore.adfstat = adfstat resstore.critvalues = critvalues resstore.nobs = nobs resstore.H0 = ("The coefficient on the lagged level equals 1 - " "unit root") resstore.HA = "The coefficient on the lagged level < 1 - stationary" resstore.icbest = icbest resstore._str = "Augmented Dickey-Fuller Test Results" return adfstat, pvalue, critvalues, resstore else: if not autolag: return adfstat, pvalue, usedlag, nobs, critvalues else: return adfstat, pvalue, usedlag, nobs, critvalues, icbest