def test_int_like(integer): assert isinstance(int_like(integer, "integer"), int) assert isinstance(int_like(integer, "integer", optional=True), int) assert int_like(None, "floating", optional=True) is None if isinstance(integer, (int, np.integer)): assert isinstance(int_like(integer, "integer", strict=True), int) assert int_like(None, "floating", optional=True, strict=True) is None
def __init__(self, endog, exog, window=None, weights=None, min_nobs=None, missing='drop'): # Call Model.__init__ twice to use const detection in first pass # But to not drop in the second pass missing = string_like(missing, 'missing', options=('drop', 'raise', 'skip')) temp_msng = 'drop' if missing != 'raise' else 'raise' Model.__init__(self, endog, exog, missing=temp_msng, hasconst=None) k_const = self.k_constant const_idx = self.data.const_idx Model.__init__(self, endog, exog, missing='none', hasconst=False) self.k_constant = k_const self.data.const_idx = const_idx self._y = array_like(endog, 'endog') nobs = self._y.shape[0] self._x = array_like(exog, 'endog', ndim=2, shape=(nobs, None)) window = int_like(window, 'window', optional=True) weights = array_like(weights, 'weights', optional=True, shape=(nobs,)) self._window = window if window is not None else self._y.shape[0] self._weighted = weights is not None self._weights = np.ones(nobs) if weights is None else weights w12 = np.sqrt(self._weights) self._wy = w12 * self._y self._wx = w12[:, None] * self._x self._is_nan = np.zeros_like(self._y, dtype=np.bool) self._has_nan = self._find_nans() self.const_idx = self.data.const_idx self._skip_missing = missing == 'skip' min_nobs = int_like(min_nobs, 'min_nobs', optional=True) self._min_nobs = min_nobs if min_nobs is not None else self._x.shape[1] if self._min_nobs < self._x.shape[1] or self._min_nobs > self._window: raise ValueError('min_nobs must be larger than the number of ' 'regressors in the model and less than window')
def test_int_like(integer): assert isinstance(int_like(integer, 'integer'), int) assert isinstance(int_like(integer, 'integer', optional=True), int) assert int_like(None, 'floating', optional=True) is None if isinstance(integer, (int, np.integer)): assert isinstance(int_like(integer, 'integer', strict=True), int) assert int_like(None, 'floating', optional=True, strict=True) is None
def __init__( self, endog, exog, window=None, *, weights=None, min_nobs=None, missing="drop", expanding=False ): # Call Model.__init__ twice to use const detection in first pass # But to not drop in the second pass missing = string_like( missing, "missing", options=("drop", "raise", "skip") ) temp_msng = "drop" if missing != "raise" else "raise" Model.__init__(self, endog, exog, missing=temp_msng, hasconst=None) k_const = self.k_constant const_idx = self.data.const_idx Model.__init__(self, endog, exog, missing="none", hasconst=False) self.k_constant = k_const self.data.const_idx = const_idx self._y = array_like(endog, "endog") nobs = self._y.shape[0] self._x = array_like(exog, "endog", ndim=2, shape=(nobs, None)) window = int_like(window, "window", optional=True) weights = array_like(weights, "weights", optional=True, shape=(nobs,)) self._window = window if window is not None else self._y.shape[0] self._weighted = weights is not None self._weights = np.ones(nobs) if weights is None else weights w12 = np.sqrt(self._weights) self._wy = w12 * self._y self._wx = w12[:, None] * self._x min_nobs = int_like(min_nobs, "min_nobs", optional=True) self._min_nobs = min_nobs if min_nobs is not None else self._x.shape[1] if self._min_nobs < self._x.shape[1] or self._min_nobs > self._window: raise ValueError( "min_nobs must be larger than the number of " "regressors in the model and less than window" ) self._expanding = expanding self._is_nan = np.zeros_like(self._y, dtype=bool) self._has_nan = self._find_nans() self.const_idx = self.data.const_idx self._skip_missing = missing == "skip"
def anderson_statistic(x, dist='norm', fit=True, params=(), axis=0): """ Calculate the Anderson-Darling a2 statistic. Parameters ---------- x : array_like The data to test. dist : {'norm', callable} The assumed distribution under the null of test statistic. fit : bool If True, then the distribution parameters are estimated. Currently only for 1d data x, except in case dist='norm'. params : tuple The optional distribution parameters if fit is False. axis : int If dist is 'norm' or fit is False, then data can be an n-dimensional and axis specifies the axis of a variable. Returns ------- {float, ndarray} The Anderson-Darling statistic. """ x = array_like(x, 'x', ndim=None) fit = bool_like(fit, 'fit') axis = int_like(axis, 'axis') y = np.sort(x, axis=axis) nobs = y.shape[axis] if fit: if dist == 'norm': xbar = np.expand_dims(np.mean(x, axis=axis), axis) s = np.expand_dims(np.std(x, ddof=1, axis=axis), axis) w = (y - xbar) / s z = stats.norm.cdf(w) # print z elif callable(dist): params = dist.fit(x) # print params z = dist.cdf(y, *params) print(z) else: raise ValueError("dist must be 'norm' or a Callable") else: if callable(dist): z = dist.cdf(y, *params) else: raise ValueError('if fit is false, then dist must be callable') i = np.arange(1, nobs + 1) sl1 = [None] * x.ndim sl1[axis] = slice(None) sl1 = tuple(sl1) sl2 = [slice(None)] * x.ndim sl2[axis] = slice(None, None, -1) sl2 = tuple(sl2) s = np.sum((2 * i[sl1] - 1.0) / nobs * (np.log(z) + np.log1p(-z[sl2])), axis=axis) a2 = -nobs - s return a2
def forecast_components(self, steps: int = 1) -> pd.DataFrame: r""" Compute the three components of the Theta model forecast Parameters ---------- steps : int The number of steps ahead to compute the forecast components. Returns ------- DataFrame A DataFrame with three columns: trend, ses and seasonal containing the forecast values of each of the three components. Notes ----- For a given value of :math:`\theta`, the deseasonalized forecast is `fcast = w * trend + ses` where :math:`w = \frac{theta - 1}{theta}`. The reseasonalized forecasts are then `seasonal * fcast` if the seasonality is multiplicative or `seasonal + fcast` if the seasonality is additive. """ steps = int_like(steps, "steps") if steps < 1: raise ValueError("steps must be a positive integer") alpha = self._alpha b0 = self._b0 nobs = self._nobs h = np.arange(1, steps + 1, dtype=np.float64) - 1 if alpha > 0: h += 1 / alpha - ((1 - alpha)**nobs / alpha) trend = b0 * h ses = self._one_step * np.ones(steps) if self.model.method.startswith("add"): season = np.zeros(steps) else: season = np.ones(steps) # Re-seasonalize if self.model.deseasonalize: seasonal = self._seasonal period = self.model.period oos_idx = nobs + np.arange(steps) seasonal_locs = oos_idx % period if seasonal.shape[0]: season[:] = seasonal[seasonal_locs] index = getattr(self.model.endog_orig, "index", None) if index is None: index = pd.RangeIndex(0, self.model.endog_orig.shape[0]) index = extend_index(steps, index) df = pd.DataFrame({ "trend": trend, "ses": ses, "seasonal": season }, index=index) return df
def detrend(x, order=1, axis=0): """ Detrend an array with a trend of given order along axis 0 or 1. Parameters ---------- x : array_like, 1d or 2d Data, if 2d, then each row or column is independently detrended with the same trendorder, but independent trend estimates. order : int The polynomial order of the trend, zero is constant, one is linear trend, two is quadratic trend. axis : int Axis can be either 0, observations by rows, or 1, observations by columns. Returns ------- ndarray The detrended series is the residual of the linear regression of the data on the trend of given order. """ order = int_like(order, "order") axis = int_like(axis, "axis") if x.ndim == 2 and int(axis) == 1: x = x.T elif x.ndim > 2: raise NotImplementedError( "x.ndim > 2 is not implemented until it is needed") nobs = x.shape[0] if order == 0: # Special case demean resid = x - x.mean(axis=0) else: trends = np.vander(np.arange(float(nobs)), N=order + 1) beta = np.linalg.pinv(trends).dot(x) resid = x - np.dot(trends, beta) if x.ndim == 2 and int(axis) == 1: resid = resid.T return resid
def commutation_matrix(p, q): """ Create the commutation matrix K_{p,q} satisfying vec(A') = K_{p,q} vec(A) Parameters ---------- p : int q : int Returns ------- K : ndarray (pq x pq) """ p = int_like(p, "p") q = int_like(q, "q") K = np.eye(p * q) indices = np.arange(p * q).reshape((p, q), order="F") return K.take(indices.ravel(), axis=0)
def duplication_matrix(n): """ Create duplication matrix D_n which satisfies vec(S) = D_n vech(S) for symmetric matrix S Returns ------- D_n : ndarray """ n = int_like(n, "n") tmp = np.eye(n * (n + 1) // 2) return np.array([unvech(x).ravel() for x in tmp]).T
def __init__(self, endog, trend=None, damped=False, seasonal=None, seasonal_periods=None, dates=None, freq=None, missing='none'): super(ExponentialSmoothing, self).__init__(endog, None, dates, freq, missing=missing) self.endog = self.endog self._y = self._data = array_like(endog, 'endog', contiguous=True, order='C') options = ("add", "mul", "additive", "multiplicative") trend = string_like(trend, 'trend', options=options, optional=True) if trend in ['additive', 'multiplicative']: trend = {'additive': 'add', 'multiplicative': 'mul'}[trend] self.trend = trend self.damped = bool_like(damped, 'damped') seasonal = string_like(seasonal, 'seasonal', options=options, optional=True) if seasonal in ['additive', 'multiplicative']: seasonal = {'additive': 'add', 'multiplicative': 'mul'}[seasonal] self.seasonal = seasonal self.trending = trend in ['mul', 'add'] self.seasoning = seasonal in ['mul', 'add'] if (self.trend == 'mul' or self.seasonal == 'mul') and \ not np.all(self._data > 0.0): raise ValueError('endog must be strictly positive when using' 'multiplicative trend or seasonal components.') if self.damped and not self.trending: raise ValueError('Can only dampen the trend component') if self.seasoning: self.seasonal_periods = int_like(seasonal_periods, 'seasonal_periods', optional=True) if seasonal_periods is None: self.seasonal_periods = freq_to_period(self._index_freq) if self.seasonal_periods <= 1: raise ValueError('seasonal_periods must be larger than 1.') else: self.seasonal_periods = 0 self.nobs = len(self.endog)
def elimination_matrix(n): """ Create the elimination matrix L_n which satisfies vech(M) = L_n vec(M) for any matrix M Parameters ---------- Returns ------- """ n = int_like(n, "n") vech_indices = vec(np.tril(np.ones((n, n)))) return np.eye(n * n)[vech_indices != 0]
def __init__( self, endog, *, period: Optional[int] = None, deseasonalize: bool = True, use_test: bool = True, method: str = "auto", difference: bool = False ) -> None: self._y = array_like(endog, "endog", ndim=1) if isinstance(endog, pd.DataFrame): self.endog_orig = endog.iloc[:, 0] else: self.endog_orig = endog self._period = int_like(period, "period", optional=True) self._deseasonalize = bool_like(deseasonalize, "deseasonalize") self._use_test = ( bool_like(use_test, "use_test") and self._deseasonalize ) self._diff = bool_like(difference, "difference") self._method = string_like( method, "model", options=("auto", "additive", "multiplicative", "mul", "add"), ) if self._period is None and self._deseasonalize: idx = getattr(endog, "index", None) pfreq = None if idx is not None: pfreq = getattr(idx, "freq", None) if pfreq is None: pfreq = getattr(idx, "inferred_freq", None) if pfreq is not None: self._period = freq_to_period(pfreq) else: raise ValueError( "You must specify a period or endog must be a " "pandas object with a DatetimeIndex with " "a freq not set to None" ) self._has_seasonality = self._deseasonalize
def unintegrate_levels(x, d): """ Returns the successive differences needed to unintegrate the series. Parameters ---------- x : array_like The original series d : int The number of differences of the differenced series. Returns ------- y : array_like The increasing differences from 0 to d-1 of the first d elements of x. See Also -------- unintegrate """ d = int_like(d, "d") x = x[:d] return np.asarray([np.diff(x, d - i)[0] for i in range(d, 0, -1)])
def add_lag(x, col=None, lags=1, drop=False, insert=True): """ Returns an array with lags included given an array. Parameters ---------- x : array An array or NumPy ndarray subclass. Can be either a 1d or 2d array with observations in columns. col : 'string', int, or None If data is a structured array or a recarray, `col` can be a string that is the name of the column containing the variable. Or `col` can be an int of the zero-based column index. If it's a 1d array `col` can be None. lags : int The number of lags desired. drop : bool Whether to keep the contemporaneous variable for the data. insert : bool or int If True, inserts the lagged values after `col`. If False, appends the data. If int inserts the lags at int. Returns ------- array : ndarray Array with lags Examples -------- >>> import statsmodels.api as sm >>> data = sm.datasets.macrodata.load(as_pandas=False) >>> data = data.data[['year','quarter','realgdp','cpi']] >>> data = sm.tsa.add_lag(data, 'realgdp', lags=2) Notes ----- Trims the array both forward and backward, so that the array returned so that the length of the returned array is len(`X`) - lags. The lags are returned in increasing order, ie., t-1,t-2,...,t-lags """ lags = int_like(lags, 'lags') drop = bool_like(drop, 'drop') if x.dtype.names: names = x.dtype.names if not col and np.squeeze(x).ndim > 1: raise IndexError("col is None and the input array is not 1d") elif len(names) == 1: col = names[0] if isinstance(col, int): col = x.dtype.names[col] contemp = x[col] # make names for lags tmp_names = [col + '_' + 'L(%i)' % i for i in range(1, lags + 1)] ndlags = lagmat(contemp, maxlag=lags, trim='Both') # get index for return if insert is True: ins_idx = list(names).index(col) + 1 elif insert is False: ins_idx = len(names) + 1 else: # insert is an int if insert > len(names): import warnings warnings.warn( "insert > number of variables, inserting at the" " last position", ValueWarning) ins_idx = insert first_names = list(names[:ins_idx]) last_names = list(names[ins_idx:]) if drop: if col in first_names: first_names.pop(first_names.index(col)) else: last_names.pop(last_names.index(col)) if first_names: # only do this if x is not "empty" # Workaround to avoid NumPy FutureWarning _x = recarray_select(x, first_names) first_arr = nprf.append_fields(_x[lags:], tmp_names, ndlags.T, usemask=False) else: first_arr = np.zeros(len(x) - lags, dtype=lzip(tmp_names, (x[col].dtype, ) * lags)) for i, name in enumerate(tmp_names): first_arr[name] = ndlags[:, i] if last_names: return nprf.append_fields(first_arr, last_names, [x[name][lags:] for name in last_names], usemask=False) else: # lags for last variable return first_arr else: # we have an ndarray if x.ndim == 1: # make 2d if 1d x = x[:, None] if col is None: col = 0 # handle negative index if col < 0: col = x.shape[1] + col contemp = x[:, col] if insert is True: ins_idx = col + 1 elif insert is False: ins_idx = x.shape[1] else: if insert < 0: # handle negative index insert = x.shape[1] + insert + 1 if insert > x.shape[1]: insert = x.shape[1] import warnings warnings.warn( "insert > number of variables, inserting at the" " last position", ValueWarning) ins_idx = insert ndlags = lagmat(contemp, lags, trim='Both') first_cols = lrange(ins_idx) last_cols = lrange(ins_idx, x.shape[1]) if drop: if col in first_cols: first_cols.pop(first_cols.index(col)) else: last_cols.pop(last_cols.index(col)) return np.column_stack((x[lags:, first_cols], ndlags, x[lags:, last_cols]))
def adfuller( x, maxlag=None, regression="c", autolag="AIC", store=False, regresults=False, ): """ Augmented Dickey-Fuller unit root test. The Augmented Dickey-Fuller test can be used to test for a unit root in a univariate process in the presence of serial correlation. Parameters ---------- x : array_like, 1d The data series to test. maxlag : int Maximum lag which is included in test, default 12*(nobs/100)^{1/4}. regression : {"c","ct","ctt","nc"} Constant and trend order to include in regression. * "c" : constant only (default). * "ct" : constant and trend. * "ctt" : constant, and linear and quadratic trend. * "nc" : no constant, no trend. autolag : {"AIC", "BIC", "t-stat", None} Method to use when automatically determining the lag length among the values 0, 1, ..., maxlag. * If "AIC" (default) or "BIC", then the number of lags is chosen to minimize the corresponding information criterion. * "t-stat" based choice of maxlag. Starts with maxlag and drops a lag until the t-statistic on the last lag length is significant using a 5%-sized test. * If None, then the number of included lags is set to maxlag. store : bool If True, then a result instance is returned additionally to the adf statistic. Default is False. regresults : bool, optional If True, the full regression results are returned. Default is False. Returns ------- adf : float The test statistic. pvalue : float MacKinnon's approximate p-value based on MacKinnon (1994, 2010). usedlag : int The number of lags used. nobs : int The number of observations used for the ADF regression and calculation of the critical values. critical values : dict Critical values for the test statistic at the 1 %, 5 %, and 10 % levels. Based on MacKinnon (2010). icbest : float The maximized information criterion if autolag is not None. resstore : ResultStore, optional A dummy class with results attached as attributes. Notes ----- The null hypothesis of the Augmented Dickey-Fuller is that there is a unit root, with the alternative that there is no unit root. If the pvalue is above a critical size, then we cannot reject that there is a unit root. The p-values are obtained through regression surface approximation from MacKinnon 1994, but using the updated 2010 tables. If the p-value is close to significant, then the critical values should be used to judge whether to reject the null. The autolag option and maxlag for it are described in Greene. References ---------- .. [1] W. Green. "Econometric Analysis," 5th ed., Pearson, 2003. .. [2] Hamilton, J.D. "Time Series Analysis". Princeton, 1994. .. [3] MacKinnon, J.G. 1994. "Approximate asymptotic distribution functions for unit-root and cointegration tests. `Journal of Business and Economic Statistics` 12, 167-76. .. [4] MacKinnon, J.G. 2010. "Critical Values for Cointegration Tests." Queen"s University, Dept of Economics, Working Papers. Available at http://ideas.repec.org/p/qed/wpaper/1227.html Examples -------- See example notebook """ x = array_like(x, "x") maxlag = int_like(maxlag, "maxlag", optional=True) regression = string_like(regression, "regression", options=("c", "ct", "ctt", "nc")) autolag = string_like(autolag, "autolag", optional=True, options=("aic", "bic", "t-stat")) store = bool_like(store, "store") regresults = bool_like(regresults, "regresults") if regresults: store = True trenddict = {None: "nc", 0: "c", 1: "ct", 2: "ctt"} if regression is None or isinstance(regression, int): regression = trenddict[regression] regression = regression.lower() nobs = x.shape[0] ntrend = len(regression) if regression != "nc" else 0 if maxlag is None: # from Greene referencing Schwert 1989 maxlag = int(np.ceil(12.0 * np.power(nobs / 100.0, 1 / 4.0))) # -1 for the diff maxlag = min(nobs // 2 - ntrend - 1, maxlag) if maxlag < 0: raise ValueError("sample size is too short to use selected " "regression component") elif maxlag > nobs // 2 - ntrend - 1: raise ValueError("maxlag must be less than (nobs/2 - 1 - ntrend) " "where n trend is the number of included " "deterministic regressors") xdiff = np.diff(x) xdall = lagmat(xdiff[:, None], maxlag, trim="both", original="in") nobs = xdall.shape[0] xdall[:, 0] = x[-nobs - 1:-1] # replace 0 xdiff with level of x xdshort = xdiff[-nobs:] if store: from statsmodels.stats.diagnostic import ResultsStore resstore = ResultsStore() if autolag: if regression != "nc": fullRHS = add_trend(xdall, regression, prepend=True) else: fullRHS = xdall startlag = fullRHS.shape[1] - xdall.shape[1] + 1 # 1 for level # search for lag length with smallest information criteria # Note: use the same number of observations to have comparable IC # aic and bic: smaller is better if not regresults: icbest, bestlag = _autolag(OLS, xdshort, fullRHS, startlag, maxlag, autolag) else: icbest, bestlag, alres = _autolag( OLS, xdshort, fullRHS, startlag, maxlag, autolag, regresults=regresults, ) resstore.autolag_results = alres bestlag -= startlag # convert to lag not column index # rerun ols with best autolag xdall = lagmat(xdiff[:, None], bestlag, trim="both", original="in") nobs = xdall.shape[0] xdall[:, 0] = x[-nobs - 1:-1] # replace 0 xdiff with level of x xdshort = xdiff[-nobs:] usedlag = bestlag else: usedlag = maxlag icbest = None if regression != "nc": resols = OLS(xdshort, add_trend(xdall[:, :usedlag + 1], regression)).fit() else: resols = OLS(xdshort, xdall[:, :usedlag + 1]).fit() adfstat = resols.tvalues[0] # adfstat = (resols.params[0]-1.0)/resols.bse[0] # the "asymptotically correct" z statistic is obtained as # nobs/(1-np.sum(resols.params[1:-(trendorder+1)])) (resols.params[0] - 1) # I think this is the statistic that is used for series that are integrated # for orders higher than I(1), ie., not ADF but cointegration tests. # Get approx p-value and critical values pvalue = mackinnonp(adfstat, regression=regression, N=1) critvalues = mackinnoncrit(N=1, regression=regression, nobs=nobs) critvalues = { "1%": critvalues[0], "5%": critvalues[1], "10%": critvalues[2], } if store: resstore.resols = resols resstore.maxlag = maxlag resstore.usedlag = usedlag resstore.adfstat = adfstat resstore.critvalues = critvalues resstore.nobs = nobs resstore.H0 = ("The coefficient on the lagged level equals 1 - " "unit root") resstore.HA = "The coefficient on the lagged level < 1 - stationary" resstore.icbest = icbest resstore._str = "Augmented Dickey-Fuller Test Results" return adfstat, pvalue, critvalues, resstore else: if not autolag: return adfstat, pvalue, usedlag, nobs, critvalues else: return adfstat, pvalue, usedlag, nobs, critvalues, icbest
def varsim(coefs, intercept, sig_u, steps=100, initial_values=None, seed=None, nsimulations=None): """ Simulate VAR(p) process, given coefficients and assuming Gaussian noise Parameters ---------- coefs : ndarray Coefficients for the VAR lags of endog. intercept : None or ndarray 1-D (neqs,) or (steps, neqs) This can be either the intercept for each equation or an offset. If None, then the VAR process has a zero intercept. If intercept is 1-D, then the same (endog specific) intercept is added to all observations. If intercept is 2-D, then it is treated as an offset and is added as an observation specific intercept to the autoregression. In this case, the intercept/offset should have same number of rows as steps, and the same number of columns as endogenous variables (neqs). sig_u : ndarray Covariance matrix of the residuals or innovations. If sig_u is None, then an identity matrix is used. steps : {None, int} number of observations to simulate, this includes the initial observations to start the autoregressive process. If offset is not None, then exog of the model are used if they were provided in the model initial_values : array_like, optional Initial values for use in the simulation. Shape should be (nlags, neqs) or (neqs,). Values should be ordered from less to most recent. Note that this values will be returned by the simulation as the first values of `endog_simulated` and they will count for the total number of steps. seed : {None, int} If seed is not None, then it will be used with for the random variables generated by numpy.random. nsimulations : {None, int} Number of simulations to perform. If `nsimulations` is None it will perform one simulation and return value will have shape (steps, neqs). Returns ------- endog_simulated : nd_array Endog of the simulated VAR process. Shape will be (nsimulations, steps, neqs) or (steps, neqs) if `nsimulations` is None. """ rs = np.random.RandomState(seed=seed) rmvnorm = rs.multivariate_normal p, k, k = coefs.shape nsimulations = int_like(nsimulations, "nsimulations", optional=True) if isinstance(nsimulations, int) and nsimulations <= 0: raise ValueError("nsimulations must be a positive integer if provided") if nsimulations is None: result_shape = (steps, k) nsimulations = 1 else: result_shape = (nsimulations, steps, k) if sig_u is None: sig_u = np.eye(k) ugen = rmvnorm(np.zeros(len(sig_u)), sig_u, steps * nsimulations).reshape(nsimulations, steps, k) result = np.zeros((nsimulations, steps, k)) if intercept is not None: # intercept can be 2-D like an offset variable if np.ndim(intercept) > 1: if not len(intercept) == ugen.shape[1]: raise ValueError('2-D intercept needs to have length `steps`') # add intercept/offset also to intial values result += intercept result[:, p:] += ugen[:, p:] else: result[:, p:] = ugen[:, p:] initial_values = array_like(initial_values, "initial_values", optional=True, maxdim=2) if initial_values is not None: if not (initial_values.shape == (p, k) or initial_values.shape == (k, )): raise ValueError( "initial_values should have shape (p, k) or (k,) where p is the number of lags and k is the number of equations." ) result[:, :p] = initial_values # add in AR terms for t in range(p, steps): ygen = result[:, t] for j in range(p): ygen += np.dot(coefs[j], result[:, t - j - 1].T).T return result.reshape(result_shape)
def lagmat2ds(x, maxlag0, maxlagex=None, dropex=0, trim="forward", use_pandas=False): """ Generate lagmatrix for 2d array, columns arranged by variables. Parameters ---------- x : array_like Data, 2d. Observations in rows and variables in columns. maxlag0 : int The first variable all lags from zero to maxlag are included. maxlagex : {None, int} The max lag for all other variables all lags from zero to maxlag are included. dropex : int Exclude first dropex lags from other variables. For all variables, except the first, lags from dropex to maxlagex are included. trim : str The trimming method to use. * 'forward' : trim invalid observations in front. * 'backward' : trim invalid initial observations. * 'both' : trim invalid observations on both sides. * 'none' : no trimming of observations. use_pandas : bool If true, returns a DataFrame when the input is a pandas Series or DataFrame. If false, return numpy ndarrays. Returns ------- ndarray The array with lagged observations, columns ordered by variable. Notes ----- Inefficient implementation for unequal lags, implemented for convenience. """ maxlag0 = int_like(maxlag0, "maxlag0") maxlagex = int_like(maxlagex, "maxlagex", optional=True) trim = string_like( trim, "trim", optional=True, options=("forward", "backward", "both", "none"), ) if maxlagex is None: maxlagex = maxlag0 maxlag = max(maxlag0, maxlagex) is_pandas = _is_using_pandas(x, None) if x.ndim == 1: if is_pandas: x = pd.DataFrame(x) else: x = x[:, None] elif x.ndim == 0 or x.ndim > 2: raise ValueError("Only supports 1 and 2-dimensional data.") nobs, nvar = x.shape if is_pandas and use_pandas: lags = lagmat(x.iloc[:, 0], maxlag, trim=trim, original="in", use_pandas=True) lagsli = [lags.iloc[:, :maxlag0 + 1]] for k in range(1, nvar): lags = lagmat(x.iloc[:, k], maxlag, trim=trim, original="in", use_pandas=True) lagsli.append(lags.iloc[:, dropex:maxlagex + 1]) return pd.concat(lagsli, axis=1) elif is_pandas: x = np.asanyarray(x) lagsli = [ lagmat(x[:, 0], maxlag, trim=trim, original="in")[:, :maxlag0 + 1] ] for k in range(1, nvar): lagsli.append( lagmat(x[:, k], maxlag, trim=trim, original="in")[:, dropex:maxlagex + 1]) return np.column_stack(lagsli)
def __init__(self, endog, trend=False, damped_trend=False, seasonal=None, initialization_method='estimated', initial_level=None, initial_trend=None, initial_seasonal=None, bounds=None, concentrate_scale=True, dates=None, freq=None, missing='none'): # Model definition self.trend = bool_like(trend, 'trend') self.damped_trend = bool_like(damped_trend, 'damped_trend') self.seasonal_periods = int_like(seasonal, 'seasonal', optional=True) self.seasonal = self.seasonal_periods is not None self.initialization_method = string_like( initialization_method, 'initialization_method').lower() self.concentrate_scale = bool_like(concentrate_scale, 'concentrate_scale') # TODO: add validation for bounds (e.g. have all bounds, upper > lower) # TODO: add `bounds_method` argument to choose between "usual" and # "admissible" as in Hyndman et al. (2008) self.bounds = bounds if self.bounds is None: self.bounds = [(1e-4, 1-1e-4)] * 3 + [(0.8, 0.98)] # Validation if self.seasonal_periods == 1: raise ValueError('Cannot have a seasonal period of 1.') if self.seasonal and self.seasonal_periods is None: raise NotImplementedError('Unable to detect season automatically;' ' please specify `seasonal_periods`.') if self.initialization_method not in ['concentrated', 'estimated', 'simple', 'heuristic', 'known']: raise ValueError('Invalid initialization method "%s".' % initialization_method) if self.initialization_method == 'known': if initial_level is None: raise ValueError('`initial_level` argument must be provided' ' when initialization method is set to' ' "known".') if initial_trend is None and self.trend: raise ValueError('`initial_trend` argument must be provided' ' for models with a trend component when' ' initialization method is set to "known".') if initial_seasonal is None and self.seasonal: raise ValueError('`initial_seasonal` argument must be provided' ' for models with a seasonal component when' ' initialization method is set to "known".') # Initialize the state space model if not self.seasonal or self.seasonal_periods is None: self._seasonal_periods = 0 else: self._seasonal_periods = self.seasonal_periods k_states = 2 + int(self.trend) + self._seasonal_periods k_posdef = 1 init = ss_init.Initialization(k_states, 'known', constant=[0] * k_states) super(ExponentialSmoothing, self).__init__( endog, k_states=k_states, k_posdef=k_posdef, initialization=init, dates=dates, freq=freq, missing=missing) # Concentrate the scale out of the likelihood function if self.concentrate_scale: self.ssm.filter_concentrated = True # Setup fixed elements of the system matrices # Observation error self.ssm['design', 0, 0] = 1. self.ssm['selection', 0, 0] = 1. self.ssm['state_cov', 0, 0] = 1. # Level self.ssm['design', 0, 1] = 1. self.ssm['transition', 1, 1] = 1. # Trend if self.trend: self.ssm['transition', 1:3, 2] = 1. # Seasonal if self.seasonal: k = 2 + int(self.trend) self.ssm['design', 0, k] = 1. self.ssm['transition', k, -1] = 1. self.ssm['transition', k + 1:k_states, k:k_states - 1] = ( np.eye(self.seasonal_periods - 1)) # Initialization of the states if self.initialization_method != 'known': msg = ('Cannot give `%%s` argument when initialization is "%s"' % initialization_method) if initial_level is not None: raise ValueError(msg % 'initial_level') if initial_trend is not None: raise ValueError(msg % 'initial_trend') if initial_seasonal is not None: raise ValueError(msg % 'initial_seasonal') if self.initialization_method == 'simple': initial_level, initial_trend, initial_seasonal = ( es_init._initialization_simple( self.endog[:, 0], trend='add' if self.trend else None, seasonal='add' if self.seasonal else None, seasonal_periods=self.seasonal_periods)) elif self.initialization_method == 'heuristic': initial_level, initial_trend, initial_seasonal = ( es_init._initialization_heuristic( self.endog[:, 0], trend='add' if self.trend else None, seasonal='add' if self.seasonal else None, seasonal_periods=self.seasonal_periods)) elif self.initialization_method == 'known': initial_level = float_like(initial_level, 'initial_level') if self.trend: initial_trend = float_like(initial_trend, 'initial_trend') if self.seasonal: initial_seasonal = array_like(initial_seasonal, 'initial_seasonal') if len(initial_seasonal) == self.seasonal_periods - 1: initial_seasonal = np.r_[initial_seasonal, 0 - np.sum(initial_seasonal)] if len(initial_seasonal) != self.seasonal_periods: raise ValueError( 'Invalid length of initial seasonal values. Must be' ' one of s or s-1, where s is the number of seasonal' ' periods.') # Note that the simple and heuristic methods of computing initial # seasonal factors return estimated seasonal factors associated with # the first t = 1, 2, ..., `n_seasons` observations. To use these as # the initial state, we lag them by `n_seasons`. This yields, for # example for `n_seasons = 4`, the seasons lagged L3, L2, L1, L0. # As described above, the state vector in this model should have # seasonal factors ordered L0, L1, L2, L3, and as a result we need to # reverse the order of the computed initial seasonal factors from # these methods. methods = ['simple', 'heuristic'] if (self.initialization_method in methods and initial_seasonal is not None): initial_seasonal = initial_seasonal[::-1] self._initial_level = initial_level self._initial_trend = initial_trend self._initial_seasonal = initial_seasonal self._initial_state = None # Initialize now if possible (if we have a damped trend, then # initialization will depend on the phi parameter, and so has to be # done at each `update`) methods = ['simple', 'heuristic', 'known'] if not self.damped_trend and self.initialization_method in methods: self._initialize_constant_statespace(initial_level, initial_trend, initial_seasonal) # Save keys for kwarg initialization self._init_keys += ['trend', 'damped_trend', 'seasonal', 'initialization_method', 'initial_level', 'initial_trend', 'initial_seasonal', 'bounds', 'concentrate_scale', 'dates', 'freq', 'missing']
def lagmat( x, maxlag: int, trim: Literal["forward", "backward", "both", "none"] = 'forward', original: Literal["ex", "sep", "in"] = "ex", use_pandas: bool = False ) -> NDArray | DataFrame | tuple[NDArray, NDArray] | tuple[DataFrame, DataFrame]: """ Create 2d array of lags. Parameters ---------- x : array_like Data; if 2d, observation in rows and variables in columns. maxlag : int All lags from zero to maxlag are included. trim : {'forward', 'backward', 'both', 'none', None} The trimming method to use. * 'forward' : trim invalid observations in front. * 'backward' : trim invalid initial observations. * 'both' : trim invalid observations on both sides. * 'none', None : no trimming of observations. original : {'ex','sep','in'} How the original is treated. * 'ex' : drops the original array returning only the lagged values. * 'in' : returns the original array and the lagged values as a single array. * 'sep' : returns a tuple (original array, lagged values). The original array is truncated to have the same number of rows as the returned lagmat. use_pandas : bool If true, returns a DataFrame when the input is a pandas Series or DataFrame. If false, return numpy ndarrays. Returns ------- lagmat : ndarray The array with lagged observations. y : ndarray, optional Only returned if original == 'sep'. Notes ----- When using a pandas DataFrame or Series with use_pandas=True, trim can only be 'forward' or 'both' since it is not possible to consistently extend index values. Examples -------- >>> from statsmodels.tsa.tsatools import lagmat >>> import numpy as np >>> X = np.arange(1,7).reshape(-1,2) >>> lagmat(X, maxlag=2, trim="forward", original='in') array([[ 1., 2., 0., 0., 0., 0.], [ 3., 4., 1., 2., 0., 0.], [ 5., 6., 3., 4., 1., 2.]]) >>> lagmat(X, maxlag=2, trim="backward", original='in') array([[ 5., 6., 3., 4., 1., 2.], [ 0., 0., 5., 6., 3., 4.], [ 0., 0., 0., 0., 5., 6.]]) >>> lagmat(X, maxlag=2, trim="both", original='in') array([[ 5., 6., 3., 4., 1., 2.]]) >>> lagmat(X, maxlag=2, trim="none", original='in') array([[ 1., 2., 0., 0., 0., 0.], [ 3., 4., 1., 2., 0., 0.], [ 5., 6., 3., 4., 1., 2.], [ 0., 0., 5., 6., 3., 4.], [ 0., 0., 0., 0., 5., 6.]]) """ maxlag = int_like(maxlag, "maxlag") use_pandas = bool_like(use_pandas, "use_pandas") trim = string_like( trim, "trim", optional=True, options=("forward", "backward", "both", "none"), ) original = string_like(original, "original", options=("ex", "sep", "in")) # TODO: allow list of lags additional to maxlag orig = x x = array_like(x, "x", ndim=2, dtype=None) is_pandas = _is_using_pandas(orig, None) and use_pandas trim = "none" if trim is None else trim trim = trim.lower() if is_pandas and trim in ("none", "backward"): raise ValueError("trim cannot be 'none' or 'backward' when used on " "Series or DataFrames") dropidx = 0 nobs, nvar = x.shape if original in ["ex", "sep"]: dropidx = nvar if maxlag >= nobs: raise ValueError("maxlag should be < nobs") lm = np.zeros((nobs + maxlag, nvar * (maxlag + 1))) for k in range(0, int(maxlag + 1)): lm[maxlag - k:nobs + maxlag - k, nvar * (maxlag - k):nvar * (maxlag - k + 1), ] = x if trim in ("none", "forward"): startobs = 0 elif trim in ("backward", "both"): startobs = maxlag else: raise ValueError("trim option not valid") if trim in ("none", "backward"): stopobs = len(lm) else: stopobs = nobs if is_pandas: x = orig x_columns = x.columns if isinstance(x, DataFrame) else [x.name] columns = [str(col) for col in x_columns] for lag in range(maxlag): lag_str = str(lag + 1) columns.extend([str(col) + ".L." + lag_str for col in x_columns]) lm = DataFrame(lm[:stopobs], index=x.index, columns=columns) lags = lm.iloc[startobs:] if original in ("sep", "ex"): leads = lags[x_columns] lags = lags.drop(x_columns, axis=1) else: lags = lm[startobs:stopobs, dropidx:] if original == "sep": leads = lm[startobs:stopobs, :dropidx] if original == "sep": return lags, leads else: return lags
def forecast(self, steps: int = 1, theta: float = 2) -> pd.Series: r""" Forecast the model for a given theta Parameters ---------- steps : int The number of steps ahead to compute the forecast components. theta : float The theta value to use when computing the weight to combine the trend and the SES forecasts. Returns ------- Series A Series containing the forecasts Notes ----- The forecast is computed as .. math:: \hat{X}_{T+h|T} = \frac{\theta-1}{\theta} b_0 \left[h - 1 + \frac{1}{\alpha} - \frac{(1-\alpha)^T}{\alpha} \right] + \tilde{X}_{T+h|T} where :math:`\tilde{X}_{T+h|T}` is the SES forecast of the endogenous variable using the parameter :math:`\alpha`. :math:`b_0` is the slope of a time trend line fitted to X using the terms 0, 1, ..., T-1. This expression follows from [1]_ and [2]_ when the combination weights are restricted to be (theta-1)/theta and 1/theta. This nests the original implementation when theta=2 and the two weights are both 1/2. References ---------- .. [1] Hyndman, R. J., & Billah, B. (2003). Unmasking the Theta method. International Journal of Forecasting, 19(2), 287-290. .. [2] Fioruci, J. A., Pellegrini, T. R., Louzada, F., & Petropoulos, F. (2015). The optimized theta method. arXiv preprint arXiv:1503.03529. """ steps = int_like(steps, "steps") if steps < 1: raise ValueError("steps must be a positive integer") theta = float_like(theta, "theta") if theta < 1: raise ValueError("theta must be a float >= 1") thresh = 4.0 / np.finfo(np.double).eps trend_weight = (theta - 1) / theta if theta < thresh else 1.0 comp = self.forecast_components(steps=steps) fcast = trend_weight * comp.trend + np.asarray(comp.ses) # Re-seasonalize if needed if self.model.deseasonalize: seasonal = np.asarray(comp.seasonal) if self.model.method.startswith("mul"): fcast *= seasonal else: fcast += seasonal fcast.name = "forecast" return fcast
def add_lag(x, col=None, lags=1, drop=False, insert=True): """ Returns an array with lags included given an array. Parameters ---------- x : array_like An array or NumPy ndarray subclass. Can be either a 1d or 2d array with observations in columns. col : int or None `col` can be an int of the zero-based column index. If it's a 1d array `col` can be None. lags : int The number of lags desired. drop : bool Whether to keep the contemporaneous variable for the data. insert : bool or int If True, inserts the lagged values after `col`. If False, appends the data. If int inserts the lags at int. Returns ------- array : ndarray Array with lags Examples -------- >>> import statsmodels.api as sm >>> data = sm.datasets.macrodata.load() >>> data = data.data[['year','quarter','realgdp','cpi']] >>> data = sm.tsa.add_lag(data, 'realgdp', lags=2) Notes ----- Trims the array both forward and backward, so that the array returned so that the length of the returned array is len(`X`) - lags. The lags are returned in increasing order, ie., t-1,t-2,...,t-lags """ lags = int_like(lags, "lags") drop = bool_like(drop, "drop") x = array_like(x, "x", ndim=2) if col is None: col = 0 # handle negative index if col < 0: col = x.shape[1] + col if x.ndim == 1: x = x[:, None] contemp = x[:, col] if insert is True: ins_idx = col + 1 elif insert is False: ins_idx = x.shape[1] else: if insert < 0: # handle negative index insert = x.shape[1] + insert + 1 if insert > x.shape[1]: insert = x.shape[1] warnings.warn( "insert > number of variables, inserting at the" " last position", ValueWarning, ) ins_idx = insert ndlags = lagmat(contemp, lags, trim="Both") first_cols = lrange(ins_idx) last_cols = lrange(ins_idx, x.shape[1]) if drop: if col in first_cols: first_cols.pop(first_cols.index(col)) else: last_cols.pop(last_cols.index(col)) return np.column_stack((x[lags:, first_cols], ndlags, x[lags:, last_cols]))
def __init__(self, data, ncomp=None, standardize=True, demean=True, normalize=True, gls=False, weights=None, method='svd', missing=None, tol=5e-8, max_iter=1000, tol_em=5e-8, max_em_iter=100): self._index = None self._columns = [] if isinstance(data, pd.DataFrame): self._index = data.index self._columns = data.columns self.data = array_like(data, "data", ndim=2) # Store inputs self._gls = bool_like(gls, "gls") self._normalize = bool_like(normalize, "normalize") self._tol = float_like(tol, "tol") if not 0 < self._tol < 1: raise ValueError('tol must be strictly between 0 and 1') self._max_iter = int_like(max_iter, "int_like") self._max_em_iter = int_like(max_em_iter, "max_em_iter") self._tol_em = float_like(tol_em, "tol_em") # Prepare data self._standardize = bool_like(standardize, "standardize") self._demean = bool_like(demean, "demean") self._nobs, self._nvar = self.data.shape weights = array_like(weights, "weights", maxdim=1, optional=True) if weights is None: weights = np.ones(self._nvar) else: weights = np.array(weights).flatten() if weights.shape[0] != self._nvar: raise ValueError('weights should have nvar elements') weights = weights / np.sqrt((weights ** 2.0).mean()) self.weights = weights # Check ncomp against maximum min_dim = min(self._nobs, self._nvar) self._ncomp = min_dim if ncomp is None else ncomp if self._ncomp > min_dim: import warnings warn = 'The requested number of components is more than can be ' \ 'computed from data. The maximum number of components is ' \ 'the minimum of the number of observations or variables' warnings.warn(warn, ValueWarning) self._ncomp = min_dim self._method = method # Workaround to avoid instance methods in __dict__ if self._method not in ('eig', 'svd', 'nipals'): raise ValueError('method {0} is not known.'.format(method)) self.rows = np.arange(self._nobs) self.cols = np.arange(self._nvar) # Handle missing self._missing = string_like(missing, "missing", optional=True) self._adjusted_data = self.data self._adjust_missing() # Update size self._nobs, self._nvar = self._adjusted_data.shape if self._ncomp == np.min(self.data.shape): self._ncomp = np.min(self._adjusted_data.shape) elif self._ncomp > np.min(self._adjusted_data.shape): raise ValueError('When adjusting for missing values, user ' 'provided ncomp must be no larger than the ' 'smallest dimension of the ' 'missing-value-adjusted data size.') # Attributes and internal values self._tss = 0.0 self._ess = None self.transformed_data = None self._mu = None self._sigma = None self._ess_indiv = None self._tss_indiv = None self.scores = self.factors = None self.loadings = None self.coeff = None self.eigenvals = None self.eigenvecs = None self.projection = None self.rsquare = None self.ic = None # Prepare data self.transformed_data = self._prepare_data() # Perform the PCA self._pca() if gls: self._compute_gls_weights() self.transformed_data = self._prepare_data() self._pca() # Final calculations self._compute_rsquare_and_ic() if self._index is not None: self._to_pandas()
def fit( self, method="inv", cov_type="nonrobust", cov_kwds=None, reset=None, use_t=False, params_only=False, ): """ Estimate model parameters. Parameters ---------- method : {'inv', 'lstsq', 'pinv'} Method to use when computing the the model parameters. * 'inv' - use moving windows inner-products and matrix inversion. This method is the fastest, but may be less accurate than the other methods. * 'lstsq' - Use numpy.linalg.lstsq * 'pinv' - Use numpy.linalg.pinv. This method matches the default estimator in non-moving regression estimators. cov_type : {'nonrobust', 'HCCM', 'HC0'} Covariance estimator: * nonrobust - The classic OLS covariance estimator * HCCM, HC0 - White heteroskedasticity robust covariance cov_kwds : dict Unused reset : int, optional Interval to recompute the moving window inner products used to estimate the model parameters. Smaller values improve accuracy, although in practice this setting is not required to be set. use_t : bool, optional Flag indicating to use the Student's t distribution when computing p-values. params_only : bool, optional Flag indicating that only parameters should be computed. Avoids calculating all other statistics or performing inference. Returns ------- RollingRegressionResults Estimation results where all pre-sample values are nan-filled. """ method = string_like(method, "method", options=("inv", "lstsq", "pinv")) reset = int_like(reset, "reset", optional=True) reset = self._y.shape[0] if reset is None else reset if reset < 1: raise ValueError("reset must be a positive integer") nobs, k = self._x.shape store = RollingStore( params=np.full((nobs, k), np.nan), ssr=np.full(nobs, np.nan), llf=np.full(nobs, np.nan), nobs=np.zeros(nobs, dtype=int), s2=np.full(nobs, np.nan), xpxi=np.full((nobs, k, k), np.nan), xeex=np.full((nobs, k, k), np.nan), centered_tss=np.full(nobs, np.nan), uncentered_tss=np.full(nobs, np.nan), ) w = self._window first = self._min_nobs if self._expanding else w xpx, xpy, nobs = self._reset(first) if not (self._has_nan[first - 1] and self._skip_missing): self._fit_single(first, xpx, xpy, nobs, store, params_only, method) wx, wy = self._wx, self._wy for i in range(first + 1, self._x.shape[0] + 1): if self._has_nan[i - 1] and self._skip_missing: continue if i % reset == 0: xpx, xpy, nobs = self._reset(i) else: if not self._is_nan[i - w - 1] and i > w: remove_x = wx[i - w - 1:i - w] xpx -= remove_x.T @ remove_x xpy -= remove_x.T @ wy[i - w - 1:i - w] nobs -= 1 if not self._is_nan[i - 1]: add_x = wx[i - 1:i] xpx += add_x.T @ add_x xpy += add_x.T @ wy[i - 1:i] nobs += 1 self._fit_single(i, xpx, xpy, nobs, store, params_only, method) return RollingRegressionResults(self, store, self.k_constant, use_t, cov_type)
def test_not_int_like(not_integer): with pytest.raises(TypeError): int_like(not_integer, "integer")
def lagmat(x, maxlag, trim='forward', original='ex', use_pandas=False): """ Create 2d array of lags Parameters ---------- x : array_like, 1d or 2d data; if 2d, observation in rows and variables in columns maxlag : int all lags from zero to maxlag are included trim : str {'forward', 'backward', 'both', 'none'} or None * 'forward' : trim invalid observations in front * 'backward' : trim invalid initial observations * 'both' : trim invalid observations on both sides * 'none', None : no trimming of observations original : str {'ex','sep','in'} * 'ex' : drops the original array returning only the lagged values. * 'in' : returns the original array and the lagged values as a single array. * 'sep' : returns a tuple (original array, lagged values). The original array is truncated to have the same number of rows as the returned lagmat. use_pandas : bool, optional If true, returns a DataFrame when the input is a pandas Series or DataFrame. If false, return numpy ndarrays. Returns ------- lagmat : 2d array array with lagged observations y : 2d array, optional Only returned if original == 'sep' Examples -------- >>> from statsmodels.tsa.tsatools import lagmat >>> import numpy as np >>> X = np.arange(1,7).reshape(-1,2) >>> lagmat(X, maxlag=2, trim="forward", original='in') array([[ 1., 2., 0., 0., 0., 0.], [ 3., 4., 1., 2., 0., 0.], [ 5., 6., 3., 4., 1., 2.]]) >>> lagmat(X, maxlag=2, trim="backward", original='in') array([[ 5., 6., 3., 4., 1., 2.], [ 0., 0., 5., 6., 3., 4.], [ 0., 0., 0., 0., 5., 6.]]) >>> lagmat(X, maxlag=2, trim="both", original='in') array([[ 5., 6., 3., 4., 1., 2.]]) >>> lagmat(X, maxlag=2, trim="none", original='in') array([[ 1., 2., 0., 0., 0., 0.], [ 3., 4., 1., 2., 0., 0.], [ 5., 6., 3., 4., 1., 2.], [ 0., 0., 5., 6., 3., 4.], [ 0., 0., 0., 0., 5., 6.]]) Notes ----- When using a pandas DataFrame or Series with use_pandas=True, trim can only be 'forward' or 'both' since it is not possible to consistently extend index values. """ maxlag = int_like(maxlag, 'maxlag') use_pandas = bool_like(use_pandas, 'use_pandas') trim = string_like(trim, 'trim', optional=True, options=('forward', 'backward', 'both', 'none')) original = string_like(original, 'original', options=('ex', 'sep', 'in')) # TODO: allow list of lags additional to maxlag orig = x x = array_like(x, 'x', ndim=2, dtype=None) is_pandas = _is_using_pandas(orig, None) and use_pandas trim = 'none' if trim is None else trim trim = trim.lower() if is_pandas and trim in ('none', 'backward'): raise ValueError("trim cannot be 'none' or 'forward' when used on " "Series or DataFrames") dropidx = 0 nobs, nvar = x.shape if original in ['ex', 'sep']: dropidx = nvar if maxlag >= nobs: raise ValueError("maxlag should be < nobs") lm = np.zeros((nobs + maxlag, nvar * (maxlag + 1))) for k in range(0, int(maxlag + 1)): lm[maxlag - k:nobs + maxlag - k, nvar * (maxlag - k):nvar * (maxlag - k + 1)] = x if trim in ('none', 'forward'): startobs = 0 elif trim in ('backward', 'both'): startobs = maxlag else: raise ValueError('trim option not valid') if trim in ('none', 'backward'): stopobs = len(lm) else: stopobs = nobs if is_pandas: x = orig x_columns = x.columns if isinstance(x, DataFrame) else [x.name] columns = [str(col) for col in x_columns] for lag in range(maxlag): lag_str = str(lag + 1) columns.extend([str(col) + '.L.' + lag_str for col in x_columns]) lm = DataFrame(lm[:stopobs], index=x.index, columns=columns) lags = lm.iloc[startobs:] if original in ('sep', 'ex'): leads = lags[x_columns] lags = lags.drop(x_columns, 1) else: lags = lm[startobs:stopobs, dropidx:] if original == 'sep': leads = lm[startobs:stopobs, :dropidx] if original == 'sep': return lags, leads else: return lags
def __init__( self, data: Union[np.ndarray, pd.Series, pd.DataFrame], stats: Sequence[str] = None, *, numeric: bool = True, categorical: bool = True, alpha: float = 0.05, use_t: bool = False, percentiles: Sequence[Union[int, float]] = PERCENTILES, ntop: bool = 5, ): data_arr = data if not isinstance(data, (pd.Series, pd.DataFrame)): data_arr = array_like(data, "data", maxdim=2) if data_arr.ndim == 1: data = pd.Series(data) numeric = bool_like(numeric, "numeric") categorical = bool_like(categorical, "categorical") include = [] col_types = "" if numeric: include.append(np.number) col_types = "numeric" if categorical: include.append("category") col_types += "and " if col_types != "" else "" col_types += "categorical" if not numeric and not categorical: raise ValueError( "At least one of numeric and categorical must be True" ) self._data = pd.DataFrame(data).select_dtypes(include) if self._data.shape[1] == 0: raise ValueError( "Selecting {col_types} results in an empty DataFrame" ) self._is_numeric = [is_numeric_dtype(dt) for dt in self._data.dtypes] self._is_cat_like = [ is_categorical_dtype(dt) for dt in self._data.dtypes ] if stats is not None: undef = [stat for stat in stats if stat not in DEFAULT_STATISTICS] if undef: raise ValueError( f"{', '.join(undef)} are not known statistics" ) self._stats = ( list(DEFAULT_STATISTICS) if stats is None else list(stats) ) self._ntop = int_like(ntop, "ntop") self._compute_top = "top" in self._stats self._compute_freq = "freq" in self._stats if self._compute_top and self._ntop <= 0 < sum(self._is_cat_like): raise ValueError("top must be a non-negative integer") self._compute_perc = "percentiles" in self._stats self._percentiles = array_like( percentiles, "percentiles", maxdim=1, dtype="d" ) self._percentiles = np.sort(self._percentiles) if np.unique(self._percentiles).shape[0] != self._percentiles.shape[0]: raise ValueError("percentiles must be distinct") if np.any(self._percentiles >= 100) or np.any(self._percentiles <= 0): raise ValueError("percentiles must be strictly between 0 and 100") # Expand special stats replacements = { "mode": ["mode", "mode_freq"], "ci": ["upper_ci", "lower_ci"], "jarque_bera": ["jarque_bera", "jarque_bera_pval"], "top": [f"top_{i}" for i in range(1, self._ntop + 1)], "freq": [f"freq_{i}" for i in range(1, self._ntop + 1)], "percentiles": [f"{i}%" for i in percentiles], } for key in replacements: if key in self._stats: idx = self._stats.index(key) self._stats = ( self._stats[:idx] + replacements[key] + self._stats[idx + 1 :] ) self._alpha = float_like(alpha, "alpha") if not 0 < alpha < 1: raise ValueError("alpha must be strictly between 0 and 1") self._use_t = bool_like(use_t, "use_t")
def lagmat2ds(x, maxlag0, maxlagex=None, dropex=0, trim='forward', use_pandas=False): """ Generate lagmatrix for 2d array, columns arranged by variables Parameters ---------- x : array_like, 2d 2d data, observation in rows and variables in columns maxlag0 : int for first variable all lags from zero to maxlag are included maxlagex : None or int max lag for all other variables all lags from zero to maxlag are included dropex : int (default is 0) exclude first dropex lags from other variables for all variables, except the first, lags from dropex to maxlagex are included trim : str * 'forward' : trim invalid observations in front * 'backward' : trim invalid initial observations * 'both' : trim invalid observations on both sides * 'none' : no trimming of observations use_pandas : bool, optional If true, returns a DataFrame when the input is a pandas Series or DataFrame. If false, return numpy ndarrays. Returns ------- lagmat : 2d array array with lagged observations, columns ordered by variable Notes ----- Inefficient implementation for unequal lags, implemented for convenience """ maxlag0 = int_like(maxlag0, 'maxlag0') maxlagex = int_like(maxlagex, 'maxlagex', optional=True) trim = string_like(trim, 'trim', optional=True, options=('forward', 'backward', 'both', 'none')) if maxlagex is None: maxlagex = maxlag0 maxlag = max(maxlag0, maxlagex) is_pandas = _is_using_pandas(x, None) if x.ndim == 1: if is_pandas: x = pd.DataFrame(x) else: x = x[:, None] elif x.ndim == 0 or x.ndim > 2: raise ValueError('Only supports 1 and 2-dimensional data.') nobs, nvar = x.shape if is_pandas and use_pandas: lags = lagmat(x.iloc[:, 0], maxlag, trim=trim, original='in', use_pandas=True) lagsli = [lags.iloc[:, :maxlag0 + 1]] for k in range(1, nvar): lags = lagmat(x.iloc[:, k], maxlag, trim=trim, original='in', use_pandas=True) lagsli.append(lags.iloc[:, dropex:maxlagex + 1]) return pd.concat(lagsli, axis=1) elif is_pandas: x = np.asanyarray(x) lagsli = [ lagmat(x[:, 0], maxlag, trim=trim, original='in')[:, :maxlag0 + 1] ] for k in range(1, nvar): lagsli.append( lagmat(x[:, k], maxlag, trim=trim, original='in')[:, dropex:maxlagex + 1]) return np.column_stack(lagsli)