def test_dataframe(self): df = pd.DataFrame(self.arr_2d) appended = tools.add_trend(df) expected = df.copy() expected["const"] = self.c assert_frame_equal(expected, appended) prepended = tools.add_trend(df, prepend=True) expected = df.copy() expected.insert(0, "const", self.c) assert_frame_equal(expected, prepended) df = pd.DataFrame(self.arr_2d) appended = tools.add_trend(df, trend="t") expected = df.copy() expected["trend"] = self.t assert_frame_equal(expected, appended) df = pd.DataFrame(self.arr_2d) appended = tools.add_trend(df, trend="ctt") expected = df.copy() expected["const"] = self.c expected["trend"] = self.t expected["trend_squared"] = self.t ** 2 assert_frame_equal(expected, appended)
def test_dataframe(self): df = pd.DataFrame(self.arr_2d) appended = tools.add_trend(df) expected = df.copy() expected['const'] = self.c assert_frame_equal(expected, appended) prepended = tools.add_trend(df, prepend=True) expected = df.copy() expected.insert(0, 'const', self.c) assert_frame_equal(expected, prepended) df = pd.DataFrame(self.arr_2d) appended = tools.add_trend(df, trend='t') expected = df.copy() expected['trend'] = self.t assert_frame_equal(expected, appended) df = pd.DataFrame(self.arr_2d) appended = tools.add_trend(df, trend='ctt') expected = df.copy() expected['const'] = self.c expected['trend'] = self.t expected['trend_squared'] = self.t ** 2 assert_frame_equal(expected, appended)
def test_mixed_recarray(self): dt = np.dtype([('c0', np.float64), ('c1', np.int8), ('c2', 'S4')]) ra = np.array([(1.0, 1, 'aaaa'), (1.1, 2, 'bbbb')], dtype=dt).view(np.recarray) added = tools.add_trend(ra, trend='ct') dt = np.dtype([('c0', np.float64), ('c1', np.int8), ('c2', 'S4'), ('const', np.float64), ('trend', np.float64)]) expected = np.array([(1.0, 1, 'aaaa', 1.0, 1.0), (1.1, 2, 'bbbb', 1.0, 2.0)], dtype=dt).view(np.recarray) assert_equal(added, expected)
def test_mixed_recarray(self): dt = np.dtype([("c0", np.float64), ("c1", np.int8), ("c2", "S4")]) ra = np.array([(1.0, 1, "aaaa"), (1.1, 2, "bbbb")], dtype=dt).view(np.recarray) added = tools.add_trend(ra, trend="ct") dt = np.dtype([("c0", np.float64), ("c1", np.int8), ("c2", "S4"), ("const", np.float64), ("trend", np.float64)]) expected = np.array([(1.0, 1, "aaaa", 1.0, 1.0), (1.1, 2, "bbbb", 1.0, 2.0)], dtype=dt).view(np.recarray) assert_equal(added, expected)
def test_series(self): s = pd.Series(self.arr_1d) appended = tools.add_trend(s) expected = pd.DataFrame(s) expected["const"] = self.c assert_frame_equal(expected, appended) prepended = tools.add_trend(s, prepend=True) expected = pd.DataFrame(s) expected.insert(0, "const", self.c) assert_frame_equal(expected, prepended) s = pd.Series(self.arr_1d) appended = tools.add_trend(s, trend="ct") expected = pd.DataFrame(s) expected["const"] = self.c expected["trend"] = self.t assert_frame_equal(expected, appended)
def test_series(self): s = pd.Series(self.arr_1d) appended = tools.add_trend(s) expected = pd.DataFrame(s) expected['const'] = self.c assert_frame_equal(expected, appended) prepended = tools.add_trend(s, prepend=True) expected = pd.DataFrame(s) expected.insert(0, 'const', self.c) assert_frame_equal(expected, prepended) s = pd.Series(self.arr_1d) appended = tools.add_trend(s, trend='ct') expected = pd.DataFrame(s) expected['const'] = self.c expected['trend'] = self.t assert_frame_equal(expected, appended)
def test_array(self): base = np.vstack((self.arr_1d, self.c, self.t, self.t ** 2)).T assert_equal(tools.add_trend(self.arr_1d), base[:, :2]) assert_equal(tools.add_trend(self.arr_1d, trend='t'), base[:, [0, 2]]) assert_equal(tools.add_trend(self.arr_1d, trend='ct'), base[:, :3]) assert_equal(tools.add_trend(self.arr_1d, trend='ctt'), base) base = np.hstack((self.c[:, None], self.t[:, None], self.t[:, None] ** 2, self.arr_2d)) assert_equal(tools.add_trend(self.arr_2d, prepend=True), base[:, [0, 3, 4]]) assert_equal(tools.add_trend(self.arr_2d, trend='t', prepend=True), base[:, [1, 3, 4]]) assert_equal(tools.add_trend(self.arr_2d, trend='ct', prepend=True), base[:, [0, 1, 3, 4]]) assert_equal(tools.add_trend(self.arr_2d, trend='ctt', prepend=True), base)
def test_duplicate_const(self): assert_raises(ValueError, tools.add_trend, x=self.c, trend="c", has_constant="raise") assert_raises(ValueError, tools.add_trend, x=self.c, trend="ct", has_constant="raise") df = pd.DataFrame(self.c) assert_raises(ValueError, tools.add_trend, x=df, trend="c", has_constant="raise") assert_raises(ValueError, tools.add_trend, x=df, trend="ct", has_constant="raise") skipped = tools.add_trend(self.c, trend="c") assert_equal(skipped, self.c[:, None]) skipped_const = tools.add_trend(self.c, trend="ct", has_constant="skip") expected = np.vstack((self.c, self.t)).T assert_equal(skipped_const, expected) added = tools.add_trend(self.c, trend="c", has_constant="add") expected = np.vstack((self.c, self.c)).T assert_equal(added, expected) added = tools.add_trend(self.c, trend="ct", has_constant="add") expected = np.vstack((self.c, self.c, self.t)).T assert_equal(added, expected)
def test_recarray(self): recarray = pd.DataFrame(self.arr_2d).to_records(index=False, convert_datetime64=False) appended = tools.add_trend(recarray) expected = pd.DataFrame(self.arr_2d) expected['const'] = self.c expected = expected.to_records(index=False, convert_datetime64=False) assert_equal(expected, appended) prepended = tools.add_trend(recarray, prepend=True) expected = pd.DataFrame(self.arr_2d) expected.insert(0, 'const', self.c) expected = expected.to_records(index=False, convert_datetime64=False) assert_equal(expected, prepended) appended = tools.add_trend(recarray, trend='ctt') expected = pd.DataFrame(self.arr_2d) expected['const'] = self.c expected['trend'] = self.t expected['trend_squared'] = self.t ** 2 expected = expected.to_records(index=False, convert_datetime64=False) assert_equal(expected, appended)
def test_duplicate_const(self): assert_raises(ValueError, tools.add_trend, x=self.c, trend='c', has_constant='raise') assert_raises(ValueError, tools.add_trend, x=self.c, trend='ct', has_constant='raise') df = pd.DataFrame(self.c) assert_raises(ValueError, tools.add_trend, x=df, trend='c', has_constant='raise') assert_raises(ValueError, tools.add_trend, x=df, trend='ct', has_constant='raise') skipped = tools.add_trend(self.c, trend='c') assert_equal(skipped, self.c[:,None]) skipped_const = tools.add_trend(self.c, trend='ct', has_constant='skip') expected = np.vstack((self.c, self.t)).T assert_equal(skipped_const, expected) added = tools.add_trend(self.c, trend='c', has_constant='add') expected = np.vstack((self.c, self.c)).T assert_equal(added, expected) added = tools.add_trend(self.c, trend='ct', has_constant='add') expected = np.vstack((self.c, self.c, self.t)).T assert_equal(added, expected)
def _stackX(self, k_ar, trend): """ Private method to build the RHS matrix for estimation. Columns are trend terms then lags. """ endog = self.endog X = lagmat(endog, maxlag=k_ar, trim='both') k_trend = util.get_trendorder(trend) if k_trend: X = add_trend(X, prepend=True, trend=trend) self.k_trend = k_trend return X
def test_duplicate_const(self): assert_raises(ValueError, tools.add_trend, x=self.c, trend='c', has_constant='raise') assert_raises(ValueError, tools.add_trend, x=self.c, trend='ct', has_constant='raise') df = pd.DataFrame(self.c) assert_raises(ValueError, tools.add_trend, x=df, trend='c', has_constant='raise') assert_raises(ValueError, tools.add_trend, x=df, trend='ct', has_constant='raise') skipped = tools.add_trend(self.c, trend='c') assert_equal(skipped, self.c[:, None]) skipped_const = tools.add_trend(self.c, trend='ct', has_constant='skip') expected = np.vstack((self.c, self.t)).T assert_equal(skipped_const, expected) added = tools.add_trend(self.c, trend='c', has_constant='add') expected = np.vstack((self.c, self.c)).T assert_equal(added, expected) added = tools.add_trend(self.c, trend='ct', has_constant='add') expected = np.vstack((self.c, self.c, self.t)).T assert_equal(added, expected)
def test_recarray(self): recarray = pd.DataFrame(self.arr_2d).to_records( index=False, convert_datetime64=False) appended = tools.add_trend(recarray) expected = pd.DataFrame(self.arr_2d) expected['const'] = self.c expected = expected.to_records(index=False, convert_datetime64=False) assert_equal(expected, appended) prepended = tools.add_trend(recarray, prepend=True) expected = pd.DataFrame(self.arr_2d) expected.insert(0, 'const', self.c) expected = expected.to_records(index=False, convert_datetime64=False) assert_equal(expected, prepended) appended = tools.add_trend(recarray, trend='ctt') expected = pd.DataFrame(self.arr_2d) expected['const'] = self.c expected['trend'] = self.t expected['trend_squared'] = self.t**2 expected = expected.to_records(index=False, convert_datetime64=False) assert_equal(expected, appended)
def test_recarray(self): recarray = pd.DataFrame(self.arr_2d).to_records(index=False) with pytest.warns(FutureWarning, match="recarray support"): appended = tools.add_trend(recarray) expected = pd.DataFrame(self.arr_2d) expected['const'] = self.c expected = expected.to_records(index=False) assert_equal(expected, appended) with pytest.warns(FutureWarning, match="recarray support"): prepended = tools.add_trend(recarray, prepend=True) expected = pd.DataFrame(self.arr_2d) expected.insert(0, 'const', self.c) expected = expected.to_records(index=False) assert_equal(expected, prepended) with pytest.warns(FutureWarning, match="recarray support"): appended = tools.add_trend(recarray, trend='ctt') expected = pd.DataFrame(self.arr_2d) expected['const'] = self.c expected['trend'] = self.t expected['trend_squared'] = self.t ** 2 expected = expected.to_records(index=False) assert_equal(expected, appended)
def _make_arma_exog(endog, exog, trend): k_trend = 1 # overwritten if no constant if exog is None and trend == 'c': # constant only exog = np.ones((len(endog),1)) elif exog is not None and trend == 'c': # constant plus exogenous exog = add_trend(exog, trend='c', prepend=True) elif exog is not None and trend == 'nc': # make sure it's not holding constant from last run if exog.var() == 0: exog = None k_trend = 0 if trend == 'nc': k_trend = 0 return k_trend, exog
def add_constant(data, prepend=True, has_constant='skip'): """ Adds a column of ones to an array Parameters ---------- data : array-like `data` is the column-ordered design matrix prepend : bool If true, the constant is in the first column. Else the constant is appended (last column). has_constant : str {'raise', 'add', 'skip'} Behavior if ``data'' already has a constant. The default will return data without adding another constant. If 'raise', will raise an error if a constant is present. Using 'add' will duplicate the constant, if one is present. Returns ------- data : array, recarray or DataFrame The original values with a constant (column of ones) as the first or last column. Returned value depends on input type. Notes ----- When the input is recarray or a pandas Series or DataFrame, the added column's name is 'const'. """ if _is_using_pandas(data, None) or _is_recarray(data): from statsmodels.tsa.tsatools import add_trend return add_trend(data, trend='c', prepend=prepend, has_constant=has_constant) # Special case for NumPy x = np.asanyarray(data) if x.ndim == 1: x = x[:,None] elif x.ndim > 2: raise ValueError('Only implementd 2-dimensional arrays') is_nonzero_const = np.ptp(x, axis=0) == 0 is_nonzero_const &= np.all(x != 0.0, axis=0) if is_nonzero_const.any(): if has_constant == 'skip': return x elif has_constant == 'raise': raise ValueError("data already contains a constant") x = [np.ones(x.shape[0]), x] x = x if prepend else x[::-1] return np.column_stack(x)
def _make_arma_exog(endog, exog, trend): k_trend = 1 # overwritten if no constant if exog is None and trend == 'c': # constant only exog = np.ones((len(endog), 1)) elif exog is not None and trend == 'c': # constant plus exogenous exog = add_trend(exog, trend='c', prepend=True) elif exog is not None and trend == 'nc': # make sure it's not holding constant from last run if exog.var() == 0: exog = None k_trend = 0 if trend == 'nc': k_trend = 0 return k_trend, exog
def add_constant(data, prepend=True, has_constant='skip'): """ Adds a column of ones to an array Parameters ---------- data : array-like ``data`` is the column-ordered design matrix prepend : bool If true, the constant is in the first column. Else the constant is appended (last column). has_constant : str {'raise', 'add', 'skip'} Behavior if ``data`` already has a constant. The default will return data without adding another constant. If 'raise', will raise an error if a constant is present. Using 'add' will duplicate the constant, if one is present. Returns ------- data : array, recarray or DataFrame The original values with a constant (column of ones) as the first or last column. Returned value depends on input type. Notes ----- When the input is recarray or a pandas Series or DataFrame, the added column's name is 'const'. """ if _is_using_pandas(data, None) or _is_recarray(data): from statsmodels.tsa.tsatools import add_trend return add_trend(data, trend='c', prepend=prepend, has_constant=has_constant) # Special case for NumPy x = np.asanyarray(data) if x.ndim == 1: x = x[:,None] elif x.ndim > 2: raise ValueError('Only implementd 2-dimensional arrays') is_nonzero_const = np.ptp(x, axis=0) == 0 is_nonzero_const &= np.all(x != 0.0, axis=0) if is_nonzero_const.any(): if has_constant == 'skip': return x elif has_constant == 'raise': raise ValueError("data already contains a constant") x = [np.ones(x.shape[0]), x] x = x if prepend else x[::-1] return np.column_stack(x)
def test_array(self): base = np.vstack((self.arr_1d, self.c, self.t, self.t**2)).T assert_equal(tools.add_trend(self.arr_1d), base[:, :2]) assert_equal(tools.add_trend(self.arr_1d, trend='t'), base[:, [0, 2]]) assert_equal(tools.add_trend(self.arr_1d, trend='ct'), base[:, :3]) assert_equal(tools.add_trend(self.arr_1d, trend='ctt'), base) base = np.hstack( (self.c[:, None], self.t[:, None], self.t[:, None]**2, self.arr_2d)) assert_equal(tools.add_trend(self.arr_2d, prepend=True), base[:, [0, 3, 4]]) assert_equal(tools.add_trend(self.arr_2d, trend='t', prepend=True), base[:, [1, 3, 4]]) assert_equal(tools.add_trend(self.arr_2d, trend='ct', prepend=True), base[:, [0, 1, 3, 4]]) assert_equal(tools.add_trend(self.arr_2d, trend='ctt', prepend=True), base)
def test_mixed_recarray(self): dt = np.dtype([("c0", np.float64), ("c1", np.int8), ("c2", "S4")]) ra = np.array([(1.0, 1, "aaaa"), (1.1, 2, "bbbb")], dtype=dt).view(np.recarray) with pytest.warns(FutureWarning, match="recarray support"): added = tools.add_trend(ra, trend="ct") dt = np.dtype([ ("c0", np.float64), ("c1", np.int8), ("c2", "S4"), ("const", np.float64), ("trend", np.float64), ]) expected = np.array([(1.0, 1, "aaaa", 1.0, 1.0), (1.1, 2, "bbbb", 1.0, 2.0)], dtype=dt).view(np.recarray) assert_equal(added, expected)
def get_var_endog(y, lags, trend='c'): """ Make predictor matrix for VAR(p) process Z := (Z_0, ..., Z_T).T (T x Kp) Z_t = [1 y_t y_{t-1} ... y_{t - p + 1}] (Kp x 1) Ref: Lutkepohl p.70 (transposed) """ nobs = len(y) # Ravel C order, need to put in descending order Z = np.array([y[t-lags : t][::-1].ravel() for t in xrange(lags, nobs)]) # Add constant, trend, etc. if trend != 'nc': Z = tsa.add_trend(Z, prepend=True, trend=trend) return Z
def get_var_endog(y, lags, trend="c", has_constant="skip"): """ Make predictor matrix for VAR(p) process Z := (Z_0, ..., Z_T).T (T x Kp) Z_t = [1 y_t y_{t-1} ... y_{t - p + 1}] (Kp x 1) Ref: Lutkepohl p.70 (transposed) has_constant can be 'raise', 'add', or 'skip'. See add_constant. """ nobs = len(y) # Ravel C order, need to put in descending order Z = np.array([y[t - lags : t][::-1].ravel() for t in range(lags, nobs)]) # Add constant, trend, etc. if trend != "nc": Z = tsa.add_trend(Z, prepend=True, trend=trend, has_constant=has_constant) return Z
def simulate_kpss(nobs, B, trend='c', rng=None): """ Simulated the KPSS test statistic for nobs observations, performing B replications. """ if rng is None: rng = RandomState() rng.seed(0) standard_normal = rng.standard_normal e = standard_normal((nobs, B)) z = np.ones((nobs, 1)) if trend == 'ct': z = add_trend(z, trend='t') zinv = np.linalg.pinv(z) trend_coef = zinv.dot(e) resid = e - z.dot(trend_coef) s = np.cumsum(resid, axis=0) lam = np.mean(resid ** 2.0, axis=0) kpss = 1 / (nobs ** 2.0) * np.sum(s ** 2.0, axis=0) / lam return kpss
def get_var_endog(y, lags, trend='c', has_constant='skip'): """ Make predictor matrix for VAR(p) process Z := (Z_0, ..., Z_T).T (T x Kp) Z_t = [1 y_t y_{t-1} ... y_{t - p + 1}] (Kp x 1) Ref: Lütkepohl p.70 (transposed) has_constant can be 'raise', 'add', or 'skip'. See add_constant. """ nobs = len(y) # Ravel C order, need to put in descending order Z = np.array([y[t-lags : t][::-1].ravel() for t in range(lags, nobs)]) # Add constant, trend, etc. trend = tsa.rename_trend(trend) if trend != 'n': Z = tsa.add_trend(Z, prepend=True, trend=trend, has_constant=has_constant) return Z
def my_adfuller(y, maxlag=None, regression='c'): """ Augmented Dickey-Fuller test (it reduces to non-augmented version if maxlag=0: dY_t = phi*Y_{t-1} + eps_t) e.g. maxlag=1 model: dY_t = phi*Y_{t-1} + phi_1*dY_{t-1} + eps_t NOTE: this implementation does not allow to add a time-dependence term :param y: time series which wants to be checked for stationarity :param maxlag: maximum lag to include :param regression: str {'c','nc'} Constant to include in regression * 'c' : constant only (default) * 'nc' : no constant, no trend :return: dictionary with OLS results """ y = np.asarray(y) # ensure it is in array form ydiff = np.diff(y) # get the differences (dY_t term) ydall = lagmat(ydiff[:, None], maxlag, trim='both', original='in') # lagged differences (dY_{t-k} terms) nobs = ydall.shape[0] # number of observations ydall[:, 0] = y[-nobs - 1:-1] # replace 0 ydiff with level of y (Y_{t-1} term) ydshort = ydiff[-nobs:] # level up the dimensions of ydiff to match nobs Y = ydshort # endogenous var if regression != 'nc': X = add_trend(ydall[:, :maxlag + 1], regression) # exogenous var else: X = ydall[:, :maxlag + 1] # exogenous var result = my_OLS( Y, X) # do the usual regression using OLS to estimate parameters # Add a few other info to the results dictionary result['adfstat'] = result['tvalue'][ 0] # define adfstat as tvalue of phi coefficient result['maxlag'] = maxlag # Akaike information criterion using statsmodel def for adfuller - not this is different to the def in AR(p) model result['aic'] = -2 * result['llf'] + 2 * result['df_model'] # result['aic'] = np.log(result['sigma']) + 2.0 * (1.0 + result['df_model']) / result['nobs'] # AR(p) def return result
def add_constant(data, prepend=True, has_constant='skip'): """ Add a column of ones to an array. Parameters ---------- data : array_like A column-ordered design matrix. prepend : bool If true, the constant is in the first column. Else the constant is appended (last column). has_constant : str {'raise', 'add', 'skip'} Behavior if ``data`` already has a constant. The default will return data without adding another constant. If 'raise', will raise an error if any column has a constant value. Using 'add' will add a column of 1s if a constant column is present. Returns ------- array_like The original values with a constant (column of ones) as the first or last column. Returned value type depends on input type. Notes ----- When the input is recarray or a pandas Series or DataFrame, the added column's name is 'const'. """ if _is_using_pandas(data, None) or _is_recarray(data): if _is_recarray(data): # deprecated: remove recarray support after 0.12 import warnings from statsmodels.tools.sm_exceptions import recarray_warning warnings.warn(recarray_warning, FutureWarning) from statsmodels.tsa.tsatools import add_trend return add_trend(data, trend='c', prepend=prepend, has_constant=has_constant) # Special case for NumPy x = np.asanyarray(data) ndim = x.ndim if ndim == 1: x = x[:, None] elif x.ndim > 2: raise ValueError('Only implemented for 2-dimensional arrays') is_nonzero_const = np.ptp(x, axis=0) == 0 is_nonzero_const &= np.all(x != 0.0, axis=0) if is_nonzero_const.any(): if has_constant == 'skip': return x elif has_constant == 'raise': if ndim == 1: raise ValueError("data is constant.") else: columns = np.arange(x.shape[1]) cols = ",".join([str(c) for c in columns[is_nonzero_const]]) raise ValueError(f"Column(s) {cols} are constant.") x = [np.ones(x.shape[0]), x] x = x if prepend else x[::-1] return np.column_stack(x)
def adfuller(x, maxlag=None, regression="c", autolag='AIC', store=False, regresults=False): """ Augmented Dickey-Fuller unit root test The Augmented Dickey-Fuller test can be used to test for a unit root in a univariate process in the presence of serial correlation. Parameters ---------- x : array_like, 1d data series maxlag : int Maximum lag which is included in test, default 12*(nobs/100)^{1/4} regression : {'c','ct','ctt','nc'} Constant and trend order to include in regression * 'c' : constant only (default) * 'ct' : constant and trend * 'ctt' : constant, and linear and quadratic trend * 'nc' : no constant, no trend autolag : {'AIC', 'BIC', 't-stat', None} * if None, then maxlag lags are used * if 'AIC' (default) or 'BIC', then the number of lags is chosen to minimize the corresponding information criterion * 't-stat' based choice of maxlag. Starts with maxlag and drops a lag until the t-statistic on the last lag length is significant using a 5%-sized test store : bool If True, then a result instance is returned additionally to the adf statistic. Default is False regresults : bool, optional If True, the full regression results are returned. Default is False Returns ------- adf : float Test statistic pvalue : float MacKinnon's approximate p-value based on MacKinnon (1994, 2010) usedlag : int Number of lags used nobs : int Number of observations used for the ADF regression and calculation of the critical values critical values : dict Critical values for the test statistic at the 1 %, 5 %, and 10 % levels. Based on MacKinnon (2010) icbest : float The maximized information criterion if autolag is not None. resstore : ResultStore, optional A dummy class with results attached as attributes Notes ----- The null hypothesis of the Augmented Dickey-Fuller is that there is a unit root, with the alternative that there is no unit root. If the pvalue is above a critical size, then we cannot reject that there is a unit root. The p-values are obtained through regression surface approximation from MacKinnon 1994, but using the updated 2010 tables. If the p-value is close to significant, then the critical values should be used to judge whether to reject the null. The autolag option and maxlag for it are described in Greene. Examples -------- See example notebook References ---------- .. [*] W. Green. "Econometric Analysis," 5th ed., Pearson, 2003. .. [*] Hamilton, J.D. "Time Series Analysis". Princeton, 1994. .. [*] MacKinnon, J.G. 1994. "Approximate asymptotic distribution functions for unit-root and cointegration tests. `Journal of Business and Economic Statistics` 12, 167-76. .. [*] MacKinnon, J.G. 2010. "Critical Values for Cointegration Tests." Queen's University, Dept of Economics, Working Papers. Available at http://ideas.repec.org/p/qed/wpaper/1227.html """ if regresults: store = True trenddict = {None: 'nc', 0: 'c', 1: 'ct', 2: 'ctt'} if regression is None or isinstance(regression, (int, long)): regression = trenddict[regression] regression = regression.lower() if regression not in ['c', 'nc', 'ct', 'ctt']: raise ValueError("regression option %s not understood") % regression x = np.asarray(x) nobs = x.shape[0] if maxlag is None: #from Greene referencing Schwert 1989 maxlag = int(np.ceil(12. * np.power(nobs / 100., 1 / 4.))) xdiff = np.diff(x) xdall = lagmat(xdiff[:, None], maxlag, trim='both', original='in') nobs = xdall.shape[0] # pylint: disable=E1103 xdall[:, 0] = x[-nobs - 1:-1] # replace 0 xdiff with level of x xdshort = xdiff[-nobs:] if store: resstore = ResultsStore() if autolag: if regression != 'nc': fullRHS = add_trend(xdall, regression, prepend=True) else: fullRHS = xdall startlag = fullRHS.shape[1] - xdall.shape[1] + 1 # 1 for level # pylint: disable=E1103 #search for lag length with smallest information criteria #Note: use the same number of observations to have comparable IC #aic and bic: smaller is better if not regresults: icbest, bestlag = _autolag(OLS, xdshort, fullRHS, startlag, maxlag, autolag) else: icbest, bestlag, alres = _autolag(OLS, xdshort, fullRHS, startlag, maxlag, autolag, regresults=regresults) resstore.autolag_results = alres bestlag -= startlag # convert to lag not column index #rerun ols with best autolag xdall = lagmat(xdiff[:, None], bestlag, trim='both', original='in') nobs = xdall.shape[0] # pylint: disable=E1103 xdall[:, 0] = x[-nobs - 1:-1] # replace 0 xdiff with level of x xdshort = xdiff[-nobs:] usedlag = bestlag else: usedlag = maxlag icbest = None if regression != 'nc': resols = OLS(xdshort, add_trend(xdall[:, :usedlag + 1], regression)).fit() else: resols = OLS(xdshort, xdall[:, :usedlag + 1]).fit() adfstat = resols.tvalues[0] # adfstat = (resols.params[0]-1.0)/resols.bse[0] # the "asymptotically correct" z statistic is obtained as # nobs/(1-np.sum(resols.params[1:-(trendorder+1)])) (resols.params[0] - 1) # I think this is the statistic that is used for series that are integrated # for orders higher than I(1), ie., not ADF but cointegration tests. # Get approx p-value and critical values pvalue = mackinnonp(adfstat, regression=regression, N=1) critvalues = mackinnoncrit(N=1, regression=regression, nobs=nobs) critvalues = {"1%" : critvalues[0], "5%" : critvalues[1], "10%" : critvalues[2]} if store: resstore.resols = resols resstore.maxlag = maxlag resstore.usedlag = usedlag resstore.adfstat = adfstat resstore.critvalues = critvalues resstore.nobs = nobs resstore.H0 = ("The coefficient on the lagged level equals 1 - " "unit root") resstore.HA = "The coefficient on the lagged level < 1 - stationary" resstore.icbest = icbest resstore._str = 'Augmented Dickey-Fuller Test Results' return adfstat, pvalue, critvalues, resstore else: if not autolag: return adfstat, pvalue, usedlag, nobs, critvalues else: return adfstat, pvalue, usedlag, nobs, critvalues, icbest
def coint(y0, y1, trend='c', method='aeg', maxlag=None, autolag='aic', return_results=None): """Test for no-cointegration of a univariate equation The null hypothesis is no cointegration. Variables in y0 and y1 are assumed to be integrated of order 1, I(1). This uses the augmented Engle-Granger two-step cointegration test. Constant or trend is included in 1st stage regression, i.e. in cointegrating equation. Parameters ---------- y1 : array_like, 1d first element in cointegrating vector y2 : array_like remaining elements in cointegrating vector trend : str {'c', 'ct'} trend term included in regression for cointegrating equation * 'c' : constant * 'ct' : constant and linear trend * also available quadratic trend 'ctt', and no constant 'nc' method : string currently only 'aeg' for augmented Engle-Granger test is available. default might change. maxlag : None or int keyword for `adfuller`, largest or given number of lags autolag : string keyword for `adfuller`, lag selection criterion. return_results : bool for future compatibility, currently only tuple available. If True, then a results instance is returned. Otherwise, a tuple with the test outcome is returned. Set `return_results=False` to avoid future changes in return. Returns ------- coint_t : float t-statistic of unit-root test on residuals pvalue : float MacKinnon's approximate, asymptotic p-value based on MacKinnon (1994) crit_value : dict Critical values for the test statistic at the 1 %, 5 %, and 10 % levels based on regression curve. This depends on the number of observations. Notes ----- The Null hypothesis is that there is no cointegration, the alternative hypothesis is that there is cointegrating relationship. If the pvalue is small, below a critical size, then we can reject the hypothesis that there is no cointegrating relationship. P-values and critical values are obtained through regression surface approximation from MacKinnon 1994 and 2010. TODO: We could handle gaps in data by dropping rows with nans in the auxiliary regressions. Not implemented yet, currently assumes no nans and no gaps in time series. References ---------- MacKinnon, J.G. 1994 "Approximate Asymptotic Distribution Functions for Unit-Root and Cointegration Tests." Journal of Business & Economics Statistics, 12.2, 167-76. MacKinnon, J.G. 2010. "Critical Values for Cointegration Tests." Queen's University, Dept of Economics Working Papers 1227. http://ideas.repec.org/p/qed/wpaper/1227.html """ trend = trend.lower() if trend not in ['c', 'nc', 'ct', 'ctt']: raise ValueError("trend option %s not understood" % trend) y0 = np.asarray(y0) y1 = np.asarray(y1) if y1.ndim < 2: y1 = y1[:, None] nobs, k_vars = y1.shape k_vars += 1 # add 1 for y0 if trend == 'nc': xx = y1 else: xx = add_trend(y1, trend=trend, prepend=False) res_co = OLS(y0, xx).fit() if res_co.rsquared < 1 - np.sqrt(np.finfo(np.double).eps): res_adf = adfuller(res_co.resid, maxlag=maxlag, autolag=None, regression='nc') else: import warnings warnings.warn("y0 and y1 are perfectly colinear. Cointegration test " "is not reliable in this case.") # Edge case where series are too similar res_adf = (0,) # no constant or trend, see egranger in Stata and MacKinnon if trend == 'nc': crit = [np.nan] * 3 # 2010 critical values not available else: crit = mackinnoncrit(N=k_vars, regression=trend, nobs=nobs - 1) # nobs - 1, the -1 is to match egranger in Stata, I don't know why. # TODO: check nobs or df = nobs - k pval_asy = mackinnonp(res_adf[0], regression=trend, N=k_vars) return res_adf[0], pval_asy, crit
def adfuller(x, maxlag=None, regression="c", autolag='AIC', store=False, regresults=False): """ Augmented Dickey-Fuller unit root test The Augmented Dickey-Fuller test can be used to test for a unit root in a univariate process in the presence of serial correlation. Parameters ---------- x : array_like, 1d data series maxlag : int Maximum lag which is included in test, default 12*(nobs/100)^{1/4} regression : {'c','ct','ctt','nc'} Constant and trend order to include in regression * 'c' : constant only (default) * 'ct' : constant and trend * 'ctt' : constant, and linear and quadratic trend * 'nc' : no constant, no trend autolag : {'AIC', 'BIC', 't-stat', None} * if None, then maxlag lags are used * if 'AIC' (default) or 'BIC', then the number of lags is chosen to minimize the corresponding information criterion * 't-stat' based choice of maxlag. Starts with maxlag and drops a lag until the t-statistic on the last lag length is significant using a 5%-sized test store : bool If True, then a result instance is returned additionally to the adf statistic. Default is False regresults : bool, optional If True, the full regression results are returned. Default is False Returns ------- adf : float Test statistic pvalue : float MacKinnon's approximate p-value based on MacKinnon (1994, 2010) usedlag : int Number of lags used nobs : int Number of observations used for the ADF regression and calculation of the critical values critical values : dict Critical values for the test statistic at the 1 %, 5 %, and 10 % levels. Based on MacKinnon (2010) icbest : float The maximized information criterion if autolag is not None. resstore : ResultStore, optional A dummy class with results attached as attributes Notes ----- The null hypothesis of the Augmented Dickey-Fuller is that there is a unit root, with the alternative that there is no unit root. If the pvalue is above a critical size, then we cannot reject that there is a unit root. The p-values are obtained through regression surface approximation from MacKinnon 1994, but using the updated 2010 tables. If the p-value is close to significant, then the critical values should be used to judge whether to reject the null. The autolag option and maxlag for it are described in Greene. Examples -------- See example notebook References ---------- .. [1] W. Green. "Econometric Analysis," 5th ed., Pearson, 2003. .. [2] Hamilton, J.D. "Time Series Analysis". Princeton, 1994. .. [3] MacKinnon, J.G. 1994. "Approximate asymptotic distribution functions for unit-root and cointegration tests. `Journal of Business and Economic Statistics` 12, 167-76. .. [4] MacKinnon, J.G. 2010. "Critical Values for Cointegration Tests." Queen's University, Dept of Economics, Working Papers. Available at http://ideas.repec.org/p/qed/wpaper/1227.html """ if regresults: store = True trenddict = {None: 'nc', 0: 'c', 1: 'ct', 2: 'ctt'} if regression is None or isinstance(regression, (int, long)): regression = trenddict[regression] regression = regression.lower() if regression not in ['c', 'nc', 'ct', 'ctt']: raise ValueError("regression option %s not understood") % regression x = np.asarray(x) nobs = x.shape[0] if maxlag is None: #from Greene referencing Schwert 1989 maxlag = int(np.ceil(12. * np.power(nobs / 100., 1 / 4.))) xdiff = np.diff(x) xdall = lagmat(xdiff[:, None], maxlag, trim='both', original='in') nobs = xdall.shape[0] # pylint: disable=E1103 xdall[:, 0] = x[-nobs - 1:-1] # replace 0 xdiff with level of x xdshort = xdiff[-nobs:] if store: resstore = ResultsStore() if autolag: if regression != 'nc': fullRHS = add_trend(xdall, regression, prepend=True) else: fullRHS = xdall startlag = fullRHS.shape[1] - xdall.shape[1] + 1 # 1 for level # pylint: disable=E1103 #search for lag length with smallest information criteria #Note: use the same number of observations to have comparable IC #aic and bic: smaller is better if not regresults: icbest, bestlag = _autolag(OLS, xdshort, fullRHS, startlag, maxlag, autolag) else: icbest, bestlag, alres = _autolag(OLS, xdshort, fullRHS, startlag, maxlag, autolag, regresults=regresults) resstore.autolag_results = alres bestlag -= startlag # convert to lag not column index #rerun ols with best autolag xdall = lagmat(xdiff[:, None], bestlag, trim='both', original='in') nobs = xdall.shape[0] # pylint: disable=E1103 xdall[:, 0] = x[-nobs - 1:-1] # replace 0 xdiff with level of x xdshort = xdiff[-nobs:] usedlag = bestlag else: usedlag = maxlag icbest = None if regression != 'nc': resols = OLS(xdshort, add_trend(xdall[:, :usedlag + 1], regression)).fit() else: resols = OLS(xdshort, xdall[:, :usedlag + 1]).fit() adfstat = resols.tvalues[0] # adfstat = (resols.params[0]-1.0)/resols.bse[0] # the "asymptotically correct" z statistic is obtained as # nobs/(1-np.sum(resols.params[1:-(trendorder+1)])) (resols.params[0] - 1) # I think this is the statistic that is used for series that are integrated # for orders higher than I(1), ie., not ADF but cointegration tests. # Get approx p-value and critical values pvalue = mackinnonp(adfstat, regression=regression, N=1) critvalues = mackinnoncrit(N=1, regression=regression, nobs=nobs) critvalues = {"1%" : critvalues[0], "5%" : critvalues[1], "10%" : critvalues[2]} if store: resstore.resols = resols resstore.maxlag = maxlag resstore.usedlag = usedlag resstore.adfstat = adfstat resstore.critvalues = critvalues resstore.nobs = nobs resstore.H0 = ("The coefficient on the lagged level equals 1 - " "unit root") resstore.HA = "The coefficient on the lagged level < 1 - stationary" resstore.icbest = icbest resstore._str = 'Augmented Dickey-Fuller Test Results' return adfstat, pvalue, critvalues, resstore else: if not autolag: return adfstat, pvalue, usedlag, nobs, critvalues else: return adfstat, pvalue, usedlag, nobs, critvalues, icbest
def coint(y0, y1, trend='c', method='aeg', maxlag=None, autolag='aic', return_results=None): """Test for no-cointegration of a univariate equation The null hypothesis is no cointegration. Variables in y0 and y1 are assumed to be integrated of order 1, I(1). This uses the augmented Engle-Granger two-step cointegration test. Constant or trend is included in 1st stage regression, i.e. in cointegrating equation. Parameters ---------- y1 : array_like, 1d first element in cointegrating vector y2 : array_like remaining elements in cointegrating vector trend : str {'c', 'ct'} trend term included in regression for cointegrating equation * 'c' : constant * 'ct' : constant and linear trend * also available quadratic trend 'ctt', and no constant 'nc' method : string currently only 'aeg' for augmented Engle-Granger test is available. default might change. maxlag : None or int keyword for `adfuller`, largest or given number of lags autolag : string keyword for `adfuller`, lag selection criterion. return_results : bool for future compatibility, currently only tuple available. If True, then a results instance is returned. Otherwise, a tuple with the test outcome is returned. Set `return_results=False` to avoid future changes in return. Returns ------- coint_t : float t-statistic of unit-root test on residuals pvalue : float MacKinnon's approximate, asymptotic p-value based on MacKinnon (1994) crit_value : dict Critical values for the test statistic at the 1 %, 5 %, and 10 % levels based on regression curve. This depends on the number of observations. Notes ----- The Null hypothesis is that there is no cointegration, the alternative hypothesis is that there is cointegrating relationship. If the pvalue is small, below a critical size, then we can reject the hypothesis that there is no cointegrating relationship. P-values and critical values are obtained through regression surface approximation from MacKinnon 1994 and 2010. TODO: We could handle gaps in data by dropping rows with nans in the auxiliary regressions. Not implemented yet, currently assumes no nans and no gaps in time series. References ---------- MacKinnon, J.G. 1994 "Approximate Asymptotic Distribution Functions for Unit-Root and Cointegration Tests." Journal of Business & Economics Statistics, 12.2, 167-76. MacKinnon, J.G. 2010. "Critical Values for Cointegration Tests." Queen's University, Dept of Economics Working Papers 1227. http://ideas.repec.org/p/qed/wpaper/1227.html """ trend = trend.lower() if trend not in ['c', 'nc', 'ct', 'ctt']: raise ValueError("trend option %s not understood" % trend) y0 = np.asarray(y0) y1 = np.asarray(y1) if y1.ndim < 2: y1 = y1[:, None] nobs, k_vars = y1.shape k_vars += 1 # add 1 for y0 if trend == 'nc': xx = y1 else: xx = add_trend(y1, trend=trend, prepend=False) res_co = OLS(y0, xx).fit() res_adf = adfuller(res_co.resid, maxlag=maxlag, autolag=None, regression='nc') # no constant or trend, see egranger in Stata and MacKinnon if trend == 'nc': crit = [np.nan] * 3 # 2010 critical values not available else: crit = mackinnoncrit(N=k_vars, regression=trend, nobs=nobs - 1) # nobs - 1, the -1 is to match egranger in Stata, I don't know why. # TODO: check nobs or df = nobs - k pval_asy = mackinnonp(res_adf[0], regression=trend, N=k_vars) return res_adf[0], pval_asy, crit
# In[ ]: --------------------------------------------------------------------------- ImportError Traceback (most recent call last) <ipython-input-396-0c2468c93995> in <module> 4 X3 = daily_returns[['ETH-USD','XRP-USD','LTC-USD']] 5 ----> 6 X1 = sm.add_constant(X1) 7 X2 = sm.add_constant(X2) 8 X3 = sm.add_constant(X3) ~/opt/anaconda3/lib/python3.7/site-packages/statsmodels/tools/tools.py in add_constant(data, prepend, has_constant) 294 if _is_using_pandas(data, None) or _is_recarray(data): 295 from statsmodels.tsa.tsatools import add_trend --> 296 return add_trend(data, trend='c', prepend=prepend, has_constant=has_constant) 297 298 # Special case for NumPy ~/opt/anaconda3/lib/python3.7/site-packages/statsmodels/tsa/tsatools.py in add_trend(x, trend, prepend, has_constant) 95 except: 96 return False ---> 97 col_const = x.apply(safe_is_const, 0) 98 else: 99 ptp0 = np.ptp(np.asanyarray(x), axis=0) ~/opt/anaconda3/lib/python3.7/site-packages/pandas/core/frame.py in apply(self, func, axis, broadcast, raw, reduce, result_type, args, **kwds) 6898 If an array is passed, it must be the same length as the data. The 6899 list can contain any of the other types (except list). -> 6900 Keys to group by on the pivot table column. If an array is passed, 6901 it is being used as the same manner as column values.
def trade_allocation(context, data, resOLS): olsPara = resOLS.params # OLS paramters for [x, dx, 1] resid = resOLS.resid ols_var = np.dot(resid, np.transpose(resid))/(resOLS.df_resid) # Replicate the OLS in adf test to predict the movement of stationary portfolio # RecentData = TrainPort[-context.lookback/50:] # only the recent data matters RecentData = context.TrainPort xdiff = np.diff(RecentData) nobs = xdiff.shape[0] xdall = np.column_stack((RecentData[-nobs:], xdiff[:,None])) x = add_trend(xdall[:,:context.lag+1], 'c') y_pred = np.dot(x, olsPara) y_actual = xdiff[-nobs:] profit_potential = y_pred[-1]/context.cost # y_mean = np.mean(TrainPort) # y_std = np.std(TrainPort) # Calculate the score of deviation from current value # dev_score = y_pred[-1]#/np.sqrt(ols_var) # dev_score = -(y_pred[-1] - y_mean)/y_std # dev_score = -(TrainPort[-1] - np.mean(TrainPort))/np.std(TrainPort) curr_price = context.base_prices[-1,:] dollar_allocation = np.multiply(curr_price, context.weights)*profit_potential pct_allocation = context.leverage_limit*dollar_allocation/np.sum(np.abs(dollar_allocation)) pct_allocation = np.asarray(pct_allocation) pct_allocation = pct_allocation[0,:] # print '-'*20 # print 'ADF test statistics: %f' %adfstat # print 'ADF test critvalues: %s' %str(critvalues) # print 'ADF pvalue: %f' %pvalue # print ' ' # print dir(resOLS) # print 'Predic value: %s' %str(y_pred[-5:]) # print 'Fitted value: %s' %str(resOLS.fittedvalues[-5:]) # print 'Actual value: %s' %str(y_actual[-5:]) # print ' ' # print 'Latest value of TrainPort: %f' %TrainPort[-1] # print 'Mean value of TrainPort: %f' %np.mean(TrainPort) # print 'Std Value of TrainPort: %f' %np.std(TrainPort) # print ' ' # print 'avg of actual change: %f' %np.mean(abs(y_actual)) # print 'std of actual change: %f' %np.std(y_actual) # print ' ' # print 'avg of pred change: %f' %np.mean(abs(y_pred)) # print 'std of pred change: %f' %np.std(y_pred) # print ' ' # print 'The predicted bar: %f' %y_pred[-1] # print 'The current bar: %f' %y_actual[-1] # print ' ' # print 'The ols std: %f' %np.sqrt(ols_var) # print 'mse_total %f' %resOLS.mse_total # print 'OLS rsquared: %f' %resOLS.rsquared # print 'profit_potential: %f' %profit_potential # print 'dev_score: %f' %dev_score # print 'Dollar Allocation: %s' %str(dollar_allocation) # if abs(y_pred[-1]) > 2*context.cost/abs(TrainPort[-1]): # return pct_allocation # elif abs(y_pred[-1]) < context.cost/abs(TrainPort[-1]): # return pct_allocation*0 # else: # return None # Trading window is determined by the abs(dev_score) if abs(profit_potential) < context.dev_lower_threshold: # Liquidate all positions if fall below lower threshold return pct_allocation*0. elif abs(profit_potential) < context.dev_upper_threshold: # Do nothing return None else: # Rebalance if the dev is above the upper threshold. return pct_allocation
def test_dataframe_duplicate(self): df = pd.DataFrame(self.arr_2d, columns=["const", "trend"]) tools.add_trend(df, trend="ct") tools.add_trend(df, trend="ct", prepend=True)
def adfuller( x, maxlag=None, regression="c", autolag="AIC", store=False, regresults=False, ): """ Augmented Dickey-Fuller unit root test. The Augmented Dickey-Fuller test can be used to test for a unit root in a univariate process in the presence of serial correlation. Parameters ---------- x : array_like, 1d The data series to test. maxlag : int Maximum lag which is included in test, default 12*(nobs/100)^{1/4}. regression : {"c","ct","ctt","nc"} Constant and trend order to include in regression. * "c" : constant only (default). * "ct" : constant and trend. * "ctt" : constant, and linear and quadratic trend. * "nc" : no constant, no trend. autolag : {"AIC", "BIC", "t-stat", None} Method to use when automatically determining the lag length among the values 0, 1, ..., maxlag. * If "AIC" (default) or "BIC", then the number of lags is chosen to minimize the corresponding information criterion. * "t-stat" based choice of maxlag. Starts with maxlag and drops a lag until the t-statistic on the last lag length is significant using a 5%-sized test. * If None, then the number of included lags is set to maxlag. store : bool If True, then a result instance is returned additionally to the adf statistic. Default is False. regresults : bool, optional If True, the full regression results are returned. Default is False. Returns ------- adf : float The test statistic. pvalue : float MacKinnon's approximate p-value based on MacKinnon (1994, 2010). usedlag : int The number of lags used. nobs : int The number of observations used for the ADF regression and calculation of the critical values. critical values : dict Critical values for the test statistic at the 1 %, 5 %, and 10 % levels. Based on MacKinnon (2010). icbest : float The maximized information criterion if autolag is not None. resstore : ResultStore, optional A dummy class with results attached as attributes. Notes ----- The null hypothesis of the Augmented Dickey-Fuller is that there is a unit root, with the alternative that there is no unit root. If the pvalue is above a critical size, then we cannot reject that there is a unit root. The p-values are obtained through regression surface approximation from MacKinnon 1994, but using the updated 2010 tables. If the p-value is close to significant, then the critical values should be used to judge whether to reject the null. The autolag option and maxlag for it are described in Greene. References ---------- .. [1] W. Green. "Econometric Analysis," 5th ed., Pearson, 2003. .. [2] Hamilton, J.D. "Time Series Analysis". Princeton, 1994. .. [3] MacKinnon, J.G. 1994. "Approximate asymptotic distribution functions for unit-root and cointegration tests. `Journal of Business and Economic Statistics` 12, 167-76. .. [4] MacKinnon, J.G. 2010. "Critical Values for Cointegration Tests." Queen"s University, Dept of Economics, Working Papers. Available at http://ideas.repec.org/p/qed/wpaper/1227.html Examples -------- See example notebook """ x = array_like(x, "x") maxlag = int_like(maxlag, "maxlag", optional=True) regression = string_like(regression, "regression", options=("c", "ct", "ctt", "nc")) autolag = string_like(autolag, "autolag", optional=True, options=("aic", "bic", "t-stat")) store = bool_like(store, "store") regresults = bool_like(regresults, "regresults") if regresults: store = True trenddict = {None: "nc", 0: "c", 1: "ct", 2: "ctt"} if regression is None or isinstance(regression, int): regression = trenddict[regression] regression = regression.lower() nobs = x.shape[0] ntrend = len(regression) if regression != "nc" else 0 if maxlag is None: # from Greene referencing Schwert 1989 maxlag = int(np.ceil(12.0 * np.power(nobs / 100.0, 1 / 4.0))) # -1 for the diff maxlag = min(nobs // 2 - ntrend - 1, maxlag) if maxlag < 0: raise ValueError("sample size is too short to use selected " "regression component") elif maxlag > nobs // 2 - ntrend - 1: raise ValueError("maxlag must be less than (nobs/2 - 1 - ntrend) " "where n trend is the number of included " "deterministic regressors") xdiff = np.diff(x) xdall = lagmat(xdiff[:, None], maxlag, trim="both", original="in") nobs = xdall.shape[0] xdall[:, 0] = x[-nobs - 1:-1] # replace 0 xdiff with level of x xdshort = xdiff[-nobs:] if store: from statsmodels.stats.diagnostic import ResultsStore resstore = ResultsStore() if autolag: if regression != "nc": fullRHS = add_trend(xdall, regression, prepend=True) else: fullRHS = xdall startlag = fullRHS.shape[1] - xdall.shape[1] + 1 # 1 for level # search for lag length with smallest information criteria # Note: use the same number of observations to have comparable IC # aic and bic: smaller is better if not regresults: icbest, bestlag = _autolag(OLS, xdshort, fullRHS, startlag, maxlag, autolag) else: icbest, bestlag, alres = _autolag( OLS, xdshort, fullRHS, startlag, maxlag, autolag, regresults=regresults, ) resstore.autolag_results = alres bestlag -= startlag # convert to lag not column index # rerun ols with best autolag xdall = lagmat(xdiff[:, None], bestlag, trim="both", original="in") nobs = xdall.shape[0] xdall[:, 0] = x[-nobs - 1:-1] # replace 0 xdiff with level of x xdshort = xdiff[-nobs:] usedlag = bestlag else: usedlag = maxlag icbest = None if regression != "nc": resols = OLS(xdshort, add_trend(xdall[:, :usedlag + 1], regression)).fit() else: resols = OLS(xdshort, xdall[:, :usedlag + 1]).fit() adfstat = resols.tvalues[0] # adfstat = (resols.params[0]-1.0)/resols.bse[0] # the "asymptotically correct" z statistic is obtained as # nobs/(1-np.sum(resols.params[1:-(trendorder+1)])) (resols.params[0] - 1) # I think this is the statistic that is used for series that are integrated # for orders higher than I(1), ie., not ADF but cointegration tests. # Get approx p-value and critical values pvalue = mackinnonp(adfstat, regression=regression, N=1) critvalues = mackinnoncrit(N=1, regression=regression, nobs=nobs) critvalues = { "1%": critvalues[0], "5%": critvalues[1], "10%": critvalues[2], } if store: resstore.resols = resols resstore.maxlag = maxlag resstore.usedlag = usedlag resstore.adfstat = adfstat resstore.critvalues = critvalues resstore.nobs = nobs resstore.H0 = ("The coefficient on the lagged level equals 1 - " "unit root") resstore.HA = "The coefficient on the lagged level < 1 - stationary" resstore.icbest = icbest resstore._str = "Augmented Dickey-Fuller Test Results" return adfstat, pvalue, critvalues, resstore else: if not autolag: return adfstat, pvalue, usedlag, nobs, critvalues else: return adfstat, pvalue, usedlag, nobs, critvalues, icbest
def test_dataframe_duplicate(self): df = pd.DataFrame(self.arr_2d, columns=['const', 'trend']) tools.add_trend(df, trend='ct') tools.add_trend(df, trend='ct', prepend=True)
def engle_granger_coef(self, y0, y1, trend='c', method='aeg', maxlag=None, autolag='aic', normalize=True, debug=True): """ Engle-Granger Cointegration Coefficient Calculations. This equation takes a linear combination of two L(1) time series to create a L(0) or stationary time series. This is useful if the two series have a similar stochastic long-term trend, as it eliminates them and allows you Parameters ---------- y0 : array_like The first element in cointegrated system. Must be 1-d. y1 : array_like The remaining elements in cointegrated system. trend : str {'c', 'ct'} The trend term included in regression for cointegrating equation. * 'c' : constant. * 'ct' : constant and linear trend. * also available quadratic trend 'ctt', and no constant 'nc'. method : {'aeg'} Only 'aeg' (augmented Engle-Granger) is available. maxlag : None or int Argument for `adfuller`, largest or given number of lags. autolag : str Argument for `adfuller`, lag selection criterion. * If None, then maxlag lags are used without lag search. * If 'AIC' (default) or 'BIC', then the number of lags is chosen to minimize the corresponding information criterion. * 't-stat' based choice of maxlag. Starts with maxlag and drops a lag until the t-statistic on the last lag length is significant using a 5%-sized test. normalize: boolean, optional As there are infinite scalar combinations that will produce the factor, this normalizes the first entry to be 1. debug: boolean, optional Checks if the series has a possible cointegration factor using the Engle-Granger Cointegration Test Returns ------- coefs: array A vector that will create a L(0) time series if a combination exists. Notes ----- The series should be checked independently for their integration order. The series must be L(1) to get consistent results. You can check this by using the int_order function. References ---------- .. [1] MacKinnon, J.G. 1994 "Approximate Asymptotic Distribution Functions for Unit-Root and Cointegration Tests." Journal of Business & Economics Statistics, 12.2, 167-76. .. [2] MacKinnon, J.G. 2010. "Critical Values for Cointegration Tests." Queen's University, Dept of Economics Working Papers 1227. http://ideas.repec.org/p/qed/wpaper/1227.html .. [3] Hamilton, J. D. (1994). Time series analysis (Vol. 2, pp. 690-696). Princeton, NJ: Princeton university press. """ if debug: coint_t, pvalue, crit_value = coint(y0, y1, trend, method, maxlag, autolag) if pvalue >= .10: print('The null hypothesis cannot be rejected') trend = string_like(trend, 'trend', options=('c', 'nc', 'ct', 'ctt')) nobs, k_vars = y1.shape y1 = add_trend(y1, trend=trend, prepend=False) eg_model = OLS(y0, y1).fit() coefs = eg_model.params[0:k_vars] if normalize: coefs = coefs / coefs[0] return coefs
def fit(self, use_mle: bool = False, disp: bool = False) -> "ThetaModelResults": r""" Estimate model parameters. Parameters ---------- use_mle : bool, default False Estimate the parameters using MLE by fitting an ARIMA(0,1,1) with a drift. If False (the default), estimates parameters using OLS of a constant and a time-trend and by fitting a SES to the model data. disp : bool, default True Display iterative output from fitting the model. Notes ----- When using MLE, the parameters are estimated from the ARIMA(0,1,1) .. math:: X_t = X_{t-1} + b_0 + (\alpha-1)\epsilon_{t-1} + \epsilon_t When estimating the model using 2-step estimation, the model parameters are estimated using the OLS regression .. math:: X_t = a_0 + b_0 (t-1) + \eta_t and the SES .. math:: \tilde{X}_{t+1} = \alpha X_{t} + (1-\alpha)\tilde{X}_{t} Returns ------- ThetaModelResult Model results and forecasting """ if self._deseasonalize and self._use_test: self._test_seasonality() y, seasonal = self._deseasonalize_data() if use_mle: mod = SARIMAX(y, order=(0, 1, 1), trend="c") res = mod.fit(disp=disp) params = np.asarray(res.params) alpha = params[1] + 1 if alpha > 1: alpha = 0.9998 res = mod.fit_constrained({"ma.L1": alpha - 1}) params = np.asarray(res.params) b0 = params[0] sigma2 = params[-1] one_step = res.forecast(1) - b0 else: ct = add_trend(y, "ct", prepend=True)[:, :2] ct[:, 1] -= 1 _, b0 = np.linalg.lstsq(ct, y, rcond=None)[0] res = ExponentialSmoothing( y, initial_level=y[0], initialization_method="known").fit(disp=disp) alpha = res.params[0] sigma2 = None one_step = res.forecast(1) return ThetaModelResults(b0, alpha, sigma2, one_step, seasonal, use_mle, self)
def coint(y0, y1, trend='c', method='aeg', maxlag=None, autolag='aic', return_results=None): """Test for no-cointegration of a univariate equation The null hypothesis is no cointegration. Variables in y0 and y1 are assumed to be integrated of order 1, I(1). This uses the augmented Engle-Granger two-step cointegration test. Constant or trend is included in 1st stage regression, i.e. in cointegrating equation. **Warning:** The autolag default has changed compared to statsmodels 0.8. In 0.8 autolag was always None, no the keyword is used and defaults to 'aic'. Use `autolag=None` to avoid the lag search. Parameters ---------- y1 : array_like, 1d first element in cointegrating vector y2 : array_like remaining elements in cointegrating vector trend : str {'c', 'ct'} trend term included in regression for cointegrating equation * 'c' : constant * 'ct' : constant and linear trend * also available quadratic trend 'ctt', and no constant 'nc' method : string currently only 'aeg' for augmented Engle-Granger test is available. default might change. maxlag : None or int keyword for `adfuller`, largest or given number of lags autolag : string keyword for `adfuller`, lag selection criterion. * if None, then maxlag lags are used without lag search * if 'AIC' (default) or 'BIC', then the number of lags is chosen to minimize the corresponding information criterion * 't-stat' based choice of maxlag. Starts with maxlag and drops a lag until the t-statistic on the last lag length is significant using a 5%-sized test return_results : bool for future compatibility, currently only tuple available. If True, then a results instance is returned. Otherwise, a tuple with the test outcome is returned. Set `return_results=False` to avoid future changes in return. Returns ------- coint_t : float t-statistic of unit-root test on residuals pvalue : float MacKinnon's approximate, asymptotic p-value based on MacKinnon (1994) crit_value : dict Critical values for the test statistic at the 1 %, 5 %, and 10 % levels based on regression curve. This depends on the number of observations. Notes ----- The Null hypothesis is that there is no cointegration, the alternative hypothesis is that there is cointegrating relationship. If the pvalue is small, below a critical size, then we can reject the hypothesis that there is no cointegrating relationship. P-values and critical values are obtained through regression surface approximation from MacKinnon 1994 and 2010. If the two series are almost perfectly collinear, then computing the test is numerically unstable. However, the two series will be cointegrated under the maintained assumption that they are integrated. In this case the t-statistic will be set to -inf and the pvalue to zero. TODO: We could handle gaps in data by dropping rows with nans in the auxiliary regressions. Not implemented yet, currently assumes no nans and no gaps in time series. References ---------- MacKinnon, J.G. 1994 "Approximate Asymptotic Distribution Functions for Unit-Root and Cointegration Tests." Journal of Business & Economics Statistics, 12.2, 167-76. MacKinnon, J.G. 2010. "Critical Values for Cointegration Tests." Queen's University, Dept of Economics Working Papers 1227. http://ideas.repec.org/p/qed/wpaper/1227.html """ trend = trend.lower() if trend not in ['c', 'nc', 'ct', 'ctt']: raise ValueError("trend option %s not understood" % trend) y0 = np.asarray(y0) y1 = np.asarray(y1) if y1.ndim < 2: y1 = y1[:, None] nobs, k_vars = y1.shape k_vars += 1 # add 1 for y0 if trend == 'nc': xx = y1 else: xx = add_trend(y1, trend=trend, prepend=False) res_co = OLS(y0, xx).fit() OLS_params = res_co.params if res_co.rsquared < 1 - 100 * SQRTEPS: res_adf = adfuller(res_co.resid, maxlag=maxlag, autolag=autolag, regression='nc') else: # Edge case where series are too similar res_adf = (-np.inf, ) pval_asy = mackinnonp(res_adf[0], regression=trend, N=k_vars) return res_adf[0], pval_asy, OLS_params
import numpy as np import pandas as pd from pandas import Series, read_csv from matplotlib import pyplot from statsmodels.tsa.seasonal import seasonal_decompose from statsmodels.tsa.tsatools import add_trend # trendData = read_csv('data/trend.csv') # trend = trendData.values.T[0] # seasonalData = read_csv('data/seasonal.csv') # seasonal = seasonalData.values.T[0] # residData = read_csv('data/resid.csv') # resid = residData.values.T[0] dat = Series.from_csv( 'data/Fuzzy_data_resource_JobId_6336594489_5minutes_todeseasonal.csv', header=0) date_range = pd.date_range(dat.index.min(), dat.index.max(), freq="10min") result = add_trend(dat.values) # lol = np.add(trend,seasonal,resid) # lolDf = pd.DataFrame(np.array(lol)) # lolDf.to_csv('data/lol.csv', index=False, header=None) pyplot.plot(result)
def test_trend_n(self): assert_equal(tools.add_trend(self.arr_1d, 'n'), self.arr_1d) assert tools.add_trend(self.arr_1d, 'n') is not self.arr_1d assert_equal(tools.add_trend(self.arr_2d, 'n'), self.arr_2d) assert tools.add_trend(self.arr_2d, 'n') is not self.arr_2d
def dynamic_coefs(self, y0, y1, n_lags=None, trend='c', normalize=True, reverse=False): """ Dynamic Cointegration Coefficient Calculation. This equation takes a linear combination of multiple L(1) time series to create a L(0) or stationary time series. This is useful if the two series have a similar stochastic long-term trend, as it eliminates them and allows you. Unlike Engle-Granger, this method uses dynamic regression - taking an equal combination of lags and leads of the differences of the series - to create a more accurate parameter vector. This method calculates the lag-lead matricies for the given lag values or searches for the best amount of lags using BIC calculations. Once the optimal value is found, the calculation is done and returned. The optimal lag can be found by using dot notation and finding max_lag. You can also find the model by using .model. Parameters ---------- y0 : array_like The first element in cointegrated system. Must be 1-d. y1 : array_like The remaining elements in cointegrated system. n_lags: int, array, None This determines which values the function should search for the best vector. * int: If an int, the calculation is done for only that lag * array: If an array of two integers, the first value is where the search begins and the second is where it ends * None: If None is given, the function searches from 2 to ceiling of the cube root of the number of observations divided by two plus two in order to ensure at least one value is searched. I.E last_lag = (n_obs**(1/3) / 2) + 2 trend : str {'c', 'ct'} The trend term included in regression for cointegrating equation. * 'c' : constant. * 'ct' : constant and linear trend. * also available quadratic trend 'ctt', and no constant 'nc'. normalize: Boolean If true, the first entry in the parameter vector is normalized to one and everything else is divided by the first entry. This is because any cointegrating vector could be multiplied by a scalar and still be a cointegrating vector. reverse: Boolean The series must be ordered from the latest data points to the last. This is in order to calculate the differences. Using this, you can reverse the ordering of your data points. Returns ------- coefs: array A vector that will create a L(0) time series if a combination sexists. Notes ----- The data must go from the latest observations to the earliest. If not, the coef vector will be the opposite sign. The series should be checked independently for their integration order. The series must be L(1) to get consistent results. You can check this by using the int_order function. References ---------- .. [1] Stock, J. H., & Watson, M. W. (1993). A simple estimator of cointegrating vectors in higher order integrated systems. Econometrica: Journal of the Econometric Society, 783-820. .. [2] Hamilton, J. D. (1994). Time series analysis (Vol. 2, pp. 690-696). Princeton, NJ: Princeton university press. """ self.bics = [] self.max_val = [] self.model = '' self.coefs = [] trend = string_like(trend, 'trend', options=('c', 'nc', 'ct', 'ctt')) y1 = add_trend(y1, trend=trend, prepend=True) y1 = y1.reset_index(drop=True) if reverse: y0, y1 = y0[::-1], y1[::-1] if _is_using_pandas(y0, y1): columns = list(y1.columns) else: # Need to check if NumPy, because I can only support those two n_obs, k = y1.shape columns = [f'Var_{x}' for x in range(k)] y0, y1 = pd.DataFrame(y0), pd.DataFrame(y1) if n_lags is None: n_obs, k = y1.shape dta = pd.DataFrame(np.diff(a=y1, n=1, axis=0)) for lag in range(2, int(np.ceil(n_obs**(1 / 3) / 2) + 2)): df1 = pd.DataFrame(lagmat(dta, lag + 1, trim='backward')) cols = dict(zip(list(df1.columns)[::-1][0:k][::-1], columns)) df1 = df1.rename(columns=cols) df2 = pd.DataFrame(lagmat(dta, lag, trim='forward')) lags_leads = pd.concat([df1, df2], axis=1, join='outer') lags_leads = lags_leads.drop(list(range(0, lag))) lags_leads = lags_leads.reset_index(drop=True) lags_leads = lags_leads.drop( list(range(len(lags_leads) - lag, len(lags_leads)))) lags_leads = lags_leads.reset_index(drop=True) data_y = y0.drop(list(range(0, lag))).reset_index(drop=True) data_y = data_y.drop( list(range(len(data_y) - lag - 1, len(data_y)))) data_y = data_y.reset_index(drop=True) self.bics.append([OLS(data_y, lags_leads).fit().bic, lag]) self.max_val = max(self.bics, key=lambda item: item[0]) self.max_val = self.max_val[1] elif len(n_lags) == 2: start, end = int(n_lags[0]), int(n_lags[1]) n_obs, k = y1.shape dta = pd.DataFrame(np.diff(a=y1, n=1, axis=0)) for lag in range(start, end + 1): df1 = pd.DataFrame(lagmat(dta, lag + 1, trim='backward')) cols = dict(zip(list(df1.columns)[::-1][0:k][::-1], columns)) df1 = df1.rename(columns=cols) df2 = pd.DataFrame(lagmat(dta, lag, trim='forward')) lags_leads = pd.concat([df1, df2], axis=1, join='outer') lags_leads = lags_leads.drop(list(range(0, lag))) lags_leads = lags_leads.reset_index(drop=True) lags_leads = lags_leads.drop( list(range(len(lags_leads) - lag, len(lags_leads)))) lags_leads = lags_leads.reset_index(drop=True) data_y = y0.drop(list(range(0, lag))).reset_index(drop=True) data_y = data_y.drop( list(range(len(data_y) - lag - 1, len(data_y)))) data_y = data_y.reset_index(drop=True) self.bics.append([OLS(data_y, lags_leads).fit().bic, lag]) self.max_val = max(self.bics, key=lambda item: item[0]) self.max_val = self.max_val[1] elif len(n_lags) == 1: self.max_val = int(n_lags) else: raise ('Make sure your lags are in one of the required forms.') dta = pd.DataFrame(np.diff(a=y1, n=1, axis=0)) # Create a matrix of the lags, this also retains the original matrix, # which is why max_val + 1 df1 = pd.DataFrame(lagmat(dta, self.max_val + 1, trim='backward')) # Rename the columns, as we need to keep track of them. We know the # original will be the final values cols = dict(zip(list(df1.columns)[::-1][0:k][::-1], columns)) df1 = df1.rename(columns=cols) # Do the same, but these are leads, this does not keep the # original matrix, thus max_val df2 = pd.DataFrame(lagmat(dta, self.max_val, trim='forward')) # There are missing data due to the lags and leads, we concat # the frames and drop the values of which are missing. lags_leads = pd.concat([df1, df2], axis=1, join='outer') lags_leads = lags_leads.drop(list(range(0, self.max_val))) lags_leads = lags_leads.reset_index(drop=True) lags_leads = lags_leads.drop( list(range(len(lags_leads) - self.max_val, len(lags_leads)))) lags_leads.reset_index(drop=True) # We also need to do this for the endog values, we need to # drop 1 extra due to a loss from first differencing. # This will be at the end of the matrix. data_y = y0.drop(list(range(0, self.max_val))).reset_index(drop=True) data_y = data_y.drop( list(range(len(data_y) - self.max_val - 1, len(data_y)))) data_y = data_y.reset_index(drop=True) self.model = OLS(data_y, lags_leads).fit() self.coefs = self.model.params[list(y1.columns)] if normalize: self.coefs = self.coefs / self.coefs[0] return (self.coefs)