def test_tstat(self): exog, endog = lagmat(self.inflation, 12, original='sep', trim='both') icbest, sel_lag = _autolag(OLS, endog, exog, 1, 11, 't-stat') icbest2, sel_lag2 = _autolag_ols(endog, exog, 0, 12, 't-stat') assert np.isscalar(icbest2) assert np.isscalar(sel_lag2) assert sel_lag == sel_lag2 exog, endog = lagmat(self.y, 12, original='sep', trim='both') icbest, sel_lag = _autolag(OLS, endog, exog, 1, 11, 't-stat') icbest2, sel_lag2 = _autolag_ols(endog, exog, 0, 12, 't-stat') assert np.isscalar(icbest2) assert np.isscalar(sel_lag2) assert sel_lag == sel_lag2
def test_tstat(self): exog, endog = lagmat(self.inflation, 12, original="sep", trim="both") _, sel_lag = _autolag(OLS, endog, exog, 1, 11, "t-stat") icbest2, sel_lag2 = _autolag_ols(endog, exog, 0, 12, "t-stat") assert np.isscalar(icbest2) assert np.isscalar(sel_lag2) assert sel_lag == sel_lag2 exog, endog = lagmat(self.y, 12, original="sep", trim="both") _, sel_lag = _autolag(OLS, endog, exog, 1, 11, "t-stat") icbest2, sel_lag2 = _autolag_ols(endog, exog, 0, 12, "t-stat") assert np.isscalar(icbest2) assert np.isscalar(sel_lag2) assert sel_lag == sel_lag2
def _df_select_lags(y, trend, max_lags, method): """ Helper method to determine the best lag length in DF-like regressions Parameters ---------- y : array-like, (nobs,) The data for the lag selection exercise trend : str, {'nc','c','ct','ctt'} The trend order max_lags : int The maximum number of lags to check. This setting affects all estimation since the sample is adjusted by max_lags when fitting the models method : str, {'AIC','BIC','t-stat'} The method to use when estimating the model Returns ------- best_ic : float The information criteria at the selected lag best_lag : int The selected lag all_res : list List of OLS results from fitting max_lag + 1 models Notes ----- See statsmodels.tsa.tsatools._autolag for details. If max_lags is None, the default value of 12 * (nobs/100)**(1/4) is used. """ nobs = y.shape[0] delta_y = diff(y) if max_lags is None: max_lags = int(ceil(12. * power(nobs / 100., 1 / 4.))) rhs = lagmat(delta_y[:, None], max_lags, trim='both', original='in') nobs = rhs.shape[0] rhs[:, 0] = y[-nobs - 1:-1] # replace 0 with level of y lhs = delta_y[-nobs:] if trend != 'nc': full_rhs = add_trend(rhs, trend, prepend=True) else: full_rhs = rhs start_lag = full_rhs.shape[1] - rhs.shape[1] + 1 ic_best, best_lag, all_res = _autolag(OLS, lhs, full_rhs, start_lag, max_lags, method, regresults=True) # To get the correct number of lags, subtract the start_lag since # lags 0,1,...,start_lag-1 were not actual lags, but other variables best_lag -= start_lag return ic_best, best_lag, all_res
def _autolag_ols(endog, exog, startlag, maxlag, method, modargs=(), regresults=False): """ Returns the results for the lag length that maximizes the info criterion. Parameters ---------- endog : {ndarray, Series} nobs array containing endogenous variable exog : {ndarray, DataFrame} nobs by (startlag + maxlag) array containing lags and possibly other variables startlag : int The first zero-indexed column to hold a lag. See Notes. maxlag : int The highest lag order for lag length selection. method : {'aic', 'bic', 't-stat'} aic - Akaike Information Criterion bic - Bayes Information Criterion t-stat - Based on last lag regresults : bool, optional Flag indicating to return optional return results Returns ------- bestlag : int The lag length that maximizes the information criterion. Notes ----- Does estimation like mod(endog, exog[:,:i], *modargs).fit(*fitargs) where i goes from lagstart to lagstart+maxlag+1. Therefore, lags are assumed to be in contiguous columns from low to high lag length with the highest lag in the last column. """ method = method.lower() if regresults: return _autolag(OLS, endog, exog, startlag, maxlag, method, regresults=regresults) resid = squeeze(endog.copy()) x = exog[:, startlag:].copy() sigma2 = empty(maxlag + 1) tstat = empty(maxlag + 1) if len(exog) > 0 and startlag > 0: _x = exog[:, :startlag] resid -= _x.dot(pinv(_x).dot(resid)) x -= _x.dot(pinv(_x).dot(x)) sigma2[0] = (resid**2).mean() tstat[0] = inf for i in range(maxlag): _x = x[:, i:i + 1] xpx = _x.T.dot(_x) beta = squeeze(_x.T.dot(resid) / xpx) resid -= squeeze(beta * _x) x[:, i + 1:] -= _x.dot(_x.T.dot(x[:, i + 1:]) / xpx) sigma2[i + 1] = (resid**2).mean() tstat[i + 1] = beta / sqrt(sigma2[i + 1] / xpx) nobs = float(resid.shape[0]) llf = -nobs / 2.0 * (log(2 * pi) + log(sigma2) + 1) if method == 'aic': crit = -2 * llf + 2 * arange(float(maxlag + 1)) icbest, lag = min(zip(crit, arange(maxlag + 1))) elif method == 'bic': crit = -2 * llf + log(nobs) * arange(float(maxlag + 1)) icbest, lag = min(zip(crit, arange(maxlag + 1))) elif method == 't-stat': stop = 1.6448536269514722 large_tstat = abs(tstat) >= stop lag = int(squeeze(max(argwhere(large_tstat)))) icbest = float(tstat[lag]) else: raise ValueError('Unknown method') return icbest, lag
def _autolag_ols(endog, exog, startlag, maxlag, method, modargs=(), regresults=False): """ Returns the results for the lag length that maximizes the info criterion. Parameters ---------- endog : array-like nobs array containing endogenous variable exog : array-like nobs by (startlag + maxlag) array containing lags and possibly other variables startlag : int The first zero-indexed column to hold a lag. See Notes. maxlag : int The highest lag order for lag length selection. method : {'aic', 'bic', 't-stat'} aic - Akaike Information Criterion bic - Bayes Information Criterion t-stat - Based on last lag regresults : bool, optional Flag indicating to return optional return results Returns ------- bestlag : int The lag length that maximizes the information criterion. Notes ----- Does estimation like mod(endog, exog[:,:i], *modargs).fit(*fitargs) where i goes from lagstart to lagstart+maxlag+1. Therefore, lags are assumed to be in contiguous columns from low to high lag length with the highest lag in the last column. """ method = method.lower() if regresults: return _autolag(OLS, endog, exog, startlag, maxlag, method, regresults=regresults) resid = squeeze(endog.copy()) x = exog[:, startlag:].copy() sigma2 = empty(maxlag + 1) tstat = empty(maxlag + 1) if len(exog) > 0 and startlag > 0: _x = exog[:, :startlag] resid -= _x.dot(pinv(_x).dot(resid)) x -= _x.dot(pinv(_x).dot(x)) sigma2[0] = (resid ** 2).mean() tstat[0] = inf for i in range(maxlag): _x = x[:, i:i + 1] xpx = _x.T.dot(_x) beta = squeeze(_x.T.dot(resid) / xpx) resid -= squeeze(beta * _x) x[:, i + 1:] -= _x.dot(_x.T.dot(x[:, i + 1:]) / xpx) sigma2[i + 1] = (resid ** 2).mean() tstat[i + 1] = beta / sqrt(sigma2[i + 1] / xpx) nobs = float(resid.shape[0]) llf = -nobs / 2.0 * (log(2 * pi) + log(sigma2) + 1) if method == 'aic': crit = -2 * llf + 2 * arange(float(maxlag + 1)) icbest, lag = min(zip(crit, arange(maxlag + 1))) elif method == 'bic': crit = -2 * llf + log(nobs) * arange(float(maxlag + 1)) icbest, lag = min(zip(crit, arange(maxlag + 1))) elif method == 't-stat': stop = 1.6448536269514722 large_tstat = abs(tstat) >= stop lag = int(squeeze(max(argwhere(large_tstat)))) icbest = float(tstat[lag]) else: raise ValueError('Unknown method') return icbest, lag
def _autolag_ols(endog, exog, startlag, maxlag, method, modargs=(), regresults=False): """ Returns the results for the lag length that maximizes the info criterion. Parameters ---------- endog : {ndarray, Series} nobs array containing endogenous variable exog : {ndarray, DataFrame} nobs by (startlag + maxlag) array containing lags and possibly other variables startlag : int The first zero-indexed column to hold a lag. See Notes. maxlag : int The highest lag order for lag length selection. method : {'aic', 'bic', 't-stat'} aic - Akaike Information Criterion bic - Bayes Information Criterion t-stat - Based on last lag regresults : bool, optional Flag indicating to return optional return results Returns ------- bestlag : int The lag length that maximizes the information criterion. Notes ----- Does estimation like mod(endog, exog[:,:i], *modargs).fit(*fitargs) where i goes from lagstart to lagstart+maxlag+1. Therefore, lags are assumed to be in contiguous columns from low to high lag length with the highest lag in the last column. """ method = method.lower() if regresults: return _autolag(OLS, endog, exog, startlag, maxlag, method, regresults=regresults) q, r = qr(exog) qpy = q.T.dot(endog) ypy = endog.T.dot(endog) xpx = exog.T.dot(exog) sigma2 = empty(maxlag + 1) tstat = empty(maxlag + 1) nobs = float(endog.shape[0]) tstat[0] = inf for i in range(startlag, startlag + maxlag + 1): b = solve(r[:i, :i], qpy[:i]) sigma2[i - startlag] = (ypy - b.T.dot(xpx[:i, :i]).dot(b)) / nobs if method == 't-stat' and i > startlag: xpxi = inv(xpx[:i, :i]) stderr = sqrt(sigma2[i - startlag] * xpxi[-1, -1]) tstat[i - startlag] = b[-1] / stderr llf = -nobs / 2.0 * (log(2 * pi) + log(sigma2) + 1) if method == 'aic': crit = -2 * llf + 2 * arange(float(maxlag + 1)) icbest, lag = min(zip(crit, arange(maxlag + 1))) elif method == 'bic': crit = -2 * llf + log(nobs) * arange(float(maxlag + 1)) icbest, lag = min(zip(crit, arange(maxlag + 1))) elif method == 't-stat': stop = 1.6448536269514722 large_tstat = abs(tstat) >= stop lag = int(squeeze(max(argwhere(large_tstat)))) icbest = float(tstat[lag]) else: raise ValueError('Unknown method') return icbest, lag