def setupClass(cls): data = longley.load() data.exog = add_constant(data.exog, prepend=False) cls.endog = data.endog cls.exog = data.exog cls.ols_model = OLS(data.endog, data.exog)
from __future__ import print_function import numpy as np from statsmodels.regression.linear_model import OLS, GLSAR from statsmodels.tools.tools import add_constant from statsmodels.datasets import macrodata import statsmodels.regression.tests.results.results_macro_ols_robust as res d2 = macrodata.load().data g_gdp = 400 * np.diff(np.log(d2['realgdp'])) g_inv = 400 * np.diff(np.log(d2['realinv'])) exogg = add_constant(np.c_[g_gdp, d2['realint'][:-1]], prepend=False) res_olsg = OLS(g_inv, exogg).fit() print(res_olsg.summary()) res_hc0 = res_olsg.get_robustcov_results('HC1') print('\n\n') print(res_hc0.summary()) print('\n\n') res_hac4 = res_olsg.get_robustcov_results('HAC', maxlags=4, use_correction=True) print(res_hac4.summary()) print('\n\n') tt = res_hac4.t_test(np.eye(len(res_hac4.params))) print(tt.summary()) print('\n\n') print(tt.summary_frame()) res_hac4.use_t = False
VIF = np.diag(C).round(2) print('VIF:', VIF) #38.5 254.42 46.87 282.51 df_scaled = (df - df.mean()) / df.std() A_scaled = np.array(df_scaled) #print(A_scaled) #ndarray,not dataframe x1x2 = A_scaled[:, [1, 2]] x3x4 = A_scaled[:, [3, 4]] A = np.array(df) X = A[:, 1:] #print(X) B = np.dot(X.T, X) ev, evct = np.linalg.eig(B) kk = ev.max() / ev.min() print('lambda1/lambda2:', kk) #423.7 lr1 = OLS(dfy, add_constant(x1x2)).fit() lr2 = OLS(dfy, add_constant(x3x4)).fit() print('AIC:', lr1.aic, lr2.aic) #x1x2=62.31 x3x4=76.74 x2x4=97.51 xmin = A_scaled[:, :] nmin = sets lrmin = OLS(dfy, add_constant(xmin)).fit() for n in subsets(sets)[1:-1]: xx = A_scaled[:, n] lr = OLS(dfy, add_constant(xx)).fit() # print(lr.aic) if lr.aic < lrmin.aic: lrmin.aic = lr.aic nmin = n print('AICmin:', lrmin.aic, 'Combination:', nmin) #x1,x2,x3,x4
def plot_ccpr(results, exog_idx, ax=None): """Plot CCPR against one regressor. Generates a CCPR (component and component-plus-residual) plot. Parameters ---------- results : result instance A regression results instance. exog_idx : int or string Exogenous, explanatory variable. If string is given, it should be the variable name that you want to use, and you can use arbitrary translations as with a formula. ax : Matplotlib AxesSubplot instance, optional If given, it is used to plot in instead of a new figure being created. Returns ------- fig : Matplotlib figure instance If `ax` is None, the created figure. Otherwise the figure to which `ax` is connected. See Also -------- plot_ccpr_grid : Creates CCPR plot for multiple regressors in a plot grid. Notes ----- The CCPR plot provides a way to judge the effect of one regressor on the response variable by taking into account the effects of the other independent variables. The partial residuals plot is defined as Residuals + B_i*X_i versus X_i. The component adds the B_i*X_i versus X_i to show where the fitted line would lie. Care should be taken if X_i is highly correlated with any of the other independent variables. If this is the case, the variance evident in the plot will be an underestimate of the true variance. References ---------- http://www.itl.nist.gov/div898/software/dataplot/refman1/auxillar/ccpr.htm """ fig, ax = utils.create_mpl_ax(ax) exog_name, exog_idx = utils.maybe_name_or_idx(exog_idx, results.model) results = maybe_unwrap_results(results) x1 = results.model.exog[:, exog_idx] #namestr = ' for %s' % self.name if self.name else '' x1beta = x1 * results.params[exog_idx] ax.plot(x1, x1beta + results.resid, 'o') from statsmodels.tools.tools import add_constant mod = OLS(x1beta, add_constant(x1)).fit() params = mod.params fig = abline_plot(*params, **dict(ax=ax)) #ax.plot(x1, x1beta, '-') ax.set_title('Component and component plus residual plot') ax.set_ylabel("Residual + %s*beta_%d" % (exog_name, exog_idx)) ax.set_xlabel("%s" % exog_name) return fig
def grangercausalitytests_mod(x, maxlag, addconst=True, verbose=True): import numpy as np from scipy import stats from statsmodels.tsa.tsatools import lagmat2ds from statsmodels.tools.tools import add_constant from statsmodels.regression.linear_model import OLS from warnings import warn x = np.asarray(x) if x.shape[0] <= 3 * maxlag + int(addconst): warn("Insufficient observations. Maximum allowable lag is {0}." "The maximum lag will be set to " "this number".format(int((x.shape[0] - int(addconst)) / 3) - 1)) maxlag = int((x.shape[0] - int(addconst)) / 3) - 1 # print(x.shape[0]) # print(int((x.shape[0] - int(addconst)) / 3) - 1) # print(maxlag) resli = {} for mlg in range(1, maxlag + 1): result = {} if verbose: print('\nGranger Causality') print('number of lags (no zero)', mlg) mxlg = mlg # create lagmat of both time series dta = lagmat2ds(x, mxlg, trim='both') dta = np.delete(dta, -1, axis=1) # removal of the not lagged xs #add constant if addconst: dtaown = add_constant(dta[:, 1:(mxlg + 1)], prepend=False) dtajoint = add_constant(dta[:, 1:], prepend=False) else: raise NotImplementedError('Not Implemented') #dtaown = dta[:, 1:mxlg] #dtajoint = dta[:, 1:] # Run ols on both models without and with lags of second variable res2down = OLS(dta[:, 0], dtaown).fit() res2djoint = OLS(dta[:, 0], dtajoint).fit() #print results #for ssr based tests see: #http://support.sas.com/rnd/app/examples/ets/granger/index.htm #the other tests are made-up # Granger Causality test using ssr (F statistic) fgc1 = ((res2down.ssr - res2djoint.ssr) / res2djoint.ssr / mxlg * res2djoint.df_resid) if verbose: print('ssr based F test: F=%-8.4f, p=%-8.4f, df_denom=%d,' ' df_num=%d' % (fgc1, stats.f.sf(fgc1, mxlg, res2djoint.df_resid), res2djoint.df_resid, mxlg)) result['ssr_ftest'] = (fgc1, stats.f.sf(fgc1, mxlg, res2djoint.df_resid), res2djoint.df_resid, mxlg) # Granger Causality test using ssr (ch2 statistic) fgc2 = res2down.nobs * (res2down.ssr - res2djoint.ssr) / res2djoint.ssr if verbose: print('ssr based chi2 test: chi2=%-8.4f, p=%-8.4f, ' 'df=%d' % (fgc2, stats.chi2.sf(fgc2, mxlg), mxlg)) result['ssr_chi2test'] = (fgc2, stats.chi2.sf(fgc2, mxlg), mxlg) #likelihood ratio test pvalue: lr = -2 * (res2down.llf - res2djoint.llf) if verbose: print('likelihood ratio test: chi2=%-8.4f, p=%-8.4f, df=%d' % (lr, stats.chi2.sf(lr, mxlg), mxlg)) result['lrtest'] = (lr, stats.chi2.sf(lr, mxlg), mxlg) # F test that all lag coefficients of exog are zero rconstr = np.column_stack((np.zeros( (mxlg, mxlg)), np.eye(mxlg, mxlg), np.zeros((mxlg, 1)))) ftres = res2djoint.f_test(rconstr) if verbose: print('parameter F test: F=%-8.4f, p=%-8.4f, df_denom=%d,' ' df_num=%d' % (ftres.fvalue, ftres.pvalue, ftres.df_denom, ftres.df_num)) result['params_ftest'] = (np.squeeze(ftres.fvalue)[()], np.squeeze(ftres.pvalue)[()], ftres.df_denom, ftres.df_num) resli[mxlg] = (result, [res2down, res2djoint, rconstr]) return resli
def dispersion_poisson(results): """Score/LM type tests for Poisson variance assumptions Null Hypothesis is H0: var(y) = E(y) and assuming E(y) is correctly specified H1: var(y) ~= E(y) The tests are based on the constrained model, i.e. the Poisson model. The tests differ in their assumed alternatives, and in their maintained assumptions. Parameters ---------- results : Poisson results instance This can be a results instance for either a discrete Poisson or a GLM with family Poisson. Returns ------- res : ndarray, shape (7, 2) each row contains the test statistic and p-value for one of the 7 tests computed here. description : 2-D list of strings Each test has two strings a descriptive name and a string for the alternative hypothesis. """ if hasattr(results, '_results'): results = results._results endog = results.model.endog nobs = endog.shape[0] #TODO: use attribute, may need to be added fitted = results.predict() #fitted = results.fittedvalues # discrete has linear prediction #this assumes Poisson resid2 = results.resid_response**2 var_resid_endog = (resid2 - endog) var_resid_fitted = (resid2 - fitted) std1 = np.sqrt(2 * (fitted**2).sum()) var_resid_endog_sum = var_resid_endog.sum() dean_a = var_resid_fitted.sum() / std1 dean_b = var_resid_endog_sum / std1 dean_c = (var_resid_endog / fitted).sum() / np.sqrt(2 * nobs) pval_dean_a = stats.norm.sf(np.abs(dean_a)) pval_dean_b = stats.norm.sf(np.abs(dean_b)) pval_dean_c = stats.norm.sf(np.abs(dean_c)) results_all = [[dean_a, pval_dean_a], [dean_b, pval_dean_b], [dean_c, pval_dean_c]] description = [['Dean A', 'mu (1 + a mu)'], ['Dean B', 'mu (1 + a mu)'], ['Dean C', 'mu (1 + a)']] # Cameron Trived auxiliary regression page 78 count book 1989 endog_v = var_resid_endog / fitted res_ols_nb2 = OLS(endog_v, fitted).fit(use_t=False) stat_ols_nb2 = res_ols_nb2.tvalues[0] pval_ols_nb2 = res_ols_nb2.pvalues[0] results_all.append([stat_ols_nb2, pval_ols_nb2]) description.append(['CT nb2', 'mu (1 + a mu)']) res_ols_nb1 = OLS(endog_v, fitted).fit(use_t=False) stat_ols_nb1 = res_ols_nb1.tvalues[0] pval_ols_nb1 = res_ols_nb1.pvalues[0] results_all.append([stat_ols_nb1, pval_ols_nb1]) description.append(['CT nb1', 'mu (1 + a)']) endog_v = var_resid_endog / fitted res_ols_nb2 = OLS(endog_v, fitted).fit(cov_type='HC1', use_t=False) stat_ols_hc1_nb2 = res_ols_nb2.tvalues[0] pval_ols_hc1_nb2 = res_ols_nb2.pvalues[0] results_all.append([stat_ols_hc1_nb2, pval_ols_hc1_nb2]) description.append(['CT nb2 HC1', 'mu (1 + a mu)']) res_ols_nb1 = OLS(endog_v, np.ones(len(endog_v))).fit(cov_type='HC1', use_t=False) stat_ols_hc1_nb1 = res_ols_nb1.tvalues[0] pval_ols_hc1_nb1 = res_ols_nb1.pvalues[0] results_all.append([stat_ols_hc1_nb1, pval_ols_hc1_nb1]) description.append(['CT nb1 HC1', 'mu (1 + a)']) return np.array(results_all), description
def _fit_arma_iter(outputs, inputs, p, q, r, l2_reg=0.0): """Iterative regression for estimating AR params in ARMAX(p, q, r) model. The iterative AR regression process provides consistent estimates for the AR parameters of an ARMAX(p, q, r) model after q iterative steps. It first fits an ARMAX(p, 0, r) model with least squares regression, then ARMAX(p, 1, r), and so on, ..., til ARMAX(p, q, r). At the i-th step, it fits an ARMAX(p, i, r) model, according to estimated error terms from the previous step. For description of the iterative regression method, see Section 2 of `Consistent Estimates of Autoregressive Parameters and Extended Sample Autocorrelation Function for Stationary and Nonstationary ARMA Models` at https://www.jstor.org/stable/2288340. The implementation here is a generalization of the method mentioned in the paper. We adapt the method for multidimensional outputs, exogenous inputs, nan handling, and also add regularization on the MA parameters. Args: outputs: Array with the output values from the LDS, nans allowed. inputs: Array with exogenous inputs values, nans allowed. Could be None. p: AR order, i.e. max lag of the autoregressive part. q: MA order, i.e. max lag of the error terms. r: Max lag of the exogenous inputs. l2_reg: L2 regularization coefficient, to be applied on MA coefficients. Returns: Fitted AR coefficients. """ if outputs.shape[1] > 1: # If there are multiple output dimensions, fit autoregressive params on # each dimension separately and average. params_list = [ _fit_arma_iter(outputs[:, j:j+1], inputs, p, q, r, l2_reg=l2_reg) \ for j in xrange(outputs.shape[1])] return np.mean(np.concatenate([a.reshape(1, -1) for a in params_list]), axis=0) # We include a constant term in regression. k_const = 1 # Input dim. If inputs is None, then in_dim = 0. in_dim = 0 if inputs is not None: in_dim = inputs.shape[1] # Lag the inputs to obtain [?, r], column j means series x_{t-j}. # Use trim to drop rows with unknown values both at beginning and end. lagged_in = np.concatenate([ lagmat(inputs[:, i], maxlag=r, trim='both') for i in xrange(in_dim) ], axis=1) # Since we trim in beginning, the offset is r. lagged_in_offset = r # Lag the series itself to p-th order. lagged_out = lagmat(outputs, maxlag=p, trim='both') lagged_out_offset = p y = outputs y_offset = 0 # Estimated residuals, initialized to 0. res = np.zeros_like(outputs) for i in xrange(q + 1): # Lag the residuals to i-th order in i-th iteration. lagged_res = lagmat(res, maxlag=i, trim='both') lagged_res_offset = y_offset + i # Compute offset in regression, since lagged_in, lagged_out, and lagged_res # have different offsets. Align them. if inputs is None: y_offset = max(lagged_out_offset, lagged_res_offset) else: y_offset = max(lagged_out_offset, lagged_res_offset, lagged_in_offset) y = outputs[y_offset:, :] # Concatenate all variables in regression. x = np.concatenate([ lagged_out[y_offset - lagged_out_offset:, :], lagged_res[y_offset - lagged_res_offset:, :] ], axis=1) if inputs is not None: x = np.concatenate([lagged_in[y_offset - lagged_in_offset:, :], x], axis=1) # Add constant term as the first variable. x = add_constant(x, prepend=True) if x.shape[1] < k_const + in_dim * r + p + i: raise ValueError('Insufficient sequence length for model fitting.') # Drop rows with nans. arr = np.concatenate([y, x], axis=1) arr = arr[~np.isnan(arr).any(axis=1)] y_dropped_na = arr[:, 0:1] x_dropped_na = arr[:, 1:] # Only regularize the MA part. alpha = np.concatenate( [np.zeros(k_const + in_dim * r + p), l2_reg * np.ones(i)], axis=0) # When L1_wt = 0, it's ridge regression. olsfit = OLS(y_dropped_na, x_dropped_na).fit_regularized(alpha=alpha, L1_wt=0.0) # Update estimated residuals. res = y - np.matmul(x, olsfit.params.reshape(-1, 1)) if len(olsfit.params) != k_const + in_dim * r + p + q: raise ValueError('Expected param len %d, got %d.' % (k_const + in_dim * r + p + q, len(olsfit.params))) if q == 0: return olsfit.params[-p:] return olsfit.params[-(p + q):-q]
def setupClass(cls): super(TestNxNxOne, cls).setupClass() cls.mod2 = OLS(cls.endog_n_, cls.exog_n_one) cls.mod2.df_model += 1 cls.res2 = cls.mod2.fit()
def test_no_penalization(self): res_ols = OLS(self.res1.model.endog, self.res1.model.exog).fit() res_theil = self.res1.model.fit(pen_weight=0, cov_type='data-prior') assert_allclose(res_theil.params, res_ols.params, rtol=1e-10) assert_allclose(res_theil.bse, res_ols.bse, rtol=1e-10)
def setupClass(cls): data = longley.load() data.exog = add_constant(data.exog) cls.res1 = OLS(data.endog, data.exog).fit() R = np.identity(7)[:-1, :] cls.Ftest = cls.res1.f_test(R)
def setupClass(cls): data = longley.load() data.exog = add_constant(data.exog) cls.res1 = GLS(data.endog, data.exog).fit() cls.res2 = OLS(data.endog, data.exog).fit()
def _engine_factory(self, fy, X, check_integrity=True): if self.use_weighted_fit: return WLS(fy, X, weights=self._get_weights()) else: return OLS(fy, X)
def test_beta(self, b0_vals, param_nums, ftol=10**-5, maxiter=30, print_weights=1): """ Returns the profile log likelihood for regression parameters 'param_num' at 'b0_vals.' Parameters ---------- b0_vals : list The value of parameters to be tested param_num : list Which parameters to be tested maxiter : int, optional How many iterations to use in the EM algorithm. Default is 30 ftol : float, optional The function tolerance for the EM optimization. Default is 10''**''-5 print_weights : bool If true, returns the weights tate maximize the profile log likelihood. Default is False Returns ------- test_results : tuple The log-likelihood and p-pvalue of the test. Notes ----- The function will warn if the EM reaches the maxiter. However, when optimizing over nuisance parameters, it is possible to reach a maximum number of inner iterations for a specific value for the nuisance parameters while the resultsof the function are still valid. This usually occurs when the optimization over the nuisance parameters selects parameter values that yield a log-likihood ratio close to infinity. Examples -------- >>> import statsmodels.api as sm >>> import numpy as np # Test parameter is .05 in one regressor no intercept model >>> data=sm.datasets.heart.load(as_pandas=False) >>> y = np.log10(data.endog) >>> x = data.exog >>> cens = data.censors >>> model = sm.emplike.emplikeAFT(y, x, cens) >>> res=model.test_beta([0], [0]) >>> res (1.4657739632606308, 0.22601365256959183) #Test slope is 0 in model with intercept >>> data=sm.datasets.heart.load(as_pandas=False) >>> y = np.log10(data.endog) >>> x = data.exog >>> cens = data.censors >>> model = sm.emplike.emplikeAFT(y, sm.add_constant(x), cens) >>> res = model.test_beta([0], [1]) >>> res (4.623487775078047, 0.031537049752572731) """ censors = self.model.censors endog = self.model.endog exog = self.model.exog uncensored = (censors == 1).flatten() censored = (censors == 0).flatten() uncens_endog = endog[uncensored] uncens_exog = exog[uncensored, :] reg_model = OLS(uncens_endog, uncens_exog).fit() llr, pval, new_weights = reg_model.el_test( b0_vals, param_nums, return_weights=True) # Needs to be changed km = self.model._make_km(endog, censors).flatten() # when merged uncens_nobs = self.model.uncens_nobs F = np.asarray(new_weights).reshape(uncens_nobs) # Step 0 ^ params = self.params() survidx = np.where(censors == 0) survidx = survidx[0] - np.arange(len(survidx[0])) numcensbelow = np.int_(np.cumsum(1 - censors)) if len(param_nums) == len(params): llr = self._EM_test([], F=F, params=params, param_nums=param_nums, b0_vals=b0_vals, survidx=survidx, uncens_nobs=uncens_nobs, numcensbelow=numcensbelow, km=km, uncensored=uncensored, censored=censored, ftol=ftol, maxiter=25) return llr, chi2.sf(llr, self.model.nvar) else: x0 = np.delete(params, param_nums) try: res = optimize.fmin( self._EM_test, x0, (params, param_nums, b0_vals, F, survidx, uncens_nobs, numcensbelow, km, uncensored, censored, maxiter, ftol), full_output=1, disp=0) llr = res[1] return llr, chi2.sf(llr, len(param_nums)) except np.linalg.linalg.LinAlgError: return np.inf, 0
def setupClass(cls): data = longley.load() data.exog = add_constant(data.exog, prepend=False) cls.res1 = OLS(data.endog, data.exog).fit() cls.res2 = WLS(data.endog, data.exog).fit()
def lm_test_glm(result, exog_extra, mean_deriv=None): '''score/lagrange multiplier test for GLM Wooldridge procedure for test of mean function in GLM Parameters ---------- results : GLMResults instance results instance with the constrained model exog_extra : ndarray or None additional exogenous variables for variable addition test This can be set to None if mean_deriv is provided. mean_deriv : None or ndarray Extra moment condition that correspond to the partial derivative of a mean function with respect to some parameters. Returns ------- test_results : Results instance The results instance has the following attributes which are score statistic and p-value for 3 versions of the score test. c1, pval1 : nonrobust score_test results c2, pval2 : score test results robust to over or under dispersion c3, pval3 : score test results fully robust to any heteroscedasticity The test results instance also has a simple summary method. Notes ----- TODO: add `df` to results and make df detection more robust This implements the auxiliary regression procedure of Wooldridge, implemented based on the presentation in chapter 8 in Handbook of Applied Econometrics 2. References ---------- Wooldridge, Jeffrey M. 1997. “Quasi-Likelihood Methods for Count Data.” Handbook of Applied Econometrics 2: 352–406. and other articles and text book by Wooldridge ''' if hasattr(result, '_result'): res = result._result else: res = result mod = result.model nobs = mod.endog.shape[0] #mean_func = mod.family.link.inverse dlinkinv = mod.family.link.inverse_deriv # derivative of mean function w.r.t. beta (linear params) dm = lambda x, linpred: dlinkinv(linpred)[:, None] * x var_func = mod.family.variance x = result.model.exog x2 = exog_extra # test omitted lin_pred = res.predict(linear=True) dm_incl = dm(x, lin_pred) if x2 is not None: dm_excl = dm(x2, lin_pred) if mean_deriv is not None: # allow both and stack dm_excl = np.column_stack((dm_excl, mean_deriv)) elif mean_deriv is not None: dm_excl = mean_deriv else: raise ValueError('either exog_extra or mean_deriv have to be provided') # TODO check for rank or redundant, note OLS calculates the rank k_constraint = dm_excl.shape[1] fittedvalues = res.predict() # discrete has linpred instead of mean v = var_func(fittedvalues) std = np.sqrt(v) res_ols1 = OLS(res.resid_response / std, np.column_stack((dm_incl, dm_excl)) / std[:, None]).fit() # case: nonrobust assumes variance implied by distribution is correct c1 = res_ols1.ess pval1 = stats.chi2.sf(c1, k_constraint) #print c1, stats.chi2.sf(c1, 2) # case: robust to dispersion c2 = nobs * res_ols1.rsquared pval2 = stats.chi2.sf(c2, k_constraint) #print c2, stats.chi2.sf(c2, 2) # case: robust to heteroscedasticity from statsmodels.stats.multivariate_tools import partial_project pp = partial_project(dm_excl / std[:, None], dm_incl / std[:, None]) resid_p = res.resid_response / std res_ols3 = OLS(np.ones(nobs), pp.resid * resid_p[:, None]).fit() #c3 = nobs * res_ols3.rsquared # this is Wooldridge c3b = res_ols3.ess # simpler if endog is ones pval3 = stats.chi2.sf(c3b, k_constraint) tres = TestResults(c1=c1, pval1=pval1, c2=c2, pval2=pval2, c3=c3b, pval3=pval3) return tres
def setup_class(cls): y, x = cls.get_sample() mod1 = TheilGLS(y, x, sigma_prior=[0, 0, 1., 1.]) cls.res1 = mod1.fit(0) cls.res2 = OLS(y, x).fit()
def cm_test_robust(resid, resid_deriv, instruments, weights=1): '''score/lagrange multiplier of Wooldridge generic version of Wooldridge procedure for test of conditional moments Limitation: This version allows only for one unconditional moment restriction, i.e. resid is scalar for each observation. Another limitation is that it assumes independent observations, no correlation in residuals and weights cannot be replaced by cross-observation whitening. Parameters ---------- resid : ndarray, (nobs, ) conditional moment restriction, E(r | x, params) = 0 resid_deriv : ndarray, (nobs, k_params) derivative of conditional moment restriction with respect to parameters instruments : ndarray, (nobs, k_instruments) indicator variables of Wooldridge, multiplies the conditional momen restriction weights : ndarray This is a weights function as used in WLS. The moment restrictions are multiplied by weights. This corresponds to the inverse of the variance in a heteroskedastic model. Returns ------- test_results : Results instance ??? TODO Notes ----- This implements the auxiliary regression procedure of Wooldridge, implemented based on procedure 2.1 in Wooldridge 1990. Wooldridge allows for multivariate conditional moments (`resid`) TODO: check dimensions for multivariate case for extension References ---------- Wooldridge Wooldridge and more Wooldridge ''' # notation: Wooldridge uses too mamny Greek letters # instruments is capital lambda # resid is small phi # resid_deriv is capital phi # weights is C nobs = resid.shape[0] from statsmodels.stats.multivariate_tools import partial_project w_sqrt = np.sqrt(weights) if np.size(weights) > 1: w_sqrt = w_sqrt[:, None] pp = partial_project(instruments * w_sqrt, resid_deriv * w_sqrt) mom_resid = pp.resid moms_test = mom_resid * resid[:, None] * w_sqrt # we get this here in case we extend resid to be more than 1-D k_constraint = moms_test.shape[1] # use OPG variance as in Wooldridge 1990. This might generalize cov = moms_test.T.dot(moms_test) diff = moms_test.sum(0) # see Wooldridge last page in appendix stat = diff.dot(np.linalg.solve(cov, diff)) # for checking, this corresponds to nobs * rsquared of auxiliary regression stat2 = OLS(np.ones(nobs), moms_test).fit().ess pval = stats.chi2.sf(stat, k_constraint) return stat, pval, stat2
def test_ols_noncentrality(self): k = self.k_groups res_ols = OLS(self.y, self.ex).fit() nobs_t = res_ols.model.nobs # constraint c_equal = -np.eye(k)[1:] c_equal[:, 0] = 1 v = np.zeros(c_equal.shape[0]) # noncentrality at estimated parameters wt = res_ols.wald_test(c_equal, scalar=True) df_num, df_denom = wt.df_num, wt.df_denom cov_p = res_ols.cov_params() nc_wt = wald_test_noncent_generic(res_ols.params, c_equal, v, cov_p, diff=None, joint=True) assert_allclose(nc_wt, wt.statistic * wt.df_num, rtol=1e-13) nc_wt2 = wald_test_noncent(res_ols.params, c_equal, v, res_ols, diff=None, joint=True) assert_allclose(nc_wt2, nc_wt, rtol=1e-13) es_ols = nc_wt / nobs_t es_oneway = smo.effectsize_oneway(res_ols.params, res_ols.scale, self.nobs, use_var="equal") assert_allclose(es_ols, es_oneway, rtol=1e-13) alpha = 0.05 pow_ols = smpwr.ftest_power(np.sqrt(es_ols), df_denom, df_num, alpha, ncc=1) pow_oneway = smpwr.ftest_anova_power(np.sqrt(es_oneway), nobs_t, alpha, k_groups=k, df=None) assert_allclose(pow_ols, pow_oneway, rtol=1e-13) # noncentrality at other params params_alt = res_ols.params * 0.75 # compute constraint value so we can get noncentrality from wald_test v_off = _offset_constraint(c_equal, res_ols.params, params_alt) wt_off = res_ols.wald_test((c_equal, v + v_off), scalar=True) nc_wt_off = wald_test_noncent_generic(params_alt, c_equal, v, cov_p, diff=None, joint=True) assert_allclose(nc_wt_off, wt_off.statistic * wt_off.df_num, rtol=1e-13) # check vectorized version, joint=False nc_wt_vec = wald_test_noncent_generic(params_alt, c_equal, v, cov_p, diff=None, joint=False) for i in range(c_equal.shape[0]): nc_wt_i = wald_test_noncent_generic( params_alt, c_equal[i:i + 1], # noqa v[i:i + 1], cov_p, diff=None, # noqa joint=False) assert_allclose(nc_wt_vec[i], nc_wt_i, rtol=1e-13)
def dispersion_poisson_generic(results, exog_new_test, exog_new_control=None, include_score=False, use_endog=True, cov_type='HC1', cov_kwds=None, use_t=False): """A variable addition test for the variance function This uses an artificial regression to calculate a variant of an LM or generalized score test for the specification of the variance assumption in a Poisson model. The performed test is a Wald test on the coefficients of the `exog_new_test`. Warning: insufficiently tested, especially for options """ if hasattr(results, '_results'): results = results._results endog = results.model.endog nobs = endog.shape[0] #TODO: use attribute, may need to be added # fitted = results.fittedvalues # generic has linpred as fittedvalues fitted = results.predict() resid2 = results.resid_response**2 #the following assumes Poisson if use_endog: var_resid = (resid2 - endog) else: var_resid = (resid2 - fitted) endog_v = var_resid / fitted k_constraints = exog_new_test.shape[1] ex_list = [exog_new_test] if include_score: score_obs = results.model.score_obs(results.params) ex_list.append(score_obs) if exog_new_control is not None: ex_list.append(score_obs) if len(ex_list) > 1: ex = np.column_stack(ex_list) use_wald = True else: ex = ex_list[0] # no control variables in exog use_wald = False res_ols = OLS(endog_v, ex).fit(cov_type=cov_type, cov_kwds=cov_kwds, use_t=use_t) if use_wald: # we have controls and need to test coefficients k_vars = ex.shape[1] constraints = np.eye(k_constraints, k_vars) ht = res_ols.wald_test(constraints) stat_ols = ht.statistic pval_ols = ht.pvalue else: # we do not have controls and can use overall fit nobs = endog_v.shape[0] rsquared_noncentered = 1 - res_ols.ssr / res_ols.uncentered_tss stat_ols = nobs * rsquared_noncentered pval_ols = stats.chi2.sf(stat_ols, k_constraints) return stat_ols, pval_ols
def kpss(x, regression='c', lags=None, store=False): """ Kwiatkowski-Phillips-Schmidt-Shin test for stationarity. Computes the Kwiatkowski-Phillips-Schmidt-Shin (KPSS) test for the null hypothesis that x is level or trend stationary. Parameters ---------- x : array_like, 1d Data series regression : str{'c', 'ct'} Indicates the null hypothesis for the KPSS test * 'c' : The data is stationary around a constant (default) * 'ct' : The data is stationary around a trend lags : int Indicates the number of lags to be used. If None (default), lags is set to int(12 * (n / 100)**(1 / 4)), as outlined in Schwert (1989). store : bool If True, then a result instance is returned additionally to the KPSS statistic (default is False). Returns ------- kpss_stat : float The KPSS test statistic p_value : float The p-value of the test. The p-value is interpolated from Table 1 in Kwiatkowski et al. (1992), and a boundary point is returned if the test statistic is outside the table of critical values, that is, if the p-value is outside the interval (0.01, 0.1). lags : int The truncation lag parameter crit : dict The critical values at 10%, 5%, 2.5% and 1%. Based on Kwiatkowski et al. (1992). resstore : (optional) instance of ResultStore An instance of a dummy class with results attached as attributes Notes ----- To estimate sigma^2 the Newey-West estimator is used. If lags is None, the truncation lag parameter is set to int(12 * (n / 100) ** (1 / 4)), as outlined in Schwert (1989). The p-values are interpolated from Table 1 of Kwiatkowski et al. (1992). If the computed statistic is outside the table of critical values, then a warning message is generated. Missing values are not handled. References ---------- D. Kwiatkowski, P. C. B. Phillips, P. Schmidt, and Y. Shin (1992): Testing the Null Hypothesis of Stationarity against the Alternative of a Unit Root. `Journal of Econometrics` 54, 159-178. """ from warnings import warn nobs = len(x) x = np.asarray(x) hypo = regression.lower() # if m is not one, n != m * n if nobs != x.size: raise ValueError("x of shape {0} not understood".format(x.shape)) if hypo == 'ct': # p. 162 Kwiatkowski et al. (1992): y_t = beta * t + r_t + e_t, # where beta is the trend, r_t a random walk and e_t a stationary # error term. resids = OLS(x, add_constant(np.arange(1, nobs + 1))).fit().resid crit = [0.119, 0.146, 0.176, 0.216] elif hypo == 'c': # special case of the model above, where beta = 0 (so the null # hypothesis is that the data is stationary around r_0). resids = x - x.mean() crit = [0.347, 0.463, 0.574, 0.739] else: raise ValueError("hypothesis '{0}' not understood".format(hypo)) if lags is None: # from Kwiatkowski et al. referencing Schwert (1989) lags = int(np.ceil(12. * np.power(nobs / 100., 1 / 4.))) pvals = [0.10, 0.05, 0.025, 0.01] eta = sum(resids.cumsum()**2) / (nobs**2) # eq. 11, p. 165 s_hat = _sigma_est_kpss(resids, nobs, lags) kpss_stat = eta / s_hat p_value = np.interp(kpss_stat, crit, pvals) if p_value == pvals[-1]: warn("p-value is smaller than the indicated p-value", InterpolationWarning) elif p_value == pvals[0]: warn("p-value is greater than the indicated p-value", InterpolationWarning) crit_dict = {'10%': crit[0], '5%': crit[1], '2.5%': crit[2], '1%': crit[3]} if store: rstore = ResultsStore() rstore.lags = lags rstore.nobs = nobs stationary_type = "level" if hypo == 'c' else "trend" rstore.H0 = "The series is {0} stationary".format(stationary_type) rstore.HA = "The series is not {0} stationary".format(stationary_type) return kpss_stat, p_value, crit_dict, rstore else: return kpss_stat, p_value, lags, crit_dict
def plot_partregress(endog, exog_i, exog_others, data=None, title_kwargs={}, obs_labels=True, label_kwargs={}, ax=None, ret_coords=False, **kwargs): """Plot partial regression for a single regressor. Parameters ---------- endog : ndarray or string endogenous or response variable. If string is given, you can use a arbitrary translations as with a formula. exog_i : ndarray or string exogenous, explanatory variable. If string is given, you can use a arbitrary translations as with a formula. exog_others : ndarray or list of strings other exogenous, explanatory variables. If a list of strings is given, each item is a term in formula. You can use a arbitrary translations as with a formula. The effect of these variables will be removed by OLS regression. data : DataFrame, dict, or recarray Some kind of data structure with names if the other variables are given as strings. title_kwargs : dict Keyword arguments to pass on for the title. The key to control the fonts is fontdict. obs_labels : bool or array-like Whether or not to annotate the plot points with their observation labels. If obs_labels is a boolean, the point labels will try to do the right thing. First it will try to use the index of data, then fall back to the index of exog_i. Alternatively, you may give an array-like object corresponding to the obseveration numbers. labels_kwargs : dict Keyword arguments that control annotate for the observation labels. ax : Matplotlib AxesSubplot instance, optional If given, this subplot is used to plot in instead of a new figure being created. ret_coords : bool If True will return the coordinates of the points in the plot. You can use this to add your own annotations. kwargs The keyword arguments passed to plot for the points. Returns ------- fig : Matplotlib figure instance If `ax` is None, the created figure. Otherwise the figure to which `ax` is connected. coords : list, optional If ret_coords is True, return a tuple of arrays (x_coords, y_coords). Notes ----- The slope of the fitted line is the that of `exog_i` in the full multiple regression. The individual points can be used to assess the influence of points on the estimated coefficient. See Also -------- plot_partregress_grid : Plot partial regression for a set of regressors. """ #NOTE: there is no interaction between possible missing data and #obs_labels yet, so this will need to be tweaked a bit for this case fig, ax = utils.create_mpl_ax(ax) # strings, use patsy to transform to data if isinstance(endog, string_types): endog = dmatrix(endog + "-1", data) if isinstance(exog_others, string_types): RHS = dmatrix(exog_others, data) elif isinstance(exog_others, list): RHS = "+".join(exog_others) RHS = dmatrix(RHS, data) else: RHS = exog_others RHS_isemtpy = False if isinstance(RHS, np.ndarray) and RHS.size == 0: RHS_isemtpy = True elif isinstance(RHS, pd.DataFrame) and RHS.empty: RHS_isemtpy = True if isinstance(exog_i, string_types): exog_i = dmatrix(exog_i + "-1", data) # all arrays or pandas-like if RHS_isemtpy: ax.plot(endog, exog_i, 'o', **kwargs) fitted_line = OLS(endog, exog_i).fit() x_axis_endog_name = 'x' if isinstance(exog_i, np.ndarray) else exog_i.name y_axis_endog_name = 'y' if isinstance( endog, np.ndarray) else endog.design_info.column_names[0] else: res_yaxis = OLS(endog, RHS).fit() res_xaxis = OLS(exog_i, RHS).fit() xaxis_resid = res_xaxis.resid yaxis_resid = res_yaxis.resid x_axis_endog_name = res_xaxis.model.endog_names y_axis_endog_name = res_yaxis.model.endog_names ax.plot(xaxis_resid, yaxis_resid, 'o', **kwargs) fitted_line = OLS(yaxis_resid, xaxis_resid).fit() fig = abline_plot(0, fitted_line.params[0], color='k', ax=ax) if x_axis_endog_name == 'y': # for no names regression will just get a y x_axis_endog_name = 'x' # this is misleading, so use x ax.set_xlabel("e(%s | X)" % x_axis_endog_name) ax.set_ylabel("e(%s | X)" % y_axis_endog_name) ax.set_title('Partial Regression Plot', **title_kwargs) #NOTE: if we want to get super fancy, we could annotate if a point is #clicked using this widget #http://stackoverflow.com/questions/4652439/ #is-there-a-matplotlib-equivalent-of-matlabs-datacursormode/ #4674445#4674445 if obs_labels is True: if data is not None: obs_labels = data.index elif hasattr(exog_i, "index"): obs_labels = exog_i.index else: obs_labels = res_xaxis.model.data.row_labels #NOTE: row_labels can be None. #Maybe we should fix this to never be the case. if obs_labels is None: obs_labels = lrange(len(exog_i)) if obs_labels is not False: # could be array-like if len(obs_labels) != len(exog_i): raise ValueError("obs_labels does not match length of exog_i") label_kwargs.update(dict(ha="center", va="bottom")) ax = utils.annotate_axes(lrange(len(obs_labels)), obs_labels, lzip(res_xaxis.resid, res_yaxis.resid), [(0, 5)] * len(obs_labels), "x-large", ax=ax, **label_kwargs) if ret_coords: return fig, (res_xaxis.resid, res_yaxis.resid) else: return fig
def adfuller(x, maxlag=None, regression="c", autolag='AIC', store=False, regresults=False): """ Augmented Dickey-Fuller unit root test The Augmented Dickey-Fuller test can be used to test for a unit root in a univariate process in the presence of serial correlation. Parameters ---------- x : array_like, 1d data series maxlag : int Maximum lag which is included in test, default 12*(nobs/100)^{1/4} regression : {'c','ct','ctt','nc'} Constant and trend order to include in regression * 'c' : constant only (default) * 'ct' : constant and trend * 'ctt' : constant, and linear and quadratic trend * 'nc' : no constant, no trend autolag : {'AIC', 'BIC', 't-stat', None} * if None, then maxlag lags are used * if 'AIC' (default) or 'BIC', then the number of lags is chosen to minimize the corresponding information criterion * 't-stat' based choice of maxlag. Starts with maxlag and drops a lag until the t-statistic on the last lag length is significant using a 5%-sized test store : bool If True, then a result instance is returned additionally to the adf statistic. Default is False regresults : bool, optional If True, the full regression results are returned. Default is False Returns ------- adf : float Test statistic pvalue : float MacKinnon's approximate p-value based on MacKinnon (1994, 2010) usedlag : int Number of lags used nobs : int Number of observations used for the ADF regression and calculation of the critical values critical values : dict Critical values for the test statistic at the 1 %, 5 %, and 10 % levels. Based on MacKinnon (2010) icbest : float The maximized information criterion if autolag is not None. resstore : ResultStore, optional A dummy class with results attached as attributes Notes ----- The null hypothesis of the Augmented Dickey-Fuller is that there is a unit root, with the alternative that there is no unit root. If the pvalue is above a critical size, then we cannot reject that there is a unit root. The p-values are obtained through regression surface approximation from MacKinnon 1994, but using the updated 2010 tables. If the p-value is close to significant, then the critical values should be used to judge whether to reject the null. The autolag option and maxlag for it are described in Greene. Examples -------- See example notebook References ---------- .. [*] W. Green. "Econometric Analysis," 5th ed., Pearson, 2003. .. [*] Hamilton, J.D. "Time Series Analysis". Princeton, 1994. .. [*] MacKinnon, J.G. 1994. "Approximate asymptotic distribution functions for unit-root and cointegration tests. `Journal of Business and Economic Statistics` 12, 167-76. .. [*] MacKinnon, J.G. 2010. "Critical Values for Cointegration Tests." Queen's University, Dept of Economics, Working Papers. Available at http://ideas.repec.org/p/qed/wpaper/1227.html """ if regresults: store = True trenddict = {None: 'nc', 0: 'c', 1: 'ct', 2: 'ctt'} if regression is None or isinstance(regression, (int, long)): regression = trenddict[regression] regression = regression.lower() if regression not in ['c', 'nc', 'ct', 'ctt']: raise ValueError("regression option %s not understood") % regression x = np.asarray(x) nobs = x.shape[0] if maxlag is None: #from Greene referencing Schwert 1989 maxlag = int(np.ceil(12. * np.power(nobs / 100., 1 / 4.))) xdiff = np.diff(x) xdall = lagmat(xdiff[:, None], maxlag, trim='both', original='in') nobs = xdall.shape[0] # pylint: disable=E1103 xdall[:, 0] = x[-nobs - 1:-1] # replace 0 xdiff with level of x xdshort = xdiff[-nobs:] if store: resstore = ResultsStore() if autolag: if regression != 'nc': fullRHS = add_trend(xdall, regression, prepend=True) else: fullRHS = xdall startlag = fullRHS.shape[1] - xdall.shape[1] + 1 # 1 for level # pylint: disable=E1103 #search for lag length with smallest information criteria #Note: use the same number of observations to have comparable IC #aic and bic: smaller is better if not regresults: icbest, bestlag = _autolag(OLS, xdshort, fullRHS, startlag, maxlag, autolag) else: icbest, bestlag, alres = _autolag(OLS, xdshort, fullRHS, startlag, maxlag, autolag, regresults=regresults) resstore.autolag_results = alres bestlag -= startlag # convert to lag not column index #rerun ols with best autolag xdall = lagmat(xdiff[:, None], bestlag, trim='both', original='in') nobs = xdall.shape[0] # pylint: disable=E1103 xdall[:, 0] = x[-nobs - 1:-1] # replace 0 xdiff with level of x xdshort = xdiff[-nobs:] usedlag = bestlag else: usedlag = maxlag icbest = None if regression != 'nc': resols = OLS(xdshort, add_trend(xdall[:, :usedlag + 1], regression)).fit() else: resols = OLS(xdshort, xdall[:, :usedlag + 1]).fit() adfstat = resols.tvalues[0] # adfstat = (resols.params[0]-1.0)/resols.bse[0] # the "asymptotically correct" z statistic is obtained as # nobs/(1-np.sum(resols.params[1:-(trendorder+1)])) (resols.params[0] - 1) # I think this is the statistic that is used for series that are integrated # for orders higher than I(1), ie., not ADF but cointegration tests. # Get approx p-value and critical values pvalue = mackinnonp(adfstat, regression=regression, N=1) critvalues = mackinnoncrit(N=1, regression=regression, nobs=nobs) critvalues = { "1%": critvalues[0], "5%": critvalues[1], "10%": critvalues[2] } if store: resstore.resols = resols resstore.maxlag = maxlag resstore.usedlag = usedlag resstore.adfstat = adfstat resstore.critvalues = critvalues resstore.nobs = nobs resstore.H0 = ("The coefficient on the lagged level equals 1 - " "unit root") resstore.HA = "The coefficient on the lagged level < 1 - stationary" resstore.icbest = icbest resstore._str = 'Augmented Dickey-Fuller Test Results' return adfstat, pvalue, critvalues, resstore else: if not autolag: return adfstat, pvalue, usedlag, nobs, critvalues else: return adfstat, pvalue, usedlag, nobs, critvalues, icbest
def setup_class(cls): data = stackloss.load() data.exog = add_constant(data.exog) cls.res1 = OLS(data.endog, data.exog).fit() cls.res2 = RegressionResults()
def grangercausalitytests(x, maxlag, addconst=True, verbose=True, saveto='../Results/grangerResults'): """four tests for granger non causality of 2 timeseries all four tests give similar results `params_ftest` and `ssr_ftest` are equivalent based on F test which is identical to lmtest:grangertest in R Parameters ---------- x : array, 2d data for test whether the time series in the second column Granger causes the time series in the first column maxlag : integer the Granger causality test results are calculated for all lags up to maxlag verbose : bool print results if true Returns ------- results : dictionary all test results, dictionary keys are the number of lags. For each lag the values are a tuple, with the first element a dictionary with teststatistic, pvalues, degrees of freedom, the second element are the OLS estimation results for the restricted model, the unrestricted model and the restriction (contrast) matrix for the parameter f_test. Notes ----- TODO: convert to class and attach results properly The Null hypothesis for grangercausalitytests is that the time series in the second column, x2, does NOT Granger cause the time series in the first column, x1. Grange causality means that past values of x2 have a statistically significant effect on the current value of x1, taking past values of x1 into account as regressors. We reject the null hypothesis that x2 does not Granger cause x1 if the pvalues are below a desired size of the test. The null hypothesis for all four test is that the coefficients corresponding to past values of the second time series are zero. 'params_ftest', 'ssr_ftest' are based on F distribution 'ssr_chi2test', 'lrtest' are based on chi-square distribution References ---------- http://en.wikipedia.org/wiki/Granger_causality Greene: Econometric Analysis """ from scipy import stats x = np.asarray(x) if x.shape[0] <= 3 * maxlag + int(addconst): raise ValueError( "Insufficient observations. Maximum allowable " "lag is {0}".format(int((x.shape[0] - int(addconst)) / 3) - 1)) resli = {} savetoFile = open(saveto, 'w') for mlg in range(1, maxlag + 1): result = {} if verbose: print('\nGranger Causality', file=savetoFile) print('number of lags (no zero)', mlg, file=savetoFile) mxlg = mlg # create lagmat of both time series dta = lagmat2ds(x, mxlg, trim='both', dropex=1) #add constant if addconst: dtaown = add_constant(dta[:, 1:(mxlg + 1)], prepend=False) dtajoint = add_constant(dta[:, 1:], prepend=False) else: raise NotImplementedError('Not Implemented') #dtaown = dta[:, 1:mxlg] #dtajoint = dta[:, 1:] # Run ols on both models without and with lags of second variable res2down = OLS(dta[:, 0], dtaown).fit() res2djoint = OLS(dta[:, 0], dtajoint).fit() #print results #for ssr based tests see: #http://support.sas.com/rnd/app/examples/ets/granger/index.htm #the other tests are made-up # Granger Causality test using ssr (F statistic) fgc1 = ((res2down.ssr - res2djoint.ssr) / res2djoint.ssr / mxlg * res2djoint.df_resid) if verbose: print('ssr based F test: F=%-8.4f, p=%-8.4f, df_denom=%d,' ' df_num=%d' % (fgc1, stats.f.sf(fgc1, mxlg, res2djoint.df_resid), res2djoint.df_resid, mxlg), file=savetoFile) result['ssr_ftest'] = (fgc1, stats.f.sf(fgc1, mxlg, res2djoint.df_resid), res2djoint.df_resid, mxlg) # Granger Causality test using ssr (ch2 statistic) fgc2 = res2down.nobs * (res2down.ssr - res2djoint.ssr) / res2djoint.ssr if verbose: print('ssr based chi2 test: chi2=%-8.4f, p=%-8.4f, ' 'df=%d' % (fgc2, stats.chi2.sf(fgc2, mxlg), mxlg), file=savetoFile) result['ssr_chi2test'] = (fgc2, stats.chi2.sf(fgc2, mxlg), mxlg) #likelihood ratio test pvalue: lr = -2 * (res2down.llf - res2djoint.llf) if verbose: print('likelihood ratio test: chi2=%-8.4f, p=%-8.4f, df=%d' % (lr, stats.chi2.sf(lr, mxlg), mxlg), file=savetoFile) result['lrtest'] = (lr, stats.chi2.sf(lr, mxlg), mxlg) # F test that all lag coefficients of exog are zero rconstr = np.column_stack((np.zeros( (mxlg, mxlg)), np.eye(mxlg, mxlg), np.zeros((mxlg, 1)))) ftres = res2djoint.f_test(rconstr) if verbose: print('parameter F test: F=%-8.4f, p=%-8.4f, df_denom=%d,' ' df_num=%d' % (ftres.fvalue, ftres.pvalue, ftres.df_denom, ftres.df_num), file=savetoFile) result['params_ftest'] = (np.squeeze(ftres.fvalue)[()], np.squeeze(ftres.pvalue)[()], ftres.df_denom, ftres.df_num) resli[mxlg] = (result, [res2down, res2djoint, rconstr]) if verbose: savetoFile = open(saveto, 'r') print(savetoFile.read()) return resli
# obtain the feature matrix as a numpy array X = boston.data # obtain the target variable as a numpy array y = boston.target # create vector of ones... int = np.ones(shape=y.shape)[..., None] #...and add to feature matrix X = np.concatenate((int, X), 1) # calculate coefficients using closed-form solution coeffs = inv(X.transpose().dot(X)).dot(X.transpose()).dot(y) # extract the feature names of the boston data set and prepend the intercept feature_names = np.insert(boston.feature_names, 0, 'INT') # collect results into a DataFrame for pretty printing results = pd.DataFrame({'coeffs': coeffs}, index=feature_names) # create a linear model and extract the parameters coeffs_lm = OLS(y, X).fit().params # add the coefficients to the results DataFrame results['coeffs_lm'] = coeffs_lm print(results.round(2)) print(results.round(2))
def coint(y0, y1, trend='c', method='aeg', maxlag=None, autolag='aic', return_results=None): """Test for no-cointegration of a univariate equation The null hypothesis is no cointegration. Variables in y0 and y1 are assumed to be integrated of order 1, I(1). This uses the augmented Engle-Granger two-step cointegration test. Constant or trend is included in 1st stage regression, i.e. in cointegrating equation. **Warning:** The autolag default has changed compared to statsmodels 0.8. In 0.8 autolag was always None, no the keyword is used and defaults to 'aic'. Use `autolag=None` to avoid the lag search. Parameters ---------- y1 : array_like, 1d first element in cointegrating vector y2 : array_like remaining elements in cointegrating vector trend : str {'c', 'ct'} trend term included in regression for cointegrating equation * 'c' : constant * 'ct' : constant and linear trend * also available quadratic trend 'ctt', and no constant 'nc' method : string currently only 'aeg' for augmented Engle-Granger test is available. default might change. maxlag : None or int keyword for `adfuller`, largest or given number of lags autolag : string keyword for `adfuller`, lag selection criterion. * if None, then maxlag lags are used without lag search * if 'AIC' (default) or 'BIC', then the number of lags is chosen to minimize the corresponding information criterion * 't-stat' based choice of maxlag. Starts with maxlag and drops a lag until the t-statistic on the last lag length is significant using a 5%-sized test return_results : bool for future compatibility, currently only tuple available. If True, then a results instance is returned. Otherwise, a tuple with the test outcome is returned. Set `return_results=False` to avoid future changes in return. Returns ------- coint_t : float t-statistic of unit-root test on residuals pvalue : float MacKinnon's approximate, asymptotic p-value based on MacKinnon (1994) crit_value : dict Critical values for the test statistic at the 1 %, 5 %, and 10 % levels based on regression curve. This depends on the number of observations. Notes ----- The Null hypothesis is that there is no cointegration, the alternative hypothesis is that there is cointegrating relationship. If the pvalue is small, below a critical size, then we can reject the hypothesis that there is no cointegrating relationship. P-values and critical values are obtained through regression surface approximation from MacKinnon 1994 and 2010. If the two series are almost perfectly collinear, then computing the test is numerically unstable. However, the two series will be cointegrated under the maintained assumption that they are integrated. In this case the t-statistic will be set to -inf and the pvalue to zero. TODO: We could handle gaps in data by dropping rows with nans in the auxiliary regressions. Not implemented yet, currently assumes no nans and no gaps in time series. References ---------- MacKinnon, J.G. 1994 "Approximate Asymptotic Distribution Functions for Unit-Root and Cointegration Tests." Journal of Business & Economics Statistics, 12.2, 167-76. MacKinnon, J.G. 2010. "Critical Values for Cointegration Tests." Queen's University, Dept of Economics Working Papers 1227. http://ideas.repec.org/p/qed/wpaper/1227.html """ trend = trend.lower() if trend not in ['c', 'nc', 'ct', 'ctt']: raise ValueError("trend option %s not understood" % trend) y0 = np.asarray(y0) y1 = np.asarray(y1) if y1.ndim < 2: y1 = y1[:, None] nobs, k_vars = y1.shape k_vars += 1 # add 1 for y0 if trend == 'nc': xx = y1 else: xx = add_trend(y1, trend=trend, prepend=False) res_co = OLS(y0, xx).fit() if res_co.rsquared < 1 - 100 * SQRTEPS: res_adf = adfuller(res_co.resid, maxlag=maxlag, autolag=autolag, regression='nc') else: import warnings warnings.warn("y0 and y1 are (almost) perfectly colinear." "Cointegration test is not reliable in this case.") # Edge case where series are too similar res_adf = (-np.inf, ) # no constant or trend, see egranger in Stata and MacKinnon if trend == 'nc': crit = [np.nan] * 3 # 2010 critical values not available else: crit = mackinnoncrit(N=k_vars, regression=trend, nobs=nobs - 1) # nobs - 1, the -1 is to match egranger in Stata, I don't know why. # TODO: check nobs or df = nobs - k pval_asy = mackinnonp(res_adf[0], regression=trend, N=k_vars) return res_adf[0], pval_asy, crit
def _computeLinearModel(inputSample, outputSample, detection, noiseThres, saturationThres, boxCox, censored): """ Run filerCensoredData and build the linear regression model. It is defined as a simple function because it is also needed in a loop for the bootstrap based POD. """ #################### Filter censored data ############################## if censored: # Filter censored data defects, defectsNoise, defectsSat, signals = \ DataHandling.filterCensoredData(inputSample, outputSample, noiseThres, saturationThres) else: defects, signals = inputSample, outputSample defectsSize = defects.getSize() ###################### Box Cox transformation ########################## # Compute Box Cox if enabled if boxCox: # optimization required, get optimal lambda without graph lambdaBoxCox, graphBoxCox = computeBoxCox(defects, signals) # Transformation of data boxCoxTransform = ot.BoxCoxTransform([lambdaBoxCox]) signals = boxCoxTransform(signals) if censored: if noiseThres is not None: noiseThres = boxCoxTransform([noiseThres])[0] if saturationThres is not None: saturationThres = boxCoxTransform([saturationThres])[0] detectionBoxCox = boxCoxTransform([detection])[0] else: detectionBoxCox = detection lambdaBoxCox = None graphBoxCox = None ######################### Linear Regression model ###################### # Linear regression with statsmodels module # Create the X matrix : [1, inputSample] X = ot.NumericalSample(defectsSize, [1, 0]) X[:, 1] = defects algoLinear = OLS(np.array(signals), np.array(X)).fit() intercept = algoLinear.params[0] slope = algoLinear.params[1] # get standard error estimates (residuals standard deviation) stderr = np.sqrt(algoLinear.scale) # get residuals from algoLinear residuals = ot.NumericalSample(np.vstack(algoLinear.resid)) if censored: # define initial starting point for MLE optimization initialStartMLE = [intercept, slope, stderr] # MLE optimization res = computeLinearParametersCensored(initialStartMLE, defects, defectsNoise, defectsSat, signals, noiseThres, saturationThres) intercept = res[0] slope = res[1] stderr = res[2] residuals = signals - (intercept + slope * defects) return {'defects':defects, 'signals':signals, 'intercept':intercept, 'slope':slope, 'stderr':stderr, 'residuals':residuals, 'detection':detectionBoxCox, 'lambdaBoxCox':lambdaBoxCox, 'graphBoxCox':graphBoxCox}
def gls(endog, exog=None, order=(0, 0, 0), seasonal_order=(0, 0, 0, 0), include_constant=None, n_iter=None, max_iter=50, tolerance=1e-8, arma_estimator='innovations_mle', arma_estimator_kwargs=None): """ Estimate ARMAX parameters by GLS. Parameters ---------- endog : array_like Input time series array. exog : array_like, optional Array of exogenous regressors. If not included, then `include_constant` must be True, and then `exog` will only include the constant column. order : tuple, optional The (p,d,q) order of the ARIMA model. Default is (0, 0, 0). seasonal_order : tuple, optional The (P,D,Q,s) order of the seasonal ARIMA model. Default is (0, 0, 0, 0). include_constant : bool, optional Whether to add a constant term in `exog` if it's not already there. The estimate of the constant will then appear as one of the `exog` parameters. If `exog` is None, then the constant will represent the mean of the process. Default is True if the specified model does not include integration and False otherwise. n_iter : int, optional Optionally iterate feasible GSL a specific number of times. Default is to iterate to convergence. If set, this argument overrides the `max_iter` and `tolerance` arguments. max_iter : int, optional Maximum number of feasible GLS iterations. Default is 50. If `n_iter` is set, it overrides this argument. tolerance : float, optional Tolerance for determining convergence of feasible GSL iterations. If `iter` is set, this argument has no effect. Default is 1e-8. arma_estimator : str, optional The estimator used for estimating the ARMA model. This option should not generally be used, unless the default method is failing or is otherwise unsuitable. Not all values will be valid, depending on the specified model orders (`order` and `seasonal_order`). Possible values are: * 'innovations_mle' - can be used with any specification * 'statespace' - can be used with any specification * 'hannan_rissanen' - can be used with any ARMA non-seasonal model * 'yule_walker' - only non-seasonal consecutive autoregressive (AR) models * 'burg' - only non-seasonal, consecutive autoregressive (AR) models * 'innovations' - only non-seasonal, consecutive moving average (MA) models. The default is 'innovations_mle'. arma_estimator_kwargs : dict, optional Arguments to pass to the ARMA estimator. Returns ------- parameters : SARIMAXParams object Contains the parameter estimates from the final iteration. other_results : Bunch Includes eight components: `spec`, `params`, `converged`, `differences`, `iterations`, `arma_estimator`, 'arma_estimator_kwargs', and `arma_results`. Notes ----- The primary reference is [1]_, section 6.6. In particular, the implementation follows the iterative procedure described in section 6.6.2. Construction of the transformed variables used to compute the GLS estimator described in section 6.6.1 is done via an application of the innovations algorithm (rather than explicit construction of the transformation matrix). Note that if the specified model includes integration, both the `endog` and `exog` series will be differenced prior to estimation and a warning will be issued to alert the user. References ---------- .. [1] Brockwell, Peter J., and Richard A. Davis. 2016. Introduction to Time Series and Forecasting. Springer. """ # Handle n_iter if n_iter is not None: max_iter = n_iter tolerance = np.inf # Default for include_constant is True if there is no integration and # False otherwise integrated = order[1] > 0 or seasonal_order[1] > 0 if include_constant is None: include_constant = not integrated elif include_constant and integrated: raise ValueError('Cannot include a constant in an integrated model.') # Handle including the constant (need to do it now so that the constant # parameter can be included in the specification as part of `exog`.) if include_constant: exog = np.ones_like(endog) if exog is None else add_constant(exog) # Create the SARIMAX specification spec = SARIMAXSpecification(endog, exog=exog, order=order, seasonal_order=seasonal_order) endog = spec.endog exog = spec.exog # Handle integration if spec.is_integrated: # TODO: this is the approach suggested by BD (see Remark 1 in # section 6.6.2 and Example 6.6.3), but maybe there are some cases # where we don't want to force this behavior on the user? warnings.warn('Provided `endog` and `exog` series have been' ' differenced to eliminate integration prior to GLS' ' parameter estimation.') endog = diff(endog, k_diff=spec.diff, k_seasonal_diff=spec.seasonal_diff, seasonal_periods=spec.seasonal_periods) exog = diff(exog, k_diff=spec.diff, k_seasonal_diff=spec.seasonal_diff, seasonal_periods=spec.seasonal_periods) augmented = np.c_[endog, exog] # Validate arma_estimator spec.validate_estimator(arma_estimator) if arma_estimator_kwargs is None: arma_estimator_kwargs = {} # Step 1: OLS mod_ols = OLS(endog, exog) res_ols = mod_ols.fit() exog_params = res_ols.params resid = res_ols.resid # 0th iteration parameters p = SARIMAXParams(spec=spec) p.exog_params = exog_params if spec.max_ar_order > 0: p.ar_params = np.zeros(spec.k_ar_params) if spec.max_seasonal_ar_order > 0: p.seasonal_ar_params = np.zeros(spec.k_seasonal_ar_params) if spec.max_ma_order > 0: p.ma_params = np.zeros(spec.k_ma_params) if spec.max_seasonal_ma_order > 0: p.seasonal_ma_params = np.zeros(spec.k_seasonal_ma_params) p.sigma2 = res_ols.scale ar_params = p.ar_params seasonal_ar_params = p.seasonal_ar_params ma_params = p.ma_params seasonal_ma_params = p.seasonal_ma_params sigma2 = p.sigma2 # Step 2 - 4: iterate feasible GLS to convergence arma_results = [None] differences = [None] parameters = [p] converged = False if n_iter is None else None i = 0 for i in range(1, max_iter + 1): prev = exog_params # Step 2: ARMA # TODO: allow estimator-specific kwargs? if arma_estimator == 'yule_walker': p_arma, res_arma = yule_walker(resid, ar_order=spec.ar_order, demean=False, **arma_estimator_kwargs) elif arma_estimator == 'burg': p_arma, res_arma = burg(resid, ar_order=spec.ar_order, demean=False, **arma_estimator_kwargs) elif arma_estimator == 'innovations': out, res_arma = innovations(resid, ma_order=spec.ma_order, demean=False, **arma_estimator_kwargs) p_arma = out[-1] elif arma_estimator == 'hannan_rissanen': p_arma, res_arma = hannan_rissanen(resid, ar_order=spec.ar_order, ma_order=spec.ma_order, demean=False, **arma_estimator_kwargs) else: # For later iterations, use a "warm start" for parameter estimates # (speeds up estimation and convergence) start_params = (None if i == 1 else np.r_[ar_params, ma_params, seasonal_ar_params, seasonal_ma_params, sigma2]) # Note: in each case, we do not pass in the order of integration # since we have already differenced the series tmp_order = (spec.order[0], 0, spec.order[2]) tmp_seasonal_order = (spec.seasonal_order[0], 0, spec.seasonal_order[2], spec.seasonal_order[3]) if arma_estimator == 'innovations_mle': p_arma, res_arma = innovations_mle( resid, order=tmp_order, seasonal_order=tmp_seasonal_order, demean=False, start_params=start_params, **arma_estimator_kwargs) else: p_arma, res_arma = statespace( resid, order=tmp_order, seasonal_order=tmp_seasonal_order, include_constant=False, start_params=start_params, **arma_estimator_kwargs) ar_params = p_arma.ar_params seasonal_ar_params = p_arma.seasonal_ar_params ma_params = p_arma.ma_params seasonal_ma_params = p_arma.seasonal_ma_params sigma2 = p_arma.sigma2 arma_results.append(res_arma) # Step 3: GLS # Compute transformed variables that satisfy OLS assumptions # Note: In section 6.1.1 of Brockwell and Davis (2016), these # transformations are developed as computed by left multiplcation # by a matrix T. However, explicitly constructing T and then # performing the left-multiplications does not scale well when nobs is # large. Instead, we can retrieve the transformed variables as the # residuals of the innovations algorithm (the `normalize=True` # argument applies a Prais-Winsten-type normalization to the first few # observations to ensure homoskedasticity). Brockwell and Davis # mention that they also take this approach in practice. # GH-6540: AR must be stationary if not p_arma.is_stationary: raise ValueError( "Roots of the autoregressive parameters indicate that data is" "non-stationary. GLS cannot be used with non-stationary " "parameters. You should consider differencing the model data" "or applying a nonlinear transformation (e.g., natural log).") tmp, _ = arma_innovations.arma_innovations(augmented, ar_params=ar_params, ma_params=ma_params, normalize=True) u = tmp[:, 0] x = tmp[:, 1:] # OLS on transformed variables mod_gls = OLS(u, x) res_gls = mod_gls.fit() exog_params = res_gls.params resid = endog - np.dot(exog, exog_params) # Construct the parameter vector for the iteration p = SARIMAXParams(spec=spec) p.exog_params = exog_params if spec.max_ar_order > 0: p.ar_params = ar_params if spec.max_seasonal_ar_order > 0: p.seasonal_ar_params = seasonal_ar_params if spec.max_ma_order > 0: p.ma_params = ma_params if spec.max_seasonal_ma_order > 0: p.seasonal_ma_params = seasonal_ma_params p.sigma2 = sigma2 parameters.append(p) # Check for convergence difference = np.abs(exog_params - prev) differences.append(difference) if n_iter is None and np.all(difference < tolerance): converged = True break else: if n_iter is None: warnings.warn('Feasible GLS failed to converge in %d iterations.' ' Consider increasing the maximum number of' ' iterations using the `max_iter` argument or' ' reducing the required tolerance using the' ' `tolerance` argument.' % max_iter) # Construct final results p = parameters[-1] other_results = Bunch({ 'spec': spec, 'params': parameters, 'converged': converged, 'differences': differences, 'iterations': i, 'arma_estimator': arma_estimator, 'arma_estimator_kwargs': arma_estimator_kwargs, 'arma_results': arma_results, }) return p, other_results
def test_bool_regressor(reset_randomstate): exog = np.random.randint(0, 2, size=(100, 2)).astype(bool) endog = np.random.standard_normal(100) bool_res = OLS(endog, exog).fit() res = OLS(endog, exog.astype(np.double)).fit() assert_allclose(bool_res.params, res.params)
def test_norm_resid_zero_variance(self): with warnings.catch_warnings(record=True): y = self.res1.model.endog res = OLS(y, y).fit() assert_allclose(res.scale, 0, atol=1e-20) assert_allclose(res.wresid, res.resid_pearson, atol=5e-11)