def _partial_regression(endog, exog_i, exog_others): """Partial regression. regress endog on exog_i conditional on exog_others uses OLS Parameters ---------- endog : array_like exog : array_like exog_others : array_like Returns ------- res1c : OLS results instance (res1a, res1b) : tuple of OLS results instances results from regression of endog on exog_others and of exog_i on exog_others """ #FIXME: This function doesn't appear to be used. res1a = OLS(endog, exog_others).fit() res1b = OLS(exog_i, exog_others).fit() res1c = OLS(res1a.resid, res1b.resid).fit() return res1c, (res1a, res1b)
def reset_ramsey(res, degree=5): '''Ramsey's RESET specification test for linear models This is a general specification test, for additional non-linear effects in a model. Notes ----- The test fits an auxiliary OLS regression where the design matrix, exog, is augmented by powers 2 to degree of the fitted values. Then it performs an F-test whether these additional terms are significant. If the p-value of the f-test is below a threshold, e.g. 0.1, then this indicates that there might be additional non-linear effects in the model and that the linear model is mis-specified. References ---------- http://en.wikipedia.org/wiki/Ramsey_RESET_test ''' order = degree + 1 k_vars = res.model.exog.shape[1] #vander without constant and x: y_fitted_vander = np.vander(res.fittedvalues, order)[:, :-2] #drop constant exog = np.column_stack((res.model.exog, y_fitted_vander)) res_aux = OLS(res.model.endog, exog).fit() #r_matrix = np.eye(degree, exog.shape[1], k_vars) r_matrix = np.eye(degree - 1, exog.shape[1], k_vars) #df1 = degree - 1 #df2 = exog.shape[0] - degree - res.df_model (without constant) return res_aux.f_test(r_matrix) #, r_matrix, res_aux
def reset_ramsey(res, degree=5): '''Ramsey's RESET specification test for linear models This is a general specification test, for additional non-linear effects in a model. Notes ----- The test fits an auxiliary OLS regression where the design matrix, exog, is augmented by powers 2 to degree of the fitted values. Then it performs an F-test whether these additional terms are significant. If the p-value of the f-test is below a threshold, e.g. 0.1, then this indicates that there might be additional non-linear effects in the model and that the linear model is mis-specified. References ---------- http://en.wikipedia.org/wiki/Ramsey_RESET_test ''' order = degree + 1 k_vars = res.model.exog.shape[1] #vander without constant and x: y_fitted_vander = np.vander(res.fittedvalues, order)[:, :-2] #drop constant exog = np.column_stack((res.model.exog, y_fitted_vander)) res_aux = OLS(res.model.endog, exog).fit() #r_matrix = np.eye(degree, exog.shape[1], k_vars) r_matrix = np.eye(degree-1, exog.shape[1], k_vars) #df1 = degree - 1 #df2 = exog.shape[0] - degree - res.df_model (without constant) return res_aux.f_test(r_matrix) #, r_matrix, res_aux
def notyet_atst(): d = macrodata.load().data realinv = d['realinv'] realgdp = d['realgdp'] realint = d['realint'] endog = realinv exog = add_constant(np.c_[realgdp, realint],prepend=True) res_ols1 = OLS(endog, exog).fit() #growth rates gs_l_realinv = 400 * np.diff(np.log(d['realinv'])) gs_l_realgdp = 400 * np.diff(np.log(d['realgdp'])) lint = d['realint'][:-1] tbilrate = d['tbilrate'][:-1] endogg = gs_l_realinv exogg = add_constant(np.c_[gs_l_realgdp, lint], prepend=True) exogg2 = add_constant(np.c_[gs_l_realgdp, tbilrate], prepend=True) res_ols = OLS(endogg, exogg).fit() res_ols2 = OLS(endogg, exogg2).fit() #the following were done accidentally with res_ols1 in R, #with original Greene data params = np.array([-272.3986041341653, 0.1779455206941112, 0.2149432424658157]) cov_hac_4 = np.array([1321.569466333051, -0.2318836566017612, 37.01280466875694, -0.2318836566017614, 4.602339488102263e-05, -0.0104687835998635, 37.012804668757, -0.0104687835998635, 21.16037144168061]).reshape(3,3, order='F') cov_hac_10 = np.array([2027.356101193361, -0.3507514463299015, 54.81079621448568, -0.350751446329901, 6.953380432635583e-05, -0.01268990195095196, 54.81079621448564, -0.01268990195095195, 22.92512402151113]).reshape(3,3, order='F') #goldfeld-quandt het_gq_greater = dict(statistic=13.20512768685082, df1=99, df2=98, pvalue=1.246141976112324e-30, distr='f') het_gq_less = dict(statistic=13.20512768685082, df1=99, df2=98, pvalue=1.) het_gq_2sided = dict(statistic=13.20512768685082, df1=99, df2=98, pvalue=1.246141976112324e-30, distr='f') #goldfeld-quandt, fraction = 0.5 het_gq_greater_2 = dict(statistic=87.1328934692124, df1=48, df2=47, pvalue=2.154956842194898e-33, distr='f') gq = smsdia.het_goldfeldquandt(endog, exog, split=0.5) compare_t_est(gq, het_gq_greater, decimal=(13, 14)) assert_equal(gq[-1], 'increasing') harvey_collier = dict(stat=2.28042114041313, df=199, pvalue=0.02364236161988260, distr='t') #hc = harvtest(fm, order.by=ggdp , data = list()) harvey_collier_2 = dict(stat=0.7516918462158783, df=199, pvalue=0.4531244858006127, distr='t')
def setupClass(cls): data = longley.load() data.exog = add_constant(data.exog) res1 = OLS(data.endog, data.exog).fit() R = np.array([[0, 1, 1, 0, 0, 0, 0], [0, 1, 0, 1, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0], [0, 0, 0, 0, 1, 0, 0], [0, 0, 0, 0, 0, 1, 0]]) q = np.array([0, 0, 0, 1, 0]) cls.Ftest1 = res1.f_test(R, q)
def coint(y1, y2, regression="c"): """ This is a simple cointegration test. Uses unit-root test on residuals to test for cointegrated relationship See Hamilton (1994) 19.2 Parameters ---------- y1 : array_like, 1d first element in cointegrating vector y2 : array_like remaining elements in cointegrating vector c : str {'c'} Included in regression * 'c' : Constant Returns ------- coint_t : float t-statistic of unit-root test on residuals pvalue : float MacKinnon's approximate p-value based on MacKinnon (1994) crit_value : dict Critical values for the test statistic at the 1 %, 5 %, and 10 % levels. Notes ----- The Null hypothesis is that there is no cointegration, the alternative hypothesis is that there is cointegrating relationship. If the pvalue is small, below a critical size, then we can reject the hypothesis that there is no cointegrating relationship. P-values are obtained through regression surface approximation from MacKinnon 1994. References ---------- MacKinnon, J.G. 1994. "Approximate asymptotic distribution functions for unit-root and cointegration tests. `Journal of Business and Economic Statistics` 12, 167-76. """ regression = regression.lower() if regression not in ['c', 'nc', 'ct', 'ctt']: raise ValueError("regression option %s not understood") % regression y1 = np.asarray(y1) y2 = np.asarray(y2) if regression == 'c': y2 = add_constant(y2) st1_resid = OLS(y1, y2).fit().resid #stage one residuals lgresid_cons = add_constant(st1_resid[0:-1]) uroot_reg = OLS(st1_resid[1:], lgresid_cons).fit() coint_t = (uroot_reg.params[0] - 1) / uroot_reg.bse[0] pvalue = mackinnonp(coint_t, regression="c", N=2, lags=None) crit_value = mackinnoncrit(N=1, regression="c", nobs=len(y1)) return coint_t, pvalue, crit_value
def setupClass(cls): data = longley.load() data.exog = add_constant(data.exog) res1 = OLS(data.endog, data.exog).fit() R = np.array([[0,1,1,0,0,0,0], [0,1,0,1,0,0,0], [0,1,0,0,0,0,0], [0,0,0,0,1,0,0], [0,0,0,0,0,1,0]]) q = np.array([0,0,0,1,0]) cls.Ftest1 = res1.f_test(R,q)
def setupClass(cls): from results.results_regression import Longley data = longley.load() data.exog = add_constant(data.exog) res1 = OLS(data.endog, data.exog).fit() res2 = Longley() res2.wresid = res1.wresid # workaround hack cls.res1 = res1 cls.res2 = res2 res_qr = OLS(data.endog, data.exog).fit(method="qr") cls.res_qr = res_qr
def qqline(ax, line, x=None, y=None, dist=None, fmt='r-'): """ Plot a reference line for a qqplot. Parameters ---------- ax : matplotlib axes instance The axes on which to plot the line line : str {'45','r','s','q'} Options for the reference line to which the data is compared.: - '45' - 45-degree line - 's' - standardized line, the expected order statistics are scaled by the standard deviation of the given sample and have the mean added to them - 'r' - A regression line is fit - 'q' - A line is fit through the quartiles. - None - By default no reference line is added to the plot. x : array X data for plot. Not needed if line is '45'. y : array Y data for plot. Not needed if line is '45'. dist : scipy.stats.distribution A scipy.stats distribution, needed if line is 'q'. Notes ----- There is no return value. The line is plotted on the given `ax`. """ if line == '45': end_pts = zip(ax.get_xlim(), ax.get_ylim()) end_pts[0] = max(end_pts[0]) end_pts[1] = min(end_pts[1]) ax.plot(end_pts, end_pts, fmt) return # does this have any side effects? if x is None and y is None: raise ValueError("If line is not 45, x and y cannot be None.") elif line == 'r': # could use ax.lines[0].get_xdata(), get_ydata(), # but don't know axes are 'clean' y = OLS(y, add_constant(x)).fit().fittedvalues ax.plot(x, y, fmt) elif line == 's': m, b = y.std(), y.mean() ref_line = x * m + b ax.plot(x, ref_line, fmt) elif line == 'q': q25 = stats.scoreatpercentile(y, 25) q75 = stats.scoreatpercentile(y, 75) theoretical_quartiles = dist.ppf([.25, .75]) m = (q75 - q25) / np.diff(theoretical_quartiles) b = q25 - m * theoretical_quartiles[0] ax.plot(x, m * x + b, fmt)
def qqline(ax, line, x=None, y=None, dist=None, fmt='r-'): """ Plot a reference line for a qqplot. Parameters ---------- ax : matplotlib axes instance The axes on which to plot the line line : str {'45','r','s','q'} Options for the reference line to which the data is compared.: - '45' - 45-degree line - 's' - standardized line, the expected order statistics are scaled by the standard deviation of the given sample and have the mean added to them - 'r' - A regression line is fit - 'q' - A line is fit through the quartiles. - None - By default no reference line is added to the plot. x : array X data for plot. Not needed if line is '45'. y : array Y data for plot. Not needed if line is '45'. dist : scipy.stats.distribution A scipy.stats distribution, needed if line is 'q'. Notes ----- There is no return value. The line is plotted on the given `ax`. """ if line == '45': end_pts = zip(ax.get_xlim(), ax.get_ylim()) end_pts[0] = max(end_pts[0]) end_pts[1] = min(end_pts[1]) ax.plot(end_pts, end_pts, fmt) return # does this have any side effects? if x is None and y is None: raise ValueError("If line is not 45, x and y cannot be None.") elif line == 'r': # could use ax.lines[0].get_xdata(), get_ydata(), # but don't know axes are 'clean' y = OLS(y, add_constant(x)).fit().fittedvalues ax.plot(x,y,fmt) elif line == 's': m,b = y.std(), y.mean() ref_line = x*m + b ax.plot(x, ref_line, fmt) elif line == 'q': q25 = stats.scoreatpercentile(y, 25) q75 = stats.scoreatpercentile(y, 75) theoretical_quartiles = dist.ppf([.25,.75]) m = (q75 - q25) / np.diff(theoretical_quartiles) b = q25 - m*theoretical_quartiles[0] ax.plot(x, m*x + b, fmt)
def setupClass(cls): # if skipR: # raise SkipTest, "Rpy not installed" # try: # r.library('car') # except RPyRException: # raise SkipTest, "car library not installed for R" R = np.zeros(7) R[4:6] = [1, -1] # self.R = R data = longley.load() data.exog = add_constant(data.exog) res1 = OLS(data.endog, data.exog).fit() cls.Ttest1 = res1.t_test(R)
def setupClass(cls): # if skipR: # raise SkipTest, "Rpy not installed" # try: # r.library('car') # except RPyRException: # raise SkipTest, "car library not installed for R" R = np.zeros(7) R[4:6] = [1,-1] # self.R = R data = longley.load() data.exog = add_constant(data.exog) res1 = OLS(data.endog, data.exog).fit() cls.Ttest1 = res1.t_test(R)
def __init__(self): super(self.__class__, self).__init__() #initialize DGP nobs = self.nobs y_true, x, exog = self.y_true, self.x, self.exog np.random.seed(8765993) sigma_noise = 0.1 y = y_true + sigma_noise * np.random.randn(nobs) m = AdditiveModel(x) m.fit(y) res_gam = m.results #TODO: currently attached to class res_ols = OLS(y, exog).fit() #Note: there still are some naming inconsistencies self.res1 = res1 = Dummy() #for gam model #res2 = Dummy() #for benchmark self.res2 = res2 = res_ols #reuse existing ols results, will add additional res1.y_pred = res_gam.predict(x) res2.y_pred = res_ols.model.predict(res_ols.params, exog) res1.y_predshort = res_gam.predict(x[:10]) slopes = [i for ss in m.smoothers for i in ss.params[1:]] const = res_gam.alpha + sum([ss.params[1] for ss in m.smoothers]) #print const, slopes res1.params = np.array([const] + slopes)
def fitbygroups(self): '''Fit OLS regression for each group separately. Returns ------- results are attached olsbygroup : dictionary of result instance the returned regression results for each group sigmabygroup : array (ngroups,) (this should be called sigma2group ??? check) mse_resid for each group weights : array (nobs,) standard deviation of group extended to the original observations. This can be used as weights in WLS for group-wise heteroscedasticity. ''' olsbygroup = {} sigmabygroup = [] for gi, group in enumerate( self.unique): #np.arange(len(self.unique))): groupmask = self.groupsint == gi #group index res = OLS(self.endog[groupmask], self.exog[groupmask]).fit() olsbygroup[group] = res sigmabygroup.append(res.mse_resid) self.olsbygroup = olsbygroup self.sigmabygroup = np.array(sigmabygroup) self.weights = np.sqrt( self.sigmabygroup[self.groupsint]) #TODO:chk sqrt
def setupClass(cls): data = longley.load() data.exog = add_constant(data.exog) ols_res = OLS(data.endog, data.exog).fit() gls_res = GLS(data.endog, data.exog).fit() cls.res1 = gls_res cls.res2 = ols_res
def pacf_ols(x, nlags=40): '''Calculate partial autocorrelations Parameters ---------- x : 1d array observations of time series for which pacf is calculated nlags : int Number of lags for which pacf is returned. Lag 0 is not returned. Returns ------- pacf : 1d array partial autocorrelations, maxlag+1 elements Notes ----- This solves a separate OLS estimation for each desired lag. ''' #TODO: add warnings for Yule-Walker #NOTE: demeaning and not using a constant gave incorrect answers? #JP: demeaning should have a better estimate of the constant #maybe we can compare small sample properties with a MonteCarlo xlags, x0 = lagmat(x, nlags, original='sep') #xlags = sm.add_constant(lagmat(x, nlags), prepend=True) xlags = add_constant(xlags, prepend=True) pacf = [1.] for k in range(1, nlags + 1): res = OLS(x0[k:], xlags[k:, :k + 1]).fit() #np.take(xlags[k:], range(1,k+1)+[-1], pacf.append(res.params[-1]) return np.array(pacf)
def setupClass(cls): np.random.seed(54321) cls.endog_n_ = np.random.uniform(0, 20, size=30) cls.endog_n_one = cls.endog_n_[:, None] cls.exog_n_ = np.random.uniform(0, 20, size=30) cls.exog_n_one = cls.exog_n_[:, None] cls.degen_exog = cls.exog_n_one[:-1] cls.mod1 = OLS(cls.endog_n_one, cls.exog_n_one) cls.mod1.df_model += 1 #cls.mod1.df_resid -= 1 cls.res1 = cls.mod1.fit() # Note that these are created for every subclass.. # A little extra overhead probably cls.mod2 = OLS(cls.endog_n_one, cls.exog_n_one) cls.mod2.df_model += 1 cls.res2 = cls.mod2.fit()
def spec_hausman(self, dof=None): '''Hausman's specification test See Also -------- spec_hausman : generic function for Hausman's specification test ''' #use normalized cov_params for OLS resols = OLS(endog, exog).fit() normalized_cov_params_ols = resols.model.normalized_cov_params se2 = resols.mse_resid params_diff = self._results.params - resols.params cov_diff = np.linalg.pinv(self.xhatprod) - normalized_cov_params_ols #TODO: the following is very inefficient, solves problem (svd) twice #use linalg.lstsq or svd directly #cov_diff will very often be in-definite (singular) if not dof: dof = tools.rank(cov_diff) cov_diffpinv = np.linalg.pinv(cov_diff) H = np.dot(params_diff, np.dot(cov_diffpinv, params_diff))/se2 pval = stats.chi2.sf(H, dof) return H, pval, dof
def test_cov_cluster_2groups(): #comparing cluster robust standard errors to Peterson #requires Petersen's test_data #http://www.kellogg.northwestern.edu/faculty/petersen/htm/papers/se/test_data.txt import os cur_dir = os.path.abspath(os.path.dirname(__file__)) fpath = os.path.join(cur_dir, "test_data.txt") pet = np.genfromtxt(fpath) endog = pet[:, -1] group = pet[:, 0].astype(int) time = pet[:, 1].astype(int) exog = add_constant(pet[:, 2], prepend=True) res = OLS(endog, exog).fit() cov01, covg, covt = sw.cov_cluster_2groups(res, group, group2=time) #Reference number from Petersen #http://www.kellogg.northwestern.edu/faculty/petersen/htm/papers/se/test_data.htm bse_petw = [0.0284, 0.0284] bse_pet0 = [0.0670, 0.0506] bse_pet1 = [0.0234, 0.0334] #year bse_pet01 = [0.0651, 0.0536] #firm and year bse_0 = sw.se_cov(covg) bse_1 = sw.se_cov(covt) bse_01 = sw.se_cov(cov01) #print res.HC0_se, bse_petw - res.HC0_se #print bse_0, bse_0 - bse_pet0 #print bse_1, bse_1 - bse_pet1 #print bse_01, bse_01 - bse_pet01 assert_almost_equal(bse_petw, res.HC0_se, decimal=4) assert_almost_equal(bse_0, bse_pet0, decimal=4) assert_almost_equal(bse_1, bse_pet1, decimal=4) assert_almost_equal(bse_01, bse_pet01, decimal=4)
def test_hac_simple(): from gwstatsmodels.datasets import macrodata d2 = macrodata.load().data g_gdp = 400 * np.diff(np.log(d2['realgdp'])) g_inv = 400 * np.diff(np.log(d2['realinv'])) exogg = add_constant(np.c_[g_gdp, d2['realint'][:-1]], prepend=True) res_olsg = OLS(g_inv, exogg).fit() #> NeweyWest(fm, lag = 4, prewhite = FALSE, sandwich = TRUE, verbose=TRUE, adjust=TRUE) #Lag truncation parameter chosen: 4 # (Intercept) ggdp lint cov1_r = [ [1.40643899878678802, -0.3180328707083329709, -0.060621111216488610], [-0.31803287070833292, 0.1097308348999818661, 0.000395311760301478], [-0.06062111121648865, 0.0003953117603014895, 0.087511528912470993] ] #> NeweyWest(fm, lag = 4, prewhite = FALSE, sandwich = TRUE, verbose=TRUE, adjust=FALSE) #Lag truncation parameter chosen: 4 # (Intercept) ggdp lint cov2_r = [ [1.3855512908840137, -0.313309610252268500, -0.059720797683570477], [-0.3133096102522685, 0.108101169035130618, 0.000389440793564339], [-0.0597207976835705, 0.000389440793564336, 0.086211852740503622] ] cov1, se1 = sw.cov_hac_simple(res_olsg, nlags=4, use_correction=True) cov2, se2 = sw.cov_hac_simple(res_olsg, nlags=4, use_correction=False) assert_almost_equal(cov1, cov1_r, decimal=14) assert_almost_equal(cov2, cov2_r, decimal=14)
def iterative_fit(self, maxiter=3): """ Perform an iterative two-step procedure to estimate a WLS model. The model is assumed to have heteroscedastic errors. The variance is estimated by OLS regression of the link transformed squared residuals on Z, i.e.:: link(sigma_i) = x_i*gamma. Parameters ---------- maxiter : integer, optional the number of iterations Notes ----- maxiter=1: returns the estimated based on given weights maxiter=2: performs a second estimation with the updated weights, this is 2-step estimation maxiter>2: iteratively estimate and update the weights TODO: possible extension stop iteration if change in parameter estimates is smaller than x_tol Repeated calls to fit_iterative, will do one redundant pinv_wexog calculation. Calling fit_iterative(maxiter) ones does not do any redundant recalculations (whitening or calculating pinv_wexog). """ import collections self.history = collections.defaultdict(list) #not really necessary res_resid = None #if maxiter < 2 no updating for i in range(maxiter): #pinv_wexog is cached if hasattr(self, 'pinv_wexog'): del self.pinv_wexog #self.initialize() #print 'wls self', results = self.fit() self.history['self_params'].append(results.params) if not i == maxiter - 1: #skip for last iteration, could break instead #print 'ols', self.results_old = results #for debugging #estimate heteroscedasticity res_resid = OLS(self.link(results.resid**2), self.exog_var).fit() self.history['ols_params'].append(res_resid.params) #update weights self.weights = 1. / self.linkinv(res_resid.fittedvalues) self.weights /= self.weights.max() #not required self.weights[self.weights < 1e-14] = 1e-14 #clip #print 'in iter', i, self.weights.var() #debug, do weights change self.initialize() #note results is the wrapper, results._results is the results instance results._results.results_residual_regression = res_resid return results
def fit(self): res_pooled = self.fit_ols() #get starting estimate sigma_i = self.get_within_cov(res_pooled.resid) self.cholsigmainv_i = np.linalg.cholesky(np.linalg.pinv(sigma_i)).T wendog = self.whiten_groups(self.endog, self.cholsigmainv_i) wexog = self.whiten_groups(self.exog, self.cholsigmainv_i) print wendog.shape, wexog.shape self.res1 = OLS(wendog, wexog).fit() return self.res1
def test_recursive_residuals(self): reccumres_standardize = np.array([-2.151, -3.748, -3.114, -3.096, -1.865, -2.230, -1.194, -3.500, -3.638, -4.447, -4.602, -4.631, -3.999, -4.830, -5.429, -5.435, -6.554, -8.093, -8.567, -7.532, -7.079, -8.468, -9.320, -12.256, -11.932, -11.454, -11.690, -11.318, -12.665, -12.842, -11.693, -10.803, -12.113, -12.109, -13.002, -11.897, -10.787, -10.159, -9.038, -9.007, -8.634, -7.552, -7.153, -6.447, -5.183, -3.794, -3.511, -3.979, -3.236, -3.793, -3.699, -5.056, -5.724, -4.888, -4.309, -3.688, -3.918, -3.735, -3.452, -2.086, -6.520, -7.959, -6.760, -6.855, -6.032, -4.405, -4.123, -4.075, -3.235, -3.115, -3.131, -2.986, -1.813, -4.824, -4.424, -4.796, -4.000, -3.390, -4.485, -4.669, -4.560, -3.834, -5.507, -3.792, -2.427, -1.756, -0.354, 1.150, 0.586, 0.643, 1.773, -0.830, -0.388, 0.517, 0.819, 2.240, 3.791, 3.187, 3.409, 2.431, 0.668, 0.957, -0.928, 0.327, -0.285, -0.625, -2.316, -1.986, -0.744, -1.396, -1.728, -0.646, -2.602, -2.741, -2.289, -2.897, -1.934, -2.532, -3.175, -2.806, -3.099, -2.658, -2.487, -2.515, -2.224, -2.416, -1.141, 0.650, -0.947, 0.725, 0.439, 0.885, 2.419, 2.642, 2.745, 3.506, 4.491, 5.377, 4.624, 5.523, 6.488, 6.097, 5.390, 6.299, 6.656, 6.735, 8.151, 7.260, 7.846, 8.771, 8.400, 8.717, 9.916, 9.008, 8.910, 8.294, 8.982, 8.540, 8.395, 7.782, 7.794, 8.142, 8.362, 8.400, 7.850, 7.643, 8.228, 6.408, 7.218, 7.699, 7.895, 8.725, 8.938, 8.781, 8.350, 9.136, 9.056, 10.365, 10.495, 10.704, 10.784, 10.275, 10.389, 11.586, 11.033, 11.335, 11.661, 10.522, 10.392, 10.521, 10.126, 9.428, 9.734, 8.954, 9.949, 10.595, 8.016, 6.636, 6.975]) rr = smsdia.recursive_olsresiduals(self.res, skip=3, alpha=0.95) assert_equal(np.round(rr[5][1:], 3), reccumres_standardize) #extra zero in front #assert_equal(np.round(rr[3][4:], 3), np.diff(reccumres_standardize)) assert_almost_equal(rr[3][4:], np.diff(reccumres_standardize),3) assert_almost_equal(rr[4][3:].std(ddof=1), 10.7242, decimal=4) #regression number, visually checked with graph from gretl ub0 = np.array([ 13.37318571, 13.50758959, 13.64199346, 13.77639734, 13.91080121]) ub1 = np.array([ 39.44753774, 39.58194162, 39.7163455 , 39.85074937, 39.98515325]) lb, ub = rr[6] assert_almost_equal(ub[:5], ub0, decimal=7) assert_almost_equal(lb[:5], -ub0, decimal=7) assert_almost_equal(ub[-5:], ub1, decimal=7) assert_almost_equal(lb[-5:], -ub1, decimal=7) #test a few values with explicit OLS endog = self.res.model.endog exog = self.res.model.exog params = [] ypred = [] for i in range(3,10): resi = OLS(endog[:i], exog[:i]).fit() ypred.append(resi.model.predict(resi.params, exog[i])) params.append(resi.params) assert_almost_equal(rr[2][3:10], ypred, decimal=12) assert_almost_equal(rr[0][3:10], endog[3:10] - ypred, decimal=12) assert_almost_equal(rr[1][2:9], params, decimal=12)
def fitpooled(self): '''fit the pooled model, which assumes there are no differences across groups ''' if self.het: if not hasattr(self, 'weights'): self.fitbygroups() weights = self.weights res = WLS(self.endog, self.exog, weights=weights).fit() else: res = OLS(self.endog, self.exog).fit() self.lspooled = res
def __init__(self): super(self.__class__, self).__init__() #initialize DGP y, x, exog = self.y, self.x, self.exog #use order = 2 in regression pmod = smoothers.PolySmoother(2, x) pmod.fit(y) #no return self.res_ps = pmod self.res2 = OLS(y, exog[:, :2 + 1]).fit()
def plot_partregress_ax(endog, exog_i, exog_others, varname='', title_fontsize=None, ax=None): """Plot partial regression for a single regressor. Parameters ---------- endog : ndarray endogenous or response variable exog_i : ndarray exogenous, explanatory variable exog_others : ndarray other exogenous, explanatory variables, the effect of these variables will be removed by OLS regression varname : str name of the variable used in the title ax : Matplotlib AxesSubplot instance, optional If given, this subplot is used to plot in instead of a new figure being created. Returns ------- fig : Matplotlib figure instance If `ax` is None, the created figure. Otherwise the figure to which `ax` is connected. See Also -------- plot_partregress : Plot partial regression for a set of regressors. """ fig, ax = utils.create_mpl_ax(ax) res1a = OLS(endog, exog_others).fit() res1b = OLS(exog_i, exog_others).fit() ax.plot(res1b.resid, res1a.resid, 'o') res1c = OLS(res1a.resid, res1b.resid).fit() ax.plot(res1b.resid, res1c.fittedvalues, '-', color='k') ax.set_title('Partial Regression plot %s' % varname, fontsize=title_fontsize)# + namestr) return fig
def fit(self): alpha0 = 0.1 #startvalue func = self.fit_conditional fitres = optimize.fmin(func, alpha0) # fit_conditional only returns ssr for now alpha = fitres[0] y = self.ar1filter(self.endog, alpha) x = self.ar1filter(self.exog, alpha) reso = OLS(y, x).fit() return fitres, reso
def __init__(self): super(self.__class__, self).__init__() #initialize DGP y, x, exog = self.y, self.x, self.exog #use order = 3 in regression pmod = smoothers.PolySmoother(3, x) #pmod.fit(y) #no return pmod.smooth(y) #no return, use alias for fit self.res_ps = pmod self.res2 = OLS(y, exog[:, :3 + 1]).fit()
def setupClass(cls): from results.results_regression import LongleyGls data = longley.load() exog = add_constant(np.column_stack(\ (data.exog[:,1],data.exog[:,4]))) tmp_results = OLS(data.endog, exog).fit() rho = np.corrcoef(tmp_results.resid[1:], tmp_results.resid[:-1])[0][1] # by assumption order = toeplitz(np.arange(16)) sigma = rho**order GLS_results = GLS(data.endog, exog, sigma=sigma).fit() cls.res1 = GLS_results cls.res2 = LongleyGls()
def _ols_xnoti(self, drop_idx, endog_idx='endog', store=True): '''regression results from LOVO auxiliary regression with cache The result instances are stored, which could use a large amount of memory if the datasets are large. There are too many combinations to store them all, except for small problems. Parameters ---------- drop_idx : int index of exog that is dropped from the regression endog_idx : 'endog' or int If 'endog', then the endogenous variable of the result instance is regressed on the exogenous variables, excluding the one at drop_idx. If endog_idx is an integer, then the exog with that index is regressed with OLS on all other exogenous variables. (The latter is the auxiliary regression for the variance inflation factor.) this needs more thought, memory versus speed not yet used in any other parts, not sufficiently tested ''' #reverse the structure, access store, if fail calculate ? #this creates keys in store even if store = false ! bug if endog_idx == 'endog': stored = self.aux_regression_endog if hasattr(stored, drop_idx): return stored[drop_idx] x_i = self.results.model.endog else: #nested dictionary try: self.aux_regression_exog[endog_idx][drop_idx] except KeyError: pass stored = self.aux_regression_exog[endog_idx] stored = {} x_i = self.exog[:, endog_idx] mask = np.arange(k_vars) != drop_idx x_noti = self.exog[:, mask] res = OLS(x_i, x_noti).fit() if store: stored[drop_idx] = res return res
def __init__(self): d = macrodata.load().data #growth rates gs_l_realinv = 400 * np.diff(np.log(d['realinv'])) gs_l_realgdp = 400 * np.diff(np.log(d['realgdp'])) lint = d['realint'][:-1] tbilrate = d['tbilrate'][:-1] endogg = gs_l_realinv exogg = add_constant(np.c_[gs_l_realgdp, lint], prepend=True) exogg2 = add_constant(np.c_[gs_l_realgdp, tbilrate], prepend=True) exogg3 = add_constant(np.c_[gs_l_realgdp], prepend=True) res_ols = OLS(endogg, exogg).fit() res_ols2 = OLS(endogg, exogg2).fit() res_ols3 = OLS(endogg, exogg3).fit() self.res = res_ols self.res2 = res_ols2 self.res3 = res_ols3 self.endog = self.res.model.endog self.exog = self.res.model.exog
def fit(self, lambd=1.): #maybe iterate #preliminary estimate res_gls = GLS(self.endog, self.exog, sigma=self.sigma).fit() res_resid = OLS(res_gls.resid**2, self.exog_var).fit() #or log-link #res_resid = OLS(np.log(res_gls.resid**2), self.exog_var).fit() #here I could use whiten and current instance instead of delegating #but this is easier #see pattern of GLSAR, calls self.initialize and self.fit res_wls = WLS(self.endog, self.exog, weights=1. / res_resid.fittedvalues).fit() res_wls._results.results_residual_regression = res_resid return res_wls
def variance_inflation_factor(exog, exog_idx): '''variance inflation factor, VIF, for one exogenous variable The variance inflation factor is a measure for the increase of the variance of the parameter estimates if an additional variable, given by exog_idx is added to the linear regression. It is a measure for multicollinearity of the design matrix, exog. One recommendation is that if VIF is greater than 5, then the explanatory variable given by exog_idx is highly collinear with the other explanatory variables, and the parameter estimates will have large standard errors because of this. Parameters ---------- exog : ndarray, (nobs, k_vars) design matrix with all explanatory variables, as for example used in regression exog_idx : int index of the exogenous variable in the columns of exog Returns ------- vif : float variance inflation factor Notes ----- This function does not save the auxiliary regression. See Also -------- xxx : class for regression diagnostics TODO: doesn't exist yet References ---------- http://en.wikipedia.org/wiki/Variance_inflation_factor ''' k_vars = exog.shape[1] x_i = exog[:, exog_idx] mask = np.arange(k_vars) != exog_idx x_noti = exog[:, mask] r_squared_i = OLS(x_i, x_noti).fit().rsquared vif = 1. / (1. - r_squared_i) return vif
#using GMM and IV2SLS classes #---------------------------- mod = IVGMM(endog, exog, instrument, nmoms=instrument.shape[1]) res = mod.fit() modgmmols = IVGMM(endog, exog, exog, nmoms=exog.shape[1]) resgmmols = modgmmols.fit() #the next is the same as IV2SLS, (Z'Z)^{-1} as weighting matrix modgmmiv = IVGMM(endog, exog, instrument, nmoms=instrument.shape[1]) #same as mod resgmmiv = modgmmiv.fitgmm(np.ones(exog.shape[1], float), weights=np.linalg.inv(np.dot(instrument.T, instrument))) modls = IV2SLS(endog, exog, instrument) resls = modls.fit() modols = OLS(endog, exog) resols = modols.fit() print '\nIV case' print 'params' print 'IV2SLS', resls.params print 'GMMIV ', resgmmiv # .params print 'GMM ', res.params print 'diff ', res.params - resls.params print 'OLS ', resols.params print 'GMMOLS', resgmmols.params print '\nbse' print 'IV2SLS', resls.bse print 'GMM ', mod.bse #bse currently only attached to model not results print 'diff ', mod.bse - resls.bse
def setupClass(cls): data = longley.load() data.exog = add_constant(data.exog) res1 = OLS(data.endog, data.exog).fit() R2 = [[0,1,-1,0,0,0,0],[0, 0, 0, 0, 1, -1, 0]] cls.Ftest1 = res1.f_test(R2)
def test_all(self): d = macrodata.load().data #import datasetswsm.greene as g #d = g.load('5-1') #growth rates gs_l_realinv = 400 * np.diff(np.log(d['realinv'])) gs_l_realgdp = 400 * np.diff(np.log(d['realgdp'])) #simple diff, not growthrate, I want heteroscedasticity later for testing endogd = np.diff(d['realinv']) exogd = add_constant(np.c_[np.diff(d['realgdp']), d['realint'][:-1]], prepend=True) endogg = gs_l_realinv exogg = add_constant(np.c_[gs_l_realgdp, d['realint'][:-1]], prepend=True) res_ols = OLS(endogg, exogg).fit() #print res_ols.params mod_g1 = GLSAR(endogg, exogg, rho=-0.108136) res_g1 = mod_g1.fit() #print res_g1.params mod_g2 = GLSAR(endogg, exogg, rho=-0.108136) #-0.1335859) from R res_g2 = mod_g2.iterative_fit(maxiter=5) #print res_g2.params rho = -0.108136 # coefficient std. error t-ratio p-value 95% CONFIDENCE INTERVAL partable = np.array([ [-9.50990, 0.990456, -9.602, 3.65e-018, -11.4631, -7.55670], # *** [4.37040, 0.208146, 21.00, 2.93e-052, 3.95993, 4.78086], # *** [-0.579253, 0.268009, -2.161, 0.0319, -1.10777, -0.0507346] ]) # ** #Statistics based on the rho-differenced data: result_gretl_g1 = dict(endog_mean=("Mean dependent var", 3.113973), endog_std=("S.D. dependent var", 18.67447), ssr=("Sum squared resid", 22530.90), mse_resid_sqrt=("S.E. of regression", 10.66735), rsquared=("R-squared", 0.676973), rsquared_adj=("Adjusted R-squared", 0.673710), fvalue=("F(2, 198)", 221.0475), f_pvalue=("P-value(F)", 3.56e-51), resid_acf1=("rho", -0.003481), dw=("Durbin-Watson", 1.993858)) #fstatistic, p-value, df1, df2 reset_2_3 = [5.219019, 0.00619, 2, 197, "f"] reset_2 = [7.268492, 0.00762, 1, 198, "f"] reset_3 = [5.248951, 0.023, 1, 198, "f"] #LM-statistic, p-value, df arch_4 = [7.30776, 0.120491, 4, "chi2"] #multicollinearity vif = [1.002, 1.002] cond_1norm = 6862.0664 determinant = 1.0296049e+009 reciprocal_condition_number = 0.013819244 #Chi-square(2): test-statistic, pvalue, df normality = [20.2792, 3.94837e-005, 2] #tests res = res_g1 #with rho from Gretl #basic assert_almost_equal(res.params, partable[:, 0], 4) assert_almost_equal(res.bse, partable[:, 1], 6) assert_almost_equal(res.tvalues, partable[:, 2], 2) assert_almost_equal(res.ssr, result_gretl_g1['ssr'][1], decimal=2) #assert_almost_equal(res.llf, result_gretl_g1['llf'][1], decimal=7) #not in gretl #assert_almost_equal(res.rsquared, result_gretl_g1['rsquared'][1], decimal=7) #FAIL #assert_almost_equal(res.rsquared_adj, result_gretl_g1['rsquared_adj'][1], decimal=7) #FAIL assert_almost_equal(np.sqrt(res.mse_resid), result_gretl_g1['mse_resid_sqrt'][1], decimal=5) assert_almost_equal(res.fvalue, result_gretl_g1['fvalue'][1], decimal=4) assert_approx_equal(res.f_pvalue, result_gretl_g1['f_pvalue'][1], significant=2) #assert_almost_equal(res.durbin_watson, result_gretl_g1['dw'][1], decimal=7) #TODO #arch #sm_arch = smsdia.acorr_lm(res.wresid**2, maxlag=4, autolag=None) sm_arch = smsdia.het_arch(res.wresid, maxlag=4) assert_almost_equal(sm_arch[0], arch_4[0], decimal=4) assert_almost_equal(sm_arch[1], arch_4[1], decimal=6) #tests res = res_g2 #with estimated rho #estimated lag coefficient assert_almost_equal(res.model.rho, rho, decimal=3) #basic assert_almost_equal(res.params, partable[:, 0], 4) assert_almost_equal(res.bse, partable[:, 1], 3) assert_almost_equal(res.tvalues, partable[:, 2], 2) assert_almost_equal(res.ssr, result_gretl_g1['ssr'][1], decimal=2) #assert_almost_equal(res.llf, result_gretl_g1['llf'][1], decimal=7) #not in gretl #assert_almost_equal(res.rsquared, result_gretl_g1['rsquared'][1], decimal=7) #FAIL #assert_almost_equal(res.rsquared_adj, result_gretl_g1['rsquared_adj'][1], decimal=7) #FAIL assert_almost_equal(np.sqrt(res.mse_resid), result_gretl_g1['mse_resid_sqrt'][1], decimal=5) assert_almost_equal(res.fvalue, result_gretl_g1['fvalue'][1], decimal=0) assert_almost_equal(res.f_pvalue, result_gretl_g1['f_pvalue'][1], decimal=6) #assert_almost_equal(res.durbin_watson, result_gretl_g1['dw'][1], decimal=7) #TODO c = oi.reset_ramsey(res, degree=2) compare_ftest(c, reset_2, decimal=(2, 4)) c = oi.reset_ramsey(res, degree=3) compare_ftest(c, reset_2_3, decimal=(2, 4)) #arch #sm_arch = smsdia.acorr_lm(res.wresid**2, maxlag=4, autolag=None) sm_arch = smsdia.het_arch(res.wresid, maxlag=4) assert_almost_equal(sm_arch[0], arch_4[0], decimal=1) assert_almost_equal(sm_arch[1], arch_4[1], decimal=2) ''' Performing iterative calculation of rho... ITER RHO ESS 1 -0.10734 22530.9 2 -0.10814 22530.9 Model 4: Cochrane-Orcutt, using observations 1959:3-2009:3 (T = 201) Dependent variable: ds_l_realinv rho = -0.108136 coefficient std. error t-ratio p-value ------------------------------------------------------------- const -9.50990 0.990456 -9.602 3.65e-018 *** ds_l_realgdp 4.37040 0.208146 21.00 2.93e-052 *** realint_1 -0.579253 0.268009 -2.161 0.0319 ** Statistics based on the rho-differenced data: Mean dependent var 3.113973 S.D. dependent var 18.67447 Sum squared resid 22530.90 S.E. of regression 10.66735 R-squared 0.676973 Adjusted R-squared 0.673710 F(2, 198) 221.0475 P-value(F) 3.56e-51 rho -0.003481 Durbin-Watson 1.993858 ''' ''' RESET test for specification (squares and cubes) Test statistic: F = 5.219019, with p-value = P(F(2,197) > 5.21902) = 0.00619 RESET test for specification (squares only) Test statistic: F = 7.268492, with p-value = P(F(1,198) > 7.26849) = 0.00762 RESET test for specification (cubes only) Test statistic: F = 5.248951, with p-value = P(F(1,198) > 5.24895) = 0.023: ''' ''' Test for ARCH of order 4 coefficient std. error t-ratio p-value -------------------------------------------------------- alpha(0) 97.0386 20.3234 4.775 3.56e-06 *** alpha(1) 0.176114 0.0714698 2.464 0.0146 ** alpha(2) -0.0488339 0.0724981 -0.6736 0.5014 alpha(3) -0.0705413 0.0737058 -0.9571 0.3397 alpha(4) 0.0384531 0.0725763 0.5298 0.5968 Null hypothesis: no ARCH effect is present Test statistic: LM = 7.30776 with p-value = P(Chi-square(4) > 7.30776) = 0.120491: ''' ''' Variance Inflation Factors Minimum possible value = 1.0 Values > 10.0 may indicate a collinearity problem ds_l_realgdp 1.002 realint_1 1.002 VIF(j) = 1/(1 - R(j)^2), where R(j) is the multiple correlation coefficient between variable j and the other independent variables Properties of matrix X'X: 1-norm = 6862.0664 Determinant = 1.0296049e+009 Reciprocal condition number = 0.013819244 ''' ''' Test for ARCH of order 4 - Null hypothesis: no ARCH effect is present Test statistic: LM = 7.30776 with p-value = P(Chi-square(4) > 7.30776) = 0.120491 Test of common factor restriction - Null hypothesis: restriction is acceptable Test statistic: F(2, 195) = 0.426391 with p-value = P(F(2, 195) > 0.426391) = 0.653468 Test for normality of residual - Null hypothesis: error is normally distributed Test statistic: Chi-square(2) = 20.2792 with p-value = 3.94837e-005: ''' #no idea what this is ''' Augmented regression for common factor test OLS, using observations 1959:3-2009:3 (T = 201) Dependent variable: ds_l_realinv coefficient std. error t-ratio p-value --------------------------------------------------------------- const -10.9481 1.35807 -8.062 7.44e-014 *** ds_l_realgdp 4.28893 0.229459 18.69 2.40e-045 *** realint_1 -0.662644 0.334872 -1.979 0.0492 ** ds_l_realinv_1 -0.108892 0.0715042 -1.523 0.1294 ds_l_realgdp_1 0.660443 0.390372 1.692 0.0923 * realint_2 0.0769695 0.341527 0.2254 0.8219 Sum of squared residuals = 22432.8 Test of common factor restriction Test statistic: F(2, 195) = 0.426391, with p-value = 0.653468 ''' ################ with OLS, HAC errors #Model 5: OLS, using observations 1959:2-2009:3 (T = 202) #Dependent variable: ds_l_realinv #HAC standard errors, bandwidth 4 (Bartlett kernel) #coefficient std. error t-ratio p-value 95% CONFIDENCE INTERVAL #for confidence interval t(199, 0.025) = 1.972 partable = np.array([ [-9.48167, 1.17709, -8.055, 7.17e-014, -11.8029, -7.16049], # *** [4.37422, 0.328787, 13.30, 2.62e-029, 3.72587, 5.02258], #*** [-0.613997, 0.293619, -2.091, 0.0378, -1.19300, -0.0349939] ]) # ** result_gretl_g1 = dict(endog_mean=("Mean dependent var", 3.257395), endog_std=("S.D. dependent var", 18.73915), ssr=("Sum squared resid", 22799.68), mse_resid_sqrt=("S.E. of regression", 10.70380), rsquared=("R-squared", 0.676978), rsquared_adj=("Adjusted R-squared", 0.673731), fvalue=("F(2, 199)", 90.79971), f_pvalue=("P-value(F)", 9.53e-29), llf=("Log-likelihood", -763.9752), aic=("Akaike criterion", 1533.950), bic=("Schwarz criterion", 1543.875), hqic=("Hannan-Quinn", 1537.966), resid_acf1=("rho", -0.107341), dw=("Durbin-Watson", 2.213805)) linear_logs = [1.68351, 0.430953, 2, "chi2"] #for logs: dropping 70 nan or incomplete observations, T=133 #(res_ols.model.exog <=0).any(1).sum() = 69 ?not 70 linear_squares = [7.52477, 0.0232283, 2, "chi2"] #Autocorrelation, Breusch-Godfrey test for autocorrelation up to order 4 lm_acorr4 = [1.17928, 0.321197, 4, 195, "F"] lm2_acorr4 = [4.771043, 0.312, 4, "chi2"] acorr_ljungbox4 = [5.23587, 0.264, 4, "chi2"] #break cusum_Harvey_Collier = [0.494432, 0.621549, 198, "t"] #stats.t.sf(0.494432, 198)*2 #see cusum results in files break_qlr = [3.01985, 0.1, 3, 196, "maxF"] #TODO check this, max at 2001:4 break_chow = [13.1897, 0.00424384, 3, "chi2"] # break at 1984:1 arch_4 = [3.43473, 0.487871, 4, "chi2"] normality = [23.962, 0.00001, 2, "chi2"] het_white = [33.503723, 0.000003, 5, "chi2"] het_breush_pagan = [1.302014, 0.521520, 2, "chi2"] #TODO: not available het_breush_pagan_konker = [0.709924, 0.701200, 2, "chi2"] reset_2_3 = [5.219019, 0.00619, 2, 197, "f"] reset_2 = [7.268492, 0.00762, 1, 198, "f"] reset_3 = [5.248951, 0.023, 1, 198, "f"] #not available cond_1norm = 5984.0525 determinant = 7.1087467e+008 reciprocal_condition_number = 0.013826504 vif = [1.001, 1.001] names = 'date residual leverage influence DFFITS'.split( ) cur_dir = os.path.abspath(os.path.dirname(__file__)) fpath = os.path.join(cur_dir, 'results/leverage_influence_ols_nostars.txt') lev = np.genfromtxt(fpath, skip_header=3, skip_footer=1, converters={0: lambda s: s}) #either numpy 1.6 or python 3.2 changed behavior if np.isnan(lev[-1]['f1']): lev = np.genfromtxt(fpath, skip_header=3, skip_footer=2, converters={0: lambda s: s}) lev.dtype.names = names res = res_ols #for easier copying cov_hac, bse_hac = sw.cov_hac_simple(res, nlags=4, use_correction=False) assert_almost_equal(res.params, partable[:, 0], 5) assert_almost_equal(bse_hac, partable[:, 1], 5) #TODO assert_almost_equal(res.ssr, result_gretl_g1['ssr'][1], decimal=2) #assert_almost_equal(res.llf, result_gretl_g1['llf'][1], decimal=7) #not in gretl assert_almost_equal(res.rsquared, result_gretl_g1['rsquared'][1], decimal=6) #FAIL assert_almost_equal(res.rsquared_adj, result_gretl_g1['rsquared_adj'][1], decimal=6) #FAIL assert_almost_equal(np.sqrt(res.mse_resid), result_gretl_g1['mse_resid_sqrt'][1], decimal=5) #f-value is based on cov_hac I guess #assert_almost_equal(res.fvalue, result_gretl_g1['fvalue'][1], decimal=0) #FAIL #assert_approx_equal(res.f_pvalue, result_gretl_g1['f_pvalue'][1], significant=1) #FAIL #assert_almost_equal(res.durbin_watson, result_gretl_g1['dw'][1], decimal=7) #TODO c = oi.reset_ramsey(res, degree=2) compare_ftest(c, reset_2, decimal=(6, 5)) c = oi.reset_ramsey(res, degree=3) compare_ftest(c, reset_2_3, decimal=(6, 5)) linear_sq = smsdia.linear_lm(res.resid, res.model.exog) assert_almost_equal(linear_sq[0], linear_squares[0], decimal=6) assert_almost_equal(linear_sq[1], linear_squares[1], decimal=7) hbpk = smsdia.het_breushpagan(res.resid, res.model.exog) assert_almost_equal(hbpk[0], het_breush_pagan_konker[0], decimal=6) assert_almost_equal(hbpk[1], het_breush_pagan_konker[1], decimal=6) hw = smsdia.het_white(res.resid, res.model.exog) assert_almost_equal(hw[:2], het_white[:2], 6) #arch #sm_arch = smsdia.acorr_lm(res.resid**2, maxlag=4, autolag=None) sm_arch = smsdia.het_arch(res.resid, maxlag=4) assert_almost_equal(sm_arch[0], arch_4[0], decimal=5) assert_almost_equal(sm_arch[1], arch_4[1], decimal=6) vif2 = [ oi.variance_inflation_factor(res.model.exog, k) for k in [1, 2] ] infl = oi.OLSInfluence(res_ols) #print np.max(np.abs(lev['DFFITS'] - infl.dffits[0])) #print np.max(np.abs(lev['leverage'] - infl.hat_matrix_diag)) #print np.max(np.abs(lev['influence'] - infl.influence)) #just added this based on Gretl #just rough test, low decimal in Gretl output, assert_almost_equal(lev['residual'], res.resid, decimal=3) assert_almost_equal(lev['DFFITS'], infl.dffits[0], decimal=3) assert_almost_equal(lev['leverage'], infl.hat_matrix_diag, decimal=3) assert_almost_equal(lev['influence'], infl.influence, decimal=4)
def grangercausalitytests(x, maxlag, addconst=True, verbose=True): '''four tests for granger causality of 2 timeseries all four tests give similar results `params_ftest` and `ssr_ftest` are equivalent based of F test which is identical to lmtest:grangertest in R Parameters ---------- x : array, 2d, (nobs,2) data for test whether the time series in the second column Granger causes the time series in the first column maxlag : integer the Granger causality test results are calculated for all lags up to maxlag verbose : bool print results if true Returns ------- results : dictionary all test results, dictionary keys are the number of lags. For each lag the values are a tuple, with the first element a dictionary with teststatistic, pvalues, degrees of freedom, the second element are the OLS estimation results for the restricted model, the unrestricted model and the restriction (contrast) matrix for the parameter f_test. Notes ----- TODO: convert to class and attach results properly The Null hypothesis for grangercausalitytests is that the time series in the second column, x2, Granger causes the time series in the first column, x1. This means that past values of x2 have a statistically significant effect on the current value of x1, taking also past values of x1 into account, as regressors. We reject the null hypothesis of x2 Granger causing x1 if the pvalues are below a desired size of the test. 'params_ftest', 'ssr_ftest' are based on F test 'ssr_chi2test', 'lrtest' are based on chi-square test ''' from scipy import stats # lazy import resli = {} for mlg in range(1, maxlag+1): result = {} if verbose: print '\nGranger Causality' print 'number of lags (no zero)', mlg mxlg = mlg #+ 1 # Note number of lags starting at zero in lagmat # create lagmat of both time series dta = lagmat2ds(x, mxlg, trim='both', dropex=1) #add constant if addconst: dtaown = add_constant(dta[:,1:mxlg+1]) dtajoint = add_constant(dta[:,1:]) else: raise ValueError('Not Implemented') dtaown = dta[:,1:mxlg] dtajoint = dta[:,1:] #run ols on both models without and with lags of second variable res2down = OLS(dta[:,0], dtaown).fit() res2djoint = OLS(dta[:,0], dtajoint).fit() #print results #for ssr based tests see: http://support.sas.com/rnd/app/examples/ets/granger/index.htm #the other tests are made-up # Granger Causality test using ssr (F statistic) fgc1 = (res2down.ssr-res2djoint.ssr)/res2djoint.ssr/(mxlg)*res2djoint.df_resid if verbose: print 'ssr based F test: F=%-8.4f, p=%-8.4f, df_denom=%d, df_num=%d' % \ (fgc1, stats.f.sf(fgc1, mxlg, res2djoint.df_resid), res2djoint.df_resid, mxlg) result['ssr_ftest'] = (fgc1, stats.f.sf(fgc1, mxlg, res2djoint.df_resid), res2djoint.df_resid, mxlg) # Granger Causality test using ssr (ch2 statistic) fgc2 = res2down.nobs*(res2down.ssr-res2djoint.ssr)/res2djoint.ssr if verbose: print 'ssr based chi2 test: chi2=%-8.4f, p=%-8.4f, df=%d' % \ (fgc2, stats.chi2.sf(fgc2, mxlg), mxlg) result['ssr_chi2test'] = (fgc2, stats.chi2.sf(fgc2, mxlg), mxlg) #likelihood ratio test pvalue: lr = -2*(res2down.llf-res2djoint.llf) if verbose: print 'likelihood ratio test: chi2=%-8.4f, p=%-8.4f, df=%d' % \ (lr, stats.chi2.sf(lr, mxlg), mxlg) result['lrtest'] = (lr, stats.chi2.sf(lr, mxlg), mxlg) # F test that all lag coefficients of exog are zero rconstr = np.column_stack((np.zeros((mxlg-1,mxlg-1)), np.eye(mxlg-1, mxlg-1),\ np.zeros((mxlg-1, 1)))) rconstr = np.column_stack((np.zeros((mxlg,mxlg)), np.eye(mxlg, mxlg),\ np.zeros((mxlg, 1)))) ftres = res2djoint.f_test(rconstr) if verbose: print 'parameter F test: F=%-8.4f, p=%-8.4f, df_denom=%d, df_num=%d' % \ (ftres.fvalue, ftres.pvalue, ftres.df_denom, ftres.df_num) result['params_ftest'] = (np.squeeze(ftres.fvalue)[()], np.squeeze(ftres.pvalue)[()], ftres.df_denom, ftres.df_num) resli[mxlg] = (result, [res2down, res2djoint, rconstr]) return resli