def get_griliches76_data(): import os curdir = os.path.split(__file__)[0] path = os.path.join(curdir, 'griliches76.dta') griliches76_data = iolib.genfromdta(path, missing_flt=np.NaN, pandas=True) # create year dummies years = griliches76_data['year'].unique() N = griliches76_data.shape[0] for yr in years: griliches76_data['D_%i' %yr] = np.zeros(N) for i in range(N): if griliches76_data.ix[i, 'year'] == yr: griliches76_data.ix[i, 'D_%i' %yr] = 1 else: pass griliches76_data['const'] = 1 X = add_constant(griliches76_data[['s', 'iq', 'expr', 'tenure', 'rns', 'smsa', 'D_67', 'D_68', 'D_69', 'D_70', 'D_71', 'D_73']], prepend=True) # for R comparison #prepend=False) # for Stata comparison Z = add_constant(griliches76_data[['expr', 'tenure', 'rns', 'smsa', \ 'D_67', 'D_68', 'D_69', 'D_70', 'D_71', 'D_73', 'med', 'kww', 'age', 'mrt']]) Y = griliches76_data['lw'] return Y, X, Z
def setup_class(cls): d = macrodata.load_pandas().data #growth rates d['gs_l_realinv'] = 400 * np.log(d['realinv']).diff() d['gs_l_realgdp'] = 400 * np.log(d['realgdp']).diff() d['lint'] = d['realint'].shift(1) d['tbilrate'] = d['tbilrate'].shift(1) d = d.dropna() cls.d = d endogg = d['gs_l_realinv'] exogg = add_constant(d[['gs_l_realgdp', 'lint']]) exogg2 = add_constant(d[['gs_l_realgdp', 'tbilrate']]) exogg3 = add_constant(d[['gs_l_realgdp']]) res_ols = OLS(endogg, exogg).fit() res_ols2 = OLS(endogg, exogg2).fit() res_ols3 = OLS(endogg, exogg3).fit() cls.res = res_ols cls.res2 = res_ols2 cls.res3 = res_ols3 cls.endog = cls.res.model.endog cls.exog = cls.res.model.exog
def __init__(self): d = macrodata.load_pandas().data # growth rates d["gs_l_realinv"] = 400 * np.log(d["realinv"]).diff() d["gs_l_realgdp"] = 400 * np.log(d["realgdp"]).diff() d["lint"] = d["realint"].shift(1) d["tbilrate"] = d["tbilrate"].shift(1) d = d.dropna() self.d = d endogg = d["gs_l_realinv"] exogg = add_constant(d[["gs_l_realgdp", "lint"]]) exogg2 = add_constant(d[["gs_l_realgdp", "tbilrate"]]) exogg3 = add_constant(d[["gs_l_realgdp"]]) res_ols = OLS(endogg, exogg).fit() res_ols2 = OLS(endogg, exogg2).fit() res_ols3 = OLS(endogg, exogg3).fit() self.res = res_ols self.res2 = res_ols2 self.res3 = res_ols3 self.endog = self.res.model.endog self.exog = self.res.model.exog
def test_add_constant_has_constant2d(self): x = np.asarray([[1, 1, 1, 1], [1, 2, 3, 4.0]]).T y = tools.add_constant(x, has_constant="skip") assert_equal(x, y) assert_raises(ValueError, tools.add_constant, x, has_constant="raise") assert_equal(tools.add_constant(x, has_constant="add"), np.column_stack((np.ones(4), x)))
def test_add_constant_has_constant1d(self): x = np.ones(5) x = tools.add_constant(x, has_constant="skip") assert_equal(x, np.ones(5)) assert_raises(ValueError, tools.add_constant, x, has_constant="raise") assert_equal(tools.add_constant(x, has_constant="add"), np.ones((5, 2)))
def notyet_atst(): d = macrodata.load().data realinv = d['realinv'] realgdp = d['realgdp'] realint = d['realint'] endog = realinv exog = add_constant(np.c_[realgdp, realint],prepend=True) res_ols1 = OLS(endog, exog).fit() #growth rates gs_l_realinv = 400 * np.diff(np.log(d['realinv'])) gs_l_realgdp = 400 * np.diff(np.log(d['realgdp'])) lint = d['realint'][:-1] tbilrate = d['tbilrate'][:-1] endogg = gs_l_realinv exogg = add_constant(np.c_[gs_l_realgdp, lint], prepend=True) exogg2 = add_constant(np.c_[gs_l_realgdp, tbilrate], prepend=True) res_ols = OLS(endogg, exogg).fit() res_ols2 = OLS(endogg, exogg2).fit() #the following were done accidentally with res_ols1 in R, #with original Greene data params = np.array([-272.3986041341653, 0.1779455206941112, 0.2149432424658157]) cov_hac_4 = np.array([1321.569466333051, -0.2318836566017612, 37.01280466875694, -0.2318836566017614, 4.602339488102263e-05, -0.0104687835998635, 37.012804668757, -0.0104687835998635, 21.16037144168061]).reshape(3,3, order='F') cov_hac_10 = np.array([2027.356101193361, -0.3507514463299015, 54.81079621448568, -0.350751446329901, 6.953380432635583e-05, -0.01268990195095196, 54.81079621448564, -0.01268990195095195, 22.92512402151113]).reshape(3,3, order='F') #goldfeld-quandt het_gq_greater = dict(statistic=13.20512768685082, df1=99, df2=98, pvalue=1.246141976112324e-30, distr='f') het_gq_less = dict(statistic=13.20512768685082, df1=99, df2=98, pvalue=1.) het_gq_2sided = dict(statistic=13.20512768685082, df1=99, df2=98, pvalue=1.246141976112324e-30, distr='f') #goldfeld-quandt, fraction = 0.5 het_gq_greater_2 = dict(statistic=87.1328934692124, df1=48, df2=47, pvalue=2.154956842194898e-33, distr='f') gq = smsdia.het_goldfeldquandt(endog, exog, split=0.5) compare_t_est(gq, het_gq_greater, decimal=(13, 14)) assert_equal(gq[-1], 'increasing') harvey_collier = dict(stat=2.28042114041313, df=199, pvalue=0.02364236161988260, distr='t') #hc = harvtest(fm, order.by=ggdp , data = list()) harvey_collier_2 = dict(stat=0.7516918462158783, df=199, pvalue=0.4531244858006127, distr='t')
def coint(y1, y2, regression="c"): """ This is a simple cointegration test. Uses unit-root test on residuals to test for cointegrated relationship See Hamilton (1994) 19.2 Parameters ---------- y1 : array_like, 1d first element in cointegrating vector y2 : array_like remaining elements in cointegrating vector c : str {'c'} Included in regression * 'c' : Constant Returns ------- coint_t : float t-statistic of unit-root test on residuals pvalue : float MacKinnon's approximate p-value based on MacKinnon (1994) crit_value : dict Critical values for the test statistic at the 1 %, 5 %, and 10 % levels. Notes ----- The Null hypothesis is that there is no cointegration, the alternative hypothesis is that there is cointegrating relationship. If the pvalue is small, below a critical size, then we can reject the hypothesis that there is no cointegrating relationship. P-values are obtained through regression surface approximation from MacKinnon 1994. References ---------- MacKinnon, J.G. 1994. "Approximate asymptotic distribution functions for unit-root and cointegration tests. `Journal of Business and Economic Statistics` 12, 167-76. """ regression = regression.lower() if regression not in ['c', 'nc', 'ct', 'ctt']: raise ValueError("regression option %s not understood") % regression y1 = np.asarray(y1) y2 = np.asarray(y2) if regression == 'c': y2 = add_constant(y2, prepend=False) st1_resid = OLS(y1, y2).fit().resid # stage one residuals lgresid_cons = add_constant(st1_resid[0:-1], prepend=False) uroot_reg = OLS(st1_resid[1:], lgresid_cons).fit() coint_t = (uroot_reg.params[0] - 1) / uroot_reg.bse[0] pvalue = mackinnonp(coint_t, regression="c", N=2, lags=None) crit_value = mackinnoncrit(N=1, regression="c", nobs=len(y1)) return coint_t, pvalue, crit_value
def test_add_constant_has_constant1d(self): x = np.ones(5) x = tools.add_constant(x, has_constant='skip') assert_equal(x, np.ones((5,1))) assert_raises(ValueError, tools.add_constant, x, has_constant='raise') assert_equal(tools.add_constant(x, has_constant='add'), np.ones((5, 2)))
def test_add_constant_has_constant2d(self): x = np.asarray([[1,1,1,1],[1,2,3,4.]]).T y = tools.add_constant(x, has_constant='skip') assert_equal(x, y) with pytest.raises(ValueError): tools.add_constant(x, has_constant='raise') assert_equal(tools.add_constant(x, has_constant='add'), np.column_stack((np.ones(4), x)))
def test_wls_tss(): y = np.array([22, 22, 22, 23, 23, 23]) X = [[1, 0], [1, 0], [1, 1], [0, 1], [0, 1], [0, 1]] ols_mod = OLS(y, add_constant(X, prepend=False)).fit() yw = np.array([22, 22, 23.]) Xw = [[1,0],[1,1],[0,1]] w = np.array([2, 1, 3.]) wls_mod = WLS(yw, add_constant(Xw, prepend=False), weights=w).fit() assert_equal(ols_mod.centered_tss, wls_mod.centered_tss)
def test_summarycol(self): # Test for latex output of summary_col object desired = r''' \begin{table} \caption{} \begin{center} \begin{tabular}{lcc} \hline & y I & y II \\ \midrule const & 7.7500 & 12.4231 \\ & (1.1058) & (3.1872) \\ x1 & -0.7500 & -1.5769 \\ & (0.2368) & (0.6826) \\ \hline \end{tabular} \end{center} \end{table} ''' x = [1,5,7,3,5] x = add_constant(x) y1 = [6,4,2,7,4] y2 = [8,5,0,12,4] reg1 = OLS(y1,x).fit() reg2 = OLS(y2,x).fit() actual = summary_col([reg1,reg2]).as_latex() actual = '\n%s\n' % actual assert_equal(desired, actual)
def test_cov_cluster_2groups(): #comparing cluster robust standard errors to Peterson #requires Petersen's test_data #http://www.kellogg.northwestern.edu/faculty/petersen/htm/papers/se/test_data.txt import os cur_dir = os.path.abspath(os.path.dirname(__file__)) fpath = os.path.join(cur_dir,"test_data.txt") pet = np.genfromtxt(fpath) endog = pet[:,-1] group = pet[:,0].astype(int) time = pet[:,1].astype(int) exog = add_constant(pet[:,2]) res = OLS(endog, exog).fit() cov01, covg, covt = sw.cov_cluster_2groups(res, group, group2=time) #Reference number from Petersen #http://www.kellogg.northwestern.edu/faculty/petersen/htm/papers/se/test_data.htm bse_petw = [0.0284, 0.0284] bse_pet0 = [0.0670, 0.0506] bse_pet1 = [0.0234, 0.0334] #year bse_pet01 = [0.0651, 0.0536] #firm and year bse_0 = sw.se_cov(covg) bse_1 = sw.se_cov(covt) bse_01 = sw.se_cov(cov01) #print res.HC0_se, bse_petw - res.HC0_se #print bse_0, bse_0 - bse_pet0 #print bse_1, bse_1 - bse_pet1 #print bse_01, bse_01 - bse_pet01 assert_almost_equal(bse_petw, res.HC0_se, decimal=4) assert_almost_equal(bse_0, bse_pet0, decimal=4) assert_almost_equal(bse_1, bse_pet1, decimal=4) assert_almost_equal(bse_01, bse_pet01, decimal=4)
def pacf_ols(x, nlags=40): '''Calculate partial autocorrelations Parameters ---------- x : 1d array observations of time series for which pacf is calculated nlags : int Number of lags for which pacf is returned. Lag 0 is not returned. Returns ------- pacf : 1d array partial autocorrelations, maxlag+1 elements Notes ----- This solves a separate OLS estimation for each desired lag. ''' #TODO: add warnings for Yule-Walker #NOTE: demeaning and not using a constant gave incorrect answers? #JP: demeaning should have a better estimate of the constant #maybe we can compare small sample properties with a MonteCarlo xlags, x0 = lagmat(x, nlags, original='sep') #xlags = sm.add_constant(lagmat(x, nlags), prepend=True) xlags = add_constant(xlags) pacf = [1.] for k in range(1, nlags+1): res = OLS(x0[k:], xlags[k:, :k+1]).fit() #np.take(xlags[k:], range(1,k+1)+[-1], pacf.append(res.params[-1]) return np.array(pacf)
def test_hac_simple(): from statsmodels.datasets import macrodata d2 = macrodata.load().data g_gdp = 400*np.diff(np.log(d2['realgdp'])) g_inv = 400*np.diff(np.log(d2['realinv'])) exogg = add_constant(np.c_[g_gdp, d2['realint'][:-1]],prepend=True) res_olsg = OLS(g_inv, exogg).fit() #> NeweyWest(fm, lag = 4, prewhite = FALSE, sandwich = TRUE, verbose=TRUE, adjust=TRUE) #Lag truncation parameter chosen: 4 # (Intercept) ggdp lint cov1_r = [[ 1.40643899878678802, -0.3180328707083329709, -0.060621111216488610], [ -0.31803287070833292, 0.1097308348999818661, 0.000395311760301478], [ -0.06062111121648865, 0.0003953117603014895, 0.087511528912470993]] #> NeweyWest(fm, lag = 4, prewhite = FALSE, sandwich = TRUE, verbose=TRUE, adjust=FALSE) #Lag truncation parameter chosen: 4 # (Intercept) ggdp lint cov2_r = [[ 1.3855512908840137, -0.313309610252268500, -0.059720797683570477], [ -0.3133096102522685, 0.108101169035130618, 0.000389440793564339], [ -0.0597207976835705, 0.000389440793564336, 0.086211852740503622]] cov1, se1 = sw.cov_hac_simple(res_olsg, nlags=4, use_correction=True) cov2, se2 = sw.cov_hac_simple(res_olsg, nlags=4, use_correction=False) assert_almost_equal(cov1, cov1_r, decimal=14) assert_almost_equal(cov2, cov2_r, decimal=14)
def mglm_Levenberg(y, design, dispersion=0, offset=0, coef_start=None, start_method='null'): """ Fit genewise negative binomial glms with log-link using Levenberg dampening for convergence. Parameters ---------- y : matrix design : dataframe Adapted from Gordon Smyth's and Yunshun Chen's algorithm in R. """ design = add_constant(design) if not coef_start: start_method = [i for i in ['null', 'y'] if i == start_method][0] if start_method == 'null': N = exp(offset) else: coef_start = asarray(coef_start) if not coef_start: if start_method == 'y': delta = np.min(np.max(y), 1/6) y1 = np.maximum(y, delta) #Need to find something similiar to pmax fit = lstsq(design, np.log(y1 - offset)).fit() beta = fit.params mu = np.exp(beta + offset) else: beta_mean = np.log(np.average(y,axis=1, weights=offset)) else: beta = coef_start.T pass
def setup_class(cls): d2 = macrodata.load().data g_gdp = 400*np.diff(np.log(d2['realgdp'])) g_inv = 400*np.diff(np.log(d2['realinv'])) exogg = add_constant(np.c_[g_gdp, d2['realint'][:-1]], prepend=False) cls.res1 = res_ols = OLS(g_inv, exogg).fit()
def test_poisson_residuals(): nobs, k_exog = 100, 5 np.random.seed(987125) x = np.random.randn(nobs, k_exog - 1) x = add_constant(x) y_true = x.sum(1) / 2 y = y_true + 2 * np.random.randn(nobs) exposure = 1 + np.arange(nobs) // 4 yp = np.random.poisson(np.exp(y_true) * exposure) yp[10:15] += 10 fam = sm.families.Poisson() mod_poi_e = GLM(yp, x, family=fam, exposure=exposure) res_poi_e = mod_poi_e.fit() mod_poi_w = GLM(yp / exposure, x, family=fam, var_weights=exposure) res_poi_w = mod_poi_w.fit() assert_allclose(res_poi_e.resid_response / exposure, res_poi_w.resid_response) assert_allclose(res_poi_e.resid_pearson, res_poi_w.resid_pearson) assert_allclose(res_poi_e.resid_deviance, res_poi_w.resid_deviance) with warnings.catch_warnings(): warnings.simplefilter("ignore", category=FutureWarning) assert_allclose(res_poi_e.resid_anscombe, res_poi_w.resid_anscombe) assert_allclose(res_poi_e.resid_anscombe_unscaled, res_poi_w.resid_anscombe)
def setupClass(cls): data = longley.load() data.exog = add_constant(data.exog, prepend=False) ols_res = OLS(data.endog, data.exog).fit() gls_res = GLS(data.endog, data.exog).fit() cls.res1 = gls_res cls.res2 = ols_res
def test_missing(self): data = longley.load() data.exog = add_constant(data.exog, prepend=False) data.endog[[3, 7, 14]] = np.nan mod = OLS(data.endog, data.exog, missing='drop') assert_equal(mod.endog.shape[0], 13) assert_equal(mod.exog.shape[0], 13)
def test_pandas_const_df_prepend(): dta = longley.load_pandas().exog # regression test for #1025 dta["UNEMP"] /= dta["UNEMP"].std() dta = tools.add_constant(dta, prepend=True) assert_string_equal("const", dta.columns[0]) assert_equal(dta.var(0)[0], 0)
def setupClass(cls): R = np.zeros(7) R[4:6] = [1,-1] data = longley.load() data.exog = add_constant(data.exog, prepend=False) res1 = OLS(data.endog, data.exog).fit() cls.Ttest1 = res1.t_test(R)
def calculateStat(y,x): cointegration = coint(y,x) signal = (cointegration[1] < 0.05).__int__() x= add_constant(x) reg = OLS(y, x).fit() # returns bo,b1,rmse return (signal, float(reg.params[0]),float(reg.params[1]), float(math.sqrt(reg.mse_resid)))
def plot_ccpr(results, exog_idx, ax=None): """Plot CCPR against one regressor. Generates a CCPR (component and component-plus-residual) plot. Parameters ---------- results : result instance A regression results instance. exog_idx : int or string Exogenous, explanatory variable. If string is given, it should be the variable name that you want to use, and you can use arbitrary translations as with a formula. ax : Matplotlib AxesSubplot instance, optional If given, it is used to plot in instead of a new figure being created. Returns ------- fig : Matplotlib figure instance If `ax` is None, the created figure. Otherwise the figure to which `ax` is connected. See Also -------- plot_ccpr_grid : Creates CCPR plot for multiple regressors in a plot grid. Notes ----- The CCPR plot provides a way to judge the effect of one regressor on the response variable by taking into account the effects of the other independent variables. The partial residuals plot is defined as Residuals + B_i*X_i versus X_i. The component adds the B_i*X_i versus X_i to show where the fitted line would lie. Care should be taken if X_i is highly correlated with any of the other independent variables. If this is the case, the variance evident in the plot will be an underestimate of the true variance. References ---------- http://www.itl.nist.gov/div898/software/dataplot/refman1/auxillar/ccpr.htm """ fig, ax = utils.create_mpl_ax(ax) exog_name, exog_idx = utils.maybe_name_or_idx(exog_idx, results.model) x1 = results.model.exog[:, exog_idx] #namestr = ' for %s' % self.name if self.name else '' x1beta = x1*results._results.params[exog_idx] ax.plot(x1, x1beta + results.resid, 'o') from statsmodels.tools.tools import add_constant mod = OLS(x1beta, add_constant(x1)).fit() params = mod.params fig = abline_plot(*params, **dict(ax=ax)) #ax.plot(x1, x1beta, '-') ax.set_title('Component and component plus residual plot') ax.set_ylabel("Residual + %s*beta_%d" % (exog_name, exog_idx)) ax.set_xlabel("%s" % exog_name) return fig
def test_const_indicator(): np.random.seed(12345) X = np.random.randint(0, 3, size=30) X = categorical(X, drop=True) y = np.dot(X, [1., 2., 3.]) + np.random.normal(size=30) modc = OLS(y, add_constant(X[:,1:], prepend=True)).fit() mod = OLS(y, X, hasconst=True).fit() assert_almost_equal(modc.rsquared, mod.rsquared, 12)
def test_add_constant_dataframe(self): df = pd.DataFrame([[1.0, 'a', 4], [2.0, 'bc', 9], [3.0, 'def', 16]]) output = tools.add_constant(df) expected = pd.Series([1.0, 1.0, 1.0], name='const') assert_series_equal(expected, output['const']) dfc = df.copy() dfc.insert(0, 'const', np.ones(3)) assert_frame_equal(dfc, output)
def setupClass(cls): data = longley.load() data.exog = add_constant(data.exog, prepend=False) res1 = OLS(data.endog, data.exog).fit() R2 = [[0,1,-1,0,0,0,0],[0, 0, 0, 0, 1, -1, 0]] cls.Ftest1 = res1.f_test(R2) hyp = 'x2 = x3, x5 = x6' cls.NewFtest1 = res1.f_test(hyp)
def setupClass(cls): data = longley.load() data.exog = add_constant(data.exog, prepend=False) cls.res1 = OLS(data.endog, data.exog).fit() R = np.identity(7) cls.Ttest = cls.res1.t_test(R) hyp = 'x1 = 0, x2 = 0, x3 = 0, x4 = 0, x5 = 0, x6 = 0, const = 0' cls.NewTTest = cls.res1.t_test(hyp)
def setupClass(cls): from statsmodels.datasets.longley import load dta = load() dta.exog = add_constant(dta.exog, prepend=True) wls_scalar = WLS(dta.endog, dta.exog, weights=1./3).fit() weights = [1/3.] * len(dta.endog) wls_array = WLS(dta.endog, dta.exog, weights=weights).fit() cls.res1 = wls_scalar cls.res2 = wls_array
def qqline(ax, line, x=None, y=None, dist=None, fmt='r-'): """ Plot a reference line for a qqplot. Parameters ---------- ax : matplotlib axes instance The axes on which to plot the line line : str {'45','r','s','q'} Options for the reference line to which the data is compared.: - '45' - 45-degree line - 's' - standardized line, the expected order statistics are scaled by the standard deviation of the given sample and have the mean added to them - 'r' - A regression line is fit - 'q' - A line is fit through the quartiles. - None - By default no reference line is added to the plot. x : array X data for plot. Not needed if line is '45'. y : array Y data for plot. Not needed if line is '45'. dist : scipy.stats.distribution A scipy.stats distribution, needed if line is 'q'. Notes ----- There is no return value. The line is plotted on the given `ax`. """ if line == '45': end_pts = zip(ax.get_xlim(), ax.get_ylim()) end_pts[0] = min(end_pts[0]) end_pts[1] = max(end_pts[1]) ax.plot(end_pts, end_pts, fmt) ax.set_xlim(end_pts) ax.set_ylim(end_pts) return # does this have any side effects? if x is None and y is None: raise ValueError("If line is not 45, x and y cannot be None.") elif line == 'r': # could use ax.lines[0].get_xdata(), get_ydata(), # but don't know axes are 'clean' y = OLS(y, add_constant(x)).fit().fittedvalues ax.plot(x,y,fmt) elif line == 's': m,b = y.std(), y.mean() ref_line = x*m + b ax.plot(x, ref_line, fmt) elif line == 'q': _check_for_ppf(dist) q25 = stats.scoreatpercentile(y, 25) q75 = stats.scoreatpercentile(y, 75) theoretical_quartiles = dist.ppf([0.25, 0.75]) m = (q75 - q25) / np.diff(theoretical_quartiles) b = q25 - m*theoretical_quartiles[0] ax.plot(x, m*x + b, fmt)
def test_wls_example(): #example from the docstring, there was a note about a bug, should #be fixed now Y = [1,3,4,5,2,3,4] X = lrange(1,8) X = add_constant(X, prepend=False) wls_model = WLS(Y,X, weights=lrange(1,8)).fit() #taken from R lm.summary assert_almost_equal(wls_model.fvalue, 0.127337843215, 6) assert_almost_equal(wls_model.scale, 2.44608530786**2, 6)
def test_add_constant_has_constant2d(self): x = np.asarray([[1,1,1,1],[1,2,3,4.]]) y = tools.add_constant(x) assert_equal(x,y)
def test_pandas_const_series_prepend(): dta = longley.load_pandas() series = dta.exog['GNP'] series = tools.add_constant(series, prepend=True) assert_string_equal('const', series.columns[0]) assert_equal(series.var(0)[0], 0)
with open('results_Granger1', 'wb') as f: pickle.dump(results_Granger1, f) plot_hist(results_NN1, lags) plot_hist(results_LSTM1, lags) plot_hist(results_GRU1, lags) #%% AR models performance on test set from statsmodels.tsa.tsatools import lagmat2ds from statsmodels.tools.tools import add_constant #results_Granger1 = grangercausalitytests(data[:7000,:],lags,verbose=False) for l in lags: mdl1 = results_Granger1[l][1][0] mdl2 = results_Granger1[l][1][1] data_gr = lagmat2ds(data[7000:, :], l, trim="both", dropex=1) dtaown = add_constant(data_gr[:, 1:(l + 1)], prepend=False) dtajoint = add_constant(data_gr[:, 1:], prepend=False) x_pred1 = mdl1.predict(dtaown) x_pred2 = mdl2.predict(dtajoint) error1 = x_pred1 - data[7000 + l:, 0] error2 = x_pred2 - data[7000 + l:, 0] rss_x1 = sum(error1**2) rss_x2 = sum(error2**2) RSS1['Granger'][l] = rss_x1 RSS2['Granger'][l] = rss_x2 print('RSS1 = %0.2f' % rss_x1) print('RSS2 = %0.2f' % rss_x2) S, p_value = stats.wilcoxon(np.abs(error1), np.abs(error2), alternative='greater') print(p_value)
def notyet_atst(): # FIXME: make this a test or move/remove d = macrodata.load(as_pandas=False).data realinv = d['realinv'] realgdp = d['realgdp'] realint = d['realint'] endog = realinv exog = add_constant(np.c_[realgdp, realint]) res_ols1 = OLS(endog, exog).fit() #growth rates gs_l_realinv = 400 * np.diff(np.log(d['realinv'])) gs_l_realgdp = 400 * np.diff(np.log(d['realgdp'])) lint = d['realint'][:-1] tbilrate = d['tbilrate'][:-1] endogg = gs_l_realinv exogg = add_constant(np.c_[gs_l_realgdp, lint]) exogg2 = add_constant(np.c_[gs_l_realgdp, tbilrate]) res_ols = OLS(endogg, exogg).fit() res_ols2 = OLS(endogg, exogg2).fit() #the following were done accidentally with res_ols1 in R, #with original Greene data params = np.array( [-272.3986041341653, 0.1779455206941112, 0.2149432424658157]) cov_hac_4 = np.array([ 1321.569466333051, -0.2318836566017612, 37.01280466875694, -0.2318836566017614, 4.602339488102263e-05, -0.0104687835998635, 37.012804668757, -0.0104687835998635, 21.16037144168061 ]).reshape(3, 3, order='F') cov_hac_10 = np.array([ 2027.356101193361, -0.3507514463299015, 54.81079621448568, -0.350751446329901, 6.953380432635583e-05, -0.01268990195095196, 54.81079621448564, -0.01268990195095195, 22.92512402151113 ]).reshape(3, 3, order='F') #goldfeld-quandt het_gq_greater = dict(statistic=13.20512768685082, df1=99, df2=98, pvalue=1.246141976112324e-30, distr='f') het_gq_less = dict(statistic=13.20512768685082, df1=99, df2=98, pvalue=1.) het_gq_2sided = dict(statistic=13.20512768685082, df1=99, df2=98, pvalue=1.246141976112324e-30, distr='f') #goldfeld-quandt, fraction = 0.5 het_gq_greater_2 = dict(statistic=87.1328934692124, df1=48, df2=47, pvalue=2.154956842194898e-33, distr='f') gq = smsdia.het_goldfeldquandt(endog, exog, split=0.5) compare_t_est(gq, het_gq_greater, decimal=(13, 14)) assert_equal(gq[-1], 'increasing') harvey_collier = dict(stat=2.28042114041313, df=199, pvalue=0.02364236161988260, distr='t') #hc = harvtest(fm, order.by=ggdp , data = list()) harvey_collier_2 = dict(stat=0.7516918462158783, df=199, pvalue=0.4531244858006127, distr='t')
import pandas as pd import statsmodels.api as sm from statsmodels.stats.outliers_influence import variance_inflation_factor from statsmodels.tools.tools import add_constant import numpy as np import plotly.graph_objs as go QUANT_FEATURES = np.array([ 'age', 'mri_baseline', 'mri_dac', 'mri_interreg', 'mri_presurg', 'rfs', 'rcb' ]) df = pd.read_csv('clean_data.csv', index_col=[0]) X = add_constant(df.drop('rfs', axis=1)) res = pd.Series( [variance_inflation_factor(X.values, i) for i in range(X.shape[1])], index=X.columns) res = res[res != np.inf] res = res.sort_values(ascending=False)[:8] values = np.around(res.values, 2) values = np.hstack((np.array(res.index).reshape(-1, 1), values.reshape(-1, 1))).T layout = go.Layout(autosize=True, margin={'l': 0, 'r': 0, 't': 20, 'b': 0}) fig = go.Figure(layout=layout, data=[ go.Table(header=dict(values=['', 'VIF'],
def fit_arparams_iter(outputs, inputs, p, q, r, l2_reg=0.0): """Iterative regression for estimating AR params in ARMAX(p, q, r) model. The iterative AR regression process provides consistent estimates for the AR parameters of an ARMAX(p, q, r) model after q iterative steps. It first fits an ARMAX(p, 0, r) model with least squares regression, then ARMAX(p, 1, r), and so on, ..., til ARMAX(p, q, r). At the i-th step, it fits an ARMAX(p, i, r) model, according to estimated error terms from the previous step. For description of the iterative regression method, see Section 2 of `Consistent Estimates of Autoregressive Parameters and Extended Sample Autocorrelation Function for Stationary and Nonstationary ARMA Models` at https://www.jstor.org/stable/2288340. The implementation here is a generalization of the method mentioned in the paper. We adapt the method for multidimensional outputs, exogenous inputs, nan handling, and also add regularization on the MA parameters. Args: outputs: Array with the output values from the LDS, nans allowed. inputs: Array with exogenous inputs values, nans allowed. Could be None. p: AR order, i.e. max lag of the autoregressive part. q: MA order, i.e. max lag of the error terms. r: Max lag of the exogenous inputs. l2_reg: L2 regularization coefficient, to be applied on MA coefficients. Returns: Fitted AR coefficients. """ if outputs.shape[1] > 1: # If there are multiple output dimensions, fit autoregressive params on # each dimension separately and average. params_list = [ fit_arparams_iter(outputs[:, j:j+1], inputs, p, q, r, l2_reg=l2_reg) \ for j in xrange(outputs.shape[1])] return np.mean( np.concatenate([a.reshape(1, -1) for a in params_list]), axis=0) # We include a constant term in regression. k_const = 1 # Input dim. If inputs is None, then in_dim = 0. in_dim = 0 if inputs is not None: in_dim = inputs.shape[1] # Lag the inputs to obtain [?, r], column j means series x_{t-j}. # Use trim to drop rows with unknown values both at beginning and end. lagged_in = np.concatenate( [lagmat(inputs[:, i], maxlag=r, trim='both') for i in xrange(in_dim)], axis=1) # Since we trim in beginning, the offset is r. lagged_in_offset = r # Lag the series itself to p-th order. lagged_out = lagmat(outputs, maxlag=p, trim='both') lagged_out_offset = p y = outputs y_offset = 0 # Estimated residuals, initialized to 0. res = np.zeros_like(outputs) for i in xrange(q + 1): # Lag the residuals to i-th order in i-th iteration. lagged_res = lagmat(res, maxlag=i, trim='both') lagged_res_offset = y_offset + i # Compute offset in regression, since lagged_in, lagged_out, and lagged_res # have different offsets. Align them. if inputs is None: y_offset = max(lagged_out_offset, lagged_res_offset) else: y_offset = max(lagged_out_offset, lagged_res_offset, lagged_in_offset) y = outputs[y_offset:, :] # Concatenate all variables in regression. x = np.concatenate([ lagged_out[y_offset - lagged_out_offset:, :], lagged_res[y_offset - lagged_res_offset:, :] ], axis=1) if inputs is not None: x = np.concatenate([lagged_in[y_offset - lagged_in_offset:, :], x], axis=1) # Add constant term as the first variable. x = add_constant(x, prepend=True) if x.shape[1] < k_const + in_dim * r + p + i: raise ValueError('Insufficient sequence length for model fitting.') # Drop rows with nans. arr = np.concatenate([y, x], axis=1) arr = arr[~np.isnan(arr).any(axis=1)] y_dropped_na = arr[:, 0:1] x_dropped_na = arr[:, 1:] # Only regularize the MA part. alpha = np.concatenate( [np.zeros(k_const + in_dim * r + p), l2_reg * np.ones(i)], axis=0) # When L1_wt = 0, it's ridge regression. olsfit = OLS(y_dropped_na, x_dropped_na).fit_regularized( alpha=alpha, L1_wt=0.0) # Update estimated residuals. res = y - np.matmul(x, olsfit.params.reshape(-1, 1)) if len(olsfit.params) != k_const + in_dim * r + p + q: raise ValueError('Expected param len %d, got %d.' % (k_const + in_dim * r + p + q, len(olsfit.params))) if q == 0: return olsfit.params[-p:] return olsfit.params[-(p + q):-q]
def plot_ccpr(results, exog_idx, ax=None): """Plot CCPR against one regressor. Generates a CCPR (component and component-plus-residual) plot. Parameters ---------- results : result instance A regression results instance. exog_idx : {int, str} Exogenous, explanatory variable. If string is given, it should be the variable name that you want to use, and you can use arbitrary translations as with a formula. ax : Matplotlib AxesSubplot instance, optional If given, it is used to plot in instead of a new figure being created. Returns ------- fig : Figure If `ax` is None, the created figure. Otherwise the figure to which `ax` is connected. See Also -------- plot_ccpr_grid : Creates CCPR plot for multiple regressors in a plot grid. Notes ----- The CCPR plot provides a way to judge the effect of one regressor on the response variable by taking into account the effects of the other independent variables. The partial residuals plot is defined as Residuals + B_i*X_i versus X_i. The component adds the B_i*X_i versus X_i to show where the fitted line would lie. Care should be taken if X_i is highly correlated with any of the other independent variables. If this is the case, the variance evident in the plot will be an underestimate of the true variance. Examples -------- Using the state crime dataset plot the effect of the rate of single households ('single') on the murder rate while accounting for high school graduation rate ('hs_grad'), percentage of people in an urban area, and rate of poverty ('poverty'). >>> import statsmodels.api as sm >>> import matplotlib.pyplot as plot >>> import statsmodels.formula.api as smf >>> crime_data = sm.datasets.statecrime.load_pandas() >>> results = smf.ols('murder ~ hs_grad + urban + poverty + single', ... data=crime_data.data).fit() >>> sm.graphics.plot_ccpr(results, 'single') >>> plt.show() .. plot:: plots/graphics_regression_ccpr.py References ---------- http://www.itl.nist.gov/div898/software/dataplot/refman1/auxillar/ccpr.htm """ fig, ax = utils.create_mpl_ax(ax) exog_name, exog_idx = utils.maybe_name_or_idx(exog_idx, results.model) results = maybe_unwrap_results(results) x1 = results.model.exog[:, exog_idx] #namestr = ' for %s' % self.name if self.name else '' x1beta = x1 * results.params[exog_idx] ax.plot(x1, x1beta + results.resid, 'o') from statsmodels.tools.tools import add_constant mod = OLS(x1beta, add_constant(x1)).fit() params = mod.params fig = abline_plot(*params, **dict(ax=ax)) #ax.plot(x1, x1beta, '-') ax.set_title('Component and component plus residual plot') ax.set_ylabel("Residual + %s*beta_%d" % (exog_name, exog_idx)) ax.set_xlabel("%s" % exog_name) return fig
def qqline(ax, line, x=None, y=None, dist=None, fmt="r-", **lineoptions): """ Plot a reference line for a qqplot. Parameters ---------- ax : matplotlib axes instance The axes on which to plot the line line : str {"45","r","s","q"} Options for the reference line to which the data is compared.: - "45" - 45-degree line - "s" - standardized line, the expected order statistics are scaled by the standard deviation of the given sample and have the mean added to them - "r" - A regression line is fit - "q" - A line is fit through the quartiles. - None - By default no reference line is added to the plot. x : ndarray X data for plot. Not needed if line is "45". y : ndarray Y data for plot. Not needed if line is "45". dist : scipy.stats.distribution A scipy.stats distribution, needed if line is "q". fmt : str, optional Line format string passed to `plot`. **lineoptions Additional arguments to be passed to the `plot` command. Notes ----- There is no return value. The line is plotted on the given `ax`. Examples -------- Import the food expenditure dataset. Plot annual food expenditure on x-axis and household income on y-axis. Use qqline to add regression line into the plot. >>> import statsmodels.api as sm >>> import numpy as np >>> import matplotlib.pyplot as plt >>> from statsmodels.graphics.gofplots import qqline >>> foodexp = sm.datasets.engel.load(as_pandas=False) >>> x = foodexp.exog >>> y = foodexp.endog >>> ax = plt.subplot(111) >>> plt.scatter(x, y) >>> ax.set_xlabel(foodexp.exog_name[0]) >>> ax.set_ylabel(foodexp.endog_name) >>> qqline(ax, "r", x, y) >>> plt.show() .. plot:: plots/graphics_gofplots_qqplot_qqline.py """ lineoptions = lineoptions.copy() for ls in ("-", "--", "-.", ":"): if ls in fmt: lineoptions.setdefault("linestyle", ls) fmt = fmt.replace(ls, "") break for marker in ( ".", ",", "o", "v", "^", "<", ">", "1", "2", "3", "4", "8", "s", "p", "P", "*", "h", "H", "+", "x", "X", "D", "d", "|", "_", ): if marker in fmt: lineoptions.setdefault("marker", marker) fmt = fmt.replace(marker, "") break if fmt: lineoptions.setdefault("color", fmt) if line == "45": end_pts = lzip(ax.get_xlim(), ax.get_ylim()) end_pts[0] = min(end_pts[0]) end_pts[1] = max(end_pts[1]) ax.plot(end_pts, end_pts, **lineoptions) ax.set_xlim(end_pts) ax.set_ylim(end_pts) return # does this have any side effects? if x is None or y is None: raise ValueError("If line is not 45, x and y cannot be None.") x = np.array(x) y = np.array(y) if line == "r": # could use ax.lines[0].get_xdata(), get_ydata(), # but don't know axes are "clean" y = OLS(y, add_constant(x)).fit().fittedvalues ax.plot(x, y, **lineoptions) elif line == "s": m, b = np.std(y), np.mean(y) ref_line = x * m + b ax.plot(x, ref_line, **lineoptions) elif line == "q": _check_for(dist, "ppf") q25 = stats.scoreatpercentile(y, 25) q75 = stats.scoreatpercentile(y, 75) theoretical_quartiles = dist.ppf([0.25, 0.75]) m = (q75 - q25) / np.diff(theoretical_quartiles) b = q25 - m * theoretical_quartiles[0] ax.plot(x, m * x + b, **lineoptions)
def test_add_constant_has_constant1d(self): x = np.ones(5) x = tools.add_constant(x) assert_equal(x, np.ones(5))
def statespace(endog, exog=None, order=(0, 0, 0), seasonal_order=(0, 0, 0, 0), include_constant=True, enforce_stationarity=True, enforce_invertibility=True, concentrate_scale=False, start_params=None, fit_kwargs=None): """ Estimate SARIMAX parameters using state space methods. Parameters ---------- endog : array_like Input time series array. order : tuple, optional The (p,d,q) order of the model for the number of AR parameters, differences, and MA parameters. Default is (0, 0, 0). seasonal_order : tuple, optional The (P,D,Q,s) order of the seasonal component of the model for the AR parameters, differences, MA parameters, and periodicity. Default is (0, 0, 0, 0). include_constant : bool, optional Whether to add a constant term in `exog` if it's not already there. The estimate of the constant will then appear as one of the `exog` parameters. If `exog` is None, then the constant will represent the mean of the process. enforce_stationarity : bool, optional Whether or not to transform the AR parameters to enforce stationarity in the autoregressive component of the model. Default is True. enforce_invertibility : bool, optional Whether or not to transform the MA parameters to enforce invertibility in the moving average component of the model. Default is True. concentrate_scale : bool, optional Whether or not to concentrate the scale (variance of the error term) out of the likelihood. This reduces the number of parameters estimated by maximum likelihood by one. start_params : array_like, optional Initial guess of the solution for the loglikelihood maximization. The AR polynomial must be stationary. If `enforce_invertibility=True` the MA poylnomial must be invertible. If not provided, default starting parameters are computed using the Hannan-Rissanen method. fit_kwargs : dict, optional Arguments to pass to the state space model's `fit` method. Returns ------- parameters : SARIMAXParams object other_results : Bunch Includes two components, `spec`, containing the `SARIMAXSpecification` instance corresponding to the input arguments; and `state_space_results`, corresponding to the results from the underlying state space model and Kalman filter / smoother. Notes ----- The primary reference is [1]_. References ---------- .. [1] Durbin, James, and Siem Jan Koopman. 2012. Time Series Analysis by State Space Methods: Second Edition. Oxford University Press. """ # Handle including the constant (need to do it now so that the constant # parameter can be included in the specification as part of `exog`.) if include_constant: exog = np.ones_like(endog) if exog is None else add_constant(exog) # Create the specification spec = SARIMAXSpecification(endog, exog=exog, order=order, seasonal_order=seasonal_order, enforce_stationarity=enforce_stationarity, enforce_invertibility=enforce_invertibility, concentrate_scale=concentrate_scale) endog = spec.endog exog = spec.exog p = SARIMAXParams(spec=spec) # Check start parameters if start_params is not None: sp = SARIMAXParams(spec=spec) sp.params = start_params if spec.enforce_stationarity and not sp.is_stationary: raise ValueError('Given starting parameters imply a non-stationary' ' AR process with `enforce_stationarity=True`.') if spec.enforce_invertibility and not sp.is_invertible: raise ValueError('Given starting parameters imply a non-invertible' ' MA process with `enforce_invertibility=True`.') # Create and fit the state space model mod = SARIMAX(endog, exog=exog, order=spec.order, seasonal_order=spec.seasonal_order, enforce_stationarity=spec.enforce_stationarity, enforce_invertibility=spec.enforce_invertibility, concentrate_scale=spec.concentrate_scale) if fit_kwargs is None: fit_kwargs = {} fit_kwargs.setdefault('disp', 0) res_ss = mod.fit(start_params=start_params, **fit_kwargs) # Construct results p.params = res_ss.params res = Bunch({ 'spec': spec, 'statespace_results': res_ss, }) return p, res
def grangercausalitytests(x, maxlag, addconst=True, verbose=True): '''four tests for granger non causality of 2 timeseries all four tests give similar results `params_ftest` and `ssr_ftest` are equivalent based on F test which is identical to lmtest:grangertest in R Parameters ---------- x : array, 2d, (nobs,2) data for test whether the time series in the second column Granger causes the time series in the first column maxlag : integer the Granger causality test results are calculated for all lags up to maxlag verbose : bool print results if true Returns ------- results : dictionary all test results, dictionary keys are the number of lags. For each lag the values are a tuple, with the first element a dictionary with teststatistic, pvalues, degrees of freedom, the second element are the OLS estimation results for the restricted model, the unrestricted model and the restriction (contrast) matrix for the parameter f_test. Notes ----- TODO: convert to class and attach results properly The Null hypothesis for grangercausalitytests is that the time series in the second column, x2, does NOT Granger cause the time series in the first column, x1. Grange causality means that past values of x2 have a statistically significant effect on the current value of x1, taking past values of x1 into account as regressors. We reject the null hypothesis that x2 does not Granger cause x1 if the pvalues are below a desired size of the test. The null hypothesis for all four test is that the coefficients corresponding to past values of the second time series are zero. 'params_ftest', 'ssr_ftest' are based on F distribution 'ssr_chi2test', 'lrtest' are based on chi-square distribution References ---------- http://en.wikipedia.org/wiki/Granger_causality Greene: Econometric Analysis ''' from scipy import stats # lazy import resli = {} for mlg in range(1, maxlag + 1): result = {} if verbose: print '\nGranger Causality' print 'number of lags (no zero)', mlg mxlg = mlg #+ 1 # Note number of lags starting at zero in lagmat # create lagmat of both time series dta = lagmat2ds(x, mxlg, trim='both', dropex=1) #add constant if addconst: dtaown = add_constant(dta[:, 1:mxlg + 1], prepend=False) dtajoint = add_constant(dta[:, 1:], prepend=False) else: raise ValueError('Not Implemented') dtaown = dta[:, 1:mxlg] dtajoint = dta[:, 1:] #run ols on both models without and with lags of second variable res2down = OLS(dta[:, 0], dtaown).fit() res2djoint = OLS(dta[:, 0], dtajoint).fit() #print results #for ssr based tests see: http://support.sas.com/rnd/app/examples/ets/granger/index.htm #the other tests are made-up # Granger Causality test using ssr (F statistic) fgc1 = (res2down.ssr - res2djoint.ssr) / res2djoint.ssr / (mxlg) * res2djoint.df_resid if verbose: print 'ssr based F test: F=%-8.4f, p=%-8.4f, df_denom=%d, df_num=%d' % \ (fgc1, stats.f.sf(fgc1, mxlg, res2djoint.df_resid), res2djoint.df_resid, mxlg) result['ssr_ftest'] = (fgc1, stats.f.sf(fgc1, mxlg, res2djoint.df_resid), res2djoint.df_resid, mxlg) # Granger Causality test using ssr (ch2 statistic) fgc2 = res2down.nobs * (res2down.ssr - res2djoint.ssr) / res2djoint.ssr if verbose: print 'ssr based chi2 test: chi2=%-8.4f, p=%-8.4f, df=%d' % \ (fgc2, stats.chi2.sf(fgc2, mxlg), mxlg) result['ssr_chi2test'] = (fgc2, stats.chi2.sf(fgc2, mxlg), mxlg) #likelihood ratio test pvalue: lr = -2 * (res2down.llf - res2djoint.llf) if verbose: print 'likelihood ratio test: chi2=%-8.4f, p=%-8.4f, df=%d' % \ (lr, stats.chi2.sf(lr, mxlg), mxlg) result['lrtest'] = (lr, stats.chi2.sf(lr, mxlg), mxlg) # F test that all lag coefficients of exog are zero rconstr = np.column_stack((np.zeros((mxlg-1,mxlg-1)), np.eye(mxlg-1, mxlg-1),\ np.zeros((mxlg-1, 1)))) rconstr = np.column_stack((np.zeros((mxlg,mxlg)), np.eye(mxlg, mxlg),\ np.zeros((mxlg, 1)))) ftres = res2djoint.f_test(rconstr) if verbose: print 'parameter F test: F=%-8.4f, p=%-8.4f, df_denom=%d, df_num=%d' % \ (ftres.fvalue, ftres.pvalue, ftres.df_denom, ftres.df_num) result['params_ftest'] = (np.squeeze(ftres.fvalue)[()], np.squeeze(ftres.pvalue)[()], ftres.df_denom, ftres.df_num) resli[mxlg] = (result, [res2down, res2djoint, rconstr]) return resli
def setup_class(cls): data = longley.load(as_pandas=False) data.exog = add_constant(data.exog, prepend=False) cls.endog = data.endog cls.exog = data.exog cls.ols_model = OLS(data.endog, data.exog)
def get_VIF(X , target): X = add_constant(X.loc[:, X.columns != 'medium_to_high_risk']) seriesObject = pd.Series([variance_inflation_factor(X.values,i) for i in range(X.shape[1])] , index=X.columns,) return seriesObject
def factor_alpha_beta(factor_data, returns=None, demeaned=True, group_adjust=False, equal_weight=False): """ Compute the alpha (excess returns), alpha t-stat (alpha significance), and beta (market exposure) of a factor. A regression is run with the period wise factor universe mean return as the independent variable and mean period wise return from a portfolio weighted by factor values as the dependent variable. Parameters ---------- factor_data : pd.DataFrame - MultiIndex A MultiIndex DataFrame indexed by date (level 0) and asset (level 1), containing the values for a single alpha factor, forward returns for each period, the factor quantile/bin that factor value belongs to, and (optionally) the group the asset belongs to. - See full explanation in utils.get_clean_factor_and_forward_returns returns : pd.DataFrame, optional Period wise factor returns. If this is None then it will be computed with 'factor_returns' function and the passed flags: 'demeaned', 'group_adjust', 'equal_weight' demeaned : bool Control how to build factor returns used for alpha/beta computation -- see performance.factor_return for a full explanation group_adjust : bool Control how to build factor returns used for alpha/beta computation -- see performance.factor_return for a full explanation equal_weight : bool, optional Control how to build factor returns used for alpha/beta computation -- see performance.factor_return for a full explanation Returns ------- alpha_beta : pd.Series A list containing the alpha, beta, a t-stat(alpha) for the given factor and forward returns. """ if returns is None: returns = \ factor_returns(factor_data, demeaned, group_adjust, equal_weight) universe_ret = factor_data.groupby(level='date')[ utils.get_forward_returns_columns(factor_data.columns)] \ .mean().loc[returns.index] if isinstance(returns, pd.Series): returns.name = universe_ret.columns.values[0] returns = pd.DataFrame(returns) alpha_beta = pd.DataFrame() for period in returns.columns.values: x = universe_ret[period].values y = returns[period].values x = add_constant(x) reg_fit = OLS(y, x).fit() try: alpha, beta = reg_fit.params except ValueError: alpha_beta.loc['Ann. alpha', period] = np.nan alpha_beta.loc['beta', period] = np.nan else: freq_adjust = pd.Timedelta('252Days') / pd.Timedelta(period) alpha_beta.loc['Ann. alpha', period] = \ (1 + alpha) ** freq_adjust - 1 alpha_beta.loc['beta', period] = beta return alpha_beta
def setup_class(cls): data = longley.load(as_pandas=False) data.exog = add_constant(data.exog, prepend=False) cls.res1 = GLS(data.endog, data.exog).fit() cls.res2 = OLS(data.endog, data.exog).fit()
def qqline(ax, line, x=None, y=None, dist=None, fmt='r-'): """ Plot a reference line for a qqplot. Parameters ---------- ax : matplotlib axes instance The axes on which to plot the line line : str {'45','r','s','q'} Options for the reference line to which the data is compared.: - '45' - 45-degree line - 's' - standardized line, the expected order statistics are scaled by the standard deviation of the given sample and have the mean added to them - 'r' - A regression line is fit - 'q' - A line is fit through the quartiles. - None - By default no reference line is added to the plot. x : array X data for plot. Not needed if line is '45'. y : array Y data for plot. Not needed if line is '45'. dist : scipy.stats.distribution A scipy.stats distribution, needed if line is 'q'. Notes ----- There is no return value. The line is plotted on the given `ax`. Examples -------- Import the food expenditure dataset. Plot annual food expendeture on x-axis and household income on y-axis. Use qqline to add regression line into the plot. >>> import statsmodels.api as sm >>> import numpy as np >>> import matplotlib.pyplot as plt >>> from statsmodels.graphics.gofplots import qqline >>> foodexp = sm.datasets.engel.load(as_pandas=False) >>> x = foodexp.exog >>> y = foodexp.endog >>> ax = plt.subplot(111) >>> plt.scatter(x, y) >>> ax.set_xlabel(foodexp.exog_name[0]) >>> ax.set_ylabel(foodexp.endog_name) >>> qqline(ax, 'r', x, y) >>> plt.show() .. plot:: plots/graphics_gofplots_qqplot_qqline.py """ if line == '45': end_pts = lzip(ax.get_xlim(), ax.get_ylim()) end_pts[0] = min(end_pts[0]) end_pts[1] = max(end_pts[1]) ax.plot(end_pts, end_pts, fmt) ax.set_xlim(end_pts) ax.set_ylim(end_pts) return # does this have any side effects? if x is None and y is None: raise ValueError("If line is not 45, x and y cannot be None.") elif line == 'r': # could use ax.lines[0].get_xdata(), get_ydata(), # but don't know axes are 'clean' y = OLS(y, add_constant(x)).fit().fittedvalues ax.plot(x, y, fmt) elif line == 's': m, b = y.std(), y.mean() ref_line = x * m + b ax.plot(x, ref_line, fmt) elif line == 'q': _check_for_ppf(dist) q25 = stats.scoreatpercentile(y, 25) q75 = stats.scoreatpercentile(y, 75) theoretical_quartiles = dist.ppf([0.25, 0.75]) m = (q75 - q25) / np.diff(theoretical_quartiles) b = q25 - m * theoretical_quartiles[0] ax.plot(x, m * x + b, fmt)
"cd", "firecomp", "schooldist", "council", "zipcode", "policeprct", "healtharea", "sanitboro", "sanitsub", "zonedist1", "spdist1", "ltdheight", "landuse", "ext", "proxcode", "irrlotcode", "lottype", "borocode", "edesignum", "sanitdistrict", "healthcenterdistrict", "pfirm15_flag" ] # Deleting block and lot since it provides too much details: df = df.drop(['lot', 'block'], axis=1) # Removing factors: X = df.drop(to_factors, axis=1) # Since we predict assessland and assesstot: X = X.drop('assesstot', axis=1) X = X.drop('assessland', axis=1) X = add_constant(X) l = [] X.info() # We don't consider index = 0 since it is the const VIF value while True: vif = pd.DataFrame() vif["VIF Factor"] = [ variance_inflation_factor(X.values, j) for j in range(X.shape[1]) ] vif["features"] = X.columns if max(vif["VIF Factor"][1:] > 5): k = vif.index[vif['VIF Factor'] == max(vif['VIF Factor'][1:])] k = k.tolist() k = k[0] l.append(vif["features"][k])
def multicollinearity_assumption(): """ Multicollinearity: Assumes that predictors are not correlated with each other. If there is correlation among the predictors, then either remove prepdictors with high Variance Inflation Factor (VIF) values or perform dimensionality reduction This assumption being violated causes issues with interpretability of the coefficients and the standard errors of the coefficients. """ from statsmodels.stats.outliers_influence import variance_inflation_factor from statsmodels.tools.tools import add_constant print( '\n=======================================================================================' ) print('Assumption 3: Little to no multicollinearity among predictors') # Plotting the heatmap plt.figure(figsize=(10, 8)) sns.heatmap(pd.DataFrame(features, columns=feature_names).corr(), annot=True) plt.title('Correlation of Variables') plt.show() print('Variance Inflation Factors (VIF)') print('> 10: An indication that multicollinearity may be present') print('> 100: Certain multicollinearity among the variables') print('-------------------------------------') # Gathering the VIF for each variable x1 = add_constant(features) VIF = [ variance_inflation_factor(x1.values, i) for i in range(x1.shape[1]) ] for idx, vif in enumerate(VIF): print('{0}: {1}'.format(feature_names[idx], vif)) # Gathering and printing total cases of possible or definite multicollinearity possible_multicollinearity = sum([1 for vif in VIF if vif > 10]) definite_multicollinearity = sum([1 for vif in VIF if vif > 100]) print() print('{0} cases of possible multicollinearity'.format( possible_multicollinearity)) print('{0} cases of definite multicollinearity'.format( definite_multicollinearity)) print() if definite_multicollinearity == 0: if possible_multicollinearity == 0: print('Assumption satisfied') else: print('Assumption possibly satisfied') print() print('Coefficient interpretability may be problematic') print( 'Consider removing variables with a high Variance Inflation Factor (VIF)' ) else: print('Assumption not satisfied') print() print('Coefficient interpretability will be problematic') print( 'Consider removing variables with a high Variance Inflation Factor (VIF)' )
def test_add_constant_1d(self): x = np.arange(1,5) x = tools.add_constant(x) y = np.asarray([[1,1,1,1],[1,2,3,4.]]).T assert_equal(x, y)
def test_all(self): d = macrodata.load().data #import datasetswsm.greene as g #d = g.load('5-1') #growth rates gs_l_realinv = 400 * np.diff(np.log(d['realinv'])) gs_l_realgdp = 400 * np.diff(np.log(d['realgdp'])) #simple diff, not growthrate, I want heteroscedasticity later for testing endogd = np.diff(d['realinv']) exogd = add_constant(np.c_[np.diff(d['realgdp']), d['realint'][:-1]]) endogg = gs_l_realinv exogg = add_constant(np.c_[gs_l_realgdp, d['realint'][:-1]]) res_ols = OLS(endogg, exogg).fit() #print res_ols.params mod_g1 = GLSAR(endogg, exogg, rho=-0.108136) res_g1 = mod_g1.fit() #print res_g1.params mod_g2 = GLSAR(endogg, exogg, rho=-0.108136) #-0.1335859) from R res_g2 = mod_g2.iterative_fit(maxiter=5) #print res_g2.params rho = -0.108136 # coefficient std. error t-ratio p-value 95% CONFIDENCE INTERVAL partable = np.array([ [-9.50990, 0.990456, -9.602, 3.65e-018, -11.4631, -7.55670], # *** [4.37040, 0.208146, 21.00, 2.93e-052, 3.95993, 4.78086], # *** [-0.579253, 0.268009, -2.161, 0.0319, -1.10777, -0.0507346] ]) # ** #Statistics based on the rho-differenced data: result_gretl_g1 = dict(endog_mean=("Mean dependent var", 3.113973), endog_std=("S.D. dependent var", 18.67447), ssr=("Sum squared resid", 22530.90), mse_resid_sqrt=("S.E. of regression", 10.66735), rsquared=("R-squared", 0.676973), rsquared_adj=("Adjusted R-squared", 0.673710), fvalue=("F(2, 198)", 221.0475), f_pvalue=("P-value(F)", 3.56e-51), resid_acf1=("rho", -0.003481), dw=("Durbin-Watson", 1.993858)) #fstatistic, p-value, df1, df2 reset_2_3 = [5.219019, 0.00619, 2, 197, "f"] reset_2 = [7.268492, 0.00762, 1, 198, "f"] reset_3 = [5.248951, 0.023, 1, 198, "f"] #LM-statistic, p-value, df arch_4 = [7.30776, 0.120491, 4, "chi2"] #multicollinearity vif = [1.002, 1.002] cond_1norm = 6862.0664 determinant = 1.0296049e+009 reciprocal_condition_number = 0.013819244 #Chi-square(2): test-statistic, pvalue, df normality = [20.2792, 3.94837e-005, 2] #tests res = res_g1 #with rho from Gretl #basic assert_almost_equal(res.params, partable[:, 0], 4) assert_almost_equal(res.bse, partable[:, 1], 6) assert_almost_equal(res.tvalues, partable[:, 2], 2) assert_almost_equal(res.ssr, result_gretl_g1['ssr'][1], decimal=2) #assert_almost_equal(res.llf, result_gretl_g1['llf'][1], decimal=7) #not in gretl #assert_almost_equal(res.rsquared, result_gretl_g1['rsquared'][1], decimal=7) #FAIL #assert_almost_equal(res.rsquared_adj, result_gretl_g1['rsquared_adj'][1], decimal=7) #FAIL assert_almost_equal(np.sqrt(res.mse_resid), result_gretl_g1['mse_resid_sqrt'][1], decimal=5) assert_almost_equal(res.fvalue, result_gretl_g1['fvalue'][1], decimal=4) assert_approx_equal(res.f_pvalue, result_gretl_g1['f_pvalue'][1], significant=2) #assert_almost_equal(res.durbin_watson, result_gretl_g1['dw'][1], decimal=7) #TODO #arch #sm_arch = smsdia.acorr_lm(res.wresid**2, maxlag=4, autolag=None) sm_arch = smsdia.het_arch(res.wresid, maxlag=4) assert_almost_equal(sm_arch[0], arch_4[0], decimal=4) assert_almost_equal(sm_arch[1], arch_4[1], decimal=6) #tests res = res_g2 #with estimated rho #estimated lag coefficient assert_almost_equal(res.model.rho, rho, decimal=3) #basic assert_almost_equal(res.params, partable[:, 0], 4) assert_almost_equal(res.bse, partable[:, 1], 3) assert_almost_equal(res.tvalues, partable[:, 2], 2) assert_almost_equal(res.ssr, result_gretl_g1['ssr'][1], decimal=2) #assert_almost_equal(res.llf, result_gretl_g1['llf'][1], decimal=7) #not in gretl #assert_almost_equal(res.rsquared, result_gretl_g1['rsquared'][1], decimal=7) #FAIL #assert_almost_equal(res.rsquared_adj, result_gretl_g1['rsquared_adj'][1], decimal=7) #FAIL assert_almost_equal(np.sqrt(res.mse_resid), result_gretl_g1['mse_resid_sqrt'][1], decimal=5) assert_almost_equal(res.fvalue, result_gretl_g1['fvalue'][1], decimal=0) assert_almost_equal(res.f_pvalue, result_gretl_g1['f_pvalue'][1], decimal=6) #assert_almost_equal(res.durbin_watson, result_gretl_g1['dw'][1], decimal=7) #TODO c = oi.reset_ramsey(res, degree=2) compare_ftest(c, reset_2, decimal=(2, 4)) c = oi.reset_ramsey(res, degree=3) compare_ftest(c, reset_2_3, decimal=(2, 4)) #arch #sm_arch = smsdia.acorr_lm(res.wresid**2, maxlag=4, autolag=None) sm_arch = smsdia.het_arch(res.wresid, maxlag=4) assert_almost_equal(sm_arch[0], arch_4[0], decimal=1) assert_almost_equal(sm_arch[1], arch_4[1], decimal=2) ''' Performing iterative calculation of rho... ITER RHO ESS 1 -0.10734 22530.9 2 -0.10814 22530.9 Model 4: Cochrane-Orcutt, using observations 1959:3-2009:3 (T = 201) Dependent variable: ds_l_realinv rho = -0.108136 coefficient std. error t-ratio p-value ------------------------------------------------------------- const -9.50990 0.990456 -9.602 3.65e-018 *** ds_l_realgdp 4.37040 0.208146 21.00 2.93e-052 *** realint_1 -0.579253 0.268009 -2.161 0.0319 ** Statistics based on the rho-differenced data: Mean dependent var 3.113973 S.D. dependent var 18.67447 Sum squared resid 22530.90 S.E. of regression 10.66735 R-squared 0.676973 Adjusted R-squared 0.673710 F(2, 198) 221.0475 P-value(F) 3.56e-51 rho -0.003481 Durbin-Watson 1.993858 ''' ''' RESET test for specification (squares and cubes) Test statistic: F = 5.219019, with p-value = P(F(2,197) > 5.21902) = 0.00619 RESET test for specification (squares only) Test statistic: F = 7.268492, with p-value = P(F(1,198) > 7.26849) = 0.00762 RESET test for specification (cubes only) Test statistic: F = 5.248951, with p-value = P(F(1,198) > 5.24895) = 0.023: ''' ''' Test for ARCH of order 4 coefficient std. error t-ratio p-value -------------------------------------------------------- alpha(0) 97.0386 20.3234 4.775 3.56e-06 *** alpha(1) 0.176114 0.0714698 2.464 0.0146 ** alpha(2) -0.0488339 0.0724981 -0.6736 0.5014 alpha(3) -0.0705413 0.0737058 -0.9571 0.3397 alpha(4) 0.0384531 0.0725763 0.5298 0.5968 Null hypothesis: no ARCH effect is present Test statistic: LM = 7.30776 with p-value = P(Chi-square(4) > 7.30776) = 0.120491: ''' ''' Variance Inflation Factors Minimum possible value = 1.0 Values > 10.0 may indicate a collinearity problem ds_l_realgdp 1.002 realint_1 1.002 VIF(j) = 1/(1 - R(j)^2), where R(j) is the multiple correlation coefficient between variable j and the other independent variables Properties of matrix X'X: 1-norm = 6862.0664 Determinant = 1.0296049e+009 Reciprocal condition number = 0.013819244 ''' ''' Test for ARCH of order 4 - Null hypothesis: no ARCH effect is present Test statistic: LM = 7.30776 with p-value = P(Chi-square(4) > 7.30776) = 0.120491 Test of common factor restriction - Null hypothesis: restriction is acceptable Test statistic: F(2, 195) = 0.426391 with p-value = P(F(2, 195) > 0.426391) = 0.653468 Test for normality of residual - Null hypothesis: error is normally distributed Test statistic: Chi-square(2) = 20.2792 with p-value = 3.94837e-005: ''' #no idea what this is ''' Augmented regression for common factor test OLS, using observations 1959:3-2009:3 (T = 201) Dependent variable: ds_l_realinv coefficient std. error t-ratio p-value --------------------------------------------------------------- const -10.9481 1.35807 -8.062 7.44e-014 *** ds_l_realgdp 4.28893 0.229459 18.69 2.40e-045 *** realint_1 -0.662644 0.334872 -1.979 0.0492 ** ds_l_realinv_1 -0.108892 0.0715042 -1.523 0.1294 ds_l_realgdp_1 0.660443 0.390372 1.692 0.0923 * realint_2 0.0769695 0.341527 0.2254 0.8219 Sum of squared residuals = 22432.8 Test of common factor restriction Test statistic: F(2, 195) = 0.426391, with p-value = 0.653468 ''' ################ with OLS, HAC errors #Model 5: OLS, using observations 1959:2-2009:3 (T = 202) #Dependent variable: ds_l_realinv #HAC standard errors, bandwidth 4 (Bartlett kernel) #coefficient std. error t-ratio p-value 95% CONFIDENCE INTERVAL #for confidence interval t(199, 0.025) = 1.972 partable = np.array([ [-9.48167, 1.17709, -8.055, 7.17e-014, -11.8029, -7.16049], # *** [4.37422, 0.328787, 13.30, 2.62e-029, 3.72587, 5.02258], #*** [-0.613997, 0.293619, -2.091, 0.0378, -1.19300, -0.0349939] ]) # ** result_gretl_g1 = dict(endog_mean=("Mean dependent var", 3.257395), endog_std=("S.D. dependent var", 18.73915), ssr=("Sum squared resid", 22799.68), mse_resid_sqrt=("S.E. of regression", 10.70380), rsquared=("R-squared", 0.676978), rsquared_adj=("Adjusted R-squared", 0.673731), fvalue=("F(2, 199)", 90.79971), f_pvalue=("P-value(F)", 9.53e-29), llf=("Log-likelihood", -763.9752), aic=("Akaike criterion", 1533.950), bic=("Schwarz criterion", 1543.875), hqic=("Hannan-Quinn", 1537.966), resid_acf1=("rho", -0.107341), dw=("Durbin-Watson", 2.213805)) linear_logs = [1.68351, 0.430953, 2, "chi2"] #for logs: dropping 70 nan or incomplete observations, T=133 #(res_ols.model.exog <=0).any(1).sum() = 69 ?not 70 linear_squares = [7.52477, 0.0232283, 2, "chi2"] #Autocorrelation, Breusch-Godfrey test for autocorrelation up to order 4 lm_acorr4 = [1.17928, 0.321197, 4, 195, "F"] lm2_acorr4 = [4.771043, 0.312, 4, "chi2"] acorr_ljungbox4 = [5.23587, 0.264, 4, "chi2"] #break cusum_Harvey_Collier = [0.494432, 0.621549, 198, "t"] #stats.t.sf(0.494432, 198)*2 #see cusum results in files break_qlr = [3.01985, 0.1, 3, 196, "maxF"] #TODO check this, max at 2001:4 break_chow = [13.1897, 0.00424384, 3, "chi2"] # break at 1984:1 arch_4 = [3.43473, 0.487871, 4, "chi2"] normality = [23.962, 0.00001, 2, "chi2"] het_white = [33.503723, 0.000003, 5, "chi2"] het_breush_pagan = [1.302014, 0.521520, 2, "chi2"] #TODO: not available het_breush_pagan_konker = [0.709924, 0.701200, 2, "chi2"] reset_2_3 = [5.219019, 0.00619, 2, 197, "f"] reset_2 = [7.268492, 0.00762, 1, 198, "f"] reset_3 = [5.248951, 0.023, 1, 198, "f"] #not available cond_1norm = 5984.0525 determinant = 7.1087467e+008 reciprocal_condition_number = 0.013826504 vif = [1.001, 1.001] names = 'date residual leverage influence DFFITS'.split( ) cur_dir = os.path.abspath(os.path.dirname(__file__)) fpath = os.path.join(cur_dir, 'results/leverage_influence_ols_nostars.txt') lev = np.genfromtxt(fpath, skip_header=3, skip_footer=1, converters={0: lambda s: s}) #either numpy 1.6 or python 3.2 changed behavior if np.isnan(lev[-1]['f1']): lev = np.genfromtxt(fpath, skip_header=3, skip_footer=2, converters={0: lambda s: s}) lev.dtype.names = names res = res_ols #for easier copying cov_hac = sw.cov_hac_simple(res, nlags=4, use_correction=False) bse_hac = sw.se_cov(cov_hac) assert_almost_equal(res.params, partable[:, 0], 5) assert_almost_equal(bse_hac, partable[:, 1], 5) #TODO assert_almost_equal(res.ssr, result_gretl_g1['ssr'][1], decimal=2) assert_almost_equal(res.llf, result_gretl_g1['llf'][1], decimal=4) #not in gretl assert_almost_equal(res.rsquared, result_gretl_g1['rsquared'][1], decimal=6) #FAIL assert_almost_equal(res.rsquared_adj, result_gretl_g1['rsquared_adj'][1], decimal=6) #FAIL assert_almost_equal(np.sqrt(res.mse_resid), result_gretl_g1['mse_resid_sqrt'][1], decimal=5) #f-value is based on cov_hac I guess #assert_almost_equal(res.fvalue, result_gretl_g1['fvalue'][1], decimal=0) #FAIL #assert_approx_equal(res.f_pvalue, result_gretl_g1['f_pvalue'][1], significant=1) #FAIL #assert_almost_equal(res.durbin_watson, result_gretl_g1['dw'][1], decimal=7) #TODO c = oi.reset_ramsey(res, degree=2) compare_ftest(c, reset_2, decimal=(6, 5)) c = oi.reset_ramsey(res, degree=3) compare_ftest(c, reset_2_3, decimal=(6, 5)) linear_sq = smsdia.linear_lm(res.resid, res.model.exog) assert_almost_equal(linear_sq[0], linear_squares[0], decimal=6) assert_almost_equal(linear_sq[1], linear_squares[1], decimal=7) hbpk = smsdia.het_breushpagan(res.resid, res.model.exog) assert_almost_equal(hbpk[0], het_breush_pagan_konker[0], decimal=6) assert_almost_equal(hbpk[1], het_breush_pagan_konker[1], decimal=6) hw = smsdia.het_white(res.resid, res.model.exog) assert_almost_equal(hw[:2], het_white[:2], 6) #arch #sm_arch = smsdia.acorr_lm(res.resid**2, maxlag=4, autolag=None) sm_arch = smsdia.het_arch(res.resid, maxlag=4) assert_almost_equal(sm_arch[0], arch_4[0], decimal=5) assert_almost_equal(sm_arch[1], arch_4[1], decimal=6) vif2 = [ oi.variance_inflation_factor(res.model.exog, k) for k in [1, 2] ] infl = oi.OLSInfluence(res_ols) #print np.max(np.abs(lev['DFFITS'] - infl.dffits[0])) #print np.max(np.abs(lev['leverage'] - infl.hat_matrix_diag)) #print np.max(np.abs(lev['influence'] - infl.influence)) #just added this based on Gretl #just rough test, low decimal in Gretl output, assert_almost_equal(lev['residual'], res.resid, decimal=3) assert_almost_equal(lev['DFFITS'], infl.dffits[0], decimal=3) assert_almost_equal(lev['leverage'], infl.hat_matrix_diag, decimal=3) assert_almost_equal(lev['influence'], infl.influence, decimal=4)
def print_vif(df): X = add_constant(df) for i, vif in enumerate([variance_inflation_factor(X.values, i) for i in range(1, X.shape[1])]): if vif > 10: print(df.columns[i] + ' VIF: ' + str(round(vif, 3)))
signal_to_predict = np.array(x).reshape(len(x)) helping_signal = np.array(y).reshape(len(y)) # Concatenate the two signals in a (nobs,2) array X = np.array([signal_to_predict, helping_signal]).T # Arrays that will contain BIC or AIC values according to the given criterion : C_r = np.zeros((self._max_lag, 1)) C_u = np.zeros((self._max_lag, 1)) # Computing OLS models for both 'restricted' and 'unrestricted' models, for each lag between 1 and 'max_lag' for lag in range(1, self._max_lag + 1): # Adapting datas : data = lagmat2ds(X, lag, trim='both', dropex=1) dataown = add_constant(data[:, 1:(lag + 1)], prepend=False) datajoint = add_constant(data[:, 1:], prepend=False) # OLS models : OLS_restricted = OLS(data[:, 0], dataown).fit() OLS_unrestricted = OLS(data[:, 0], datajoint).fit() # Saving AIC or BIC values : if self._criterion == 'bic': C_r[lag - 1] = OLS_restricted.bic C_u[lag - 1] = OLS_unrestricted.bic elif self._criterion == 'aic': C_r[lag - 1] = OLS_restricted.aic C_u[lag - 1] = OLS_unrestricted.aic # Determine the optimal 'lag' according to 'bic' or 'aic' criterion :
def test_pandas_const_df(): dta = longley.load_pandas().exog dta = tools.add_constant(dta, prepend=False) assert_string_equal('const', dta.columns[-1]) assert_equal(dta.var(0)[-1], 0)
import pandas as pd import patsy import pytest from statsmodels.discrete.discrete_model import Poisson from statsmodels.discrete.discrete_model import Logit from statsmodels.genmod.generalized_linear_model import GLM from statsmodels.genmod import families from statsmodels.base._constraints import fit_constrained from statsmodels.tools.tools import add_constant from statsmodels import datasets spector_data = datasets.spector.load() spector_data.exog = add_constant(spector_data.exog, prepend=False) from .results import results_poisson_constrained as results from .results import results_glm_logit_constrained as reslogit DEBUG = False ss = '''\ agecat smokes deaths pyears 1 1 32 52407 2 1 104 43248 3 1 206 28612 4 1 186 12663 5 1 102 5317 1 0 2 18790 2 0 12 10673
def setup_class(cls): data = longley.load(as_pandas=False) data.exog = add_constant(data.exog, prepend=False) cls.res1 = OLS(data.endog, data.exog).fit() R = np.identity(7)[:-1, :] cls.Ftest = cls.res1.f_test(R)
def kpss(x, regression='c', lags=None, store=False): """ Kwiatkowski-Phillips-Schmidt-Shin test for stationarity. Computes the Kwiatkowski-Phillips-Schmidt-Shin (KPSS) test for the null hypothesis that x is level or trend stationary. Parameters ---------- x : array_like, 1d Data series regression : str{'c', 'ct'} Indicates the null hypothesis for the KPSS test * 'c' : The data is stationary around a constant (default) * 'ct' : The data is stationary around a trend lags : int Indicates the number of lags to be used. If None (default), lags is set to int(12 * (n / 100)**(1 / 4)), as outlined in Schwert (1989). store : bool If True, then a result instance is returned additionally to the KPSS statistic (default is False). Returns ------- kpss_stat : float The KPSS test statistic p_value : float The p-value of the test. The p-value is interpolated from Table 1 in Kwiatkowski et al. (1992), and a boundary point is returned if the test statistic is outside the table of critical values, that is, if the p-value is outside the interval (0.01, 0.1). lags : int The truncation lag parameter crit : dict The critical values at 10%, 5%, 2.5% and 1%. Based on Kwiatkowski et al. (1992). resstore : (optional) instance of ResultStore An instance of a dummy class with results attached as attributes Notes ----- To estimate sigma^2 the Newey-West estimator is used. If lags is None, the truncation lag parameter is set to int(12 * (n / 100) ** (1 / 4)), as outlined in Schwert (1989). The p-values are interpolated from Table 1 of Kwiatkowski et al. (1992). If the computed statistic is outside the table of critical values, then a warning message is generated. Missing values are not handled. References ---------- D. Kwiatkowski, P. C. B. Phillips, P. Schmidt, and Y. Shin (1992): Testing the Null Hypothesis of Stationarity against the Alternative of a Unit Root. `Journal of Econometrics` 54, 159-178. """ from warnings import warn nobs = len(x) x = np.asarray(x) hypo = regression.lower() # if m is not one, n != m * n if nobs != x.size: raise ValueError("x of shape {0} not understood".format(x.shape)) if hypo == 'ct': # p. 162 Kwiatkowski et al. (1992): y_t = beta * t + r_t + e_t, # where beta is the trend, r_t a random walk and e_t a stationary # error term. resids = OLS(x, add_constant(np.arange(1, nobs + 1))).fit().resid crit = [0.119, 0.146, 0.176, 0.216] elif hypo == 'c': # special case of the model above, where beta = 0 (so the null # hypothesis is that the data is stationary around r_0). resids = x - x.mean() crit = [0.347, 0.463, 0.574, 0.739] else: raise ValueError("hypothesis '{0}' not understood".format(hypo)) if lags is None: # from Kwiatkowski et al. referencing Schwert (1989) lags = int(np.ceil(12. * np.power(nobs / 100., 1 / 4.))) pvals = [0.10, 0.05, 0.025, 0.01] eta = sum(resids.cumsum()**2) / (nobs**2) # eq. 11, p. 165 s_hat = _sigma_est_kpss(resids, nobs, lags) kpss_stat = eta / s_hat p_value = np.interp(kpss_stat, crit, pvals) if p_value == pvals[-1]: warn("p-value is smaller than the indicated p-value", InterpolationWarning) elif p_value == pvals[0]: warn("p-value is greater than the indicated p-value", InterpolationWarning) crit_dict = {'10%': crit[0], '5%': crit[1], '2.5%': crit[2], '1%': crit[3]} if store: rstore = ResultsStore() rstore.lags = lags rstore.nobs = nobs stationary_type = "level" if hypo == 'c' else "trend" rstore.H0 = "The series is {0} stationary".format(stationary_type) rstore.HA = "The series is not {0} stationary".format(stationary_type) return kpss_stat, p_value, crit_dict, rstore else: return kpss_stat, p_value, lags, crit_dict
def plot_ccpr(results, exog_idx, ax=None): """Plot CCPR against one regressor. Generates a CCPR (component and component-plus-residual) plot. Parameters ---------- results : result instance A regression results instance. exog_idx : int or string Exogenous, explanatory variable. If string is given, it should be the variable name that you want to use, and you can use arbitrary translations as with a formula. ax : Matplotlib AxesSubplot instance, optional If given, it is used to plot in instead of a new figure being created. Returns ------- fig : Matplotlib figure instance If `ax` is None, the created figure. Otherwise the figure to which `ax` is connected. See Also -------- plot_ccpr_grid : Creates CCPR plot for multiple regressors in a plot grid. Notes ----- The CCPR plot provides a way to judge the effect of one regressor on the response variable by taking into account the effects of the other independent variables. The partial residuals plot is defined as Residuals + B_i*X_i versus X_i. The component adds the B_i*X_i versus X_i to show where the fitted line would lie. Care should be taken if X_i is highly correlated with any of the other independent variables. If this is the case, the variance evident in the plot will be an underestimate of the true variance. References ---------- http://www.itl.nist.gov/div898/software/dataplot/refman1/auxillar/ccpr.htm """ fig, ax = utils.create_mpl_ax(ax) exog_name, exog_idx = utils.maybe_name_or_idx(exog_idx, results.model) results = maybe_unwrap_results(results) x1 = results.model.exog[:, exog_idx] #namestr = ' for %s' % self.name if self.name else '' x1beta = x1 * results.params[exog_idx] ax.plot(x1, x1beta + results.resid, 'o') from statsmodels.tools.tools import add_constant mod = OLS(x1beta, add_constant(x1)).fit() params = mod.params fig = abline_plot(*params, **dict(ax=ax)) #ax.plot(x1, x1beta, '-') ax.set_title('Component and component plus residual plot') ax.set_ylabel("Residual + %s*beta_%d" % (exog_name, exog_idx)) ax.set_xlabel("%s" % exog_name) return fig
# load data into module namespace from statsmodels.datasets.cpunish import load from statsmodels.discrete.discrete_model import ( NegativeBinomial, NegativeBinomialP, Poisson, ) import statsmodels.discrete.tests.results.results_count_margins as res_stata from statsmodels.tools.tools import add_constant cpunish_data = load() cpunish_data.exog = np.asarray(cpunish_data.exog) cpunish_data.endog = np.asarray(cpunish_data.endog) cpunish_data.exog[:, 3] = np.log(cpunish_data.exog[:, 3]) exog = add_constant(cpunish_data.exog, prepend=False) endog = cpunish_data.endog - 1 # avoid zero-truncation exog /= np.round(exog.max(0), 3) class CheckMarginMixin(object): rtol_fac = 1 def test_margins_table(self): res1 = self.res1 sl = self.res1_slice rf = self.rtol_fac assert_allclose(self.margeff.margeff, self.res1.params[sl], rtol=1e-5 * rf) assert_allclose(self.margeff.margeff_se,
def predict(self, params, exog=None): if exog is None: exog = self.exog return np.dot(add_constant(exog, prepend=True), params)
import statsmodels.tools._testing as smt # get data and results as module global for now, TODO: move to class from .results import results_count_robust_cluster as results_st cur_dir = os.path.dirname(os.path.abspath(__file__)) filepath = os.path.join(cur_dir, "results", "ships.csv") data_raw = pd.read_csv(filepath, index_col=False) data = data_raw.dropna() #mod = smd.Poisson.from_formula('accident ~ yr_con + op_75_79', data=dat) # Don't use formula for tests against Stata because intercept needs to be last endog = data['accident'] exog_data = data['yr_con op_75_79'.split()] exog = add_constant(exog_data, prepend=False) group = np.asarray(data['ship'], int) exposure = np.asarray(data['service']) # TODO get the test methods from regression/tests class CheckCountRobustMixin(object): def test_basic(self): res1 = self.res1 res2 = self.res2 if len(res1.params) == (len(res2.params) - 1): # Stata includes lnalpha in table for NegativeBinomial mask = np.ones(len(res2.params), np.bool_) mask[-2] = False res2_params = res2.params[mask]