def test_formula_predict(): from numpy import log formula = """TOTEMP ~ log(GNPDEFL) + log(GNP) + UNEMP + ARMED + POP + YEAR""" data = load_pandas() dta = load_pandas().data results = ols(formula, dta).fit() npt.assert_equal(results.fittedvalues, results.predict(data.exog))
def test_formula_predict(): from numpy import log formula = """TOTEMP ~ log(GNPDEFL) + log(GNP) + UNEMP + ARMED + POP + YEAR""" data = load_pandas() dta = load_pandas().data results = ols(formula, dta).fit() npt.assert_almost_equal(results.fittedvalues.values, results.predict(data.exog), 8)
def test_formula_predict(): # `log` is needed in the namespace for patsy to find from numpy import log # noqa:F401 formula = """TOTEMP ~ log(GNPDEFL) + log(GNP) + UNEMP + ARMED + POP + YEAR""" data = load_pandas() dta = load_pandas().data results = ols(formula, dta).fit() npt.assert_almost_equal(results.fittedvalues.values, results.predict(data.exog), 8)
def test_pandas_const_df_prepend(): dta = longley.load_pandas().exog # regression test for #1025 dta["UNEMP"] /= dta["UNEMP"].std() dta = tools.add_constant(dta, prepend=True) assert_string_equal("const", dta.columns[0]) assert_equal(dta.var(0)[0], 0)
def test_pandas_const_df_prepend(): dta = longley.load_pandas().exog # regression test for #1025 dta['UNEMP'] /= dta['UNEMP'].std() dta = tools.add_constant(dta, prepend=True) assert_string_equal('const', dta.columns[0]) assert_equal(dta.var(0)[0], 0)
def test_summary_as_latex(): # GH#734 import re dta = longley.load_pandas() X = dta.exog X["constant"] = 1 y = dta.endog res = OLS(y, X).fit() with pytest.warns(UserWarning): table = res.summary().as_latex() # replace the date and time table = re.sub("(?<=\n\\\\textbf\\{Date:\\} &).+?&", " Sun, 07 Apr 2013 &", table) table = re.sub("(?<=\n\\\\textbf\\{Time:\\} &).+?&", " 13:46:07 &", table) expected = """\\begin{center} \\begin{tabular}{lclc} \\toprule \\textbf{Dep. Variable:} & TOTEMP & \\textbf{ R-squared: } & 0.995 \\\\ \\textbf{Model:} & OLS & \\textbf{ Adj. R-squared: } & 0.992 \\\\ \\textbf{Method:} & Least Squares & \\textbf{ F-statistic: } & 330.3 \\\\ \\textbf{Date:} & Sun, 07 Apr 2013 & \\textbf{ Prob (F-statistic):} & 4.98e-10 \\\\ \\textbf{Time:} & 13:46:07 & \\textbf{ Log-Likelihood: } & -109.62 \\\\ \\textbf{No. Observations:} & 16 & \\textbf{ AIC: } & 233.2 \\\\ \\textbf{Df Residuals:} & 9 & \\textbf{ BIC: } & 238.6 \\\\ \\textbf{Df Model:} & 6 & \\textbf{ } & \\\\ \\bottomrule \\end{tabular} \\begin{tabular}{lcccccc} & \\textbf{coef} & \\textbf{std err} & \\textbf{t} & \\textbf{P$> |$t$|$} & \\textbf{[0.025} & \\textbf{0.975]} \\\\ \\midrule \\textbf{GNPDEFL} & 15.0619 & 84.915 & 0.177 & 0.863 & -177.029 & 207.153 \\\\ \\textbf{GNP} & -0.0358 & 0.033 & -1.070 & 0.313 & -0.112 & 0.040 \\\\ \\textbf{UNEMP} & -2.0202 & 0.488 & -4.136 & 0.003 & -3.125 & -0.915 \\\\ \\textbf{ARMED} & -1.0332 & 0.214 & -4.822 & 0.001 & -1.518 & -0.549 \\\\ \\textbf{POP} & -0.0511 & 0.226 & -0.226 & 0.826 & -0.563 & 0.460 \\\\ \\textbf{YEAR} & 1829.1515 & 455.478 & 4.016 & 0.003 & 798.788 & 2859.515 \\\\ \\textbf{constant} & -3.482e+06 & 8.9e+05 & -3.911 & 0.004 & -5.5e+06 & -1.47e+06 \\\\ \\bottomrule \\end{tabular} \\begin{tabular}{lclc} \\textbf{Omnibus:} & 0.749 & \\textbf{ Durbin-Watson: } & 2.559 \\\\ \\textbf{Prob(Omnibus):} & 0.688 & \\textbf{ Jarque-Bera (JB): } & 0.684 \\\\ \\textbf{Skew:} & 0.420 & \\textbf{ Prob(JB): } & 0.710 \\\\ \\textbf{Kurtosis:} & 2.434 & \\textbf{ Cond. No. } & 4.86e+09 \\\\ \\bottomrule \\end{tabular} %\\caption{OLS Regression Results} \\end{center} Warnings: \\newline [1] Standard Errors assume that the covariance matrix of the errors is correctly specified. \\newline [2] The condition number is large, 4.86e+09. This might indicate that there are \\newline strong multicollinearity or other numerical problems.""" assert_equal(table, expected)
def test_tests(): formula = 'TOTEMP ~ GNPDEFL + GNP + UNEMP + ARMED + POP + YEAR' dta = load_pandas().data results = ols(formula, dta).fit() test_formula = '(GNPDEFL = GNP), (UNEMP = 2), (YEAR/1829 = 1)' LC = make_hypotheses_matrices(results, test_formula) R = LC.coefs Q = LC.constants npt.assert_almost_equal(R, [[0, 1, -1, 0, 0, 0, 0], [0, 0, 0, 1, 0, 0, 0], [0, 0, 0, 0, 0, 0, 1. / 1829]], 8) npt.assert_array_equal(Q, [[0], [2], [1]])
def test_tests(): formula = 'TOTEMP ~ GNPDEFL + GNP + UNEMP + ARMED + POP + YEAR' dta = load_pandas().data results = ols(formula, dta).fit() test_formula = '(GNPDEFL = GNP), (UNEMP = 2), (YEAR/1829 = 1)' LC = make_hypotheses_matrices(results, test_formula) R = LC.coefs Q = LC.constants npt.assert_almost_equal(R, [[0, 1, -1, 0, 0, 0, 0], [0, 0 , 0, 1, 0, 0, 0], [0, 0, 0, 0, 0, 0, 1./1829]], 8) npt.assert_array_equal(Q, [[0],[2],[1]])
def test_summary(): # test 734 import re dta = longley.load_pandas() X = dta.exog X["constant"] = 1 y = dta.endog with warnings.catch_warnings(record=True): res = OLS(y, X).fit() table = res.summary().as_latex() # replace the date and time table = re.sub("(?<=\n\\\\textbf\{Date:\} &).+?&", " Sun, 07 Apr 2013 &", table) table = re.sub("(?<=\n\\\\textbf\{Time:\} &).+?&", " 13:46:07 &", table) expected = """\\begin{center} \\begin{tabular}{lclc} \\toprule \\textbf{Dep. Variable:} & TOTEMP & \\textbf{ R-squared: } & 0.995 \\\\ \\textbf{Model:} & OLS & \\textbf{ Adj. R-squared: } & 0.992 \\\\ \\textbf{Method:} & Least Squares & \\textbf{ F-statistic: } & 330.3 \\\\ \\textbf{Date:} & Sun, 07 Apr 2013 & \\textbf{ Prob (F-statistic):} & 4.98e-10 \\\\ \\textbf{Time:} & 13:46:07 & \\textbf{ Log-Likelihood: } & -109.62 \\\\ \\textbf{No. Observations:} & 16 & \\textbf{ AIC: } & 233.2 \\\\ \\textbf{Df Residuals:} & 9 & \\textbf{ BIC: } & 238.6 \\\\ \\textbf{Df Model:} & 6 & \\textbf{ } & \\\\ \\bottomrule \\end{tabular} \\begin{tabular}{lccccc} & \\textbf{coef} & \\textbf{std err} & \\textbf{t} & \\textbf{P$>$$|$t$|$} & \\textbf{[95.0\\% Conf. Int.]} \\\\ \\midrule \\textbf{GNPDEFL} & 15.0619 & 84.915 & 0.177 & 0.863 & -177.029 207.153 \\\\ \\textbf{GNP} & -0.0358 & 0.033 & -1.070 & 0.313 & -0.112 0.040 \\\\ \\textbf{UNEMP} & -2.0202 & 0.488 & -4.136 & 0.003 & -3.125 -0.915 \\\\ \\textbf{ARMED} & -1.0332 & 0.214 & -4.822 & 0.001 & -1.518 -0.549 \\\\ \\textbf{POP} & -0.0511 & 0.226 & -0.226 & 0.826 & -0.563 0.460 \\\\ \\textbf{YEAR} & 1829.1515 & 455.478 & 4.016 & 0.003 & 798.788 2859.515 \\\\ \\textbf{constant} & -3.482e+06 & 8.9e+05 & -3.911 & 0.004 & -5.5e+06 -1.47e+06 \\\\ \\bottomrule \\end{tabular} \\begin{tabular}{lclc} \\textbf{Omnibus:} & 0.749 & \\textbf{ Durbin-Watson: } & 2.559 \\\\ \\textbf{Prob(Omnibus):} & 0.688 & \\textbf{ Jarque-Bera (JB): } & 0.684 \\\\ \\textbf{Skew:} & 0.420 & \\textbf{ Prob(JB): } & 0.710 \\\\ \\textbf{Kurtosis:} & 2.434 & \\textbf{ Cond. No. } & 4.86e+09 \\\\ \\bottomrule \\end{tabular} %\\caption{OLS Regression Results} \\end{center}""" assert_equal(table, expected)
def test_pandas_const_series_prepend(): dta = longley.load_pandas() series = dta.exog["GNP"] series = tools.add_constant(series, prepend=True) assert_string_equal("const", series.columns[0]) assert_equal(series.var(0)[0], 0)
def setup_class(cls): data = dict((k, v.tolist()) for k, v in iteritems(load_pandas().data)) cls.model = ols(longley_formula, data) super(TestFormulaDict, cls).setup_class()
res3 = sm.OLS(y, X).fit() print(res3.f_test(R)) print(res3.f_test("x2 = x3 = 0")) # ### Multicollinearity # # The Longley dataset is well known to have high multicollinearity. That is, the exogenous predictors are highly correlated. This is problematic because it can affect the stability of our coefficient estimates as we make minor changes to model specification. from statsmodels.datasets.longley import load_pandas y = load_pandas().endog X = load_pandas().exog X = sm.add_constant(X) # Fit and summary: ols_model = sm.OLS(y, X) ols_results = ols_model.fit() print(ols_results.summary()) # #### Condition number # # One way to assess multicollinearity is to compute the condition number. Values over 20 are worrisome (see Greene 4.9). The first step is to normalize the independent variables to have unit length:
def test_pandas_const_df(): dta = longley.load_pandas().exog dta = tools.add_constant(dta, prepend=False) assert_string_equal('const', dta.columns[-1]) assert_equal(dta.var(0)[-1], 0)
def test_pandas_const_df_prepend(): dta = longley.load_pandas().exog dta = tools.add_constant(dta, prepend=True) assert_string_equal('const', dta.columns[0]) assert_equal(dta.var(0)[0], 0)
def setup_class(cls): data = load_pandas().data cls.model = ols(longley_formula, data) super(TestFormulaPandas, cls).setup_class()
def test_pandas_const_series_prepend(): dta = longley.load_pandas() series = dta.exog['GNP'] series = tools.add_constant(series, prepend=True) assert_string_equal('const', series.columns[0]) assert_equal(series.var(0)[0], 0)
def test_pandas_const_series(): dta = longley.load_pandas() series = dta.exog['GNP'] series = tools.add_constant(series, prepend=False) assert_string_equal('const', series.columns[1]) assert_equal(series.var(0)[1], 0)
def setupClass(cls): data = load_pandas().data cls.model = ols(longley_formula, data) super(TestFormulaPandas, cls).setupClass()
plt.show() # In[155]: file_tcs['pct_change'] = file_tcs['Close Price'].pct_change() file_nifty['pct_change'] = file_nifty['Close'].pct_change() # In[158]: y = file_nifty['pct_change'].dropna() x = file_tcs['pct_change'].dropna() y = load_pandas().endog x = load_pandas().exog x = sm.add_constant(x) myModel = sm.OLS(y, x).fit() myModel.summary() # In[160]: tcs = pd.read_csv('TCS.csv', parse_dates=True, index_col='Date',) nifty50 = pd.read_csv('Nifty50.csv', parse_dates=True, index_col='Date') # In[162]:
ols_dummy_vars_small = OLSExample() ols_dummy_vars_small.set_config(x_start=0, x_stop=20, n_samples=50, beta=[1.0, 0.3, -0.0, 10], dummy_slices=[20, 40]) (y, X) = ols_dummy_vars_small.make_dummy_vars() res_ols_dummy_small = ols_dummy_vars_small.fit_data(y, X) print(res_ols_dummy_small.f_test(R)) print(res_ols_dummy_small.f_test("x2 = x3 = 0")) """ Multicollinearity: the exogenous predictors are highly correlated. This is problematic because it can affect the stability of our coefficient estimates as we make minor changes to model specification. """ y_multicol = load_pandas().endog X_multicol = load_pandas().exog X_multicol = sm.add_constant(X_multicol) print("X: ", X_multicol) res_ols_multicollinearity = ols.fit_data(y_multicol, X_multicol) print("Parameters :", res_ols_multicollinearity.params) print("Standard errors: ", res_ols_multicollinearity.bse) print("R^2 :", res_ols_multicollinearity.rsquared) """ condition number: to assess multicollinearity - Values over 20 are worrisome (see Greene 4.9) """ #The first step is to normalize the independent variables to have unit length norm_x = X_multicol.values for i, name in enumerate(X_multicol):
beta = [1., 0.3, -0.0, 10] y_true = np.dot(X, beta) y = y_true + np.random.normal(size=nsample) res3 = sm.OLS(y, X).fit() print(res3.f_test(R)) print(res3.f_test("x2 = x3 = 0")) # ### Multicollinearity # # The Longley dataset is well known to have high multicollinearity. That is, the exogenous predictors are highly correlated. This is problematic because it can affect the stability of our coefficient estimates as we make minor changes to model specification. from statsmodels.datasets.longley import load_pandas y = load_pandas().endog X = load_pandas().exog X = sm.add_constant(X) # Fit and summary: ols_model = sm.OLS(y, X) ols_results = ols_model.fit() print(ols_results.summary()) # #### Condition number # # One way to assess multicollinearity is to compute the condition number. Values over 20 are worrisome (see Greene 4.9). The first step is to normalize the independent variables to have unit length: for i, name in enumerate(X): if name == "const":
def setupClass(cls): data = dict((k, v.tolist()) for k, v in load_pandas().data.iteritems()) cls.model = ols(longley_formula, data) super(TestFormulaDict, cls).setupClass()
def test_pandas_const_df(): dta = longley.load_pandas().exog dta = tools.add_constant(dta, prepend=False) assert_string_equal("const", dta.columns[-1]) assert_equal(dta.var(0)[-1], 0)
A = A[1:, :] print(A) #This tests that each coefficient is jointly statistically significantly different from zero. print(results.f_test(A)) print(results.fvalue) # 330.2853392346658 print(results.f_pvalue) #4.98403096572e-10 B = np.array(([0, 0, 1, -1, 0, 0, 0], [0, 0, 0, 0, 0, 1, -1])) #This tests that the coefficient on the 2nd and 3rd regressors are equal and jointly that the coefficient on the 5th and 6th regressors are equal. print(results.f_test(B)) from statsmodels.datasets import longley from statsmodels.formula.api import ols dta = longley.load_pandas().data print(dta.describe()) formula = 'TOTEMP ~ GNPDEFL + GNP + UNEMP + ARMED + POP + YEAR' results = ols(formula, dta).fit() hypotheses = '(GNPDEFL = GNP), (UNEMP = 2), (YEAR/1829 = 1)' print(hypotheses) f_test = results.f_test(hypotheses) print(f_test) hypotheses = '(GNPDEFL = GNP), (UNEMP = 2)' print(hypotheses) f_test = results.f_test(hypotheses) print(f_test) hypotheses = '(GNPDEFL = GNP),(YEAR/1829 = 1)' print(hypotheses) f_test = results.f_test(hypotheses) print(f_test)
def test_pandas_const_series(): dta = longley.load_pandas() series = dta.exog["GNP"] series = tools.add_constant(series, prepend=False) assert_string_equal("const", series.columns[1]) assert_equal(series.var(0)[1], 0)