def test_influence_dtype(): # see #2148 bug when endog is integer y = np.ones(20) np.random.seed(123) x = np.random.randn(20, 3) res1 = OLS(y, x).fit() res2 = OLS(y*1., x).fit() cr1 = res1.get_influence().cov_ratio cr2 = res2.get_influence().cov_ratio assert_allclose(cr1, cr2, rtol=1e-14) # regression test for values cr3 = np.array( [ 1.22239215, 1.31551021, 1.52671069, 1.05003921, 0.89099323, 1.57405066, 1.03230092, 0.95844196, 1.15531836, 1.21963623, 0.87699564, 1.16707748, 1.10481391, 0.98839447, 1.08999334, 1.35680102, 1.46227715, 1.45966708, 1.13659521, 1.22799038]) assert_almost_equal(cr1, cr3, decimal=8)
def test_outlier_influence_funcs(): # smoke test x = add_constant(np.random.randn(10, 2)) y = x.sum(1) + np.random.randn(10) res = OLS(y, x).fit() oi.summary_table(res, alpha=0.05) res2 = OLS(y, x[:, 0]).fit() oi.summary_table(res2, alpha=0.05) infl = res2.get_influence() infl.summary_table()
def test_outlier_influence_funcs(): #smoke test x = add_constant(np.random.randn(10, 2)) y = x.sum(1) + np.random.randn(10) res = OLS(y, x).fit() oi.summary_table(res, alpha=0.05) res2 = OLS(y, x[:,0]).fit() oi.summary_table(res2, alpha=0.05) infl = res2.get_influence() infl.summary_table()
def test_outlier_influence_funcs(reset_randomstate): x = add_constant(np.random.randn(10, 2)) y = x.sum(1) + np.random.randn(10) res = OLS(y, x).fit() out_05 = oi.summary_table(res) # GH3344 : Check alpha has an effect out_01 = oi.summary_table(res, alpha=0.01) assert_(np.all(out_01[1][:, 6] <= out_05[1][:, 6])) assert_(np.all(out_01[1][:, 7] >= out_05[1][:, 7])) res2 = OLS(y, x[:, 0]).fit() oi.summary_table(res2, alpha=0.05) infl = res2.get_influence() infl.summary_table()
def test_outlier_influence_funcs(reset_randomstate): x = add_constant(np.random.randn(10, 2)) y = x.sum(1) + np.random.randn(10) res = OLS(y, x).fit() out_05 = oi.summary_table(res) # GH3344 : Check alpha has an effect out_01 = oi.summary_table(res, alpha=0.01) assert_(np.all(out_01[1][:, 6] <= out_05[1][:, 6])) assert_(np.all(out_01[1][:, 7] >= out_05[1][:, 7])) res2 = OLS(y, x[:,0]).fit() oi.summary_table(res2, alpha=0.05) infl = res2.get_influence() infl.summary_table()
def ols_sm(X_train, y_train, X_test): X_train = sm.add_constant( X_train) # adds col of ones for intercept coefficient in OLS model ols = OLS(y_train, X_train).fit() # with open('ols_model_summary.csv', 'w') as f: # f.write(ols.summary().as_csv()) with open('ols_model_summary.txt', 'w') as f: f.write(ols.summary().as_text()) # Plot True vs Predicted values to examine if linear model is a good fit fig = plt.figure(figsize=(12, 8)) X_test = sm.add_constant(X_test) plt.scatter(y_test, ols.predict(X_test)) plt.xlabel('True values') plt.ylabel('Predicted values') plt.title('True vs Predicted values') plt.show() plt.close() # Add quadratic term to X or take log of y to improve # Discern if a linear relationship exists with partial regression plots fig = plt.figure(figsize=(12, 8)) fig = sm.graphics.plot_partregress_grid(ols, fig=fig) plt.title('Partial Regression Plots') plt.show() plt.close() # Identify outliers and high leverage points # a. Identify outliers (typically, those data points with studentized residuals outside of +/- 3 stdev). # Temporarily remove these from your data set and re-run your model. # Do your model metrics improve considerably? Does this give you cause for more confidence in your model? # b. Identify those outliers that are also high-leverage points (high residual and high leverage --> high influence). fig, ax = plt.subplots(figsize=(12, 8)) fig = sm.graphics.influence_plot(ols, ax=ax, criterion="cooks") plt.show() fig, ax = plt.subplots(figsize=(8, 6)) fig = sm.graphics.plot_leverage_resid2(ols, ax=ax) plt.show() plt.close() # Confirm homoscedasticity (i.e., constant variance of residual terms) # If residuals exhibit a “funnel shaped” effect, consider transforming your data into logarithmic space. studentized_residuals = ols.outlier_test()[:, 0] y_pred = ols.fittedvalues fig = plt.figure() ax = fig.add_subplot(111) ax.scatter(y_pred, studentized_residuals) ax.axhline(y=0.0, color='k', ls='--') ax.set_xlabel('Predicted y') ax.set_ylabel('Studentized Residuals') plt.show() plt.close() # Test if residuals are normally distributed in QQ plot # plots quantile of the normal distribution against studentized residuals # if sample quantiles are normally distributed, the dots will align with 45 deg line fig, ax = plt.subplots() sm.graphics.qqplot(studentized_residuals, fit=True, line='45', ax=ax) plt.show() plt.close() # Find influencial points in data # DFBETAS - standardized measure of how much each coefficient changes when that observation is left out threshold = 2. / len(X_train)**.5 infl = ols.get_influence() df = pd.DataFrame(infl.summary_frame().filter(regex="dfb")) inf = df[df > threshold].dropna(axis=0, how='all') print('Influencial points:\n', inf)