def simple_regession(self): ''' The answer of exercise03-08: (a) (i) Yes, from F-stat (ii) Explain it from RSE and R^2 stat (iii)negative (iv) Code, no prediction interval (b) Code (c) Residual/fitted: non-linearity ''' # model = smf.ols(formula="mpg ~ horsepower", data=self.df) y = self.df['mpg'] X = self.df[['horsepower']] X = sm.add_constant(X) print X res = sm.OLS(y, X).fit() # res = model.fit() print res.summary() print "The prediction is: ", res.predict(exog=[[1, 98]]) print "The prediction interval is: " ''' self.df.plot(kind="scatter", x='horsepower', y='mpg', c='w') graph_x = np.linspace(min(self.df['horsepower']), 200) graph_y = res.predict(sm.add_constant(graph_x)) plt.plot(graph_x, graph_y) ''' fig = rp.abline_plot(model_results=res) ax = fig.axes[0] ax.scatter(X['horsepower'], y, c='w') plt.show() lrplot.plot_R_graphs(res)
The answer of exercise-03-09: (f) From the correlation matrix in 9a., displacement, horsepower and weight show a similar nonlinear pattern against our response mpg. This nonlinear pattern is very close to a log form. So in the next attempt, we use log(mpg) as our response variable. """ # Why choose these predictors? By brute force and choose the least p-value? mod = smf.ols( formula="np.log(mpg) ~ cylinders+displacement+horsepower+weight+acceleration+year+origin", data=self.df ) res = mod.fit() print res.summary() return res if __name__ == "__main__": ex09 = Exec09() # ex09.plot_scatter_matrix() # ex09.show_covariance() """ RegressionResults class """ res = ex09.multi_variate_regression() # lrp.plot_scale_location(res) # lrp.plot_qq(res) # lrp.plot_fitted_student_residual(ex09.df, res) # ex09.get_leverages_resid(res) # ex09.get_vifs(ex09.X) # ex09.regress_with_interaction() # res = ex09.regress_with_poly_2() lrp.plot_R_graphs(res)