def test_plot_influence(self, close_figures): infl = self.res.get_influence() fig = influence_plot(self.res) assert_equal(isinstance(fig, plt.Figure), True) # test that we have the correct criterion for sizes #3103 try: sizes = fig.axes[0].get_children()[0]._sizes ex = sm.add_constant(infl.cooks_distance[0]) ssr = sm.OLS(sizes, ex).fit().ssr assert_array_less(ssr, 1e-12) except AttributeError: import warnings warnings.warn('test not compatible with matplotlib version') fig = influence_plot(self.res, criterion='DFFITS') assert_equal(isinstance(fig, plt.Figure), True) try: sizes = fig.axes[0].get_children()[0]._sizes ex = sm.add_constant(np.abs(infl.dffits[0])) ssr = sm.OLS(sizes, ex).fit().ssr assert_array_less(ssr, 1e-12) except AttributeError: pass assert_raises(ValueError, influence_plot, self.res, criterion='unknown')
def test_plot_influence(self): infl = self.res.get_influence() fig = influence_plot(self.res) assert_equal(isinstance(fig, plt.Figure), True) # test that we have the correct criterion for sizes #3103 try: sizes = fig.axes[0].get_children()[0]._sizes ex = sm.add_constant(infl.cooks_distance[0]) ssr = sm.OLS(sizes, ex).fit().ssr assert_array_less(ssr, 1e-12) except AttributeError: import warnings warnings.warn('test not compatible with matplotlib version') plt.close(fig) fig = influence_plot(self.res, criterion='DFFITS') assert_equal(isinstance(fig, plt.Figure), True) try: sizes = fig.axes[0].get_children()[0]._sizes ex = sm.add_constant(np.abs(infl.dffits[0])) ssr = sm.OLS(sizes, ex).fit().ssr assert_array_less(ssr, 1e-12) except AttributeError: pass plt.close(fig) assert_raises(ValueError, influence_plot, self.res, criterion='unknown')
def plot_student_residual_leverage(self, res): """ The answer of exercise-03-09: (d) plot the residual graph and the leverage graph here, and can see outlier and high leverage point here """ # The Leverage-Studentized Residuals plot rp.influence_plot(res, criterion="DFFITS", size=20) plt.show()
def test_plot_influence(self): fig = influence_plot(self.res) assert_equal(isinstance(fig, plt.Figure), True) plt.close(fig) fig = influence_plot(self.res, criterion='DFFITS') assert_equal(isinstance(fig, plt.Figure), True) plt.close(fig) assert_raises(ValueError, influence_plot, self.res, criterion='unknown')
def plot_regress_analysis(model, influence=True, annotate=True): plt.figure(figsize=(15, 16)) # Residuals vs Fitted ax = plt.subplot2grid((3, 2), (0, 0)) ax.set_title("Residuals vs Fitted") ax.set_xlabel('Fitted values') ax.set_ylabel('Residuals') fitted = model.predict() residuals = model.resid ax.plot(fitted, residuals, marker='.', linestyle='') # Model non-linearity with quadratic polyline = np.poly1d(np.polyfit(fitted, residuals, 2)) max_fitted = np.max(fitted) xs = np.append(np.arange(np.min(fitted), max_fitted), max_fitted) ax.plot(xs, polyline(xs), linewidth=2.5) # Q-Q plot ax = plt.subplot2grid((3, 2), (0, 1)) ax.set_title("Q-Q") qqplot(model.resid_pearson, dist="norm", line='r', ax=ax) # Scale-Location ax = plt.subplot2grid((3, 2), (1, 0)) ax.set_title("Scale-Location") ax.set_xlabel('Fitted values') ax.set_ylabel('$|$Normalized residuals$|^{1/2}$') std_residuals = np.sqrt(np.abs(model.resid_pearson)) ax.plot(fitted, std_residuals, linestyle='', marker='.') # Model non-linearity with quadratic polyline = np.poly1d(np.polyfit(fitted, std_residuals, 2)) ax.plot(xs, polyline(xs), linewidth=2.5) # Residuals vs Leverage ax = plt.subplot2grid((3, 2), (1, 1)) plot_leverage_resid2(model, ax, annotate=annotate) # Influence plot if influence: ax = plt.subplot2grid((3, 2), (2, 0), colspan=2) ax = influence_plot(model, ax=ax)
# Residual Vs Regressors sm.graphics.plot_regress_exog(model, 'RnD') # pat sm.graphics.plot_regress_exog(model, 'Administration') sm.graphics.plot_regress_exog(model, 'Market') # slight pat sm.graphics.plot_regress_exog(model, 'St_F') sm.graphics.plot_regress_exog(model, 'St_N') ##### Deletion diagnostic ####### # Cook's distance (c, _) = model.get_influence().cooks_distance fig = plt.subplots(figsize=(20, 7)) plt.stem(np.arange(len(startup)), np.round(c, 3)) (np.argmax(c), np.max(c)) #(49, 0.2639594358718258) - no outlier # High influence point influence_plot(model) # 49 k = startup.shape[1] n = startup.shape[0] leverage_cutoff = 3 * ((k + 1) / n) leverage_cutoff # chcek influencer/outlier data startup[startup.index.isin([49])] startup.head(10) ########## improve model ######### # removing categorical variable as its highly insiginificant and not # adding any value ###### Iteration 1 ############
ipl_model_3.summary2() """*Residual Analysis in Multiple Regression* **P-P plot** """ def draw_pp_plot(model, title): probplot = sm.ProbPlot(model.resid) plt.figure(figsize=(8, 6)) probplot.ppplot(line='45 ') plt.title(title) plt.show() draw_pp_plot(ipl_model_3, "Figure - Normal P-P Plot of Regression Standardized Residuals") k = train_X.shape[1] n = train_X.shape[0] print("Number of Variables: ", k, " and number of observations: ", n) leverage_cutoff = 3 * ((k + 1) / n) print("cutoff for leverage value: ", round(leverage_cutoff, 3)) from statsmodels.graphics.regressionplots import influence_plot fig, ax = plt.subplots(figsize=(8, 6)) influence_plot(ipl_model_3, ax=ax) plt.title("Fig - Leverage Value vs Residuals") plt.show()
# From the above plot, it is evident that data point 19 and 47 are the influencers # In[30]: #index and value of influencer where c is more than .5 (np.argmax(c), np.max(c)) # ## Since the value is <1 , we can stop the diagnostic process and finalize the model # # High Influence points # # In[31]: from statsmodels.graphics.regressionplots import influence_plot influence_plot(model) plt.show() # In[32]: k = data2.shape[1] n = data2.shape[0] leverage_cutoff = 3 * ((k + 1) / n) # In[33]: leverage_cutoff # In[34]: data2[data2.index.isin([19, 47])]
sm.graphics.plot_regress_exog(model,'HP') sm.graphics.plot_regress_exog(model,'cc') sm.graphics.plot_regress_exog(model,'Doors') sm.graphics.plot_regress_exog(model,'Gears') sm.graphics.plot_regress_exog(model,'Quarterly_Tax') sm.graphics.plot_regress_exog(model,'Weight') ##### Deletion diagnostic ####### # Cook's distance (c, _)=model.get_influence().cooks_distance fig = plt.subplots(figsize=(20, 7)) plt.stem(np.arange(len(toyota)), np.round(c, 3)) (np.argmax(c),np.max(c)) # High influence point influence_plot(model) k = toyota.shape[1] n = toyota.shape[0] leverage_cutoff = 3*((k + 1)/n) leverage_cutoff # chcek influencer/outlier data toyota[toyota.index.isin([78])] toyota.head(10) ########## improve model ######### toyota1=toyota.drop([78]).reset_index() toyota1=toyota1.drop(['index'],axis=1) ###### Iteration 1 ############
""" ■ トライ&エラーを補助してくれる可視化ツール 回帰分析にはトライ&エラーが付き物です。むしろ、ほとんどの経済現象は、線形式で完全に記述できるはずがありませんから、色々な回帰式を当てはめたり、サンプル期間変えたりして初めて、経済現象の全体像を掴むことができるのだと思います。 statsmodelsには、こうしたトライ&エラーをサポートする可視化ツールが付いています。以下のモジュールをimportしてみましょう。 """ # /// Graphical Diagnostic Tools /// --------------------------------- import statsmodels.graphics.regressionplots as regplot """ まず、以下のinfluence_plot()は、サンプルの中にはずれ値的な動きをした期間があるかを検出してくれます。 """ # Checking Outlier effect regplot.influence_plot(rlt) # Studentized Residual regplot.plot_leverage_resid2(rlt) # Leverage vs. resid^2 """ また、plot_regress_exog()は、個別の説明変数ごとに、誤差項と説明変数の関係や、他の要因をコントロールした上での当該変数の説明力を見る偏回帰プロットが表示されます。誤差や説明力が説明変数の値に連動して変化するなら、他変数からの影響や非線形性が現れていると考えられます。 """ # Selected exog vs. other things controlled endog plot regplot.plot_regress_exog(rlt, 1) N = DD.shape[0] x = DD['Pic'].values * 100 y = DD['GAP'].values * 100 radii = np.random.random(size=N) / 10 colors = [
'resid': model.resid, 'std_resids': model.resid_pearson, 'fitted': model.predict() }) # residual vs fitted value residvsfitted = plt.plot(residual_plot_var['fitted'], residual_plot_var['resid'], '+') l = plt.axhline(y=0, color='black', linestyle='dashed') plt.xlabel('Fitted_values') plt.ylabel('Residuals') plt.title('Residuals vs Fitted_value') plt.show(residvsfitted) # Q-Q Plot qqplot = sm.qqplot(residual_plot_var['std_resids'], line='s') plt.show(qqplot) # Scalelocation plot scalelocplot = plt.plot(residual_plot_var['fitted'], abs(residual_plot_var['std_resids'])**0.5, 'o') plt.xlabel('Fitted_values') plt.ylabel('Square Root of |standardized residuals|') plt.title('Scale-Location') plt.show(scalelocplot) # Residual vs leverage plot from statsmodels.graphics import regressionplots residsvlevplot = regressionplots.influence_plot(model, criterion='Cooks') plt.show(residsvlevplot)
回帰分析にはトライ&エラーが付き物です。むしろ、ほとんどの経済現象は、線形式で完全に記述できるはずがありませんから、色々な回帰式を当てはめたり、サンプル期間変えたりして初めて、経済現象の全体像を掴むことができるのだと思います。 statsmodelsには、こうしたトライ&エラーをサポートする可視化ツールが付いています。以下のモジュールをimportしてみましょう。 """ # /// Graphical Diagnostic Tools /// --------------------------------- import statsmodels.graphics.regressionplots as regplot """ まず、以下のinfluence_plot()は、サンプルの中にはずれ値的な動きをした期間があるかを検出してくれます。 """ # Checking Outlier effect regplot.influence_plot(rlt) # Studentized Residual regplot.plot_leverage_resid2(rlt) # Leverage vs. resid^2 """ また、plot_regress_exog()は、個別の説明変数ごとに、誤差項と説明変数の関係や、他の要因をコントロールした上での当該変数の説明力を見る偏回帰プロットが表示されます。誤差や説明力が説明変数の値に連動して変化するなら、他変数からの影響や非線形性が現れていると考えられます。 """ # Selected exog vs. other things controlled endog plot regplot.plot_regress_exog(rlt,1) N = DD.shape[0]
est = smf.ols(formula='Employed ~ GNP', data=df).fit() print(est.summary()) # analisis de minimos cuadrados ordinarios # separar ejes y = df.Employed x = df.GNP x = sm.add_constant(x) # agregamos constante para usarlo como un valor multiplicativo, el predict ocupa para # saber cuantas veces se va a recalcular # regresion x_1 = pd.DataFrame({'GNP': np.linspace(x.GNP.min(), x.GNP.max(), 100)}) # para agarrar intervalos ocupas la constante en el dataframe original x_1 = sm.add_constant(x_1) # crear un df con los datos de GNP para poder usarlos # print(x_1) y_pron = est.predict(x_1) plt.scatter(x.GNP, y, alpha=0.3) # alpha es la separacion entre los puntos plt.ylim(30, 100) # acotar la grafica plt.xlabel('PIB') plt.ylabel('Tasas de Empleo') plt.title('Ajuste de Regresion') plt.plot(x_1.GNP, y_pron, 'r', alpha=0.9) plt.savefig('../out/lineal_simple_gdp.png') plt.show() inf = influence_plot(est) inf.savefig('../out/influencia.png') inf.show() # estadistica descriptiva # apalancamiento y residuales, grafico de influencia, tamaño de los circulos