def test_plot_influence(self, close_figures):
        infl = self.res.get_influence()
        fig = influence_plot(self.res)
        assert_equal(isinstance(fig, plt.Figure), True)
        # test that we have the correct criterion for sizes #3103
        try:
            sizes = fig.axes[0].get_children()[0]._sizes
            ex = sm.add_constant(infl.cooks_distance[0])
            ssr = sm.OLS(sizes, ex).fit().ssr
            assert_array_less(ssr, 1e-12)
        except AttributeError:
            import warnings
            warnings.warn('test not compatible with matplotlib version')

        fig = influence_plot(self.res, criterion='DFFITS')
        assert_equal(isinstance(fig, plt.Figure), True)
        try:
            sizes = fig.axes[0].get_children()[0]._sizes
            ex = sm.add_constant(np.abs(infl.dffits[0]))
            ssr = sm.OLS(sizes, ex).fit().ssr
            assert_array_less(ssr, 1e-12)
        except AttributeError:
            pass

        assert_raises(ValueError, influence_plot, self.res, criterion='unknown')
Esempio n. 2
0
    def test_plot_influence(self):
        infl = self.res.get_influence()
        fig = influence_plot(self.res)
        assert_equal(isinstance(fig, plt.Figure), True)
        # test that we have the correct criterion for sizes #3103
        try:
            sizes = fig.axes[0].get_children()[0]._sizes
            ex = sm.add_constant(infl.cooks_distance[0])
            ssr = sm.OLS(sizes, ex).fit().ssr
            assert_array_less(ssr, 1e-12)
        except AttributeError:
            import warnings
            warnings.warn('test not compatible with matplotlib version')
        plt.close(fig)

        fig = influence_plot(self.res, criterion='DFFITS')
        assert_equal(isinstance(fig, plt.Figure), True)
        try:
            sizes = fig.axes[0].get_children()[0]._sizes
            ex = sm.add_constant(np.abs(infl.dffits[0]))
            ssr = sm.OLS(sizes, ex).fit().ssr
            assert_array_less(ssr, 1e-12)
        except AttributeError:
            pass
        plt.close(fig)

        assert_raises(ValueError,
                      influence_plot,
                      self.res,
                      criterion='unknown')
Esempio n. 3
0
 def plot_student_residual_leverage(self, res):
     """
     The answer of exercise-03-09:
     (d) plot the residual graph and the leverage graph here, and can see outlier and high leverage point here
     """
     # The Leverage-Studentized Residuals plot
     rp.influence_plot(res, criterion="DFFITS", size=20)
     plt.show()
Esempio n. 4
0
    def test_plot_influence(self):
        fig = influence_plot(self.res)
        assert_equal(isinstance(fig, plt.Figure), True)
        plt.close(fig)

        fig = influence_plot(self.res, criterion='DFFITS')
        assert_equal(isinstance(fig, plt.Figure), True)
        plt.close(fig)

        assert_raises(ValueError, influence_plot, self.res, criterion='unknown')
Esempio n. 5
0
def plot_regress_analysis(model, influence=True, annotate=True):
    plt.figure(figsize=(15, 16))

    # Residuals vs Fitted
    ax = plt.subplot2grid((3, 2), (0, 0))
    ax.set_title("Residuals vs Fitted")
    ax.set_xlabel('Fitted values')
    ax.set_ylabel('Residuals')
    fitted = model.predict()
    residuals = model.resid
    ax.plot(fitted, residuals, marker='.', linestyle='')

    # Model non-linearity with quadratic
    polyline = np.poly1d(np.polyfit(fitted, residuals, 2))
    max_fitted = np.max(fitted)
    xs = np.append(np.arange(np.min(fitted), max_fitted), max_fitted)
    ax.plot(xs, polyline(xs), linewidth=2.5)

    # Q-Q plot
    ax = plt.subplot2grid((3, 2), (0, 1))
    ax.set_title("Q-Q")
    qqplot(model.resid_pearson, dist="norm", line='r', ax=ax)

    # Scale-Location
    ax = plt.subplot2grid((3, 2), (1, 0))
    ax.set_title("Scale-Location")
    ax.set_xlabel('Fitted values')
    ax.set_ylabel('$|$Normalized residuals$|^{1/2}$')
    std_residuals = np.sqrt(np.abs(model.resid_pearson))
    ax.plot(fitted, std_residuals, linestyle='', marker='.')

    # Model non-linearity with quadratic
    polyline = np.poly1d(np.polyfit(fitted, std_residuals, 2))
    ax.plot(xs, polyline(xs), linewidth=2.5)

    # Residuals vs Leverage
    ax = plt.subplot2grid((3, 2), (1, 1))
    plot_leverage_resid2(model, ax, annotate=annotate)

    # Influence plot
    if influence:
        ax = plt.subplot2grid((3, 2), (2, 0), colspan=2)
        ax = influence_plot(model, ax=ax)
Esempio n. 6
0
# Residual Vs Regressors
sm.graphics.plot_regress_exog(model, 'RnD')  # pat
sm.graphics.plot_regress_exog(model, 'Administration')
sm.graphics.plot_regress_exog(model, 'Market')  # slight pat
sm.graphics.plot_regress_exog(model, 'St_F')
sm.graphics.plot_regress_exog(model, 'St_N')

##### Deletion diagnostic #######

# Cook's distance
(c, _) = model.get_influence().cooks_distance
fig = plt.subplots(figsize=(20, 7))
plt.stem(np.arange(len(startup)), np.round(c, 3))
(np.argmax(c), np.max(c))  #(49, 0.2639594358718258) - no outlier
# High influence point
influence_plot(model)  # 49
k = startup.shape[1]
n = startup.shape[0]
leverage_cutoff = 3 * ((k + 1) / n)
leverage_cutoff

# chcek influencer/outlier data
startup[startup.index.isin([49])]
startup.head(10)

########## improve model #########

# removing categorical variable as its highly insiginificant and not
# adding any value

###### Iteration 1 ############
ipl_model_3.summary2()
"""*Residual Analysis in Multiple Regression*

**P-P plot**
"""


def draw_pp_plot(model, title):
    probplot = sm.ProbPlot(model.resid)
    plt.figure(figsize=(8, 6))
    probplot.ppplot(line='45 ')
    plt.title(title)
    plt.show()


draw_pp_plot(ipl_model_3,
             "Figure - Normal P-P Plot of Regression Standardized Residuals")

k = train_X.shape[1]
n = train_X.shape[0]

print("Number of Variables: ", k, " and number of observations: ", n)

leverage_cutoff = 3 * ((k + 1) / n)
print("cutoff for leverage value: ", round(leverage_cutoff, 3))

from statsmodels.graphics.regressionplots import influence_plot
fig, ax = plt.subplots(figsize=(8, 6))
influence_plot(ipl_model_3, ax=ax)
plt.title("Fig - Leverage Value vs Residuals")
plt.show()
Esempio n. 8
0
# From the above plot, it is evident that data point 19 and 47 are the influencers

# In[30]:

#index and value of influencer where c is more than .5
(np.argmax(c), np.max(c))

# ## Since the value is <1 , we can stop the diagnostic process and finalize the model

# # High Influence points
#

# In[31]:

from statsmodels.graphics.regressionplots import influence_plot
influence_plot(model)
plt.show()

# In[32]:

k = data2.shape[1]
n = data2.shape[0]
leverage_cutoff = 3 * ((k + 1) / n)

# In[33]:

leverage_cutoff

# In[34]:

data2[data2.index.isin([19, 47])]
Esempio n. 9
0
sm.graphics.plot_regress_exog(model,'HP')
sm.graphics.plot_regress_exog(model,'cc')
sm.graphics.plot_regress_exog(model,'Doors')
sm.graphics.plot_regress_exog(model,'Gears')
sm.graphics.plot_regress_exog(model,'Quarterly_Tax')
sm.graphics.plot_regress_exog(model,'Weight')

##### Deletion diagnostic #######

# Cook's distance
(c, _)=model.get_influence().cooks_distance
fig = plt.subplots(figsize=(20, 7))
plt.stem(np.arange(len(toyota)), np.round(c, 3)) 
(np.argmax(c),np.max(c))
# High influence point
influence_plot(model)
k = toyota.shape[1]
n = toyota.shape[0]
leverage_cutoff = 3*((k + 1)/n)
leverage_cutoff

# chcek influencer/outlier data
toyota[toyota.index.isin([78])] 
toyota.head(10)

########## improve model #########

toyota1=toyota.drop([78]).reset_index()
toyota1=toyota1.drop(['index'],axis=1)

###### Iteration 1 ############
Esempio n. 10
0
"""
■ トライ&エラーを補助してくれる可視化ツール

回帰分析にはトライ&エラーが付き物です。むしろ、ほとんどの経済現象は、線形式で完全に記述できるはずがありませんから、色々な回帰式を当てはめたり、サンプル期間変えたりして初めて、経済現象の全体像を掴むことができるのだと思います。

statsmodelsには、こうしたトライ&エラーをサポートする可視化ツールが付いています。以下のモジュールをimportしてみましょう。

"""

# /// Graphical Diagnostic Tools /// ---------------------------------
import statsmodels.graphics.regressionplots as regplot
"""
まず、以下のinfluence_plot()は、サンプルの中にはずれ値的な動きをした期間があるかを検出してくれます。
"""
# Checking Outlier effect
regplot.influence_plot(rlt)  # Studentized Residual
regplot.plot_leverage_resid2(rlt)  # Leverage vs. resid^2
"""
また、plot_regress_exog()は、個別の説明変数ごとに、誤差項と説明変数の関係や、他の要因をコントロールした上での当該変数の説明力を見る偏回帰プロットが表示されます。誤差や説明力が説明変数の値に連動して変化するなら、他変数からの影響や非線形性が現れていると考えられます。
"""

# Selected exog vs. other things controlled endog plot
regplot.plot_regress_exog(rlt, 1)

N = DD.shape[0]

x = DD['Pic'].values * 100
y = DD['GAP'].values * 100

radii = np.random.random(size=N) / 10
colors = [
Esempio n. 11
0
    'resid': model.resid,
    'std_resids': model.resid_pearson,
    'fitted': model.predict()
})

# residual vs fitted value
residvsfitted = plt.plot(residual_plot_var['fitted'],
                         residual_plot_var['resid'], '+')
l = plt.axhline(y=0, color='black', linestyle='dashed')
plt.xlabel('Fitted_values')
plt.ylabel('Residuals')
plt.title('Residuals vs Fitted_value')
plt.show(residvsfitted)

# Q-Q Plot
qqplot = sm.qqplot(residual_plot_var['std_resids'], line='s')
plt.show(qqplot)

# Scalelocation plot
scalelocplot = plt.plot(residual_plot_var['fitted'],
                        abs(residual_plot_var['std_resids'])**0.5, 'o')
plt.xlabel('Fitted_values')
plt.ylabel('Square Root of |standardized residuals|')
plt.title('Scale-Location')
plt.show(scalelocplot)

# Residual vs leverage plot
from statsmodels.graphics import regressionplots
residsvlevplot = regressionplots.influence_plot(model, criterion='Cooks')
plt.show(residsvlevplot)
Esempio n. 12
0
回帰分析にはトライ&エラーが付き物です。むしろ、ほとんどの経済現象は、線形式で完全に記述できるはずがありませんから、色々な回帰式を当てはめたり、サンプル期間変えたりして初めて、経済現象の全体像を掴むことができるのだと思います。

statsmodelsには、こうしたトライ&エラーをサポートする可視化ツールが付いています。以下のモジュールをimportしてみましょう。

"""


# /// Graphical Diagnostic Tools /// ---------------------------------
import statsmodels.graphics.regressionplots as regplot


"""
まず、以下のinfluence_plot()は、サンプルの中にはずれ値的な動きをした期間があるかを検出してくれます。
"""
# Checking Outlier effect
regplot.influence_plot(rlt) # Studentized Residual
regplot.plot_leverage_resid2(rlt) # Leverage vs. resid^2


"""
また、plot_regress_exog()は、個別の説明変数ごとに、誤差項と説明変数の関係や、他の要因をコントロールした上での当該変数の説明力を見る偏回帰プロットが表示されます。誤差や説明力が説明変数の値に連動して変化するなら、他変数からの影響や非線形性が現れていると考えられます。
"""

# Selected exog vs. other things controlled endog plot
regplot.plot_regress_exog(rlt,1)




N = DD.shape[0]
Esempio n. 13
0
est = smf.ols(formula='Employed ~ GNP', data=df).fit()
print(est.summary())
# analisis de minimos cuadrados ordinarios
# separar ejes
y = df.Employed
x = df.GNP
x = sm.add_constant(x)
# agregamos constante para usarlo como un valor multiplicativo, el predict ocupa para
# saber cuantas veces se va a recalcular
# regresion
x_1 = pd.DataFrame({'GNP': np.linspace(x.GNP.min(), x.GNP.max(), 100)})
# para agarrar intervalos ocupas la constante en el dataframe original
x_1 = sm.add_constant(x_1)
# crear un df con los datos de GNP para poder usarlos
# print(x_1)
y_pron = est.predict(x_1)
plt.scatter(x.GNP, y, alpha=0.3)  # alpha es la separacion entre los puntos
plt.ylim(30, 100)  # acotar la grafica
plt.xlabel('PIB')
plt.ylabel('Tasas de Empleo')
plt.title('Ajuste de Regresion')
plt.plot(x_1.GNP, y_pron, 'r', alpha=0.9)
plt.savefig('../out/lineal_simple_gdp.png')
plt.show()
inf = influence_plot(est)
inf.savefig('../out/influencia.png')
inf.show()

# estadistica descriptiva
# apalancamiento y residuales, grafico de influencia, tamaño de los circulos