# Condition number:

np.linalg.cond(results.model.exog)


# ## Heteroskedasticity tests
# 
# Breush-Pagan test:

name = ['Lagrange multiplier statistic', 'p-value', 
        'f-value', 'f p-value']
test = sms.het_breushpagan(results.resid, results.model.exog)
lzip(name, test)


# Goldfeld-Quandt test

name = ['F statistic', 'p-value']
test = sms.het_goldfeldquandt(results.resid, results.model.exog)
lzip(name, test)


# ## Linearity
# 
# Harvey-Collier multiplier test for Null hypothesis that the linear specification is correct:

name = ['t value', 'p value']
test = sms.linear_harvey_collier(results)
lzip(name, test)

# Other plotting options can be found on the [Graphics page.](http://www.statsmodels.org/stable/graphics.html)

# ## Multicollinearity
#
# Condition number:

np.linalg.cond(results.model.exog)

# ## Heteroskedasticity tests
#
# Breush-Pagan test:

name = ['Lagrange multiplier statistic', 'p-value', 'f-value', 'f p-value']
test = sms.het_breushpagan(results.resid, results.model.exog)
lzip(name, test)

# Goldfeld-Quandt test

name = ['F statistic', 'p-value']
test = sms.het_goldfeldquandt(results.resid, results.model.exog)
lzip(name, test)

# ## Linearity
#
# Harvey-Collier multiplier test for Null hypothesis that the linear specification is correct:

name = ['t value', 'p value']
test = sms.linear_harvey_collier(results)
lzip(name, test)
# This block creates the objects for each statistic we'd like printed to Excel.<br>
# <br>
# It will report $R^2$, $R^2 adj,$ residuals, the f p-value, aic, the fitted model parameters, normality of the residuals, the Bruesh-Pagan Test for heteroscedasicity and the Harvey-Collier test for linearity.

# <codecell>

r_squared = model.rsquared
r_square_adj = model.rsquared_adj
residuals = model.resid
p = model.f_pvalue
aic = model.aic
pvalues = pd.DataFrame(model.pvalues)
params = pd.DataFrame(model.params)
normality = sms.jarque_bera(model.resid)
breush_pagan_hska = sms.het_breushpagan(model.resid, model.model.exog)
harvey_collier = sms.linear_harvey_collier(model)

# <headingcell level=4>

# Print the regression results to Excel

# <codecell>

Range("Results", "O6").value = "R^2"
Range("Results", "P6").value = r_squared

Range("Results", "O7").value = "R^2 Adjusted"
Range("Results", "P7").value = r_square_adj

Range("Results", "O8").value = "p-value"
Range("Results", "P8").value = p
def check_rmse(df, response, list_of_list):
    for item in list_of_list:
        cols = item
        print('\nUsing scikit:\n')
        # instantiate model
        lm = LinearRegression()
        # fit model
        lm.fit(X_train[cols], y_train)
        # make predictions
        y_pred = lm.predict(X_test[cols])
        print('MAE:', mean_absolute_error(y_test, y_pred))
        print('RMSE:', np.sqrt(mean_squared_error(y_test, y_pred)))
        print('R-Squared:', r2_score(y_test, y_pred))
        print('Score', lm.score(X_test[cols], y_test))
         # normality check
        residual = y_test - y_pred
        sm.qqplot(residual, stats.distributions.norm, line='r')
        #plt.show()

        print('\nUsing statsmodels:\n')
        formula = "{0} ~ {1}".format(response, '+'.join(cols))
        print(formula)
        model = smf.ols(formula, data=df).fit()
        y_pred = model.predict(X_test[cols])
        print('RMSE:', np.sqrt(mean_squared_error(y_test, y_pred)))
        print(model.params)
        print(model.summary())
        print('\nCHECKING RESIDUALS\n')
        residual = y_test - y_pred
        fig, ax = plt.subplots(1,2)
        # axes are in a two-dimensional array, indexed by [row, col]
        ax[0].hist(model.resid_pearson)
        ax[0].set_ylabel('Count')
        ax[0].set_xlabel('Normalized residuals')
        ax[1].scatter(residual, y_pred)

        # normality check
        res = model.resid
        sm.qqplot(res, stats.distributions.norm, line='r')
        #plt.show()

        # linearity check
        ##The Harvey-Collier test performs a t-test (with parameter degrees of freedom) on the recursive residuals.
        ##If the true relationship is not linear but convex or concave the mean of the recursive residuals should differ from 0 significantly.
        import statsmodels.stats.api as sms
        try:
            harvey_collier = sms.linear_harvey_collier(model)
            print(harvey_collier)
        except np.linalg.linalg.LinAlgError as e:
            print(e)

        ## equal variation check
        ### violation of homoscedasticity.
        ### small pvalue = bad
        from statsmodels.stats.diagnostic import het_breuschpagan
        _, pval, __, f_pval = het_breuschpagan(residual, X_test[cols])
        print('Pval', pval)
        print('f_pval', f_pval)

        ## correlation analysis
        # Compute matrix of correlation coefficients
        corr_matrix = np.corrcoef(df[cols].T)
        print(pd.DataFrame(corr_matrix))
        ### https://matplotlib.org/gallery/images_contours_and_fields/image_annotated_heatmap.html
        # Display heat map
        fig, ax = plt.subplots(1, 1, figsize=(6, 6))
        ax.imshow(corr_matrix)
        ax.set_title('Heatmap of correlation matrix')
        # Rotate the tick labels and set their alignment.
        plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
         rotation_mode="anchor")
        # We want to show all ticks...
        ax.set_xticks(np.arange(len(cols)))
        ax.set_yticks(np.arange(len(cols)))
        # ... and label them with the respective list entries
        ax.set_xticklabels(cols)
        ax.set_yticklabels(cols)
        # Loop over data dimensions and create text annotations.
        for i in range(len(cols)):
            for j in range(len(cols)):
                text = ax.text(j, i, round(corr_matrix[i, j], 2),
                       ha="center", va="center", color="black")
        plt.show()


        print("\nCross validation using scikit:\n")
        # check cross validation predictions, 10 splits
        cv_predictions = cross_val_predict(lm, df[cols], df[response], cv=10)
        # check errors
        print('MAE:', mean_absolute_error(df[response], cv_predictions))
        print('RMSE:', np.sqrt(mean_squared_error(df[response], cv_predictions)))
        print('R-Squared:', r2_score(df[response],cv_predictions))
        #print('Score', accuracy_score(df[response], cv_predictions))
        # plot actual vs predicted
        fig3 = plt.figure(figsize=(6,6))
        plt.scatter(x=cv_predictions, y=df[response])
        plt.xlabel('Predictions')
        plt.ylabel('Appliances')
        df['predictions'] = cv_predictions
        # plotting fit results
        fig4, ax = plt.subplots()
        df.Appliances.plot(ax=ax, style='b-')
        # same ax as above since it's automatically added on the right
        df.predictions.plot(ax=ax, style='r-')
        ax.set_xlabel('Predictions')
        ax.set_ylabel('Appliances')
        #print(df.head())
        #df.plot(y=cv_predictions, color='red', linewidth=1)
        #fig4.tight_layout()
        #plt.show()
        print('\nCHECKING RESIDUALS\n')
        residual = df[response] - y_pred
        # axes are in a two-dimensional array, indexed by [row, col]
        fig4 = plt.figure(figsize=(6,6))
        plt.scatter(df.predictions, residual)
        plt.hlines(y = 0, xmin = 0, xmax = 250)
        plt.title('Residual Plot')
        plt.ylabel('Residuals')
        plt.show()
Exemple #5
0
name3 = ['Lagrange multiplier statistic', 'p-value', 'f-value', 'f p-value']
test3 = sms.het_breuschpagan(lr.resid, lr.model.exog)
lzip(name3, test3)

#======================Goldfeld-Quandt test:
name5 = ['F statistic', 'p-value']
test5 = sms.het_goldfeldquandt(lr.resid, lr.model.exog)
lzip(name5, test5)


#================================================
#================================================
#==================================Linearity test
#======================Harvey-Collier:
name6 = ['t value', 'p value']
test6 = sms.linear_harvey_collier(lr)
lzip(name6, test6)

import statsmodels.stats.diagnostic as ssd
name6 = ['t value', 'p value']
test6 = ssd.acorr_linear_rainbow(lr)
lzip(name6, test6)

#================================================
#================================================
#====Serial correlation (or) Autocorrelation test
#======================Durbin_watson:
#Durbin-Watson test for no autocorrelation of residuals
#printed with summary()
from statsmodels.stats.stattools import durbin_watson
print("Durbin-Watson: ", durbin_watson(lr.resid))
Exemple #6
0
def harveyCollier(results):
    name = ['t value', 'p value']
    test = sms.linear_harvey_collier(results)
    lzip(name, test)
Exemple #7
0
    def diagnostic_plots(self, linear_model):
        """
        :param linear_model: Linear Model Fit on the Data
        :return: None
        This method validates the assumptions of Linear Model
        """
        diagnostic_result = {}

        summary = linear_model.summary()
        #diagnostic_result['summary'] = str(summary)

        # fitted values
        fitted_y = linear_model.fittedvalues
        # model residuals
        residuals = linear_model.resid

        # normalized residuals
        residuals_normalized = linear_model.get_influence().resid_studentized_internal

        # absolute squared normalized residuals
        model_norm_residuals_abs_sqrt = np.sqrt(np.abs(residuals_normalized))

        # leverage, from statsmodels internals
        leverage = linear_model.get_influence().hat_matrix_diag

        # cook's distance, from statsmodels internals
        cooks = linear_model.get_influence().cooks_distance[0]

        self.check_linearity_assumption(fitted_y, residuals)

        self.check_residual_normality(residuals_normalized)

        self.check_homoscedacticity(fitted_y, model_norm_residuals_abs_sqrt)

        self.check_influcence(leverage, cooks, residuals_normalized)

        # 1. Non-Linearity Test
        try:
            name = ['F value', 'p value']
            test = sms.linear_harvey_collier(linear_model)
            linear_test_result = lzip(name, test)
        except Exception as e:
            linear_test_result = str(e)
        diagnostic_result['Non_Linearity_Test'] = linear_test_result

        # 2. Hetroskedasticity Test
        name = ['Lagrange multiplier statistic', 'p-value',
                'f-value', 'f p-value']
        test = sms.het_breuschpagan(linear_model.resid, linear_model.model.exog)
        test_val = lzip(name, test)
        diagnostic_result['Hetroskedasticity_Test'] = test_val

        # 3. Normality of Residuals
        name = ['Jarque-Bera', 'Chi^2 two-tail prob.', 'Skew', 'Kurtosis']
        test = sms.jarque_bera(linear_model.resid)
        test_val = lzip(name, test)
        diagnostic_result['Residual_Normality_Test'] = test_val

        # 4. MultiCollnearity Test
        test = np.linalg.cond(linear_model.model.exog)
        test_val = [('condition no',test)]
        diagnostic_result['MultiCollnearity_Test'] = test_val

        # 5. Residuals Auto-Correlation Tests
        test = sms.durbin_watson(linear_model.resid)
        test_val = [('p value', test)]
        diagnostic_result['Residual_AutoCorrelation_Test'] = test_val

        json_result = json.dumps(diagnostic_result)
        return summary, json_result