def goldfeld_quandt(dataframe, target, model, ax): text_color = plt.rcParams.get('ytick.color') temp = dataframe.sort_values(by=target).reset_index(drop=True) lwr_thresh = temp[target].quantile(q=.45) upr_thresh = temp[target].quantile(q=.55) middle_10percent_indices = temp[(temp[target] >= lwr_thresh) & (temp[target] <= upr_thresh)].index indices = [x - 1 for x in temp.index if x not in middle_10percent_indices] if not ax: fig, ax = plt.subplots(figsize=(6, 6)) ax.scatter(temp[target].iloc[indices], model.resid.iloc[indices]) ax.set_xlabel(target, color=text_color) ax.set_ylabel('Model Residuals', color=text_color) ax.set_title("Residuals versus {}".format(target), color=text_color) ax.axvline(x=lwr_thresh, ls=':', linewidth=2, color='gray') ax.axvline(x=upr_thresh, ls=':', linewidth=2, color='gray') if not ax: plt.show() test = sms.het_goldfeldquandt(model.resid.iloc[indices], model.model.exog[indices]) results = pd.DataFrame(index=['Goldfeld-Quandt'], columns=['F_statistic', 'p_value']) results.loc['Goldfeld-Quandt', 'F_statistic'] = test[0] results.loc['Goldfeld-Quandt', 'p_value'] = test[1] return results
def homoscedasticity_test(model): ''' Function for testing the homoscedasticity of residuals in a linear regression model. It plots residuals and standardized residuals vs. fitted values and runs Breusch-Pagan and Goldfeld-Quandt tests. Args: * model - fitted OLS model from statsmodels ''' fitted_vals = model.predict() resids = model.resid resids_standardized = model.get_influence().resid_studentized_internal fig, ax = plt.subplots(1,2) sns.regplot(x=fitted_vals, y=resids, lowess=True, ax=ax[0], line_kws={'color': 'red'}) ax[0].set_title('Residuals vs Fitted', fontsize=16) ax[0].set(xlabel='Fitted Values', ylabel='Residuals') sns.regplot(x=fitted_vals, y=np.sqrt(np.abs(resids_standardized)), lowess=True, ax=ax[1], line_kws={'color': 'red'}) ax[1].set_title('Scale-Location', fontsize=16) ax[1].set(xlabel='Fitted Values', ylabel='sqrt(abs(Residuals))') bp_test = pd.DataFrame(sms.het_breuschpagan(resids, model.model.exog), columns=['value'], index=['Lagrange multiplier statistic', 'p-value', 'f-value', 'f p-value']) gq_test = pd.DataFrame(sms.het_goldfeldquandt(resids, model.model.exog)[:-1], columns=['value'], index=['F statistic', 'p-value']) print('\n Breusch-Pagan test ----') print(bp_test) print('\n Goldfeld-Quandt test ----') print(gq_test) print('\n Residuals plots ----')
def compute(self, residual, significance, test_name, parameters, x=None): """The function to compute the constant variance check Please review statsmodels.stats.api.het_goldfeldquandtfor more information. Args: residual: the residual data derived from your dataset significance: this value should be 0.05 in general. Used for determining the hypothesis. test_name: the name of the test. This is just a label that doesn't not alter computations. parameters: extra parameters for the function "het_goldfeldquandt" from statsmodel x: the X_test of your dataset Returns: None """ p_value = het_goldfeldquandt(residual, x, **parameters)[1] if p_value > significance: print( test_name + ': Good. The residuals have constant variance. (homoscedastic) p value: ' + str(p_value)) else: print( test_name + ': Bad. The residuals do not have constant variance. (heteroscedastic) p value: ' + str(p_value))
def goldfeld_quandt(dataframe, target, model, ax=None, alternative='two-sided'): text_color = plt.rcParams.get('ytick.color') exog = pd.DataFrame(model.model.exog) endog = pd.DataFrame(model.model.endog, columns=[target]) temp = pd.concat([endog, exog], axis=1) temp = temp.sort_values(target).reset_index(drop=True) #display(temp.head()) #dataframe = dataframe.reset_index() #temp = dataframe.sort_values(by=target).reset_index() #temp = temp.rename(columns={'index':'old_index'}) #display(temp) lwr_thresh = dataframe[target].quantile(q=.45) upr_thresh = dataframe[target].quantile(q=.55) #lower_indices = temp[temp[target] <= lwr_thresh].index #upper_indices = temp[temp[target] >= upr_thresh].index middle_10percent_indices = dataframe[(dataframe[target] >= lwr_thresh) & (dataframe[target]<=upr_thresh)].index indices = [x for x in dataframe.index if x not in middle_10percent_indices] #print(indices) #return indices if not ax: fig, ax = plt.subplots(figsize=(6,6)) #ax.scatter(temp[target].iloc[indices], model.resid.iloc[indices]) features = [x for x in dataframe.columns if x not in [target]] #predictions = model.predict(dataframe.loc[indices][features]) #ax.scatter(predictions, model.resid.loc[indices]) predictions = model.predict(dataframe[features]) #predictions = model.predict(model.model.exog) ax.scatter(predictions, model.resid) ax.set_xlabel(target+' predictions', color=text_color) ax.set_ylabel('Model Residuals', color=text_color) ax.set_title("Residuals versus {} predictions".format(target), color=text_color) #ax.axvline(x=lwr_thresh, ls=':',linewidth=2, color='gray') #ax.axvline(x=upr_thresh, ls=':',linewidth=2, color='gray') ax.axhline(y=0, c='r') if not ax: plt.show() #test = sms.het_goldfeldquandt(model.resid.iloc[indices], model.model.exog[indices]) test = sms.het_goldfeldquandt(#model.resid.iloc[indices], temp[target], #model.model.endog[indices], temp[[x for x in temp.columns if x not in [target]]], split=0.45, drop=0.10, alternative=alternative ) #print(test) #var1 = np.var(temp.iloc[upper_indices][target]) #var2 = np.var(temp.iloc[lower_indices][target]) #df1 = len(temp.iloc[upper_indices]) - 1 #df2 = len(temp.iloc[lower_indices]) - 1 #p = f_test(var1, var2, df1, df2) results = pd.DataFrame(index=['Goldfeld-Quandt'], columns=['F_statistic', 'p_value']) results.loc['Goldfeld-Quandt','F_statistic'] = test[0] results.loc['Goldfeld-Quandt','p_value'] = test[1] #results.loc['Goldfeld-Quandt','F_statistic'] = var1/var2 #results.loc['Goldfeld-Quandt','p_value'] = p return results
def goldfeld_quandt( model: RegressionResultsWrapper, split: float = 0.45, drop: float = 0.1, jobs: int = os.cpu_count(), ) -> pd.DataFrame: """Run a battery of GQ tests, sorting by each exog variable in `model`. Args: model (RegressionResultsWrapper): Statsmodels regression results. split (float, optional): Fraction of observations for split point. Defaults to 0.45. drop (float, optional): Fraction of observations to drop. Defaults to 0.1. jobs (int, optional): Number of threads to create. Defaults to os.cpu_count(). Returns: [pd.DataFrame]: DataFrame of results for each exog variable. """ resid = model.resid exog = model.model.data.orig_exog resid, exog = resid.align(exog, axis=0) sort_cols = np.arange(exog.shape[1]) if jobs > 1: gq = partial( sms.het_goldfeldquandt, resid.to_numpy(), exog.to_numpy(), alternative="two-sided", split=split, drop=drop, ) with ThreadPool(jobs) as pool: all_results = pool.map(lambda x: gq(idx=x), sort_cols) else: all_results = [] for idx in sort_cols: results = sms.het_goldfeldquandt( resid.to_numpy(), exog.to_numpy(), idx=idx, alternative="two-sided", split=split, drop=drop, ) all_results.append(results) all_results = pd.DataFrame(all_results, columns=["f_val", "p_val", "hypothesis"], index=sort_cols) all_results.index = all_results.index.map(lambda x: exog.columns.values[x]) all_results.index.name = "sort_by" return all_results.sort_values("p_val")
def process_heteroscedasticity(x, y, metrics_dict, suffix): x_with_const = sm.add_constant(x) results = sm.OLS(y, x_with_const).fit() bp_lm, bp_lm_pvalue, bp_fvalue, bp_f_pvalue = sms.het_breuschpagan( results.resid, results.model.exog) w_lm, w_lm_pvalue, w_fvalue, w_f_pvalue = sms.het_white( results.resid, results.model.exog) gq_fvalue, gq_f_pvalue, gq_type = sms.het_goldfeldquandt( results.resid, results.model.exog) beg_lim, end_lim = np.percentile(x, [33, 67]) beg_ids = [] end_ids = [] for t_id, t in enumerate(x): if t < beg_lim: beg_ids.append(t_id) elif t > end_lim: end_ids.append(t_id) beg_std = np.std(np.array(y)[np.array(beg_ids)]) end_std = np.std(np.array(y)[np.array(end_ids)]) if end_std > beg_std: type = 'increasing' else: type = 'decreasing' metrics_dict['type' + suffix].append(type) metrics_dict['bp_lm' + suffix].append(bp_lm) metrics_dict['bp_lm_pvalue' + suffix].append(bp_lm_pvalue) metrics_dict['bp_fvalue' + suffix].append(bp_fvalue) metrics_dict['bp_f_pvalue' + suffix].append(bp_f_pvalue) metrics_dict['w_lm' + suffix].append(w_lm) metrics_dict['w_lm_pvalue' + suffix].append(w_lm_pvalue) metrics_dict['w_fvalue' + suffix].append(w_fvalue) metrics_dict['w_f_pvalue' + suffix].append(w_f_pvalue) metrics_dict['gq_fvalue' + suffix].append(gq_fvalue) metrics_dict['gq_f_pvalue' + suffix].append(gq_f_pvalue) metrics_dict['gq_type' + suffix].append(gq_type)
# ## <a id="h**o">3. Check for Homoscedasticity</a> # %% [code] p = sns.scatterplot(y_pred, residuals) plt.xlabel('y_pred/predicted values') plt.ylabel('Residuals') plt.ylim(-10, 10) plt.xlim(0, 26) p = sns.lineplot([0, 26], [0, 0], color='blue') p = plt.title('Residuals vs fitted values plot for homoscedasticity check') # %% [code] import statsmodels.stats.api as sms from statsmodels.compat import lzip name = ['F statistic', 'p-value'] test = sms.het_goldfeldquandt(residuals, X_train) lzip(name, test) # %% [code] from scipy.stats import bartlett test = bartlett(X_train, residuals) print(test) # %% [markdown] # ## <a id="normal">4. Check for Normality of error terms/residuals</a> # %% [code] p = sns.distplot(residuals, kde=True) p = plt.title('Normality of error terms/residuals') # %% [markdown]
# Other plotting options can be found on the [Graphics page.](http://www.statsmodels.org/stable/graphics.html) # ## Multicollinearity # # Condition number: np.linalg.cond(results.model.exog) # ## Heteroskedasticity tests # # Breush-Pagan test: name = ['Lagrange multiplier statistic', 'p-value', 'f-value', 'f p-value'] test = sms.het_breushpagan(results.resid, results.model.exog) lzip(name, test) # Goldfeld-Quandt test name = ['F statistic', 'p-value'] test = sms.het_goldfeldquandt(results.resid, results.model.exog) lzip(name, test) # ## Linearity # # Harvey-Collier multiplier test for Null hypothesis that the linear specification is correct: name = ['t value', 'p value'] test = sms.linear_harvey_collier(results) lzip(name, test)
# Condition number: np.linalg.cond(results.model.exog) # ## Heteroskedasticity tests # # Breush-Pagan test: name = ['Lagrange multiplier statistic', 'p-value', 'f-value', 'f p-value'] test = sms.het_breushpagan(results.resid, results.model.exog) lzip(name, test) # Goldfeld-Quandt test name = ['F statistic', 'p-value'] test = sms.het_goldfeldquandt(results.resid, results.model.exog) lzip(name, test) # ## Linearity # # Harvey-Collier multiplier test for Null hypothesis that the linear specification is correct: name = ['t value', 'p value'] test = sms.linear_harvey_collier(results) lzip(name, test)
name2 = ['Chi^2', 'Two-tail probability'] test2 = sms.omni_normtest(lr.resid) lzip(name2, test2) ''' #================================================ #================================================ #=========================Heteroskedasticity test #======================Breush-Pagan test: name3 = ['Lagrange multiplier statistic', 'p-value', 'f-value', 'f p-value'] test3 = sms.het_breuschpagan(lr.resid, lr.model.exog) lzip(name3, test3) #======================Goldfeld-Quandt test: name5 = ['F statistic', 'p-value'] test5 = sms.het_goldfeldquandt(lr.resid, lr.model.exog) lzip(name5, test5) #================================================ #================================================ #==================================Linearity test #======================Harvey-Collier: name6 = ['t value', 'p value'] test6 = sms.linear_harvey_collier(lr) lzip(name6, test6) import statsmodels.stats.diagnostic as ssd name6 = ['t value', 'p value'] test6 = ssd.acorr_linear_rainbow(lr) lzip(name6, test6)
def goldfeldQuandtTest(residuals, exogVars): name = ['F statistic', 'p-value'] test = sms.het_goldfeldquandt(residuals, exogVars) lzip(name, test)
data_new = pd.DataFrame(pp.scale(data.values[:,:-1]), columns=['Beds','Healing_days','Income','Salary','Costs']) model_new1 = sm.OLS.from_formula(formula=model.model.formula, data=data_new).fit() print(model_new1.summary()) import patsy y, X = patsy.dmatrices(model.model.formula, data, return_type='dataframe') model_n = lm.LinearRegression() #Кросс-Валидация k_fold = KFold(n_splits=10) scores = cross_val_score(model_n, X, y, cv=k_fold, scoring='r2') predicted = cross_val_predict(model_n,X,y,cv=k_fold) slope, intercept, r_value, p_value, std_err = st.linregress(y.values[:,0],predicted[:,0]) print(r_value*r_value) #Гомоскедастичность (Бреуш-Паган, Голдфильд-Квандт) test = sms.het_breushpagan(res11.resid, res11.model.exog) name = ['Lagrange multiplier statistic', 'p-value', 'f-value', 'f p-value'] print(lzip(name, test)) name = ['F statistic', 'p-value'] test = sms.het_goldfeldquandt(res11.resid, res11.model.exog) print(lzip(name, test)) #Q-Q st.probplot(res4.resid,plot=plt) sm.qqplot(res11.resid, line='s') plt.show() #Дарбин-Уотсон dw = sms.stattools.durbin_watson(res11.resid) print(dw)
def check_error_term_constant_variance(self) -> bool: """ Checks if the error term has constant variance (there is no heteroscedascity) by: - Breusch-Pagan's statistical test, - Goldfeld-Quandt's statstical test. If: - silent_mode = True, method returns: a) True (which means that the assumption is fulfilled) if the percentage of statistical tests for which the assumption is fulfilled is higher than or equal to set min_fulfill_ratio b) False (which means that the assumption is not fulfilled) if the percentage of statistical tests for which the assumption is fulfilled is lower than set min_fulfill_ratio - silent_mode = False, method returns True/False as above and shows additional statistics, descriptions which are helpful in assessing the fulfilment of assumption """ bp_test = pd.DataFrame( sms.het_breuschpagan(self.residuals, self.results.model.exog)[:2], columns=["value"], index=["Lagrange multiplier statistic", "p-value"]) gq_test = pd.DataFrame(sms.het_goldfeldquandt( self.residuals, self.results.model.exog)[:-1], columns=["value"], index=["F statistic", "p-value"]) heteroscedascity_tests = [bp_test, gq_test] true_counts = 0 for test in heteroscedascity_tests: true_counts = true_counts + test_hypothesis( significance_level=self.alpha, p_value=test.iloc[1].value, print_outcome=False) true_ratio = true_counts / 2 if not self.silent_mode: print( Color.BOLD + "Assumption 5. The error term has a constant variance." + Color.END, "\n") print("This assumption affects on: \n", "- prediction \n", "- interpretation \n") print( "Heteroscedasticity does not cause bias in the coefficient estimates, it does " "make them less precise. Heteroscedasticity also tends to produce p-values that " "are smaller than they should be. If you notice this problem in your model, " "you can try one of this solutions to fix it: redefine independent variable to " "focus on rates/per capita, try using weighted least squares, experiment with " "data transformations (f.g. Box-Cox's/Johnson's transformation).\n" ) print(Color.BOLD + "Breusch-Pagan " + Color.END + "Lagrange Multiplier " "statistical test: \n") print(bp_test, "\n") test_hypothesis( significance_level=self.alpha, p_value=bp_test.iloc[1].value, null_hypothesis="error term's variance is constant.") print(Color.BOLD + "Goldfeld-Quandt " + Color.END + "test that examines whether the " "residual variance is the same in " "two subsamples: \n") print(gq_test, "\n") test_hypothesis( significance_level=self.alpha, p_value=gq_test.iloc[1].value, null_hypothesis="error term's variance is constant.") check_fulfill_ratio(true_fulfill_ratio=true_ratio, min_fulfill_ratio=self.min_fulfill_ratio) print( "HINT: If you see randomly scattered points => there is no heteroscedascity. \n", "If you see fan or cone pattern => probably there exists heteroscedascity. \n" ) plot_standarized_residuals_vs_fitted(fitted_model=self.results) plt.show() return check_fulfill_ratio(true_fulfill_ratio=true_ratio, min_fulfill_ratio=self.min_fulfill_ratio, print_outcome=False)
# RMSE of residuals np.sqrt(np.mean(model.resid**2)) # 244 # checking normality of residuals stats.anderson(model.resid) # residuals are normal # checking auto-correlation of residuals from statsmodels.stats import diagnostic as diag diag.acorr_ljungbox(model.resid, lags=1) # pvalue is <0.05, so autocorrelation is present # checking heteroscedasticity import statsmodels.stats.api as sms from statsmodels.compat import lzip name = ['F-statistic','p-value'] gold_test = sms.het_goldfeldquandt(model.resid,model.model.exog) lzip(name,gold_test) # ('F-stat', 0.6722696289421596), ('p-value', 0.9999999999999999)], # go with null, residuals are homoscedastic: constant variance pred_price = model.predict(computer2.drop(['price'],axis=1)) pred_price[0:4] computer2.price[0:4] model.resid[0:4] 1499-1787 # except for autocorrelation all assumptions are satisfied # splitting data from sklearn.model_selection import train_test_split train,test = train_test_split(computer2,test_size=0.30, random_state=100)
#* so we will remove them from our data frame rm = ["season", "weathersit", "yr", "mnth"] reg2 = reg.drop(columns=rm) testp = model2.predict(test.iloc[:, 0:12]) MAPE(test.iloc[:, 12], testp) r2_score(test.iloc[:, 12], testp), math.sqrt(mean_squared_error(test.iloc[:, 12], testp)) ### 4. Detecting Hetroscedaticity #* Using goldfeld quandt test gq_test = pd.DataFrame(sms.het_goldfeldquandt(model2.resid, model2.model.exog)[:-1], columns=['value'], index=['F statistic', 'p-value']) gq_test ##### Since p-value is greater than the alpha = 0.05, we can asume that data is homoscedatic. ### Building a final model after satisfying all the assumption. train, test = train_test_split(reg2, test_size=0.2) model3 = sm.OLS(train.iloc[:, 8], train.iloc[:, 0:8]).fit() model3.summary() testp = model3.predict(test.iloc[:, 0:8]) MAPE(test.iloc[:, 8], testp), r2_score(test.iloc[:, 8], testp), math.sqrt(
def main(processed_path = "data/processed", models_path = "models"): """Nested 10-fold cross-validation for linear regression of ranking_log and score with with lasso regularization (inner CV for alpha tuning, outer for R^2 robustness).""" # logging logger = logging.getLogger(__name__) # normalize paths processed_path = os.path.normpath(processed_path) logger.debug("Path to processed data normalized: {}" .format(processed_path)) models_path = os.path.normpath(models_path) logger.debug("Path to models normalized: {}" .format(models_path)) # load selected_df selected_df = pd.read_pickle(os.path.join(processed_path, 'selected_df.pkl')) logger.info("Loaded selected_df. Shape of df: {}" .format(selected_df.shape)) #%% split df into dependent and independent variables teams_df = selected_df.iloc[:, :9] y = selected_df.iloc[:, 9:10] X = selected_df.iloc[:, 10:] X_columns = X.columns X_index = X.index #%% standardize scaler = StandardScaler() not_standardize = ['core', 'visualization', 'machine_learning', 'deep_learning'] X_standardized = scaler.fit_transform(X .drop(columns=not_standardize) .values) X_standardized = pd.DataFrame(X_standardized, index = X_index, columns = X_columns.drop(not_standardize)) X_not_standardized = X[not_standardize] X = pd.concat([X_standardized, X_not_standardized], axis=1) logger.debug("After Standardization:\n{}".format(X.describe().to_string)) #%% define hyperparameter start = time() L1_RATIOS = [1.0, .95, .7, .5, .3, .1] EPS = 0.001 N_ALPHAS = 100 ALPHAS = None # normalize data # If True, the regressors X will be normalized before regression by # subtracting the mean (column-wise) and dividing by the l2-norm in # order for each feature to have norm = 1. NORMALIZE = False MAX_ITER = 10000 TOL = 0.0001 CV = 20 N_JOBS = 1 RS = 1 SELECTION = 'cyclic' logger.info("l1_ratio={}, eps={}, n_alphas={}, alphas={}, normalize={}" .format(L1_RATIOS, EPS, N_ALPHAS, ALPHAS, NORMALIZE)) logger.info("max_iter={}, tol={}, cv={}, n_jobs={}, rs={}, selection={}" .format(MAX_ITER, TOL, CV, N_JOBS, RS, SELECTION)) logger.debug("Try following L1-ratios: {}".format(L1_RATIOS)) # print R^2 values for bounding alphas 0 and 1 to make sense of alphas logger.info("Bounding score: R^2 for alpha=0 and l1_ratio=0.5: {}" .format(ElasticNet(alpha=0, l1_ratio=.5, normalize=NORMALIZE, random_state=RS) .fit(X.values, y.values) .score(X.values, y.values))) logger.info("Bounding score: R^2 for alpha=1 and l1_ratio=0.5: {}" .format(ElasticNet(alpha=1, l1_ratio=.5, normalize=NORMALIZE, random_state=RS) .fit(X.values, y.values) .score(X.values, y.values))) #%% train model mod = ElasticNetCV(l1_ratio = L1_RATIOS, eps = EPS, n_alphas = N_ALPHAS, alphas = ALPHAS, normalize = NORMALIZE, max_iter = MAX_ITER, tol = TOL, cv = CV, n_jobs = N_JOBS, random_state = RS, selection = SELECTION)\ .fit(X.values, y.values) # log some statistics best_r2 = mod.score(X.values, y.values) logger.info("best R^2 score: {:.2f}%".format(best_r2*100)) best_l1_ratio = mod.l1_ratio_ logger.info("best l1_ratio: {}".format(best_l1_ratio)) best_alpha = mod.alpha_ logger.info("best alpha: {:.3f}".format(best_alpha)) alphas = mod.alphas_ logger.debug("tested alphas:\n{}".format(alphas)) coef = pd.Series(data=mod.coef_, index=X_columns) logger.debug("best coefficients:\n{}".format(coef)) # mse_path = mod.mse_path_ #%% Nested Cross-Validation to test robustness of R^2 cv_results = cross_validate(ElasticNetCV(l1_ratio = L1_RATIOS, eps = EPS, n_alphas = N_ALPHAS, alphas = ALPHAS, normalize = NORMALIZE, max_iter = MAX_ITER, tol = TOL, cv = CV, n_jobs = N_JOBS, random_state = RS, selection = SELECTION), X.values, y.values, cv=CV, return_train_score=True, n_jobs=N_JOBS) logger.info("95% confidence intervall: {:.2f} +/- {:.2f} (mean +/- 2*std)" .format(cv_results['test_score'].mean(), cv_results['test_score'].std()*2)) logger.debug("Nested cross-validation results:\n{}" .format(pd.DataFrame(data=cv_results))) #%% Elastic Net regression with statsmodels for summary mod_sm = sm.OLS(y.values, sm.add_constant(pd.DataFrame(data=X.values, columns=X_columns, index=X_index)))\ .fit_regularized(method='elastic_net', alpha=best_alpha, L1_wt=best_l1_ratio, refit=True) res = mod_sm.summary().as_text() logger.info("ElasticNet regression of selected_df regarding ranking_log") logger.info("with alpha={:.5f} and L1_wt={}:\n{}" .format(best_alpha, best_l1_ratio, res)) # Normality of residuals # Jarque-Bera test: name = ['Jarque-Bera', 'Chi^2 two-tail prob.', 'Skew', 'Kurtosis'] test = sms.jarque_bera(mod_sm.resid) logger.info("Jarque-Bera test: {}".format(lzip(name, test))) # Omni test: name = ['Chi^2', 'Two-tail probability'] test = sms.omni_normtest(mod_sm.resid) logger.info("Omnibus test: {}".format(lzip(name, test))) # Multicollinearity # Conditional Number: logger.info("Conditional Number: {}" .format(np.linalg.cond(mod_sm.model.exog))) # Heteroskedasticity tests # Breush-Pagan test: name = ['Lagrange multiplier statistic', 'p-value', 'f-value', 'f p-value'] test = sms.het_breuschpagan(mod_sm.resid, mod_sm.model.exog) logger.info("Breush-Pagan test: {}".format(lzip(name, test))) # Goldfeld-Quandt test name = ['F statistic', 'p-value'] test = sms.het_goldfeldquandt(mod_sm.resid, mod_sm.model.exog) logger.info("Goldfeld-Quandt test: {}".format(lzip(name, test))) #%% export results as pickle file to models folder # pickle mod with open(os.path.join(models_path, 'sklearn_ElasticNetCV.pkl'), 'wb') as handle: pickle.dump(mod, handle, protocol=pickle.HIGHEST_PROTOCOL) logger.info("Saved elastic net model of sklearn to {}." .format(os.path.join(models_path, 'sklearn_ElasticNetCV.pkl'))) # pickle mod_sm with open(os.path.join(models_path, 'sm_OLS_fit_regularized.pkl'), 'wb') as handle: pickle.dump(mod_sm, handle, protocol=pickle.HIGHEST_PROTOCOL) logger.info("Saved elastic net model of statsmodels to {}." .format(os.path.join(models_path, 'sm_OLS_fit_regularized.pkl'))) # save res as .txt f = open(os.path.join(models_path, 'sm_OLS_fit_regularized_summary.txt'), "w+") f.write(res) f.close() #%% logging time passed end = time() time_passed = pd.Timedelta(seconds=end-start).round(freq='s') logger.info("Time needed to train Elastic Net Model: {}" .format(time_passed))