Esempio n. 1
0
def goldfeld_quandt(dataframe, target, model, ax):
    text_color = plt.rcParams.get('ytick.color')
    temp = dataframe.sort_values(by=target).reset_index(drop=True)
    lwr_thresh = temp[target].quantile(q=.45)
    upr_thresh = temp[target].quantile(q=.55)
    middle_10percent_indices = temp[(temp[target] >= lwr_thresh)
                                    & (temp[target] <= upr_thresh)].index
    indices = [x - 1 for x in temp.index if x not in middle_10percent_indices]
    if not ax:
        fig, ax = plt.subplots(figsize=(6, 6))
    ax.scatter(temp[target].iloc[indices], model.resid.iloc[indices])
    ax.set_xlabel(target, color=text_color)
    ax.set_ylabel('Model Residuals', color=text_color)
    ax.set_title("Residuals versus {}".format(target), color=text_color)
    ax.axvline(x=lwr_thresh, ls=':', linewidth=2, color='gray')
    ax.axvline(x=upr_thresh, ls=':', linewidth=2, color='gray')
    if not ax:
        plt.show()
    test = sms.het_goldfeldquandt(model.resid.iloc[indices],
                                  model.model.exog[indices])
    results = pd.DataFrame(index=['Goldfeld-Quandt'],
                           columns=['F_statistic', 'p_value'])
    results.loc['Goldfeld-Quandt', 'F_statistic'] = test[0]
    results.loc['Goldfeld-Quandt', 'p_value'] = test[1]
    return results
def homoscedasticity_test(model):
    '''
    Function for testing the homoscedasticity of residuals in a linear regression model.
    It plots residuals and standardized residuals vs. fitted values and runs Breusch-Pagan and Goldfeld-Quandt tests.
    
    Args:
    * model - fitted OLS model from statsmodels
    '''
    fitted_vals = model.predict()
    resids = model.resid
    resids_standardized = model.get_influence().resid_studentized_internal

    fig, ax = plt.subplots(1,2)

    sns.regplot(x=fitted_vals, y=resids, lowess=True, ax=ax[0], line_kws={'color': 'red'})
    ax[0].set_title('Residuals vs Fitted', fontsize=16)
    ax[0].set(xlabel='Fitted Values', ylabel='Residuals')

    sns.regplot(x=fitted_vals, y=np.sqrt(np.abs(resids_standardized)), lowess=True, ax=ax[1], line_kws={'color': 'red'})
    ax[1].set_title('Scale-Location', fontsize=16)
    ax[1].set(xlabel='Fitted Values', ylabel='sqrt(abs(Residuals))')

    bp_test = pd.DataFrame(sms.het_breuschpagan(resids, model.model.exog), 
                           columns=['value'],
                           index=['Lagrange multiplier statistic', 'p-value', 'f-value', 'f p-value'])

    gq_test = pd.DataFrame(sms.het_goldfeldquandt(resids, model.model.exog)[:-1],
                           columns=['value'],
                           index=['F statistic', 'p-value'])

    print('\n Breusch-Pagan test ----')
    print(bp_test)
    print('\n Goldfeld-Quandt test ----')
    print(gq_test)
    print('\n Residuals plots ----')
    def compute(self, residual, significance, test_name, parameters, x=None):
        """The function to compute the constant variance check

        Please review statsmodels.stats.api.het_goldfeldquandtfor more information.

        Args:
            residual: the residual data derived from your dataset
            significance: this value should be 0.05 in general. Used for determining the hypothesis.
            test_name: the name of the test. This is just a label that doesn't not alter computations.
            parameters: extra parameters for the function "het_goldfeldquandt" from statsmodel
            x: the X_test of your dataset

        Returns:
            None
        """

        p_value = het_goldfeldquandt(residual, x, **parameters)[1]
        if p_value > significance:
            print(
                test_name +
                ': Good. The residuals have constant variance. (homoscedastic) p value: '
                + str(p_value))
        else:
            print(
                test_name +
                ': Bad. The residuals do not have constant variance. (heteroscedastic) p value: '
                + str(p_value))
Esempio n. 4
0
def goldfeld_quandt(dataframe, target, model, ax=None, alternative='two-sided'):
    text_color = plt.rcParams.get('ytick.color')
    exog = pd.DataFrame(model.model.exog)
    endog = pd.DataFrame(model.model.endog, columns=[target])
    temp = pd.concat([endog, exog], axis=1)
    temp = temp.sort_values(target).reset_index(drop=True)
    #display(temp.head())
    #dataframe = dataframe.reset_index()
    #temp = dataframe.sort_values(by=target).reset_index()
    #temp = temp.rename(columns={'index':'old_index'})
    #display(temp)
    lwr_thresh = dataframe[target].quantile(q=.45)
    upr_thresh = dataframe[target].quantile(q=.55)
    #lower_indices = temp[temp[target] <= lwr_thresh].index
    #upper_indices = temp[temp[target] >= upr_thresh].index
    middle_10percent_indices = dataframe[(dataframe[target] >= lwr_thresh) & (dataframe[target]<=upr_thresh)].index
    indices = [x for x in dataframe.index if x not in middle_10percent_indices]
    #print(indices)
    #return indices
    if not ax:
        fig, ax = plt.subplots(figsize=(6,6))
    #ax.scatter(temp[target].iloc[indices], model.resid.iloc[indices])
    features = [x for x in dataframe.columns if x not in [target]]
    #predictions = model.predict(dataframe.loc[indices][features])
    #ax.scatter(predictions, model.resid.loc[indices])
    predictions = model.predict(dataframe[features])
    #predictions = model.predict(model.model.exog)
    ax.scatter(predictions, model.resid)
    ax.set_xlabel(target+' predictions', color=text_color)
    ax.set_ylabel('Model Residuals', color=text_color)
    ax.set_title("Residuals versus {} predictions".format(target), color=text_color)
    #ax.axvline(x=lwr_thresh, ls=':',linewidth=2, color='gray')
    #ax.axvline(x=upr_thresh, ls=':',linewidth=2, color='gray')
    ax.axhline(y=0, c='r')
    if not ax:
        plt.show()
    #test = sms.het_goldfeldquandt(model.resid.iloc[indices], model.model.exog[indices])
    test = sms.het_goldfeldquandt(#model.resid.iloc[indices], 
                                  temp[target],
                                  #model.model.endog[indices],
                                  temp[[x for x in temp.columns if x not in [target]]],
                                  split=0.45,
                                  drop=0.10,
                                  alternative=alternative
                                  )
    #print(test)

    #var1 = np.var(temp.iloc[upper_indices][target])
    #var2 = np.var(temp.iloc[lower_indices][target])
    #df1 = len(temp.iloc[upper_indices]) - 1
    #df2 = len(temp.iloc[lower_indices]) - 1
    #p = f_test(var1, var2, df1, df2)
    results = pd.DataFrame(index=['Goldfeld-Quandt'], columns=['F_statistic', 'p_value'])
    results.loc['Goldfeld-Quandt','F_statistic'] = test[0]
    results.loc['Goldfeld-Quandt','p_value'] = test[1]
    #results.loc['Goldfeld-Quandt','F_statistic'] = var1/var2
    #results.loc['Goldfeld-Quandt','p_value'] = p
    return results 
def goldfeld_quandt(
        model: RegressionResultsWrapper,
        split: float = 0.45,
        drop: float = 0.1,
        jobs: int = os.cpu_count(),
) -> pd.DataFrame:
    """Run a battery of GQ tests, sorting by each exog variable in `model`.

    Args:
        model (RegressionResultsWrapper): Statsmodels regression results.
        split (float, optional): Fraction of observations for split point. Defaults to 0.45.
        drop (float, optional): Fraction of observations to drop. Defaults to 0.1.
        jobs (int, optional): Number of threads to create. Defaults to os.cpu_count().

    Returns:
        [pd.DataFrame]: DataFrame of results for each exog variable.
    """
    resid = model.resid
    exog = model.model.data.orig_exog
    resid, exog = resid.align(exog, axis=0)
    sort_cols = np.arange(exog.shape[1])

    if jobs > 1:
        gq = partial(
            sms.het_goldfeldquandt,
            resid.to_numpy(),
            exog.to_numpy(),
            alternative="two-sided",
            split=split,
            drop=drop,
        )
        with ThreadPool(jobs) as pool:
            all_results = pool.map(lambda x: gq(idx=x), sort_cols)
    else:
        all_results = []
        for idx in sort_cols:
            results = sms.het_goldfeldquandt(
                resid.to_numpy(),
                exog.to_numpy(),
                idx=idx,
                alternative="two-sided",
                split=split,
                drop=drop,
            )
            all_results.append(results)
    all_results = pd.DataFrame(all_results,
                               columns=["f_val", "p_val", "hypothesis"],
                               index=sort_cols)
    all_results.index = all_results.index.map(lambda x: exog.columns.values[x])
    all_results.index.name = "sort_by"
    return all_results.sort_values("p_val")
Esempio n. 6
0
def process_heteroscedasticity(x, y, metrics_dict, suffix):
    x_with_const = sm.add_constant(x)

    results = sm.OLS(y, x_with_const).fit()

    bp_lm, bp_lm_pvalue, bp_fvalue, bp_f_pvalue = sms.het_breuschpagan(
        results.resid, results.model.exog)
    w_lm, w_lm_pvalue, w_fvalue, w_f_pvalue = sms.het_white(
        results.resid, results.model.exog)
    gq_fvalue, gq_f_pvalue, gq_type = sms.het_goldfeldquandt(
        results.resid, results.model.exog)

    beg_lim, end_lim = np.percentile(x, [33, 67])
    beg_ids = []
    end_ids = []
    for t_id, t in enumerate(x):
        if t < beg_lim:
            beg_ids.append(t_id)
        elif t > end_lim:
            end_ids.append(t_id)

    beg_std = np.std(np.array(y)[np.array(beg_ids)])
    end_std = np.std(np.array(y)[np.array(end_ids)])

    if end_std > beg_std:
        type = 'increasing'
    else:
        type = 'decreasing'

    metrics_dict['type' + suffix].append(type)

    metrics_dict['bp_lm' + suffix].append(bp_lm)
    metrics_dict['bp_lm_pvalue' + suffix].append(bp_lm_pvalue)
    metrics_dict['bp_fvalue' + suffix].append(bp_fvalue)
    metrics_dict['bp_f_pvalue' + suffix].append(bp_f_pvalue)

    metrics_dict['w_lm' + suffix].append(w_lm)
    metrics_dict['w_lm_pvalue' + suffix].append(w_lm_pvalue)
    metrics_dict['w_fvalue' + suffix].append(w_fvalue)
    metrics_dict['w_f_pvalue' + suffix].append(w_f_pvalue)

    metrics_dict['gq_fvalue' + suffix].append(gq_fvalue)
    metrics_dict['gq_f_pvalue' + suffix].append(gq_f_pvalue)
    metrics_dict['gq_type' + suffix].append(gq_type)
# ## <a id="h**o">3. Check for Homoscedasticity</a>

# %% [code]
p = sns.scatterplot(y_pred, residuals)
plt.xlabel('y_pred/predicted values')
plt.ylabel('Residuals')
plt.ylim(-10, 10)
plt.xlim(0, 26)
p = sns.lineplot([0, 26], [0, 0], color='blue')
p = plt.title('Residuals vs fitted values plot for homoscedasticity check')

# %% [code]
import statsmodels.stats.api as sms
from statsmodels.compat import lzip
name = ['F statistic', 'p-value']
test = sms.het_goldfeldquandt(residuals, X_train)
lzip(name, test)

# %% [code]
from scipy.stats import bartlett
test = bartlett(X_train, residuals)
print(test)

# %% [markdown]
# ## <a id="normal">4. Check for Normality of error terms/residuals</a>

# %% [code]
p = sns.distplot(residuals, kde=True)
p = plt.title('Normality of error terms/residuals')

# %% [markdown]
# Other plotting options can be found on the [Graphics page.](http://www.statsmodels.org/stable/graphics.html)

# ## Multicollinearity
#
# Condition number:

np.linalg.cond(results.model.exog)

# ## Heteroskedasticity tests
#
# Breush-Pagan test:

name = ['Lagrange multiplier statistic', 'p-value', 'f-value', 'f p-value']
test = sms.het_breushpagan(results.resid, results.model.exog)
lzip(name, test)

# Goldfeld-Quandt test

name = ['F statistic', 'p-value']
test = sms.het_goldfeldquandt(results.resid, results.model.exog)
lzip(name, test)

# ## Linearity
#
# Harvey-Collier multiplier test for Null hypothesis that the linear specification is correct:

name = ['t value', 'p value']
test = sms.linear_harvey_collier(results)
lzip(name, test)
# Condition number:

np.linalg.cond(results.model.exog)


# ## Heteroskedasticity tests
# 
# Breush-Pagan test:

name = ['Lagrange multiplier statistic', 'p-value', 
        'f-value', 'f p-value']
test = sms.het_breushpagan(results.resid, results.model.exog)
lzip(name, test)


# Goldfeld-Quandt test

name = ['F statistic', 'p-value']
test = sms.het_goldfeldquandt(results.resid, results.model.exog)
lzip(name, test)


# ## Linearity
# 
# Harvey-Collier multiplier test for Null hypothesis that the linear specification is correct:

name = ['t value', 'p value']
test = sms.linear_harvey_collier(results)
lzip(name, test)

Esempio n. 10
0
name2 = ['Chi^2', 'Two-tail probability']
test2 = sms.omni_normtest(lr.resid)
lzip(name2, test2)
'''

#================================================
#================================================
#=========================Heteroskedasticity test
#======================Breush-Pagan test:
name3 = ['Lagrange multiplier statistic', 'p-value', 'f-value', 'f p-value']
test3 = sms.het_breuschpagan(lr.resid, lr.model.exog)
lzip(name3, test3)

#======================Goldfeld-Quandt test:
name5 = ['F statistic', 'p-value']
test5 = sms.het_goldfeldquandt(lr.resid, lr.model.exog)
lzip(name5, test5)


#================================================
#================================================
#==================================Linearity test
#======================Harvey-Collier:
name6 = ['t value', 'p value']
test6 = sms.linear_harvey_collier(lr)
lzip(name6, test6)

import statsmodels.stats.diagnostic as ssd
name6 = ['t value', 'p value']
test6 = ssd.acorr_linear_rainbow(lr)
lzip(name6, test6)
Esempio n. 11
0
def goldfeldQuandtTest(residuals, exogVars):
    name = ['F statistic', 'p-value']
    test = sms.het_goldfeldquandt(residuals, exogVars)
    lzip(name, test)
data_new = pd.DataFrame(pp.scale(data.values[:,:-1]), columns=['Beds','Healing_days','Income','Salary','Costs'])
model_new1 = sm.OLS.from_formula(formula=model.model.formula, data=data_new).fit()
print(model_new1.summary())
import  patsy
y, X = patsy.dmatrices(model.model.formula, data, return_type='dataframe')
model_n = lm.LinearRegression()
#Кросс-Валидация
k_fold = KFold(n_splits=10)
scores = cross_val_score(model_n, X, y, cv=k_fold, scoring='r2')
predicted = cross_val_predict(model_n,X,y,cv=k_fold)

slope, intercept, r_value, p_value, std_err = st.linregress(y.values[:,0],predicted[:,0])
print(r_value*r_value)  
#Гомоскедастичность (Бреуш-Паган, Голдфильд-Квандт)
test = sms.het_breushpagan(res11.resid, res11.model.exog)
name = ['Lagrange multiplier statistic', 'p-value', 
        'f-value', 'f p-value']
print(lzip(name, test))

name = ['F statistic', 'p-value']
test = sms.het_goldfeldquandt(res11.resid, res11.model.exog)
print(lzip(name, test))

#Q-Q
st.probplot(res4.resid,plot=plt)
sm.qqplot(res11.resid, line='s')
plt.show()
#Дарбин-Уотсон
dw = sms.stattools.durbin_watson(res11.resid)
print(dw)
    def check_error_term_constant_variance(self) -> bool:
        """
        Checks if the error term has constant variance (there is no heteroscedascity) by:
        - Breusch-Pagan's statistical test,
        - Goldfeld-Quandt's statstical test.
        If:
         - silent_mode = True, method returns:
                                              a) True (which means that the assumption is
                                                 fulfilled) if the percentage of statistical tests
                                                 for which the assumption is fulfilled is higher
                                                 than or equal to set min_fulfill_ratio
                                              b) False (which means that the assumption is not
                                                 fulfilled) if the percentage of statistical tests
                                                 for which the assumption is fulfilled is lower
                                                 than set min_fulfill_ratio
         - silent_mode = False, method returns True/False as above and shows additional statistics,
         descriptions which are helpful in assessing the fulfilment of assumption
        """

        bp_test = pd.DataFrame(
            sms.het_breuschpagan(self.residuals, self.results.model.exog)[:2],
            columns=["value"],
            index=["Lagrange multiplier statistic", "p-value"])
        gq_test = pd.DataFrame(sms.het_goldfeldquandt(
            self.residuals, self.results.model.exog)[:-1],
                               columns=["value"],
                               index=["F statistic", "p-value"])
        heteroscedascity_tests = [bp_test, gq_test]

        true_counts = 0
        for test in heteroscedascity_tests:
            true_counts = true_counts + test_hypothesis(
                significance_level=self.alpha,
                p_value=test.iloc[1].value,
                print_outcome=False)
        true_ratio = true_counts / 2

        if not self.silent_mode:
            print(
                Color.BOLD +
                "Assumption 5. The error term has a constant variance." +
                Color.END, "\n")

            print("This assumption affects on: \n", "- prediction \n",
                  "- interpretation \n")

            print(
                "Heteroscedasticity does not cause bias in the coefficient estimates, it does "
                "make them less precise. Heteroscedasticity also tends to produce p-values that "
                "are smaller than they should be. If you notice this problem in your model, "
                "you can try one of this solutions to fix it: redefine independent variable to "
                "focus on rates/per capita, try using weighted least squares, experiment with "
                "data transformations (f.g. Box-Cox's/Johnson's transformation).\n"
            )

            print(Color.BOLD + "Breusch-Pagan " + Color.END +
                  "Lagrange Multiplier "
                  "statistical test: \n")
            print(bp_test, "\n")

            test_hypothesis(
                significance_level=self.alpha,
                p_value=bp_test.iloc[1].value,
                null_hypothesis="error term's variance is constant.")

            print(Color.BOLD + "Goldfeld-Quandt " + Color.END +
                  "test that examines whether the "
                  "residual variance is the same in "
                  "two subsamples: \n")

            print(gq_test, "\n")

            test_hypothesis(
                significance_level=self.alpha,
                p_value=gq_test.iloc[1].value,
                null_hypothesis="error term's variance is constant.")

            check_fulfill_ratio(true_fulfill_ratio=true_ratio,
                                min_fulfill_ratio=self.min_fulfill_ratio)

            print(
                "HINT: If you see randomly scattered points => there is no heteroscedascity. \n",
                "If you see fan or cone pattern => probably there exists heteroscedascity. \n"
            )

            plot_standarized_residuals_vs_fitted(fitted_model=self.results)
            plt.show()

        return check_fulfill_ratio(true_fulfill_ratio=true_ratio,
                                   min_fulfill_ratio=self.min_fulfill_ratio,
                                   print_outcome=False)
# RMSE of residuals
np.sqrt(np.mean(model.resid**2)) # 244

# checking normality of residuals
stats.anderson(model.resid) # residuals are normal

# checking auto-correlation of residuals
from statsmodels.stats import diagnostic as diag
diag.acorr_ljungbox(model.resid, lags=1)
# pvalue is <0.05, so autocorrelation is present

# checking heteroscedasticity
import statsmodels.stats.api as sms
from statsmodels.compat import lzip
name = ['F-statistic','p-value']
gold_test = sms.het_goldfeldquandt(model.resid,model.model.exog)
lzip(name,gold_test)
# ('F-stat', 0.6722696289421596), ('p-value', 0.9999999999999999)], 
# go with null, residuals are homoscedastic: constant variance

pred_price = model.predict(computer2.drop(['price'],axis=1))
pred_price[0:4]
computer2.price[0:4]
model.resid[0:4]
1499-1787

# except for autocorrelation all assumptions are satisfied

# splitting data
from sklearn.model_selection import train_test_split
train,test = train_test_split(computer2,test_size=0.30, random_state=100)
#* so we will remove them from our data frame

rm = ["season", "weathersit", "yr", "mnth"]
reg2 = reg.drop(columns=rm)

testp = model2.predict(test.iloc[:, 0:12])

MAPE(test.iloc[:, 12], testp)

r2_score(test.iloc[:, 12],
         testp), math.sqrt(mean_squared_error(test.iloc[:, 12], testp))

### 4. Detecting Hetroscedaticity
#* Using goldfeld quandt test

gq_test = pd.DataFrame(sms.het_goldfeldquandt(model2.resid,
                                              model2.model.exog)[:-1],
                       columns=['value'],
                       index=['F statistic', 'p-value'])
gq_test

##### Since p-value is greater than the alpha = 0.05, we can asume that data is homoscedatic.

### Building a final model after satisfying all the assumption.

train, test = train_test_split(reg2, test_size=0.2)
model3 = sm.OLS(train.iloc[:, 8], train.iloc[:, 0:8]).fit()
model3.summary()

testp = model3.predict(test.iloc[:, 0:8])

MAPE(test.iloc[:, 8], testp), r2_score(test.iloc[:, 8], testp), math.sqrt(
Esempio n. 16
0
def main(processed_path = "data/processed",
         models_path = "models"):
    
    """Nested 10-fold cross-validation for linear regression of
    ranking_log and score with with lasso regularization
    (inner CV for alpha tuning, outer for R^2 robustness)."""
    
    # logging
    logger = logging.getLogger(__name__)
    
    # normalize paths
    processed_path = os.path.normpath(processed_path)
    logger.debug("Path to processed data normalized: {}"
                 .format(processed_path))
    models_path = os.path.normpath(models_path)
    logger.debug("Path to models normalized: {}"
                 .format(models_path))
    
    # load selected_df
    selected_df = pd.read_pickle(os.path.join(processed_path,
                                              'selected_df.pkl'))
    logger.info("Loaded selected_df. Shape of df: {}"
                .format(selected_df.shape))
    
    #%% split df into dependent and independent variables
    teams_df = selected_df.iloc[:, :9]
    y = selected_df.iloc[:, 9:10]
    X = selected_df.iloc[:, 10:]
    X_columns = X.columns
    X_index = X.index
    
    #%% standardize
    
    scaler = StandardScaler()
    not_standardize = ['core',
                       'visualization',
                       'machine_learning',
                       'deep_learning']
    X_standardized = scaler.fit_transform(X
                                          .drop(columns=not_standardize)
                                          .values)
    X_standardized = pd.DataFrame(X_standardized,
                                  index = X_index,
                                  columns = X_columns.drop(not_standardize))
    X_not_standardized = X[not_standardize]
    X = pd.concat([X_standardized, X_not_standardized], axis=1)
    logger.debug("After Standardization:\n{}".format(X.describe().to_string))
    
    #%% define hyperparameter
    
    start = time()

    L1_RATIOS = [1.0, .95, .7, .5, .3, .1]
    EPS = 0.001
    N_ALPHAS = 100
    ALPHAS = None
    # normalize data
    # If True, the regressors X will be normalized before regression by
    # subtracting the mean (column-wise) and dividing by the l2-norm in
    # order for each feature to have norm = 1.
    NORMALIZE = False
    MAX_ITER = 10000
    TOL = 0.0001
    CV = 20
    N_JOBS = 1
    RS = 1
    SELECTION = 'cyclic'
    
    logger.info("l1_ratio={}, eps={}, n_alphas={}, alphas={}, normalize={}"
                 .format(L1_RATIOS, EPS, N_ALPHAS, ALPHAS, NORMALIZE))
    logger.info("max_iter={}, tol={}, cv={}, n_jobs={}, rs={}, selection={}"
                 .format(MAX_ITER, TOL, CV, N_JOBS, RS, SELECTION))
    logger.debug("Try following L1-ratios: {}".format(L1_RATIOS))
    
    # print R^2 values for bounding alphas 0 and 1 to make sense of alphas
    logger.info("Bounding score: R^2 for alpha=0 and l1_ratio=0.5: {}"
                .format(ElasticNet(alpha=0, l1_ratio=.5,
                                   normalize=NORMALIZE, random_state=RS)
                        .fit(X.values, y.values)
                        .score(X.values, y.values)))
    logger.info("Bounding score: R^2 for alpha=1 and l1_ratio=0.5: {}"
                .format(ElasticNet(alpha=1, l1_ratio=.5,
                                   normalize=NORMALIZE, random_state=RS)
                        .fit(X.values, y.values)
                        .score(X.values, y.values)))
    
    #%% train model
    
    mod = ElasticNetCV(l1_ratio = L1_RATIOS,
                       eps = EPS,
                       n_alphas = N_ALPHAS,
                       alphas = ALPHAS,
                       normalize = NORMALIZE,
                       max_iter = MAX_ITER,
                       tol = TOL,
                       cv = CV,
                       n_jobs = N_JOBS,
                       random_state = RS,
                       selection = SELECTION)\
          .fit(X.values, y.values)
    
    # log some statistics
    best_r2 = mod.score(X.values, y.values)
    logger.info("best R^2 score: {:.2f}%".format(best_r2*100))
    best_l1_ratio = mod.l1_ratio_
    logger.info("best l1_ratio: {}".format(best_l1_ratio))
    best_alpha = mod.alpha_
    logger.info("best alpha: {:.3f}".format(best_alpha))
    alphas = mod.alphas_
    logger.debug("tested alphas:\n{}".format(alphas))
    coef = pd.Series(data=mod.coef_, index=X_columns)
    logger.debug("best coefficients:\n{}".format(coef))
#    mse_path = mod.mse_path_
    
    #%% Nested Cross-Validation to test robustness of R^2
    
    cv_results = cross_validate(ElasticNetCV(l1_ratio = L1_RATIOS,
                                             eps = EPS,
                                             n_alphas = N_ALPHAS,
                                             alphas = ALPHAS,
                                             normalize = NORMALIZE,
                                             max_iter = MAX_ITER,
                                             tol = TOL,
                                             cv = CV,
                                             n_jobs = N_JOBS,
                                             random_state = RS,
                                             selection = SELECTION),
                                X.values, y.values, cv=CV,
                                return_train_score=True, n_jobs=N_JOBS)
    logger.info("95% confidence intervall: {:.2f} +/- {:.2f} (mean +/- 2*std)"
                .format(cv_results['test_score'].mean(),
                        cv_results['test_score'].std()*2))
    logger.debug("Nested cross-validation results:\n{}"
                .format(pd.DataFrame(data=cv_results)))
    
    #%% Elastic Net regression with statsmodels for summary
    
    mod_sm = sm.OLS(y.values, sm.add_constant(pd.DataFrame(data=X.values,
                                                    columns=X_columns,
                                                    index=X_index)))\
          .fit_regularized(method='elastic_net',
                           alpha=best_alpha,
                           L1_wt=best_l1_ratio,
                           refit=True)
    res = mod_sm.summary().as_text()
    logger.info("ElasticNet regression of selected_df regarding ranking_log")
    logger.info("with alpha={:.5f} and L1_wt={}:\n{}"
                .format(best_alpha, best_l1_ratio, res))
    
    # Normality of residuals
    # Jarque-Bera test:
    name = ['Jarque-Bera', 'Chi^2 two-tail prob.', 'Skew', 'Kurtosis']
    test = sms.jarque_bera(mod_sm.resid)
    logger.info("Jarque-Bera test: {}".format(lzip(name, test)))
    # Omni test:
    name = ['Chi^2', 'Two-tail probability']
    test = sms.omni_normtest(mod_sm.resid)
    logger.info("Omnibus test: {}".format(lzip(name, test)))
    
    # Multicollinearity
    # Conditional Number:
    logger.info("Conditional Number: {}"
                .format(np.linalg.cond(mod_sm.model.exog)))
    
    # Heteroskedasticity tests
    # Breush-Pagan test:
    name = ['Lagrange multiplier statistic', 'p-value', 
        'f-value', 'f p-value']
    test = sms.het_breuschpagan(mod_sm.resid, mod_sm.model.exog)
    logger.info("Breush-Pagan test: {}".format(lzip(name, test)))
    # Goldfeld-Quandt test
    name = ['F statistic', 'p-value']
    test = sms.het_goldfeldquandt(mod_sm.resid, mod_sm.model.exog)
    logger.info("Goldfeld-Quandt test: {}".format(lzip(name, test)))
    
    #%% export results as pickle file to models folder
    
    # pickle mod
    with open(os.path.join(models_path, 'sklearn_ElasticNetCV.pkl'),
              'wb') as handle:
        pickle.dump(mod, handle, protocol=pickle.HIGHEST_PROTOCOL)
    logger.info("Saved elastic net model of sklearn to {}."
                .format(os.path.join(models_path,
                                     'sklearn_ElasticNetCV.pkl')))
    
    # pickle mod_sm
    with open(os.path.join(models_path, 'sm_OLS_fit_regularized.pkl'),
              'wb') as handle:
        pickle.dump(mod_sm, handle, protocol=pickle.HIGHEST_PROTOCOL)
    logger.info("Saved elastic net model of statsmodels to {}."
                .format(os.path.join(models_path,
                                     'sm_OLS_fit_regularized.pkl')))
    
    # save res as .txt
    f = open(os.path.join(models_path,
                          'sm_OLS_fit_regularized_summary.txt'), "w+")
    f.write(res)
    f.close()
    
    
    #%% logging time passed
    end = time()
    time_passed = pd.Timedelta(seconds=end-start).round(freq='s')
    logger.info("Time needed to train Elastic Net Model: {}"
                .format(time_passed))