def ridgeCV(y_ts, df_norm, keys=None, kwrgs_model=None): ''' X contains all precursor data, incl train and test X_train, y_train are split up by TrainIsTrue Preciction is made for whole timeseries ''' #%% if keys is None: no_data_col = ['TrainIsTrue', 'RV_mask', 'fit_model_mask'] keys = df_norm.columns keys = [k for k in keys if k not in no_data_col] import warnings warnings.filterwarnings("ignore", category=DeprecationWarning) # warnings.filterwarnings("ignore", category=FutureWarning) if kwrgs_model == None: # use Bram settings kwrgs_model = {'fit_intercept': True, 'alphas': (.01, .1, 1.0, 10.0)} # find parameters for gridsearch optimization kwrgs_gridsearch = { k: i for k, i in kwrgs_model.items() if type(i) == list } # only the constant parameters are kept kwrgs = kwrgs_model.copy() [kwrgs.pop(k) for k in kwrgs_gridsearch.keys()] if 'feat_sel' in kwrgs: feat_sel = kwrgs.pop('feat_sel') else: feat_sel = None # Get training years x_fit_mask, y_fit_mask, x_pred_mask, y_pred_mask = utils.get_masks(df_norm) X = df_norm[keys] X = X.dropna(axis='columns') # drop only nan columns # X = add_constant(X) X_train = X[x_fit_mask.values] X_pred = X[x_pred_mask.values] RV_fit = y_ts['ts'].loc[y_fit_mask.index] # y_fit may be shortened # because X_test was used to predict y_train due to lag, hence train-test # leakage. # y_ts dates may no longer align with x_fit y_fit masks y_fit_mask = df_norm['TrainIsTrue'].loc[y_fit_mask.index].values y_train = RV_fit[y_fit_mask].squeeze() # if y_pred_mask is not None: # y_dates = RV_fit[y_pred_mask.values].index # else: # y_dates = RV_fit.index X = X_train # # Create stratified random shuffle which keeps together years as blocks. kwrgs_cv = ['kfold', 'seed'] kwrgs_cv = {k: i for k, i in kwrgs.items() if k in kwrgs_cv} [kwrgs.pop(k) for k in kwrgs_cv.keys()] if len(kwrgs_cv) >= 1: cv = utils.get_cv_accounting_for_years(y_train, **kwrgs_cv) kwrgs['store_cv_values'] = False else: cv = None kwrgs['store_cv_values'] = True model = RidgeCV(cv=cv, **kwrgs) if feat_sel is not None: if feat_sel['model'] is None: feat_sel['model'] = model model, new_features, rfecv = utils.feature_selection( X_train, y_train.values, **feat_sel) X_pred = X_pred[new_features] else: model.fit(X_train, y_train) y_pred = model.predict(X_pred) prediction = pd.DataFrame(y_pred, index=y_pred_mask.index, columns=[0]) model.X_pred = X_pred model.name = 'Ridge Regression' #%% return prediction, model