raw['oof'] = raw.drop(columns=['card_id', 'target']).mean(axis=1) train = raw[:len_train] cv_score = mean_squared_error(train['oof'], train['target'])**0.5 print("CV score: {:<8.5f}".format(cv_score)) sub_folder = path.join(top_folder, 'average-CV-' + str(np.round(cv_score, 5)) + '_' + now) makedirs(sub_folder, exist_ok=True) raw[['card_id', 'oof']].to_csv(path.join(sub_folder, 'oof_average.csv'), index=False) del raw['oof'] # ========= stacking train = raw[:len_train] test = raw[len_train:] train = uni_distribution(train, 'target') x_train = train.drop(columns=['card_id', 'target']) y_train = train['target'] x_test = test.drop(columns=['card_id', 'target']) folds = KFold(n_splits=6, shuffle=False, random_state=None) oof = np.zeros(len(train)) predictions = np.zeros(len(test)) for fold_, (trn_idx, val_idx) in enumerate(folds.split(x_train.values, y_train.values)): print("fold n°{}".format(fold_)) trn_data, trn_y = x_train.iloc[trn_idx], y_train.iloc[trn_idx] val_data, val_y = x_train.iloc[val_idx], y_train.iloc[val_idx]
train = raw[:len_train] test = raw[len_train:] del raw # === remove imbalance feature list_p_value = [] for i in tqdm(feats): list_p_value.append(ks_2samp(test[i], train[i])[1]) Se = pd.Series(list_p_value, index=feats).sort_values() list_discarded = list(Se[Se < .1].index) print(list_discarded) for col in tqdm(list_discarded): feats.remove(col) # === uniform distribution train = uni_distribution(df=train, key='target') y_train = train['target'] train = train[feats] test = test[feats] gc.collect() model = ridge.Ridge(alpha=1) folds = KFold(n_splits=6, shuffle=False, random_state=None) col_all = [col for col in feats] col_use = [ 'oof_0', ] col_use.extend(feats_old) col_use.extend(feats_ds2)