def ieee_cv(logger, df_train, Y, df_test, COLUMN_GROUP, use_cols, params={}, cols_categorical=[], is_adv=False, is_valid=False): start_time = "{0:%Y%m%d_%H%M%S}".format(datetime.datetime.now())[:13] seed = params['seed'] model_type = params['model_type'] n_splits = params['n_splits'] validation = params['fold'] early_stopping_rounds = params['early_stopping_rounds'] del params['seed'] del params['model_type'] del params['n_splits'] del params['fold'] # del params['model_type'], params['n_splits'], params['fold'] if validation=="stratified": kfold = list(StratifiedKFold(n_splits=n_splits, random_state=seed).split(df_train, Y)) elif validation=='group': # tmp_kfold = list(GroupKFold(n_splits=n_splits).split(df_train, Y, df_train[COLUMN_GROUP])) # kfold = [tmp_kfold[3], tmp_kfold[5], tmp_kfold[1], tmp_kfold[4], tmp_kfold[2], tmp_kfold[0]] kfold = list(GroupKFold(n_splits=n_splits).split(df_train, Y, df_train[COLUMN_GROUP])) score_list = [] feim_list = [] best_iteration = 0 y_pred = np.zeros(len(df_train)) test_preds = [] if len(df_test): x_test = df_test else: x_test = [] for n_fold, (trn_idx, val_idx) in enumerate(kfold): # if n_fold!=3: # continue x_train = df_train.iloc[trn_idx] y_train = Y.iloc[trn_idx] x_valid = df_train.iloc[val_idx] y_valid = Y.iloc[val_idx] # if n_fold != 0: # probing = pd.read_csv('../input/20190929_probing.csv') # probing = probing[probing['Probing_isFraud']==1] # probing_ids = probing[COLUMN_ID].values # y_probing = probing['Probing_isFraud'] # y_probing.name = COLUMN_TARGET # probing_train = x_test[x_test[COLUMN_ID].isin(probing_ids)] # print(x_train.shape, y_train.shape) # x_train = pd.concat([x_train, probing_train], axis=0) # y_train = pd.concat([y_train, y_probing], axis=0) # print(x_train.shape, y_train.shape) x_train = x_train[use_cols] x_valid = x_valid[use_cols] val_gr = df_train.iloc[val_idx][COLUMN_GROUP].value_counts() dtm = val_gr.index.tolist()[0] print("="*20) with timer(f" * Fold{n_fold} Validation-{COLUMN_GROUP} {dtm}: {val_gr.values[0]}"): score, oof_pred, test_pred, feim, best_iter, _ = Classifier( model_type=model_type, x_train=x_train, y_train=y_train, x_valid=x_valid, y_valid=y_valid, x_test=x_test[use_cols], params=params, early_stopping_rounds = early_stopping_rounds, cols_categorical = cols_categorical ) # if not is_adv: pb, pv, al = bear_validation(test_pred) logger.info(f" * Fold{n_fold} {dtm}: {score} | Bear's...PB:{pb} PV:{pv} All:{al}") print("="*20) score_list.append(score) best_iteration += best_iter/n_splits y_pred[val_idx] = oof_pred test_preds.append(test_pred) if len(feim): feim.rename(columns={'importance': f'imp_fold{n_fold+1}'}, inplace=True) feim.set_index('feature', inplace=True) feim_list.append(feim) cv_score = np.mean(score_list) cvs = str(cv_score).replace('.', '-') if len(feim): df_feim = pd.concat(feim_list, axis=1) df_feim['imp_avg'] = df_feim.mean(axis=1) df_feim.sort_values(by='imp_avg', ascending=False, inplace=True) else: df_feim = [] ## Save # Each Fold Test Pred to_pkl_gzip(obj=test_preds, path=f'../output/fold_test_pred/{start_time}_Each_Fold__CV{cvs}__feature{len(use_cols)}') # Feature Importance if len(feim): to_pkl_gzip(obj=df_feim, path=f"../output/feature_importances/{start_time}__CV{cvs}__feature{len(use_cols)}") #======================================================================== # Adversarial Validationもこちらの関数を使うので、Adversarialの場合はここで終わり #======================================================================== if is_adv: pred_result = pd.Series(y_pred, index=df_train[COLUMN_ID].values, name='adv_pred_' + start_time) return 0, cv_score, df_feim, pred_result, [], [] with timer(" * Make Prediction Result File."): if is_valid: pred_result = [] else: test_pred_avg = np.mean(test_preds, axis=0) all_pred = np.append(y_pred, test_pred_avg) all_ids = np.append(df_train[COLUMN_ID].values, df_test[COLUMN_ID].values) pred_result = pd.DataFrame([all_ids, all_pred], index=[COLUMN_ID, 'pred_' + start_time]).T pred_result[COLUMN_ID] = pred_result[COLUMN_ID].astype('int') #======================================================================== # Save #======================================================================== # Prediction to_pkl_gzip(obj=pred_result, path=f"../output/pred_result/{start_time}__CV{cvs}__all_preds") # Submit File pred_result.columns = [COLUMN_ID, COLUMN_TARGET] pred_result.iloc[len(df_train):].to_csv(f"../submit/tmp/{start_time}__CV{cvs}__feature{len(use_cols)}.csv", index=False) return best_iteration, cv_score, df_feim, pred_result, score_list, test_preds
if is_base or len(valid_path)==0: tmp_train = df_train.copy() feature_name = 'base' else: df_feat_train = parallel_load_data(valid_path) tmp_train = df_train.join(df_feat_train) feature_name = get_filename(valid_path[0]) use_cols = [col for col in tmp_train.columns if col not in COLUMNS_IGNORE] cnt = 0 cv = 0 for fold in range(3): with timer(' * Make Dataset'): if fold==0: train = tmp_train[ (tmp_train[COLUMN_GROUP] == '2017-12') | (tmp_train[COLUMN_GROUP] == '2018-1') | (tmp_train[COLUMN_GROUP] == '2018-2') | (tmp_train[COLUMN_GROUP] == '2018-3') | (tmp_train[COLUMN_GROUP] == '2018-4') ] test = tmp_train[tmp_train[COLUMN_GROUP] == '2018-5'] elif fold==1: train = tmp_train[ (tmp_train[COLUMN_GROUP] == '2017-12') | (tmp_train[COLUMN_GROUP] == '2018-1') | (tmp_train[COLUMN_GROUP] == '2018-2') | (tmp_train[COLUMN_GROUP] == '2018-3') |
y_valid = Y.iloc[val_idx] x_train = x_train[~x_train[COLUMN_TARGET].isnull()][use_cols] x_trn_idx = x_train.index x_valid = x_valid[~x_valid[COLUMN_TARGET].isnull()][use_cols] x_val_idx = x_valid.index y_train = y_train.loc[x_trn_idx] y_valid = y_valid.loc[x_val_idx] base_valid = tmp_train.iloc[val_idx][use_cols] val_gr = tmp_train.iloc[val_idx][COLUMN_GROUP].value_counts() dtm = val_gr.index.tolist()[0] print("=" * 20) with timer( f" * Fold{n_fold} Validation-{COLUMN_GROUP} {dtm}: {val_gr.values[0]}" ): score, oof_pred, test_pred, feim, _ = Regressor( base_valid, model_type=model_type, x_train=x_train, y_train=y_train, x_valid=x_valid, y_valid=y_valid, x_test=x_test, params=params, early_stopping_rounds=early_stopping_rounds, ) score_list.append(score) y_pred[val_idx] = oof_pred