def save_feature(df_feat, prefix, dir_save, is_train, auto_type=True, list_ignore=[], is_check=False, is_viz=True): DIR_FEATURE = Path('../feature') / dir_save length = len(df_feat) if is_check: for col in df_feat.columns: if col in list_ignore: continue # Nullがあるかどうか null_len = df_feat[col].dropna().shape[0] if length - null_len > 0: print(f"{col} | null shape: {length - null_len}") # infがあるかどうか max_val = df_feat[col].max() min_val = df_feat[col].min() if max_val == np.inf or min_val == -np.inf: print(f"{col} | max: {max_val} | min: {min_val}") print(" * Finish Feature Check.") sys.exit() for col in df_feat.columns: if col in list_ignore: continue if auto_type: feature = df_feat[col].values.astype('float32') else: feature = df_feat[col].values if is_train: feat_path = DIR_FEATURE / f'{prefix}__{col}_train' else: feat_path = DIR_FEATURE / f'{prefix}__{col}_test' if os.path.exists(str(feat_path) + '.gz'): continue else: if is_viz: print(f"{feature.shape} | {col}") utils.to_pkl_gzip(path=str(feat_path), obj=feature)
def ieee_cv(logger, df_train, Y, df_test, COLUMN_GROUP, use_cols, params={}, cols_categorical=[], is_adv=False, is_valid=False): start_time = "{0:%Y%m%d_%H%M%S}".format(datetime.datetime.now())[:13] seed = params['seed'] model_type = params['model_type'] n_splits = params['n_splits'] validation = params['fold'] early_stopping_rounds = params['early_stopping_rounds'] del params['seed'] del params['model_type'] del params['n_splits'] del params['fold'] # del params['model_type'], params['n_splits'], params['fold'] if validation=="stratified": kfold = list(StratifiedKFold(n_splits=n_splits, random_state=seed).split(df_train, Y)) elif validation=='group': # tmp_kfold = list(GroupKFold(n_splits=n_splits).split(df_train, Y, df_train[COLUMN_GROUP])) # kfold = [tmp_kfold[3], tmp_kfold[5], tmp_kfold[1], tmp_kfold[4], tmp_kfold[2], tmp_kfold[0]] kfold = list(GroupKFold(n_splits=n_splits).split(df_train, Y, df_train[COLUMN_GROUP])) score_list = [] feim_list = [] best_iteration = 0 y_pred = np.zeros(len(df_train)) test_preds = [] if len(df_test): x_test = df_test else: x_test = [] for n_fold, (trn_idx, val_idx) in enumerate(kfold): # if n_fold!=3: # continue x_train = df_train.iloc[trn_idx] y_train = Y.iloc[trn_idx] x_valid = df_train.iloc[val_idx] y_valid = Y.iloc[val_idx] # if n_fold != 0: # probing = pd.read_csv('../input/20190929_probing.csv') # probing = probing[probing['Probing_isFraud']==1] # probing_ids = probing[COLUMN_ID].values # y_probing = probing['Probing_isFraud'] # y_probing.name = COLUMN_TARGET # probing_train = x_test[x_test[COLUMN_ID].isin(probing_ids)] # print(x_train.shape, y_train.shape) # x_train = pd.concat([x_train, probing_train], axis=0) # y_train = pd.concat([y_train, y_probing], axis=0) # print(x_train.shape, y_train.shape) x_train = x_train[use_cols] x_valid = x_valid[use_cols] val_gr = df_train.iloc[val_idx][COLUMN_GROUP].value_counts() dtm = val_gr.index.tolist()[0] print("="*20) with timer(f" * Fold{n_fold} Validation-{COLUMN_GROUP} {dtm}: {val_gr.values[0]}"): score, oof_pred, test_pred, feim, best_iter, _ = Classifier( model_type=model_type, x_train=x_train, y_train=y_train, x_valid=x_valid, y_valid=y_valid, x_test=x_test[use_cols], params=params, early_stopping_rounds = early_stopping_rounds, cols_categorical = cols_categorical ) # if not is_adv: pb, pv, al = bear_validation(test_pred) logger.info(f" * Fold{n_fold} {dtm}: {score} | Bear's...PB:{pb} PV:{pv} All:{al}") print("="*20) score_list.append(score) best_iteration += best_iter/n_splits y_pred[val_idx] = oof_pred test_preds.append(test_pred) if len(feim): feim.rename(columns={'importance': f'imp_fold{n_fold+1}'}, inplace=True) feim.set_index('feature', inplace=True) feim_list.append(feim) cv_score = np.mean(score_list) cvs = str(cv_score).replace('.', '-') if len(feim): df_feim = pd.concat(feim_list, axis=1) df_feim['imp_avg'] = df_feim.mean(axis=1) df_feim.sort_values(by='imp_avg', ascending=False, inplace=True) else: df_feim = [] ## Save # Each Fold Test Pred to_pkl_gzip(obj=test_preds, path=f'../output/fold_test_pred/{start_time}_Each_Fold__CV{cvs}__feature{len(use_cols)}') # Feature Importance if len(feim): to_pkl_gzip(obj=df_feim, path=f"../output/feature_importances/{start_time}__CV{cvs}__feature{len(use_cols)}") #======================================================================== # Adversarial Validationもこちらの関数を使うので、Adversarialの場合はここで終わり #======================================================================== if is_adv: pred_result = pd.Series(y_pred, index=df_train[COLUMN_ID].values, name='adv_pred_' + start_time) return 0, cv_score, df_feim, pred_result, [], [] with timer(" * Make Prediction Result File."): if is_valid: pred_result = [] else: test_pred_avg = np.mean(test_preds, axis=0) all_pred = np.append(y_pred, test_pred_avg) all_ids = np.append(df_train[COLUMN_ID].values, df_test[COLUMN_ID].values) pred_result = pd.DataFrame([all_ids, all_pred], index=[COLUMN_ID, 'pred_' + start_time]).T pred_result[COLUMN_ID] = pred_result[COLUMN_ID].astype('int') #======================================================================== # Save #======================================================================== # Prediction to_pkl_gzip(obj=pred_result, path=f"../output/pred_result/{start_time}__CV{cvs}__all_preds") # Submit File pred_result.columns = [COLUMN_ID, COLUMN_TARGET] pred_result.iloc[len(df_train):].to_csv(f"../submit/tmp/{start_time}__CV{cvs}__feature{len(use_cols)}.csv", index=False) return best_iteration, cv_score, df_feim, pred_result, score_list, test_preds
valid_sets=lgb_valid, early_stopping_rounds=early_stopping_rounds, num_boost_round=num_boost_round, verbose_eval=200) best_iter = estimator.best_iteration oof_pred = estimator.predict(x_valid) score = roc_auc_score(y_valid, oof_pred) cvs = str(score).replace('.', '-') feim = get_tree_importance(estimator=estimator, use_cols=x_train.columns) feim.sort_values(by='importance', ascending=False, inplace=True) feim['is_valid'] = feim['feature'].map(valid_map) #======================================================================== # PostProcess #======================================================================== with timer(" * PostProcess"): to_pkl_gzip( obj=feim, path= f"../output/selection_feature/{start_time}__CV{cvs}__feature{len(use_cols)}" ) for path in valid_paths_train: try: shutil.move(path, to_dir) shutil.move(path.replace('train', 'test'), to_dir) except FileNotFoundError: print(feature_name)
# # 三行もたないfeatureは各foldをクリアできなかった # if score < base_fold_score[fold]: # break # else: # cnt +=1 # cv += score/3 df_feim = pd.concat(feim_list, axis=1) df_feim['imp_avg'] = df_feim.mean(axis=1) df_feim.sort_values(by='imp_avg', ascending=False, inplace=True) avg_score = str(np.mean(score_list))[:9].replace('.', '-') to_pkl_gzip( obj=df_feim, path= f"../output/feature_importances/{start_time}__bear_valid__CV{avg_score}__feature{len(use_cols)}" ) if cnt == 3: with open(check_score_path, 'a') as f: line = f'{feature_name},{cv}\n' f.write(line) df_score = pd.read_csv(check_score_path, header=None) if len(df_score) > 2: from_dir = 'valid' to_dir = 'sub_use' df_score.columns = ['feature', 'score'] df_score.sort_values(by='score', ascending=False, inplace=True) best_feature = df_score['feature'].values[0]
feim.rename(columns={'importance': f'imp_fold{n_fold+1}'}, inplace=True) feim.set_index('feature', inplace=True) feim_list.append(feim) cv_score = np.mean(score_list) cvs = str(cv_score).replace('.', '-') df_feim = pd.concat(feim_list, axis=1) df_feim['imp_avg'] = df_feim.mean(axis=1) df_feim.sort_values(by='imp_avg', ascending=False, inplace=True) ## Save # Feature Importance to_pkl_gzip( obj=df_feim, path= f"../output/feature_importances/{start_time}__CV{cvs}__{COLUMN_TARGET}__feature{len(use_cols)}" ) with timer(" * Make Prediction Result File."): test_pred_avg = np.mean(test_preds, axis=0) all_pred = np.append(y_pred, test_pred_avg) all_ids = np.append(tmp_train[COLUMN_ID].values, df_test[COLUMN_ID].values) pred_result = pd.DataFrame([all_ids, all_pred], index=[COLUMN_ID, 'pred_' + start_time]).T pred_result[COLUMN_ID] = pred_result[COLUMN_ID].astype('int') #======================================================================== # Save #======================================================================== # Prediction