def get_oof_feature(oof_path='../oof_feature/*.gz', key='', pred_col='prediction'): feat_path_list = glob.glob(oof_path) oof_list = [] for path in feat_path_list: oof = utils.read_pkl_gzip(path) oof_name = oof.columns.tolist()[1] oof = oof.set_index(key)[pred_col] oof.name = "oof_" + oof_name oof_list.append(oof) df_oof = pd.concat(oof_list, axis=1) return df_oof
COLUMN_ID = 'TransactionID' COLUMN_DT = 'TransactionDT' COLUMN_TARGET = 'isFraud' COLUMN_GROUP = 'DT-M' COLUMNS_IGNORE = [COLUMN_ID, COLUMN_DT, COLUMN_TARGET, COLUMN_GROUP, 'is_train', 'date'] paths_train = glob('../feature/raw_use/*_train.gz') paths_train += sorted(glob('../feature/org_use/*_train.gz')) # paths_train += sorted(glob('../feature/sub_use/*_train.gz')) # paths_train += sorted(glob('../feature/valid_use/*_train.gz')) df_train = parallel_load_data(paths_train) group_kfold_path = '../input/0908_ieee__DT-M_GroupKFold.gz' group = read_pkl_gzip(group_kfold_path) df_train[COLUMN_GROUP] = group #======================================================================== # Negative Down Sampling #======================================================================== frac = 0.2 np.random.seed(seed) df_pos = df_train[df_train.isFraud==1] df_neg = df_train[df_train.isFraud!=1] del df_train gc.collect() df_neg = df_neg.sample(int(df_neg.shape[0] * frac)) df_train = pd.concat([df_pos, df_neg], axis=0)