def do_label_onehot_sums(fn, output_fn, output_features_dir, usecols): """ Generate onehot encoded features per categorical column. (1) SK_ID_CURR maps to many SK_ID_PREV entries, so aggregate (sum) each category by SK_ID_CURR. :param fn: File path to bureau.csv :param label_cols: Categorical columns in the dataset :param output_features_dir: Directory path to write the features :return: """ df = pd.read_csv(fn) label_cols = list(usecols) df = df[["SK_ID_CURR", "SK_ID_PREV"] + label_cols].copy() # make sure the columns are strings (object type) for col in label_cols: df[col] = df[col].astype(str) # handle duplicate colnames across raw datasets. usecols = rename_cols_with_feat_name_prefix(df=df, feat_code=FEAT_CODE, colnames=usecols, idxcol="SK_ID_CURR") combined_idxcol = "SK_ID_CURR__SK_ID_PREV" df[combined_idxcol] = df.apply( lambda x: "{}_{}".format(x['SK_ID_CURR'], x["SK_ID_PREV"]), axis=1) object_cols = dict(df[usecols].nunique()) one_hot_encoders_di, label_encoders_di = generate_encoders( df, object_cols=object_cols) temp_df = add_onehot_col(df=df, one_hot_encoders_di=one_hot_encoders_di, idxcol=combined_idxcol, output_feat_dir=output_features_dir, drop=True, filename_prefix="test_", force=True) temp_df = add_label_col(df=temp_df, label_encoders_di=label_encoders_di, idxcol=combined_idxcol, output_feat_dir=output_features_dir, drop=True, filename_prefix="test_", force=True) agg_columns = set(temp_df.columns) - { 'SK_ID_CURR', 'SK_ID_PREV', 'SK_ID_CURR__SK_ID_PREV' } agg_columns = sorted(list(agg_columns)) print("agg_columns", agg_columns) grp_df = temp_df.groupby("SK_ID_CURR")[agg_columns].agg(['sum']) grp_df.reset_index(inplace=True) grp_df.columns = ["SK_ID_CURR"] + agg_columns grp_df.to_csv(output_fn, compression='gzip', index=False) print("Wrote to", output_fn)
def do_label_onehot_sums(fn, output_fn, output_features_dir, usecols): """ Generate feature CSV with category count features. :param fn: Input file path for raw data :param output_fn: Output file path to write features CSV :param output_features_dir: Output directory for intermediate files, if needed. :param usecols: categorical column names :return: None """ df = pd.read_csv(fn) label_cols = list(usecols) df = df[["SK_ID_CURR", "SK_ID_PREV"] + label_cols].copy() # handle duplicate colnames across raw datasets. usecols = rename_cols_with_feat_name_prefix(df=df, feat_code=FEAT_CODE, colnames=usecols, idxcol="SK_ID_CURR") combined_idxcol = "SK_ID_CURR__SK_ID_PREV" df[combined_idxcol] = df.apply(lambda x: "{}_{}".format(x['SK_ID_CURR'], x["SK_ID_PREV"]), axis=1) object_cols = dict(df[usecols].nunique()) one_hot_encoders_di, label_encoders_di = generate_encoders(df, object_cols=object_cols) temp_df = add_onehot_col(df=df, one_hot_encoders_di=one_hot_encoders_di, idxcol=combined_idxcol, output_feat_dir=output_features_dir, drop=True, filename_prefix="test_", force=True) temp_df = add_label_col(df=temp_df, label_encoders_di=label_encoders_di, idxcol=combined_idxcol, output_feat_dir=output_features_dir, drop=True, filename_prefix="test_", force=True) agg_columns = set(temp_df.columns) - {'SK_ID_CURR', 'SK_ID_PREV', 'SK_ID_CURR__SK_ID_PREV'} agg_columns = sorted(list(agg_columns)) print("agg_columns", agg_columns) grp_df = temp_df.groupby("SK_ID_CURR")[agg_columns].agg(['sum']) grp_df.reset_index(inplace=True) grp_df.columns = ["SK_ID_CURR"]+agg_columns grp_df.to_csv(output_fn, compression='gzip', index=False) print("Wrote to", output_fn)
def do_numeric_col_stats(fn, output_fn, usecols): """ Generate features CSV with numeric features :param fn: :param output_fn: :param usecols: :return: """ # Calc basic statistics for numerical fields. df = pd.read_csv(fn) print("shape", df.shape) print("columns", df.columns) df = df[["SK_ID_CURR"] + usecols].copy() # handle duplicate colnames across raw datasets. usecols = rename_cols_with_feat_name_prefix(df=df, feat_code=FEAT_CODE, colnames=usecols, idxcol="SK_ID_CURR") temp_df = add_numeric_stats_cols(df=df, column_names=usecols) temp_df.to_csv(output_fn, compression="gzip", index=False) print("Wrote to", output_fn)
def do_numeric_col_stats(fn_balance='./data/bureau_balance.csv', fn='./data/bureau.csv', output_features_dir="./output/bureau_features"): """ Generate ['min', 'max', 'mean', 'median'] for numerical columns :param fn_balance: File path to bureau_balance.csv :param fn: File path to bureau.csv :param output_features_dir: Directory path to write the features :return: """ df_bbalance = pd.read_csv(fn_balance) df_bbalance["MONTHS_BALANCE"] = df_bbalance["MONTHS_BALANCE"] df = pd.read_csv(fn) df = df.merge(df_bbalance, on="SK_ID_BUREAU", how="left") print("shape", df.shape) print("columns", df.columns) numeric_cols = [ 'DAYS_CREDIT', 'CREDIT_DAY_OVERDUE', 'CNT_CREDIT_PROLONG', 'AMT_CREDIT_MAX_OVERDUE', 'AMT_CREDIT_SUM', 'AMT_CREDIT_SUM_DEBT', 'AMT_CREDIT_SUM_LIMIT', 'AMT_CREDIT_SUM_OVERDUE', 'MONTHS_BALANCE' ] df = df[["SK_ID_CURR"] + numeric_cols].copy() # handle duplicate colnames across raw datasets. numeric_cols = rename_cols_with_feat_name_prefix(df=df, feat_code=FEAT_CODE, colnames=numeric_cols, idxcol="SK_ID_CURR") temp_df = add_numeric_stats_cols(df=df, column_names=numeric_cols) output_fn = os.path.join(output_features_dir, "bureau_numeric_features.csv.gz") temp_df.to_csv(output_fn, compression="gzip", index=False) print("Wrote to", output_fn)
def do_label_onehot_sums(fn='./data/bureau.csv', label_cols=('CREDIT_ACTIVE', 'CREDIT_CURRENCY', 'CREDIT_TYPE'), output_features_dir="./output/bureau_features"): """ Generate onehot encoded features per categorical column. (1) SK_ID_CURR maps to many SK_ID_BUREAU entries, so aggregate (sum) each category by SK_ID_CURR. :param fn: File path to bureau.csv :param label_cols: Categorical columns in the dataset :param output_features_dir: Directory path to write the features :return: """ df = pd.read_csv(fn) label_cols = list(label_cols) df = df[["SK_ID_CURR", "SK_ID_BUREAU"] + label_cols].copy() # handle duplicate colnames across raw datasets. label_cols = rename_cols_with_feat_name_prefix(df=df, feat_code=FEAT_CODE, colnames=label_cols, idxcol="SK_ID_CURR") print("columns", df.columns) combined_idxcol = "SK_ID_CURR__SK_ID_BUREAU" df[combined_idxcol] = df.apply( lambda x: "{}_{}".format(x['SK_ID_CURR'], x["SK_ID_BUREAU"]), axis=1) print("columns", df.columns) object_cols = dict(df[label_cols].nunique()) one_hot_encoders_di, label_encoders_di = generate_encoders( df, object_cols=object_cols) temp_df = add_onehot_col(df=df, one_hot_encoders_di=one_hot_encoders_di, idxcol=combined_idxcol, output_feat_dir=output_features_dir, drop=True, filename_prefix="test_", force=True) temp_df = add_label_col(df=temp_df, label_encoders_di=label_encoders_di, idxcol=combined_idxcol, output_feat_dir=output_features_dir, drop=True, filename_prefix="test_", force=True) agg_columns = set(temp_df.columns) - { 'SK_ID_CURR', 'SK_ID_BUREAU', 'SK_ID_CURR__SK_ID_BUREAU' } agg_columns = sorted(list(agg_columns)) print("agg_columns", agg_columns) grp_df = temp_df.groupby("SK_ID_CURR")[agg_columns].agg(['sum']) grp_df.reset_index(inplace=True) grp_df.columns = ["SK_ID_CURR"] + agg_columns output_fn = os.path.join(output_features_dir, "bureau_features_label_features.csv.gz") grp_df.to_csv(output_fn, compression='gzip', index=False) print("Wrote to", output_fn)