def do_label_onehot_sums(fn, output_fn, output_features_dir, usecols): """ Generate onehot encoded features per categorical column. (1) SK_ID_CURR maps to many SK_ID_PREV entries, so aggregate (sum) each category by SK_ID_CURR. :param fn: File path to bureau.csv :param label_cols: Categorical columns in the dataset :param output_features_dir: Directory path to write the features :return: """ df = pd.read_csv(fn) label_cols = list(usecols) df = df[["SK_ID_CURR", "SK_ID_PREV"] + label_cols].copy() # make sure the columns are strings (object type) for col in label_cols: df[col] = df[col].astype(str) # handle duplicate colnames across raw datasets. usecols = rename_cols_with_feat_name_prefix(df=df, feat_code=FEAT_CODE, colnames=usecols, idxcol="SK_ID_CURR") combined_idxcol = "SK_ID_CURR__SK_ID_PREV" df[combined_idxcol] = df.apply( lambda x: "{}_{}".format(x['SK_ID_CURR'], x["SK_ID_PREV"]), axis=1) object_cols = dict(df[usecols].nunique()) one_hot_encoders_di, label_encoders_di = generate_encoders( df, object_cols=object_cols) temp_df = add_onehot_col(df=df, one_hot_encoders_di=one_hot_encoders_di, idxcol=combined_idxcol, output_feat_dir=output_features_dir, drop=True, filename_prefix="test_", force=True) temp_df = add_label_col(df=temp_df, label_encoders_di=label_encoders_di, idxcol=combined_idxcol, output_feat_dir=output_features_dir, drop=True, filename_prefix="test_", force=True) agg_columns = set(temp_df.columns) - { 'SK_ID_CURR', 'SK_ID_PREV', 'SK_ID_CURR__SK_ID_PREV' } agg_columns = sorted(list(agg_columns)) print("agg_columns", agg_columns) grp_df = temp_df.groupby("SK_ID_CURR")[agg_columns].agg(['sum']) grp_df.reset_index(inplace=True) grp_df.columns = ["SK_ID_CURR"] + agg_columns grp_df.to_csv(output_fn, compression='gzip', index=False) print("Wrote to", output_fn)
def generate_train_test_feature_encodings(cleandir, output_features_dir): # TODO: put train and test in separate directories # read the processed datums input_clean_csv = os.path.join(cleandir, 'application_train.clean.csv') print("encoding", input_clean_csv) df = pd.read_csv(input_clean_csv) # generate one-hot encoders one_hot_encoders_di, label_encoders_di = generate_encoders(df=df) df = add_onehot_col(df=df, one_hot_encoders_di=one_hot_encoders_di, idxcol="SK_ID_CURR", output_feat_dir=output_features_dir, drop=True, filename_prefix="train_") df = add_label_col(df=df, label_encoders_di=label_encoders_di, idxcol="SK_ID_CURR", output_feat_dir=output_features_dir, drop=True, filename_prefix="train_") fn = os.path.join(output_features_dir, 'application_train.csv.gz') df.to_csv(fn, compression='gzip', index=False) print("Wrote to", fn) # read the processed datums input_clean_csv = os.path.join(cleandir, 'application_test.clean.csv') print("encoding", input_clean_csv) df = pd.read_csv(input_clean_csv) df = add_onehot_col(df=df, one_hot_encoders_di=one_hot_encoders_di, idxcol="SK_ID_CURR", output_feat_dir=output_features_dir, drop=True, filename_prefix="test_") df = add_label_col(df=df, label_encoders_di=label_encoders_di, idxcol="SK_ID_CURR", output_feat_dir=output_features_dir, drop=True, filename_prefix="test_") fn = os.path.join(output_features_dir, 'application_test.csv.gz') df.to_csv(fn, compression='gzip', index=False) print("Wrote to", fn)
def do_label_onehot_sums(fn, output_fn, output_features_dir, usecols): """ Generate feature CSV with category count features. :param fn: Input file path for raw data :param output_fn: Output file path to write features CSV :param output_features_dir: Output directory for intermediate files, if needed. :param usecols: categorical column names :return: None """ df = pd.read_csv(fn) label_cols = list(usecols) df = df[["SK_ID_CURR", "SK_ID_PREV"] + label_cols].copy() # handle duplicate colnames across raw datasets. usecols = rename_cols_with_feat_name_prefix(df=df, feat_code=FEAT_CODE, colnames=usecols, idxcol="SK_ID_CURR") combined_idxcol = "SK_ID_CURR__SK_ID_PREV" df[combined_idxcol] = df.apply(lambda x: "{}_{}".format(x['SK_ID_CURR'], x["SK_ID_PREV"]), axis=1) object_cols = dict(df[usecols].nunique()) one_hot_encoders_di, label_encoders_di = generate_encoders(df, object_cols=object_cols) temp_df = add_onehot_col(df=df, one_hot_encoders_di=one_hot_encoders_di, idxcol=combined_idxcol, output_feat_dir=output_features_dir, drop=True, filename_prefix="test_", force=True) temp_df = add_label_col(df=temp_df, label_encoders_di=label_encoders_di, idxcol=combined_idxcol, output_feat_dir=output_features_dir, drop=True, filename_prefix="test_", force=True) agg_columns = set(temp_df.columns) - {'SK_ID_CURR', 'SK_ID_PREV', 'SK_ID_CURR__SK_ID_PREV'} agg_columns = sorted(list(agg_columns)) print("agg_columns", agg_columns) grp_df = temp_df.groupby("SK_ID_CURR")[agg_columns].agg(['sum']) grp_df.reset_index(inplace=True) grp_df.columns = ["SK_ID_CURR"]+agg_columns grp_df.to_csv(output_fn, compression='gzip', index=False) print("Wrote to", output_fn)
def do_label_onehot_sums(fn='./data/bureau.csv', label_cols=('CREDIT_ACTIVE', 'CREDIT_CURRENCY', 'CREDIT_TYPE'), output_features_dir="./output/bureau_features"): """ Generate onehot encoded features per categorical column. (1) SK_ID_CURR maps to many SK_ID_BUREAU entries, so aggregate (sum) each category by SK_ID_CURR. :param fn: File path to bureau.csv :param label_cols: Categorical columns in the dataset :param output_features_dir: Directory path to write the features :return: """ df = pd.read_csv(fn) label_cols = list(label_cols) df = df[["SK_ID_CURR", "SK_ID_BUREAU"] + label_cols].copy() # handle duplicate colnames across raw datasets. label_cols = rename_cols_with_feat_name_prefix(df=df, feat_code=FEAT_CODE, colnames=label_cols, idxcol="SK_ID_CURR") print("columns", df.columns) combined_idxcol = "SK_ID_CURR__SK_ID_BUREAU" df[combined_idxcol] = df.apply( lambda x: "{}_{}".format(x['SK_ID_CURR'], x["SK_ID_BUREAU"]), axis=1) print("columns", df.columns) object_cols = dict(df[label_cols].nunique()) one_hot_encoders_di, label_encoders_di = generate_encoders( df, object_cols=object_cols) temp_df = add_onehot_col(df=df, one_hot_encoders_di=one_hot_encoders_di, idxcol=combined_idxcol, output_feat_dir=output_features_dir, drop=True, filename_prefix="test_", force=True) temp_df = add_label_col(df=temp_df, label_encoders_di=label_encoders_di, idxcol=combined_idxcol, output_feat_dir=output_features_dir, drop=True, filename_prefix="test_", force=True) agg_columns = set(temp_df.columns) - { 'SK_ID_CURR', 'SK_ID_BUREAU', 'SK_ID_CURR__SK_ID_BUREAU' } agg_columns = sorted(list(agg_columns)) print("agg_columns", agg_columns) grp_df = temp_df.groupby("SK_ID_CURR")[agg_columns].agg(['sum']) grp_df.reset_index(inplace=True) grp_df.columns = ["SK_ID_CURR"] + agg_columns output_fn = os.path.join(output_features_dir, "bureau_features_label_features.csv.gz") grp_df.to_csv(output_fn, compression='gzip', index=False) print("Wrote to", output_fn)