コード例 #1
0
def do_label_onehot_sums(fn, output_fn, output_features_dir, usecols):
    """
    Generate onehot encoded features per categorical column. (1) SK_ID_CURR maps to many
    SK_ID_PREV entries, so aggregate (sum) each category by SK_ID_CURR.

    :param fn: File path to bureau.csv
    :param label_cols: Categorical columns in the dataset
    :param output_features_dir: Directory path to write the features
    :return:
    """

    df = pd.read_csv(fn)
    label_cols = list(usecols)
    df = df[["SK_ID_CURR", "SK_ID_PREV"] + label_cols].copy()

    # make sure the columns are strings (object type)
    for col in label_cols:
        df[col] = df[col].astype(str)

    # handle duplicate colnames across raw datasets.
    usecols = rename_cols_with_feat_name_prefix(df=df,
                                                feat_code=FEAT_CODE,
                                                colnames=usecols,
                                                idxcol="SK_ID_CURR")

    combined_idxcol = "SK_ID_CURR__SK_ID_PREV"

    df[combined_idxcol] = df.apply(
        lambda x: "{}_{}".format(x['SK_ID_CURR'], x["SK_ID_PREV"]), axis=1)

    object_cols = dict(df[usecols].nunique())
    one_hot_encoders_di, label_encoders_di = generate_encoders(
        df, object_cols=object_cols)
    temp_df = add_onehot_col(df=df,
                             one_hot_encoders_di=one_hot_encoders_di,
                             idxcol=combined_idxcol,
                             output_feat_dir=output_features_dir,
                             drop=True,
                             filename_prefix="test_",
                             force=True)

    temp_df = add_label_col(df=temp_df,
                            label_encoders_di=label_encoders_di,
                            idxcol=combined_idxcol,
                            output_feat_dir=output_features_dir,
                            drop=True,
                            filename_prefix="test_",
                            force=True)

    agg_columns = set(temp_df.columns) - {
        'SK_ID_CURR', 'SK_ID_PREV', 'SK_ID_CURR__SK_ID_PREV'
    }
    agg_columns = sorted(list(agg_columns))
    print("agg_columns", agg_columns)
    grp_df = temp_df.groupby("SK_ID_CURR")[agg_columns].agg(['sum'])
    grp_df.reset_index(inplace=True)
    grp_df.columns = ["SK_ID_CURR"] + agg_columns

    grp_df.to_csv(output_fn, compression='gzip', index=False)
    print("Wrote to", output_fn)
コード例 #2
0
def do_label_onehot_sums(fn, output_fn, output_features_dir, usecols):
    """
    Generate feature CSV with category count features.

    :param fn: Input file path for raw data
    :param output_fn: Output file path to write features CSV
    :param output_features_dir: Output directory for intermediate files, if needed.
    :param usecols: categorical column names
    :return: None
    """
    df = pd.read_csv(fn)
    label_cols = list(usecols)
    df = df[["SK_ID_CURR", "SK_ID_PREV"] + label_cols].copy()

    # handle duplicate colnames across raw datasets.
    usecols = rename_cols_with_feat_name_prefix(df=df, feat_code=FEAT_CODE,
                                                colnames=usecols, idxcol="SK_ID_CURR")

    combined_idxcol = "SK_ID_CURR__SK_ID_PREV"

    df[combined_idxcol] = df.apply(lambda x: "{}_{}".format(x['SK_ID_CURR'], x["SK_ID_PREV"]), axis=1)

    object_cols = dict(df[usecols].nunique())
    one_hot_encoders_di, label_encoders_di = generate_encoders(df, object_cols=object_cols)
    temp_df = add_onehot_col(df=df, one_hot_encoders_di=one_hot_encoders_di,
                             idxcol=combined_idxcol, output_feat_dir=output_features_dir, drop=True,
                             filename_prefix="test_", force=True)

    temp_df = add_label_col(df=temp_df, label_encoders_di=label_encoders_di,
                            idxcol=combined_idxcol, output_feat_dir=output_features_dir, drop=True,
                            filename_prefix="test_", force=True)

    agg_columns = set(temp_df.columns) - {'SK_ID_CURR', 'SK_ID_PREV', 'SK_ID_CURR__SK_ID_PREV'}
    agg_columns = sorted(list(agg_columns))
    print("agg_columns", agg_columns)
    grp_df = temp_df.groupby("SK_ID_CURR")[agg_columns].agg(['sum'])
    grp_df.reset_index(inplace=True)
    grp_df.columns = ["SK_ID_CURR"]+agg_columns

    grp_df.to_csv(output_fn, compression='gzip', index=False)
    print("Wrote to", output_fn)
コード例 #3
0
def do_numeric_col_stats(fn, output_fn, usecols):
    """
    Generate features CSV with numeric features
    :param fn:
    :param output_fn:
    :param usecols:
    :return:
    """
    # Calc basic statistics for numerical fields.

    df = pd.read_csv(fn)
    print("shape", df.shape)
    print("columns", df.columns)

    df = df[["SK_ID_CURR"] + usecols].copy()
    # handle duplicate colnames across raw datasets.
    usecols = rename_cols_with_feat_name_prefix(df=df, feat_code=FEAT_CODE,
                                                colnames=usecols, idxcol="SK_ID_CURR")

    temp_df = add_numeric_stats_cols(df=df, column_names=usecols)
    temp_df.to_csv(output_fn, compression="gzip", index=False)
    print("Wrote to", output_fn)
コード例 #4
0
def do_numeric_col_stats(fn_balance='./data/bureau_balance.csv',
                         fn='./data/bureau.csv',
                         output_features_dir="./output/bureau_features"):
    """
    Generate ['min', 'max', 'mean', 'median'] for numerical columns

    :param fn_balance: File path to bureau_balance.csv
    :param fn: File path to bureau.csv
    :param output_features_dir: Directory path to write the features
    :return:
    """
    df_bbalance = pd.read_csv(fn_balance)
    df_bbalance["MONTHS_BALANCE"] = df_bbalance["MONTHS_BALANCE"]

    df = pd.read_csv(fn)
    df = df.merge(df_bbalance, on="SK_ID_BUREAU", how="left")
    print("shape", df.shape)
    print("columns", df.columns)

    numeric_cols = [
        'DAYS_CREDIT', 'CREDIT_DAY_OVERDUE', 'CNT_CREDIT_PROLONG',
        'AMT_CREDIT_MAX_OVERDUE', 'AMT_CREDIT_SUM', 'AMT_CREDIT_SUM_DEBT',
        'AMT_CREDIT_SUM_LIMIT', 'AMT_CREDIT_SUM_OVERDUE', 'MONTHS_BALANCE'
    ]
    df = df[["SK_ID_CURR"] + numeric_cols].copy()

    # handle duplicate colnames across raw datasets.
    numeric_cols = rename_cols_with_feat_name_prefix(df=df,
                                                     feat_code=FEAT_CODE,
                                                     colnames=numeric_cols,
                                                     idxcol="SK_ID_CURR")

    temp_df = add_numeric_stats_cols(df=df, column_names=numeric_cols)
    output_fn = os.path.join(output_features_dir,
                             "bureau_numeric_features.csv.gz")
    temp_df.to_csv(output_fn, compression="gzip", index=False)
    print("Wrote to", output_fn)
コード例 #5
0
def do_label_onehot_sums(fn='./data/bureau.csv',
                         label_cols=('CREDIT_ACTIVE', 'CREDIT_CURRENCY',
                                     'CREDIT_TYPE'),
                         output_features_dir="./output/bureau_features"):
    """
    Generate onehot encoded features per categorical column. (1) SK_ID_CURR maps to many
    SK_ID_BUREAU entries, so aggregate (sum) each category by SK_ID_CURR.

    :param fn: File path to bureau.csv
    :param label_cols: Categorical columns in the dataset
    :param output_features_dir: Directory path to write the features
    :return:
    """
    df = pd.read_csv(fn)
    label_cols = list(label_cols)
    df = df[["SK_ID_CURR", "SK_ID_BUREAU"] + label_cols].copy()
    # handle duplicate colnames across raw datasets.
    label_cols = rename_cols_with_feat_name_prefix(df=df,
                                                   feat_code=FEAT_CODE,
                                                   colnames=label_cols,
                                                   idxcol="SK_ID_CURR")
    print("columns", df.columns)

    combined_idxcol = "SK_ID_CURR__SK_ID_BUREAU"

    df[combined_idxcol] = df.apply(
        lambda x: "{}_{}".format(x['SK_ID_CURR'], x["SK_ID_BUREAU"]), axis=1)
    print("columns", df.columns)

    object_cols = dict(df[label_cols].nunique())
    one_hot_encoders_di, label_encoders_di = generate_encoders(
        df, object_cols=object_cols)
    temp_df = add_onehot_col(df=df,
                             one_hot_encoders_di=one_hot_encoders_di,
                             idxcol=combined_idxcol,
                             output_feat_dir=output_features_dir,
                             drop=True,
                             filename_prefix="test_",
                             force=True)

    temp_df = add_label_col(df=temp_df,
                            label_encoders_di=label_encoders_di,
                            idxcol=combined_idxcol,
                            output_feat_dir=output_features_dir,
                            drop=True,
                            filename_prefix="test_",
                            force=True)

    agg_columns = set(temp_df.columns) - {
        'SK_ID_CURR', 'SK_ID_BUREAU', 'SK_ID_CURR__SK_ID_BUREAU'
    }
    agg_columns = sorted(list(agg_columns))
    print("agg_columns", agg_columns)
    grp_df = temp_df.groupby("SK_ID_CURR")[agg_columns].agg(['sum'])
    grp_df.reset_index(inplace=True)
    grp_df.columns = ["SK_ID_CURR"] + agg_columns

    output_fn = os.path.join(output_features_dir,
                             "bureau_features_label_features.csv.gz")
    grp_df.to_csv(output_fn, compression='gzip', index=False)
    print("Wrote to", output_fn)