Ejemplo n.º 1
0
def do_label_onehot_sums(fn, output_fn, output_features_dir, usecols):
    """
    Generate onehot encoded features per categorical column. (1) SK_ID_CURR maps to many
    SK_ID_PREV entries, so aggregate (sum) each category by SK_ID_CURR.

    :param fn: File path to bureau.csv
    :param label_cols: Categorical columns in the dataset
    :param output_features_dir: Directory path to write the features
    :return:
    """

    df = pd.read_csv(fn)
    label_cols = list(usecols)
    df = df[["SK_ID_CURR", "SK_ID_PREV"] + label_cols].copy()

    # make sure the columns are strings (object type)
    for col in label_cols:
        df[col] = df[col].astype(str)

    # handle duplicate colnames across raw datasets.
    usecols = rename_cols_with_feat_name_prefix(df=df,
                                                feat_code=FEAT_CODE,
                                                colnames=usecols,
                                                idxcol="SK_ID_CURR")

    combined_idxcol = "SK_ID_CURR__SK_ID_PREV"

    df[combined_idxcol] = df.apply(
        lambda x: "{}_{}".format(x['SK_ID_CURR'], x["SK_ID_PREV"]), axis=1)

    object_cols = dict(df[usecols].nunique())
    one_hot_encoders_di, label_encoders_di = generate_encoders(
        df, object_cols=object_cols)
    temp_df = add_onehot_col(df=df,
                             one_hot_encoders_di=one_hot_encoders_di,
                             idxcol=combined_idxcol,
                             output_feat_dir=output_features_dir,
                             drop=True,
                             filename_prefix="test_",
                             force=True)

    temp_df = add_label_col(df=temp_df,
                            label_encoders_di=label_encoders_di,
                            idxcol=combined_idxcol,
                            output_feat_dir=output_features_dir,
                            drop=True,
                            filename_prefix="test_",
                            force=True)

    agg_columns = set(temp_df.columns) - {
        'SK_ID_CURR', 'SK_ID_PREV', 'SK_ID_CURR__SK_ID_PREV'
    }
    agg_columns = sorted(list(agg_columns))
    print("agg_columns", agg_columns)
    grp_df = temp_df.groupby("SK_ID_CURR")[agg_columns].agg(['sum'])
    grp_df.reset_index(inplace=True)
    grp_df.columns = ["SK_ID_CURR"] + agg_columns

    grp_df.to_csv(output_fn, compression='gzip', index=False)
    print("Wrote to", output_fn)
def generate_train_test_feature_encodings(cleandir, output_features_dir):
    # TODO: put train and test in separate directories

    # read the processed datums
    input_clean_csv = os.path.join(cleandir, 'application_train.clean.csv')
    print("encoding", input_clean_csv)
    df = pd.read_csv(input_clean_csv)

    # generate one-hot encoders
    one_hot_encoders_di, label_encoders_di = generate_encoders(df=df)

    df = add_onehot_col(df=df,
                        one_hot_encoders_di=one_hot_encoders_di,
                        idxcol="SK_ID_CURR",
                        output_feat_dir=output_features_dir,
                        drop=True,
                        filename_prefix="train_")

    df = add_label_col(df=df,
                       label_encoders_di=label_encoders_di,
                       idxcol="SK_ID_CURR",
                       output_feat_dir=output_features_dir,
                       drop=True,
                       filename_prefix="train_")

    fn = os.path.join(output_features_dir, 'application_train.csv.gz')
    df.to_csv(fn, compression='gzip', index=False)
    print("Wrote to", fn)

    # read the processed datums
    input_clean_csv = os.path.join(cleandir, 'application_test.clean.csv')
    print("encoding", input_clean_csv)
    df = pd.read_csv(input_clean_csv)

    df = add_onehot_col(df=df,
                        one_hot_encoders_di=one_hot_encoders_di,
                        idxcol="SK_ID_CURR",
                        output_feat_dir=output_features_dir,
                        drop=True,
                        filename_prefix="test_")

    df = add_label_col(df=df,
                       label_encoders_di=label_encoders_di,
                       idxcol="SK_ID_CURR",
                       output_feat_dir=output_features_dir,
                       drop=True,
                       filename_prefix="test_")

    fn = os.path.join(output_features_dir, 'application_test.csv.gz')
    df.to_csv(fn, compression='gzip', index=False)
    print("Wrote to", fn)
Ejemplo n.º 3
0
def do_label_onehot_sums(fn, output_fn, output_features_dir, usecols):
    """
    Generate feature CSV with category count features.

    :param fn: Input file path for raw data
    :param output_fn: Output file path to write features CSV
    :param output_features_dir: Output directory for intermediate files, if needed.
    :param usecols: categorical column names
    :return: None
    """
    df = pd.read_csv(fn)
    label_cols = list(usecols)
    df = df[["SK_ID_CURR", "SK_ID_PREV"] + label_cols].copy()

    # handle duplicate colnames across raw datasets.
    usecols = rename_cols_with_feat_name_prefix(df=df, feat_code=FEAT_CODE,
                                                colnames=usecols, idxcol="SK_ID_CURR")

    combined_idxcol = "SK_ID_CURR__SK_ID_PREV"

    df[combined_idxcol] = df.apply(lambda x: "{}_{}".format(x['SK_ID_CURR'], x["SK_ID_PREV"]), axis=1)

    object_cols = dict(df[usecols].nunique())
    one_hot_encoders_di, label_encoders_di = generate_encoders(df, object_cols=object_cols)
    temp_df = add_onehot_col(df=df, one_hot_encoders_di=one_hot_encoders_di,
                             idxcol=combined_idxcol, output_feat_dir=output_features_dir, drop=True,
                             filename_prefix="test_", force=True)

    temp_df = add_label_col(df=temp_df, label_encoders_di=label_encoders_di,
                            idxcol=combined_idxcol, output_feat_dir=output_features_dir, drop=True,
                            filename_prefix="test_", force=True)

    agg_columns = set(temp_df.columns) - {'SK_ID_CURR', 'SK_ID_PREV', 'SK_ID_CURR__SK_ID_PREV'}
    agg_columns = sorted(list(agg_columns))
    print("agg_columns", agg_columns)
    grp_df = temp_df.groupby("SK_ID_CURR")[agg_columns].agg(['sum'])
    grp_df.reset_index(inplace=True)
    grp_df.columns = ["SK_ID_CURR"]+agg_columns

    grp_df.to_csv(output_fn, compression='gzip', index=False)
    print("Wrote to", output_fn)
Ejemplo n.º 4
0
def do_label_onehot_sums(fn='./data/bureau.csv',
                         label_cols=('CREDIT_ACTIVE', 'CREDIT_CURRENCY',
                                     'CREDIT_TYPE'),
                         output_features_dir="./output/bureau_features"):
    """
    Generate onehot encoded features per categorical column. (1) SK_ID_CURR maps to many
    SK_ID_BUREAU entries, so aggregate (sum) each category by SK_ID_CURR.

    :param fn: File path to bureau.csv
    :param label_cols: Categorical columns in the dataset
    :param output_features_dir: Directory path to write the features
    :return:
    """
    df = pd.read_csv(fn)
    label_cols = list(label_cols)
    df = df[["SK_ID_CURR", "SK_ID_BUREAU"] + label_cols].copy()
    # handle duplicate colnames across raw datasets.
    label_cols = rename_cols_with_feat_name_prefix(df=df,
                                                   feat_code=FEAT_CODE,
                                                   colnames=label_cols,
                                                   idxcol="SK_ID_CURR")
    print("columns", df.columns)

    combined_idxcol = "SK_ID_CURR__SK_ID_BUREAU"

    df[combined_idxcol] = df.apply(
        lambda x: "{}_{}".format(x['SK_ID_CURR'], x["SK_ID_BUREAU"]), axis=1)
    print("columns", df.columns)

    object_cols = dict(df[label_cols].nunique())
    one_hot_encoders_di, label_encoders_di = generate_encoders(
        df, object_cols=object_cols)
    temp_df = add_onehot_col(df=df,
                             one_hot_encoders_di=one_hot_encoders_di,
                             idxcol=combined_idxcol,
                             output_feat_dir=output_features_dir,
                             drop=True,
                             filename_prefix="test_",
                             force=True)

    temp_df = add_label_col(df=temp_df,
                            label_encoders_di=label_encoders_di,
                            idxcol=combined_idxcol,
                            output_feat_dir=output_features_dir,
                            drop=True,
                            filename_prefix="test_",
                            force=True)

    agg_columns = set(temp_df.columns) - {
        'SK_ID_CURR', 'SK_ID_BUREAU', 'SK_ID_CURR__SK_ID_BUREAU'
    }
    agg_columns = sorted(list(agg_columns))
    print("agg_columns", agg_columns)
    grp_df = temp_df.groupby("SK_ID_CURR")[agg_columns].agg(['sum'])
    grp_df.reset_index(inplace=True)
    grp_df.columns = ["SK_ID_CURR"] + agg_columns

    output_fn = os.path.join(output_features_dir,
                             "bureau_features_label_features.csv.gz")
    grp_df.to_csv(output_fn, compression='gzip', index=False)
    print("Wrote to", output_fn)