Esempio n. 1
0
 def _create_feature(cls, conf) -> pd.DataFrame:
     df = Base.get_df(conf)
     df = df.merge(CreditCardBalance.get_df(conf), on="SK_ID_CURR", how="left")
     # fit with train data and transform with both date
     train_df = df[df['TARGET'].notnull()].copy()
     categorical_columns = [col for col in df.columns if df[col].dtype == 'object']
     df = TargetEncoder(cols=categorical_columns).fit(train_df, train_df['TARGET']).transform(df)
     df = df.groupby(by=['SK_ID_CURR'], as_index=False).agg({col: 'mean' for col in categorical_columns})
     return df[categorical_columns + ['SK_ID_CURR']].rename(
         columns={col: f"{col}_target_encode" for col in categorical_columns}
     )
Esempio n. 2
0
def get_train_test(conf):
    df = Base.get_df(conf)  # pd.DataFrame

    if "stacking_features" in conf:
        StackingFeaturesWithPasses.set_result_dirs(conf.stacking_features)
        f = StackingFeaturesWithPasses.get_df(conf)
        df = df.merge(f, how='left', on='SK_ID_CURR')

    train_df = df[df['TARGET'].notnull()].copy()
    test_df = df[df['TARGET'].isnull()].copy()
    del df
    gc.collect()
    return train_df, test_df
def get_train_test(conf):
    df = Base.get_df(conf)  # pd.DataFrame

    feature_classes = [KEY_FEATURE_MAP[key] for key in conf.features]
    for feature in feature_classes:
        with timer(f"process {feature.__name__}"):
            f = feature.get_df(conf)
            if "drop_duplicate_column_on_merge" in conf.options and conf.options.drop_duplicate_column_on_merge:
                cols_to_drop = [
                    c for c in f.columns
                    if (c in df.columns) and (c != 'SK_ID_CURR')
                ]
                if cols_to_drop:
                    print(f"drop columns: {cols_to_drop}")
                    f = f.drop(cols_to_drop, axis=1)
            if "reduce_mem_usage" in conf.options and conf.options.reduce_mem_usage:
                with timer("reduce_mem_usaga"):
                    f = reduce_mem_usage(f)
            df = df.merge(f, how='left', on='SK_ID_CURR')
            del f
            gc.collect()

    if "stacking_features" in conf:
        StackingFeaturesWithPasses.set_result_dirs(conf.stacking_features)
        f = StackingFeaturesWithPasses.get_df(conf)
        df = df.merge(f, how='left', on='SK_ID_CURR')

    if "drop_features_list_file" in conf.options:
        with open(conf.options.drop_features_list_file, "r") as fp:
            line = fp.read()
            feature_to_drop = eval(line)
        print(f"drop columns in {conf.options.drop_features_list_file}")
        df = df.drop(feature_to_drop, axis=1)

    if "clean_data" in conf.options and conf.options.clean_data:
        with timer("clean_data"):
            df = clean_data(df)

    train_df = df[df['TARGET'].notnull()].copy()
    test_df = df[df['TARGET'].isnull()].copy()
    del df
    gc.collect()
    return train_df, test_df
Esempio n. 4
0
def get_train_test(conf):
    df = Base.get_df(conf)  # pd.DataFrame

    feature_classes = [KEY_FEATURE_MAP[key] for key in conf.features]
    features = [df]
    for feature in feature_classes:
        with timer(f"load (or create) {feature.__name__}"):
            f = feature.get_df(conf)
            features.append(f)
    with timer("join on SK_ID_CURR"):
        df = reduce(
            lambda lhs, rhs: lhs.merge(rhs, how='left', on='SK_ID_CURR'),
            features)
    del features
    gc.collect()

    train_df = df[df['TARGET'].notnull()].copy()
    test_df = df[df['TARGET'].isnull()].copy()
    del df
    gc.collect()
    return train_df, test_df
 def _create_feature(cls, conf) -> pd.DataFrame:
     base = Base.get_df(conf)
     df = BureauFeaturesAntonova.get_df(conf)
     # clean data needs target info
     return clean_data(base.merge(df, on='SK_ID_CURR', how='left'))
Esempio n. 6
0
 def _create_feature(cls, conf) -> pd.DataFrame:
     df = Base.get_df(conf)
     df = df.merge(CreditCardBalanceFeaturesAntonova.get_df(conf), on="SK_ID_CURR", how="left")
     return clean_data(df)
Esempio n. 7
0
 def _create_feature(cls, conf) -> pd.DataFrame:
     df = Base.get_df(conf)
     df = df.merge(PreviousApplicationFeaturesAntonova.get_df(conf),
                   on="SK_ID_CURR",
                   how="left")
     return clean_data(df)
Esempio n. 8
0
 def _create_feature(cls, conf) -> pd.DataFrame:
     df = Base.get_df(conf)
     df = df.merge(InstallmentsPaymentsFeaturesAntonova.get_df(conf),
                   on="SK_ID_CURR",
                   how="left")
     return clean_data(df)