def _create_feature(cls, conf) -> pd.DataFrame: df = Base.get_df(conf) df = df.merge(CreditCardBalance.get_df(conf), on="SK_ID_CURR", how="left") # fit with train data and transform with both date train_df = df[df['TARGET'].notnull()].copy() categorical_columns = [col for col in df.columns if df[col].dtype == 'object'] df = TargetEncoder(cols=categorical_columns).fit(train_df, train_df['TARGET']).transform(df) df = df.groupby(by=['SK_ID_CURR'], as_index=False).agg({col: 'mean' for col in categorical_columns}) return df[categorical_columns + ['SK_ID_CURR']].rename( columns={col: f"{col}_target_encode" for col in categorical_columns} )
def get_train_test(conf): df = Base.get_df(conf) # pd.DataFrame if "stacking_features" in conf: StackingFeaturesWithPasses.set_result_dirs(conf.stacking_features) f = StackingFeaturesWithPasses.get_df(conf) df = df.merge(f, how='left', on='SK_ID_CURR') train_df = df[df['TARGET'].notnull()].copy() test_df = df[df['TARGET'].isnull()].copy() del df gc.collect() return train_df, test_df
def get_train_test(conf): df = Base.get_df(conf) # pd.DataFrame feature_classes = [KEY_FEATURE_MAP[key] for key in conf.features] for feature in feature_classes: with timer(f"process {feature.__name__}"): f = feature.get_df(conf) if "drop_duplicate_column_on_merge" in conf.options and conf.options.drop_duplicate_column_on_merge: cols_to_drop = [ c for c in f.columns if (c in df.columns) and (c != 'SK_ID_CURR') ] if cols_to_drop: print(f"drop columns: {cols_to_drop}") f = f.drop(cols_to_drop, axis=1) if "reduce_mem_usage" in conf.options and conf.options.reduce_mem_usage: with timer("reduce_mem_usaga"): f = reduce_mem_usage(f) df = df.merge(f, how='left', on='SK_ID_CURR') del f gc.collect() if "stacking_features" in conf: StackingFeaturesWithPasses.set_result_dirs(conf.stacking_features) f = StackingFeaturesWithPasses.get_df(conf) df = df.merge(f, how='left', on='SK_ID_CURR') if "drop_features_list_file" in conf.options: with open(conf.options.drop_features_list_file, "r") as fp: line = fp.read() feature_to_drop = eval(line) print(f"drop columns in {conf.options.drop_features_list_file}") df = df.drop(feature_to_drop, axis=1) if "clean_data" in conf.options and conf.options.clean_data: with timer("clean_data"): df = clean_data(df) train_df = df[df['TARGET'].notnull()].copy() test_df = df[df['TARGET'].isnull()].copy() del df gc.collect() return train_df, test_df
def get_train_test(conf): df = Base.get_df(conf) # pd.DataFrame feature_classes = [KEY_FEATURE_MAP[key] for key in conf.features] features = [df] for feature in feature_classes: with timer(f"load (or create) {feature.__name__}"): f = feature.get_df(conf) features.append(f) with timer("join on SK_ID_CURR"): df = reduce( lambda lhs, rhs: lhs.merge(rhs, how='left', on='SK_ID_CURR'), features) del features gc.collect() train_df = df[df['TARGET'].notnull()].copy() test_df = df[df['TARGET'].isnull()].copy() del df gc.collect() return train_df, test_df
def _create_feature(cls, conf) -> pd.DataFrame: base = Base.get_df(conf) df = BureauFeaturesAntonova.get_df(conf) # clean data needs target info return clean_data(base.merge(df, on='SK_ID_CURR', how='left'))
def _create_feature(cls, conf) -> pd.DataFrame: df = Base.get_df(conf) df = df.merge(CreditCardBalanceFeaturesAntonova.get_df(conf), on="SK_ID_CURR", how="left") return clean_data(df)
def _create_feature(cls, conf) -> pd.DataFrame: df = Base.get_df(conf) df = df.merge(PreviousApplicationFeaturesAntonova.get_df(conf), on="SK_ID_CURR", how="left") return clean_data(df)
def _create_feature(cls, conf) -> pd.DataFrame: df = Base.get_df(conf) df = df.merge(InstallmentsPaymentsFeaturesAntonova.get_df(conf), on="SK_ID_CURR", how="left") return clean_data(df)