def _create_feature(cls, conf) -> pd.DataFrame:
        app = Application.get_df(conf)
        features = pd.DataFrame({'SK_ID_CURR': app['SK_ID_CURR'].unique()})
        features['annuity_income_percentage'] = app['AMT_ANNUITY'] / app[
            'AMT_INCOME_TOTAL']
        features['car_to_birth_ratio'] = app['OWN_CAR_AGE'] / app['DAYS_BIRTH']
        features[
            'car_to_employ_ratio'] = app['OWN_CAR_AGE'] / app['DAYS_EMPLOYED']
        features[
            'children_ratio'] = app['CNT_CHILDREN'] / app['CNT_FAM_MEMBERS']
        features[
            'credit_to_annuity_ratio'] = app['AMT_CREDIT'] / app['AMT_ANNUITY']
        features['credit_to_goods_ratio'] = app['AMT_CREDIT'] / app[
            'AMT_GOODS_PRICE']
        features['credit_to_income_ratio'] = app['AMT_CREDIT'] / app[
            'AMT_INCOME_TOTAL']
        features['days_employed_percentage'] = app['DAYS_EMPLOYED'] / app[
            'DAYS_BIRTH']
        features['income_credit_percentage'] = app['AMT_INCOME_TOTAL'] / app[
            'AMT_CREDIT']
        features['income_per_child'] = app['AMT_INCOME_TOTAL'] / (
            1 + app['CNT_CHILDREN'])
        features['income_per_person'] = app['AMT_INCOME_TOTAL'] / app[
            'CNT_FAM_MEMBERS']
        features['payment_rate'] = app['AMT_ANNUITY'] / app['AMT_CREDIT']
        features['phone_to_birth_ratio'] = app['DAYS_LAST_PHONE_CHANGE'] / app[
            'DAYS_BIRTH']
        features['phone_to_employ_ratio'] = app[
            'DAYS_LAST_PHONE_CHANGE'] / app['DAYS_EMPLOYED']
        features[
            'external_sources_weighted'] = app.EXT_SOURCE_1 * 2 + app.EXT_SOURCE_2 * 3 + app.EXT_SOURCE_3 * 4
        features[
            'cnt_non_child'] = app['CNT_FAM_MEMBERS'] - app['CNT_CHILDREN']
        features['child_to_non_child_ratio'] = app['CNT_CHILDREN'] / features[
            'cnt_non_child']
        features['income_per_non_child'] = app['AMT_INCOME_TOTAL'] / features[
            'cnt_non_child']
        features[
            'credit_per_person'] = app['AMT_CREDIT'] / app['CNT_FAM_MEMBERS']
        features['credit_per_child'] = app['AMT_CREDIT'] / (
            1 + app['CNT_CHILDREN'])
        features['credit_per_non_child'] = app['AMT_CREDIT'] / features[
            'cnt_non_child']
        for function_name in ['min', 'max', 'sum', 'mean', 'nanmedian']:
            features['external_sources_{}'.format(function_name)] = eval(
                'np.{}'.format(function_name))(
                    app[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']],
                    axis=1)

        features['short_employment'] = (app['DAYS_EMPLOYED'] <
                                        -2000).astype(int)
        features['young_age'] = (app['DAYS_BIRTH'] < -14000).astype(int)

        return features
 def _create_feature(cls, conf) -> pd.DataFrame:
     df = Application.get_df(conf)
     # fit with train data and transform both data
     train_df = df[df['TARGET'].notnull()].copy()
     categorical_columns = [
         col for col in df.columns if df[col].dtype == 'object'
     ]
     df = TargetEncoder(cols=categorical_columns).fit(
         train_df, train_df['TARGET']).transform(df)
     return df[categorical_columns + ['SK_ID_CURR']].rename(columns={
         col: f"{col}_target_encode"
         for col in categorical_columns
     })
    def _create_feature(cls, conf) -> pd.DataFrame:
        df = Application.get_df(conf)
        categorical_columns = [
            col for col in df.columns if df[col].dtype == 'object'
        ]
        df = df[categorical_columns + ['SK_ID_CURR']]
        for c in categorical_columns:
            feature = df[[c, 'SK_ID_CURR']].groupby(c)[[
                'SK_ID_CURR'
            ]].count().reset_index().rename(
                columns={'SK_ID_CURR': f'app_{c}_value_count'})
            df = df.merge(feature, on=c, how='left')

        return df.drop(categorical_columns, axis=1)
    def _create_feature(cls, conf) -> pd.DataFrame:
        df = Application.get_df(conf)
        categorical_columns = [
            col for col in df.columns if df[col].dtype == 'object'
        ]
        df = df[categorical_columns + ['SK_ID_CURR']]
        cat_pairs = itertools.combinations(categorical_columns, 2)
        for col1, col2 in tqdm(list(cat_pairs)):
            feature = df[[col1, col2, 'SK_ID_CURR']].groupby(by=[col1, col2])[[
                'SK_ID_CURR'
            ]].count().reset_index().rename(
                columns={'SK_ID_CURR': f'app_{col1}_{col2}_value_count'})
            df = df.merge(feature, on=[col1, col2], how='left')

        return df.drop(categorical_columns, axis=1)
    def _create_feature(cls, conf) -> pd.DataFrame:
        df = Application.get_df(conf)
        features = pd.DataFrame({'SK_ID_CURR': df['SK_ID_CURR'].unique()})

        features['app EXT_SOURCE_1 * EXT_SOURCE_2'] = df['EXT_SOURCE_1'] * df[
            'EXT_SOURCE_2']
        features['app EXT_SOURCE_1 * EXT_SOURCE_3'] = df['EXT_SOURCE_1'] * df[
            'EXT_SOURCE_3']
        features['app EXT_SOURCE_2 * EXT_SOURCE_3'] = df['EXT_SOURCE_2'] * df[
            'EXT_SOURCE_3']
        features['app EXT_SOURCE_1 * DAYS_EMPLOYED'] = df['EXT_SOURCE_1'] * df[
            'DAYS_EMPLOYED']
        features['app EXT_SOURCE_2 * DAYS_EMPLOYED'] = df['EXT_SOURCE_2'] * df[
            'DAYS_EMPLOYED']
        features['app EXT_SOURCE_3 * DAYS_EMPLOYED'] = df['EXT_SOURCE_3'] * df[
            'DAYS_EMPLOYED']
        features['app EXT_SOURCE_1 / DAYS_BIRTH'] = df['EXT_SOURCE_1'] / df[
            'DAYS_BIRTH']
        features['app EXT_SOURCE_2 / DAYS_BIRTH'] = df['EXT_SOURCE_2'] / df[
            'DAYS_BIRTH']
        features['app EXT_SOURCE_3 / DAYS_BIRTH'] = df['EXT_SOURCE_3'] / df[
            'DAYS_BIRTH']
        features['app AMT_CREDIT - AMT_GOODS_PRICE'] = df['AMT_CREDIT'] - df[
            'AMT_GOODS_PRICE']
        features['app AMT_INCOME_TOTAL / 12 - AMT_ANNUITY'] = df[
            'AMT_INCOME_TOTAL'] / 12. - df['AMT_ANNUITY']
        features['app AMT_INCOME_TOTAL / AMT_ANNUITY'] = df[
            'AMT_INCOME_TOTAL'] / df['AMT_ANNUITY']
        features['app AMT_INCOME_TOTAL - AMT_GOODS_PRICE'] = df[
            'AMT_INCOME_TOTAL'] - df['AMT_GOODS_PRICE']
        features['app most popular AMT_GOODS_PRICE'] = df[
            'AMT_GOODS_PRICE'].isin([225000, 450000, 675000, 900000]).map({
                True:
                1,
                False:
                0
            })
        features['app popular AMT_GOODS_PRICE'] = df['AMT_GOODS_PRICE'].isin(
            [1125000, 1350000, 1575000, 1800000, 2250000]).map({
                True: 1,
                False: 0
            })
        features['app DAYS_EMPLOYED - DAYS_BIRTH'] = df['DAYS_EMPLOYED'] - df[
            'DAYS_BIRTH']

        return features
    def _create_feature(cls, conf) -> pd.DataFrame:
        df = Application.get_df(conf)[[
            'SK_ID_CURR', 'TARGET', cls._col1, cls._col2
        ]]
        df = OrdinalEncoder(cols=[cls._col1, cls._col2]).fit_transform(df)
        latent_vectors = lda(cls._n_components, df, cls._col1, cls._col2)

        dic = defaultdict(list)
        for v in latent_vectors:
            for i, s in enumerate(v):
                dic[f"{cls._col1}_LDA_{cls._col2}_dim{i}"].append(s)
        df_latent_vectors = pd.DataFrame(dic)
        return df.merge(df_latent_vectors,
                        how="left",
                        left_on=cls._col1,
                        right_index=True).drop(
                            ['TARGET', cls._col1, cls._col2], axis=1)
    def _create_feature(cls, conf) -> pd.DataFrame:
        df = Application.get_df(conf)

        # fit with train data and transform both data
        categorical_columns = [
            col for col in df.columns if df[col].dtype == 'object'
        ]
        train_df = df[df['TARGET'].notnull()].copy()
        test_df = df[df['TARGET'].isnull()].copy()

        feature = pd.DataFrame()
        folds = StratifiedKFold(**conf.model.kfold_params)
        for n_fold, (train_idx, valid_idx) in tqdm(
                enumerate(
                    folds.split(train_df[categorical_columns],
                                train_df['TARGET'])),
                total=conf.model.kfold_params.n_splits):
            encoder = TargetEncoder(cols=categorical_columns).fit(
                train_df.iloc[train_idx][categorical_columns + ['SK_ID_CURR']],
                train_df.iloc[train_idx]['TARGET'])
            valid_te = encoder.transform(
                train_df.iloc[valid_idx][categorical_columns +
                                         ['SK_ID_CURR']]).rename(columns={
                                             col: f"{col}_target_encode"
                                             for col in categorical_columns
                                         })
            test_te = encoder.transform(
                test_df[categorical_columns + ['SK_ID_CURR']]).rename(columns={
                    col: f"{col}_target_encode"
                    for col in categorical_columns
                })
            feature = feature.append(valid_te, sort=True).append(test_te,
                                                                 sort=True)

        # take mean of oof target mean for test data
        feature = feature.groupby('SK_ID_CURR').mean()

        return feature
 def _create_feature(cls, conf) -> pd.DataFrame:
     df = Application.get_df(conf)
     df = ApplicationFeatures._features_from_kernel(df)
     df = ApplicationFeatures._binarize_features(df)
     df, _cat_cols = one_hot_encoder(df, True)
     return ApplicationFeatures._filter_features(df)
Esempio n. 9
0
 def _create_feature(cls, conf) -> pd.DataFrame:
     return Application.get_df(conf)[["SK_ID_CURR", "TARGET"]]