Esempio n. 1
0
    def _bureau_and_balance(self, configs):
        current_index = self.data_index['bureau']
        major_index = self.data_index['application_train']
        nan_as_category = configs.get('nan_as_category', False)

        # Read data and merge
        df = self.data_raw['bureau']
        bb = self.data_raw['bureau_balance']
        logger.info("Bureau: {}, Bureau Balance: {}".format(
            df.shape, bb.shape))

        if configs.get('onehot_encoding', False):
            df, cat_cols, new_cols = process_one_hot_encode(
                df, configs['onehot_columns'], nan_as_category)
            bb, cat_cols_bb, new_cols_bb = process_one_hot_encode(
                bb, configs['onehot_columns'], nan_as_category)
            self.cols_one_hot.update({'bureau': new_cols + new_cols_bb})

        agg_configs = self._split_configs(configs.copy(), 'bureau_balance')
        bb_agg = self._aggregate_pipeline(bb, cat_cols_bb,
                                          agg_configs)[current_index]
        df = df.set_index(current_index).join(bb_agg, how='left')
        bureau_cat_cols = cat_cols + [
            c for c in bb_agg
            if any([True if cc in c else False for cc in cat_cols_bb])
        ]
        #condictional aggregation
        # Bureau: Active credits - using only numerical aggregations
        # Bureau: Closed credits - using only numerical aggregations
        agg_configs = self._split_configs(configs.copy(), 'bureau')
        bureau_agg = self._aggregate_pipeline(df, bureau_cat_cols,
                                              agg_configs)[major_index]
        return Cast64To32(bureau_agg)
Esempio n. 2
0
    def _application_train_test(self, configs):
        nan_as_category = configs.get('nan_as_category', False)

        # Read data and merge
        major_index = self.data_index['application_train']
        df = self.data_raw['application_train']
        test_df = self.data_raw['application_test']
        logger.info("Train samples: {}, test samples: {}".format(
            df.shape, test_df.shape))
        df = df.append(test_df, sort=False, ignore_index=True)

        df = process_drop_rows(df, process_configs=configs['filter_rows'])
        df = process_factorize(df,
                               process_configs=configs['factorize_columns'])

        if configs.get('onehot_encoding', False):
            df, cat_cols, new_cols = process_one_hot_encode(
                df, configs['onehot_columns'], nan_as_category)
            self.cols_one_hot.update({'application': new_cols})
        else:
            cat_cols = IdentifyCategoricalColumn(df)

        df = process_replace(df, process_configs=configs['replace_rows'])
        df, interact_cols = process_interaction(
            df, process_configs=configs['interaction_columns'])

        if configs.get('deep_interactions', []):
            deep_interactions = configs.get('deep_interactions', [])
            for c in deep_interactions:
                df = process_deep_interactions(df, c)

        logger.info('prepare decompostion, application={}'.format(df.shape))
        df_ext = [
            process_decomposition(df, c) for c in configs['decomposition']
        ]
        df = pd.concat([df] + df_ext, axis=1, join='inner')
        logger.info('finished decompositions, application={}'.format(df.shape))
        df = Cast64To32(df)

        # seperate train test
        # Divide in training/validation and test data
        train_df = df.loc[df[
            self.target_column].notnull()].reset_index().set_index(major_index)
        test_df = df.loc[df[
            self.target_column].isnull()].reset_index().set_index(major_index)
        logger.info("Split into train samples: {}, test samples: {}".format(
            train_df.shape, test_df.shape))
        del df
        gc.collect()

        return train_df, test_df
Esempio n. 3
0
    def _pos_cash_balance(self, configs):
        current_index = self.data_index['pos_cash_balance']
        major_index = self.data_index['application_train']
        nan_as_category = configs.get('nan_as_category', False)

        df = self.data_raw['pos_cash_balance']
        logger.info("pos_cash: {}".format(df.shape))

        if configs.get('onehot_encoding', False):
            df, cat_cols, new_cols = process_one_hot_encode(
                df, configs['onehot_columns'], nan_as_category)
            self.cols_one_hot.update({'pos_cash': new_cols})
        else:
            cat_cols = IdentifyCategoricalColumn(df)

        pos_cash_agg = self._aggregate_pipeline(df, cat_cols,
                                                configs)[major_index]
        return Cast64To32(pos_cash_agg)
Esempio n. 4
0
    def _installments_payments(self, configs):
        current_index = self.data_index['installments_payments']
        major_index = self.data_index['application_train']
        nan_as_category = configs.get('nan_as_category', False)

        df = self.data_raw['installments_payments']
        logger.info("installments_payments: {}".format(df.shape))

        cat_cols = []
        if configs.get('onehot_encoding', False):
            df, cat_cols, new_cols = process_one_hot_encode(
                df, cat_cols, nan_as_category)
            self.cols_one_hot.update({'installments_payments': new_cols})
        else:
            cat_cols = IdentifyCategoricalColumn(df)

        df, interact_cols = process_interaction(
            df, process_configs=configs['interaction_columns'])
        installments_agg = self._aggregate_pipeline(df, cat_cols,
                                                    configs)[major_index]
        return Cast64To32(installments_agg)
Esempio n. 5
0
    def _previous_application(self, configs):
        current_index = self.data_index['previous_application']
        major_index = self.data_index['application_train']
        nan_as_category = configs.get('nan_as_category', False)

        df = self.data_raw['previous_application']
        logger.info("Previous application: {}".format(df.shape))

        if configs.get('onehot_encoding', False):
            df, cat_cols, new_cols = process_one_hot_encode(
                df, configs['onehot_columns'], nan_as_category)
            self.cols_one_hot.update({'previous_application': new_cols})
        else:
            cat_cols = IdentifyCategoricalColumn(df)

        df = process_replace(df, process_configs=configs['replace_rows'])
        df, interact_cols = process_interaction(
            df, process_configs=configs['interaction_columns'])
        # Previous applications categorical features
        # Previous Applications: Approved Applications - only numerical features
        # Previous Applications: Refused Applications - only numerical features
        prev_agg = self._aggregate_pipeline(df, cat_cols, configs)[major_index]

        return Cast64To32(prev_agg)