Esempio n. 1
0
    def fit_transform(self, F, label, X_old):
        X_cat = F['CAT'].iloc[:, :cat_feat_num]
        X_new = pd.DataFrame()
        categories_record = {}
        for col in X_cat.columns:
            categories = list(set(X_cat[col].values))
            cat_num = len(categories)
            if ((cat_num <= self.max_cat_num) and (cat_num > 1)):
                try:
                    #F['cat_origin'][col + '_origin'] = X_cat[col].astype(int)
                    X_new[col + '_origin'] = pd.Categorical(X_cat[col])
                    categories_record[col] = categories
                except:
                    pass

        # select generated features by importance
        auc_score, selected_features = select_feature_by_importance(
            X_new,
            X_old,
            label,
            importance_type='gain',
            scale=1.,
            use_all=True)
        X_new = X_new[selected_features]

        print('selected features: ', selected_features)
        for feat in selected_features:
            col = feat.split('_origin')[0]
            self.col_categories[col] = categories_record[col]

        return X_new
Esempio n. 2
0
    def fit_transform(self, F, label, X_old):

        try:
            self.X_cat = F['CAT'].iloc[:, :cat_feat_num]
            self.X_num = F['numerical']

            # select categorical columns used for aggregation
            cat_count = self.X_cat.nunique().sort_values(ascending=False)
            self.cat_used = list(cat_count.index[((cat_count >= self.min_cat_num) & \
                            (cat_count <= self.X_cat.shape[0] // self.max_cat_ratio))])
            print('cat used: ', self.cat_used)

            # aggregate
            X_mean_all, X_subtract_all = [], []
            for cat in self.cat_used:
                X = pd.concat([self.X_cat[[cat]], self.X_num], axis=1)
                X_mean = X.groupby(cat).mean()
                X_mean = X_mean.rename(columns={
                    col: col + '_mean_groupby_' + cat
                    for col in X_mean.columns
                })
                X_mean = X[[cat]].merge(X_mean.reset_index(),
                                        how='left',
                                        on=cat).iloc[:, 1:]
                X_subtract = pd.DataFrame(
                    self.X_num.values - X_mean.values,
                    columns=[col + '_subtract' for col in X_mean.columns])
                X_mean_all.append(X_mean)
                X_subtract_all.append(X_subtract)
            X_mean_all = pd.concat(X_mean_all, axis=1)
            X_subtract_all = pd.concat(X_subtract_all, axis=1)
            X_new = pd.concat([X_mean_all, X_subtract_all], axis=1)

            # feature selection
            auc_score, selected_features = select_feature_by_importance(
                X_new, X_old, label, importance_type='gain', scale=1.)
            X_new = X_new[selected_features]

            print('selected features: ', selected_features)
            for col in selected_features:
                num_col, cat_col = col.split('_mean_groupby_')
                if '_subtract' in cat_col:
                    cat_col = cat_col.split('_subtract')[0]
                    try:
                        self.subtract_columns[cat_col].append(num_col)
                    except:
                        self.subtract_columns[cat_col] = [num_col]
                else:
                    try:
                        self.mean_columns[cat_col].append(num_col)
                    except:
                        self.mean_columns[cat_col] = [num_col]

            return X_new

        except:
            return pd.DataFrame()
Esempio n. 3
0
    def transform(self, F, y, is_train=False):

        time_spent = []

        # category count
        try:
            start_time = time.time()
            X_cat = F['CAT']
            F['cat_info'] = {}
            for col in X_cat.columns:
                cat_info = X_cat[col].value_counts(sort=False, dropna=False)
                F['cat_info'][col] = cat_info
            end_time = time.time()
            time_spent.append(('cat_count', end_time - start_time))
        except:
            pass

        # high-order feature engineering
        feat_types = [
            'num_mean_groupby_cat', 'cat_nunique_groupby_cat',
            'time_delta_groupby_cat', 'cat_combine'
        ]
        if 'time' not in F:
            feat_types.remove('time_delta_groupby_cat')

        # set up the baseline for feature selection
        if is_train:
            # generate cat/mv freq features
            X_cat_freq = pd.DataFrame()
            for col in F['CAT'].columns:
                X_cat_freq[col + '_freq'] = F['CAT'][col].map(
                    F['cat_info'][col] / F['cat_info'][col].sum())
            # concat basic features
            X_baseline = pd.concat([F['numerical'], X_cat_freq], axis=1)
            if 'time' in F:
                X_baseline = pd.concat([X_baseline, F['time']], axis=1)
            # train and validate on the training set
            auc_baseline = select_feature_by_importance(
                pd.DataFrame(), X_baseline, y)
            print('validation auc with basic features: ', auc_baseline)

        for feat in feat_types:
            print('processing %s features' % (feat))
            transformer = self.transformer[feat]
            start_time = time.time()
            if is_train:
                F[feat] = transformer.fit_transform(F, y, X_baseline)
                X_baseline = pd.concat([X_baseline, F[feat]], axis=1)
            else:
                F[feat] = transformer.transform(F)
            end_time = time.time()
            time_spent.append((feat, end_time - start_time))

        for feat in time_spent:
            print('%s: %.2f seconds' % (feat[0], feat[1]))

        return F
Esempio n. 4
0
    def fit_transform(self, F, label, X_old):

        X_cat = F['CAT'].iloc[:, :cat_feat_num]
        X_time = F['time']

        # select categorical feature columns for feature generation
        for col in X_cat.columns:
            cat_num = X_cat[col].nunique()
            if ((cat_num > self.min_cat_num)
                    and (cat_num < (X_cat.shape[0] / self.max_cat_ratio))):
                self.cat_index.append(col)
        print(self.cat_index)

        # generate features
        X_new = []
        for time in X_time.columns:

            X = pd.concat([X_time[time], X_cat[self.cat_index]], axis=1)
            X_order_by_time = X.sort_values(time, ascending=True)

            for cat in self.cat_index:
                X_group = X_order_by_time[[time, cat]].groupby(cat)
                X_order_by_time[time + '_diff_back_groupby_' +
                                cat] = X_group[time].diff(periods=1)
                X_order_by_time[time + '_diff_forward_groupby_' +
                                cat] = X_group[time].diff(periods=-1)

            X_new.append(X_order_by_time.iloc[:, X.shape[1]:].sort_index())
        X_new = pd.concat(X_new, axis=1)

        # select generated features by importance
        auc_score, selected_features = select_feature_by_importance(
            X_new, X_old, label, importance_type='gain', scale=1.)
        X_new = X_new[selected_features]

        print('selected features: ')
        for col in selected_features:
            print(col)
            time, remaining = col.split('_diff_')
            direction, cat = remaining.split('_groupby_')
            try:
                self.use_columns[time].append((direction, cat))
            except:
                self.use_columns[time] = [(direction, cat)]

        return X_new
Esempio n. 5
0
    def fit_transform(self, F, label, X_old):

        try:
            X_cat = F['CAT'].iloc[:, :cat_feat_num]

            # select feature columns for feature generation
            for col in X_cat.columns:
                cat_num = X_cat[col].nunique()
                if ((cat_num > self.min_cat_num)
                        and (cat_num < (X_cat.shape[0] / self.max_cat_ratio))):
                    self.cat_index.append(col)
            print(self.cat_index)
            if (len(self.cat_index) == 0):
                return pd.DataFrame()

            # generate features
            X_new = []
            for col in self.cat_index:
                cols_grouped = [c for c in self.cat_index if c != col]
                cat_count = X_cat[cols_grouped].groupby(X_cat[col]).nunique()
                cat_count.columns = [
                    c + '_nunique_groupby_' + col for c in cat_count.columns
                ]
                cat_count = cat_count.reset_index()
                X_new.append(X_cat[[col]].merge(cat_count, on=col,
                                                how='left').iloc[:, 1:])
            X_new = pd.concat(X_new, axis=1)

            # select generated features by importance
            auc_score, selected_features = select_feature_by_importance(
                X_new, X_old, label, importance_type='gain', scale=1.)
            X_new = X_new[selected_features]

            print('selected features: ')
            for col in selected_features:
                print(col)
                cat_1, cat_2 = col.split('_nunique_groupby_')
                try:
                    self.use_columns[cat_2].append(cat_1)
                except:
                    self.use_columns[cat_2] = [cat_1]

            return X_new

        except:
            return pd.DataFrame()
Esempio n. 6
0
    def fit_transform(self, F, label, X_old):

        try:
            X_cat = F['CAT'].iloc[:, :cat_feat_num]

            # select categorical feature columns for feature generation
            self.cat_index = []
            for col in X_cat.columns:
                cat_num = X_cat[col].nunique()
                if ((cat_num > self.min_cat_num)
                        and (cat_num < self.max_cat_num)):
                    self.cat_index.append(col)
            print(self.cat_index)

            # generate new features
            X_combine = pd.DataFrame()
            X_new = pd.DataFrame()
            for i, cat_1 in enumerate(self.cat_index[:-1]):
                for j, cat_2 in enumerate(self.cat_index[i + 1:]):
                    col = cat_1 + '_' + cat_2 + '_combine'
                    X_combine[col] = list(zip(X_cat[cat_1], X_cat[cat_2]))
                    combine_freq = X_combine[col].value_counts(
                        normalize=True, sort=False).reset_index()
                    combine_freq.columns = [col, col + '_freq']
                    X_new[col + '_freq'] = X_combine[[col]].merge(
                        combine_freq, on=col, how='left')[col + '_freq']

            # select generated features by importance
            auc_score, selected_features = select_feature_by_importance(
                X_new, X_old, label, importance_type='gain', scale=1.)
            X_new = X_new[selected_features]

            print('selected features: ')
            for feat in selected_features:
                print(feat)
                name = feat.split('_')
                cat_1 = name[0] + '_' + name[1]
                cat_2 = name[2] + '_' + name[3]
                self.selected_combination.append((cat_1, cat_2))

            return X_new

        except:
            return pd.DataFrame()