def _select_features(self,
                         problem,
                         percent_features_to_select,
                         algorithm,
                         features_to_keep=None):
        # Initialize FeatureSelector.
        fs = FeatureSelector(problem=problem,
                             algorithm=algorithm,
                             random_state=self._random_state)
        fs.set_input_matrix(self._X_train, column_or_1d(self._y_train))
        num_features_to_select = int(percent_features_to_select *
                                     len(self._X_train.columns.values))

        # Parse features_to_keep.
        if features_to_keep is None:
            features_to_keep = []

        # Select features.
        fs.select(k=num_features_to_select)

        # Enumerate eliminated features pre-transformation.
        feature_ranks = fs.compute_ranks()
        for i in range(len(feature_ranks)):
            if feature_ranks[i] > num_features_to_select:
                # If in features_to_keep, pretend it wasn't eliminated.
                if self._X_train.columns[i] not in features_to_keep:
                    self._eliminated_features.append(self._X_train.columns[i])

        # Hack: rather than making FeatureSelector handle the concept of
        # kept features, just copy the data here and add it back to the
        # transformed matrices.
        # Rather than looping, do this individually so that we can skip if
        # transformed X already has the feature.
        for feature in features_to_keep:
            kept_X_train_feature = self._X_train[[feature]].copy()
            log.debug('kept_X_train_feature.shape: %s' %
                      str(kept_X_train_feature.shape))
            self._X_train = fs.transform_matrix(self._X_train)
            if feature not in self._X_train:
                self._X_train = self._X_train.merge(kept_X_train_feature,
                                                    left_index=True,
                                                    right_index=True)

            kept_X_test_feature = self._X_test[[feature]].copy()
            log.debug('kept_X_test_feature.shape: %s' %
                      str(kept_X_test_feature.shape))
            self._X_test = fs.transform_matrix(self._X_test)
            if feature not in self._X_test:
                self._X_test = self._X_test.merge(kept_X_test_feature,
                                                  left_index=True,
                                                  right_index=True)

        if not features_to_keep:
            # Even if there is no feature to keep, still need to
            # perform transform_matrix to drop most low-rank features
            self._X_train = fs.transform_matrix(self._X_train)
            self._X_test = fs.transform_matrix(self._X_test)
Beispiel #2
0
    def _select_features(self):
        # Use FeatureSelector to prune all but 100 variables.
        fs = FeatureSelector(algorithm=FeatureSelector.RECURSIVE_ELIMINATION, \
            problem=FeatureSelector.CLASSIFICATION)

        fs.set_input_matrix(self._X_train, column_or_1d(self._y_train))
        num_features_to_select = int(0.01 * len(self._X_train.columns.values))
        fs.select(k=num_features_to_select)

        # Enumerate eliminated features pre-transformation.
        self._feature_ranks = fs.compute_ranks()
        for i in range(len(self._feature_ranks)):
            if self._feature_ranks[i] > num_features_to_select:
                self._eliminated_features.append(self._X_train.columns[i])

        self._X_train = fs.transform_matrix(self._X_train)
        self._X_test = fs.transform_matrix(self._X_test)