def _select_features(self,
                         problem,
                         percent_features_to_select,
                         algorithm,
                         features_to_keep=None):
        # Initialize FeatureSelector.
        fs = FeatureSelector(problem=problem,
                             algorithm=algorithm,
                             random_state=self._random_state)
        fs.set_input_matrix(self._X_train, column_or_1d(self._y_train))
        num_features_to_select = int(percent_features_to_select *
                                     len(self._X_train.columns.values))

        # Parse features_to_keep.
        if features_to_keep is None:
            features_to_keep = []

        # Select features.
        fs.select(k=num_features_to_select)

        # Enumerate eliminated features pre-transformation.
        feature_ranks = fs.compute_ranks()
        for i in range(len(feature_ranks)):
            if feature_ranks[i] > num_features_to_select:
                # If in features_to_keep, pretend it wasn't eliminated.
                if self._X_train.columns[i] not in features_to_keep:
                    self._eliminated_features.append(self._X_train.columns[i])

        # Hack: rather than making FeatureSelector handle the concept of
        # kept features, just copy the data here and add it back to the
        # transformed matrices.
        # Rather than looping, do this individually so that we can skip if
        # transformed X already has the feature.
        for feature in features_to_keep:
            kept_X_train_feature = self._X_train[[feature]].copy()
            log.debug('kept_X_train_feature.shape: %s' %
                      str(kept_X_train_feature.shape))
            self._X_train = fs.transform_matrix(self._X_train)
            if feature not in self._X_train:
                self._X_train = self._X_train.merge(kept_X_train_feature,
                                                    left_index=True,
                                                    right_index=True)

            kept_X_test_feature = self._X_test[[feature]].copy()
            log.debug('kept_X_test_feature.shape: %s' %
                      str(kept_X_test_feature.shape))
            self._X_test = fs.transform_matrix(self._X_test)
            if feature not in self._X_test:
                self._X_test = self._X_test.merge(kept_X_test_feature,
                                                  left_index=True,
                                                  right_index=True)

        if not features_to_keep:
            # Even if there is no feature to keep, still need to
            # perform transform_matrix to drop most low-rank features
            self._X_train = fs.transform_matrix(self._X_train)
            self._X_test = fs.transform_matrix(self._X_test)
Ejemplo n.º 2
0
    def _get_test_feature_ranks(self, algorithm, problem, X, y, k=None, percentile=None):
        # Set input features and values.
        fs = FeatureSelector(algorithm=algorithm, problem=problem, random_state=12345)
        fs.set_input_matrix(X, y)

        # Select k best features.
        fs.select(k=k, percentile=percentile)
        feature_ranks = fs.compute_ranks()

        return feature_ranks
Ejemplo n.º 3
0
class Select_Features(TransformerMixin):
    def __init__(self, random_state=0, features_by_type=None):
        '''
        TODO: if feature_collection is None, assume all features are numeric.

        Args:
            random_state:
            feature_dict:
        '''
        self.fs = FeatureSelector(
            problem=FeatureSelector.CLASSIFICATION,
            algorithm=FeatureSelector.RECURSIVE_ELIMINATION,
            random_state=random_state)
        self.features_by_type = features_by_type
        self.selected_features = []

    def fit(self, X, y=None, features_to_keep=None, select_percent=0.05):
        '''
        TODO: Does this "select_percent" include those pre-set to keep?
        features_to_keep includes both features wanted to keep + non-numeric features

        Args:
            X:
            y:
            features_to_keep:
            select_percent:

        Returns:

        '''
        if not features_to_keep:
            features_to_keep = []

        X_numeric = X[X.columns[X.columns.isin(
            self.features_by_type['numeric_features'])]]

        self.fs.set_input_matrix(X_numeric.values, column_or_1d(y.values))

        num_features_to_select = int(
            round(select_percent * len(X_numeric.columns.values)))
        self.fs.select(k=num_features_to_select)

        feature_ranks = self.fs.compute_ranks()

        for i in range(len(feature_ranks)):
            if feature_ranks[i] <= num_features_to_select:
                # If in features_to_keep, pretend it wasn't eliminated.
                features_to_keep.append(X_numeric.columns[i])

        self.selected_features = features_to_keep[:]
        return self

    def transform(self, X):
        return X[self.selected_features]
Ejemplo n.º 4
0
    def _select_features(self):
        # Use FeatureSelector to prune all but 100 variables.
        fs = FeatureSelector(algorithm=FeatureSelector.RECURSIVE_ELIMINATION, \
            problem=FeatureSelector.CLASSIFICATION)

        fs.set_input_matrix(self._X_train, column_or_1d(self._y_train))
        num_features_to_select = int(0.01 * len(self._X_train.columns.values))
        fs.select(k=num_features_to_select)

        # Enumerate eliminated features pre-transformation.
        self._feature_ranks = fs.compute_ranks()
        for i in range(len(self._feature_ranks)):
            if self._feature_ranks[i] > num_features_to_select:
                self._eliminated_features.append(self._X_train.columns[i])

        self._X_train = fs.transform_matrix(self._X_train)
        self._X_test = fs.transform_matrix(self._X_test)
Ejemplo n.º 5
0
def select_features(matrix, features, random_state=0):
    select_params = features['select_params']
    fs = FeatureSelector(problem=select_params['selection_problem'],
                         algorithm=select_params['selection_algorithm'],
                         random_state=random_state)

    X, y = split_Xy(matrix, features['ylabel'])
    fs.set_input_matrix(X, y)
    num_features_to_select = int(select_params['percent_features_to_select'] *
                                 len(matrix.columns.values))

    fs.select(k=num_features_to_select)

    feature_ranks = fs.compute_ranks()

    features_to_keep = []
    for i in range(len(feature_ranks)):
        if feature_ranks[i] <= num_features_to_select:
            # If in features_to_keep, pretend it wasn't eliminated.
            # self._eliminated_features.append(self._X_train.columns[i])
            features_to_keep.append(X.columns[i])

    return matrix[features_to_keep].copy()