Esempio n. 1
0
class SelectPercentile(FeatureSelectionAlgorithm):
    r"""Implementation of feature selection using percentile selection of best features according to used score function.
    
    Date:
        2020

    Author:
        Luka Pečnik

    License:
        MIT
    
    Documentation:
        https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectPercentile.html

    See Also:
        * :class:`niaaml.preprocessing.feature_selection.feature_selection_algorithm.FeatureSelectionAlgorithm`
    """
    Name = 'Select Percentile'

    def __init__(self, **kwargs):
        r"""Initialize SelectPercentile feature selection algorithm.
        """
        self._params = dict(score_func=ParameterDefinition(
            [chi2, f_classif, mutual_info_classif]),
                            percentile=ParameterDefinition(
                                MinMax(10, 100), np.uint))
        self.__select_percentile = SelectPerc()

    def set_parameters(self, **kwargs):
        r"""Set the parameters/arguments of the algorithm.
        """
        self.__select_percentile.set_params(**kwargs)

    def select_features(self, x, y, **kwargs):
        r"""Perform the feature selection process.

        Arguments:
            x (pandas.core.frame.DataFrame): Array of original features.
            y (pandas.core.series.Series) Expected classifier results.

        Returns:
            numpy.ndarray[bool]: Mask of selected features.
        """
        self.__select_percentile.fit(x, y)
        return self.__select_percentile.get_support()

    def to_string(self):
        r"""User friendly representation of the object.

        Returns:
            str: User friendly representation of the object.
        """
        return FeatureSelectionAlgorithm.to_string(self).format(
            name=self.Name,
            args=self._parameters_to_string(
                self.__select_percentile.get_params()))
Esempio n. 2
0
class LinearRegressor:
    def __init__(self):
        self.model = linear_model.LinearRegression(fit_intercept=False)
        self.feature_selector = SelectPercentile(f_regression, percentile=100)
        self.imputer = Imputer(missing_values='NaN', strategy='mean', axis=0, verbose=0, copy=True)
        self.best_columns = []
        self.feature_name = []

    def valid_fit(self, X_train_init, y_train, X_eval_init, y_eval, use_sample_weight=False, use_sample_window_select=False):
        max_sample_num = 1000000
        if len(X_train_init) > max_sample_num:
            X_train_init = X_train_init[-max_sample_num:]
            y_train = y_train[-max_sample_num:]
            gc.collect()

        weight_column = 'sample_weight'
        train_weight = None
        if use_sample_weight:
            train_weight = X_train_init.pop(weight_column)
            eval_weight = X_eval_init.pop(weight_column)

        init_columns = X_train_init.columns
        X_train = self.imputer.fit_transform(X_train_init)
        X_eval = self.imputer.transform(X_eval_init)

        if X_train.shape[1] != len(init_columns):
            X_train_init[(X_train_init == np.inf) | (X_train_init == -np.inf)] = np.nan
            X_train = self.imputer.fit_transform(X_train_init)
            X_eval = self.imputer.transform(X_eval_init)

        score_min = float("inf")
        best_percentile = 100
        best_preds = None
        best_column_num = 0

        if X_train.shape[1] < 20:
            if use_sample_weight:
                self.model.fit(X_train, y_train, sample_weight=train_weight)
            else:
                self.model.fit(X_train, y_train)
            best_preds = self.model.predict(X_eval)
        else:
            for percentile in range(100, 10, -10):
                self.feature_selector.set_params(**{'percentile': percentile})
                gc.collect()
                train = self.feature_selector.fit_transform(X_train, y_train)
                eval = self.feature_selector.transform(X_eval)

                if use_sample_weight:
                    self.model.fit(train, y_train, sample_weight=train_weight)
                else:
                    self.model.fit(train, y_train)

                preds = self.model.predict(eval)
                score = math.sqrt(mean_squared_error(y_eval, preds))
                print(f"valid score:{score}\n")

                if score < score_min:
                    score_min = score
                    best_percentile = percentile
                    best_preds = preds
                    best_column_num = train.shape[1]

                gc.collect()

            ss = pd.Series(self.feature_selector.scores_, index=init_columns)
            score_sorted_cols = list(ss.sort_values(ascending=False).index)
            self.best_columns = score_sorted_cols[:best_column_num]
        return best_preds

    def fit(self, X_train, y_train, use_sample_weight):
        weight_column = 'sample_weight'
        train_weight = None

        if use_sample_weight:
            train_weight = X_train.pop(weight_column)

        if len(self.best_columns):
            self.feature_name = list(set(self.best_columns) & set(X_train.columns))
            X_train = X_train[self.feature_name]
        else:
            self.feature_name = X_train.columns

        init_column_num = len(X_train.columns)
        train = self.imputer.fit_transform(X_train)
        if train.shape[1] != init_column_num:
            X_train[(X_train == np.inf) | (X_train == -np.inf)] = np.nan
            train = self.imputer.fit_transform(X_train)

        if use_sample_weight:
            self.model.fit(train, y_train, sample_weight=train_weight)
        else:
            self.model.fit(train, y_train)

        gc.collect()
        return self

    def predict(self, x_test):
        x_test = x_test[self.feature_name]
        test = self.imputer.transform(x_test)
        pred = self.model.predict(test)
        return pred