class SelectPercentile(FeatureSelectionAlgorithm): r"""Implementation of feature selection using percentile selection of best features according to used score function. Date: 2020 Author: Luka Pečnik License: MIT Documentation: https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectPercentile.html See Also: * :class:`niaaml.preprocessing.feature_selection.feature_selection_algorithm.FeatureSelectionAlgorithm` """ Name = 'Select Percentile' def __init__(self, **kwargs): r"""Initialize SelectPercentile feature selection algorithm. """ self._params = dict(score_func=ParameterDefinition( [chi2, f_classif, mutual_info_classif]), percentile=ParameterDefinition( MinMax(10, 100), np.uint)) self.__select_percentile = SelectPerc() def set_parameters(self, **kwargs): r"""Set the parameters/arguments of the algorithm. """ self.__select_percentile.set_params(**kwargs) def select_features(self, x, y, **kwargs): r"""Perform the feature selection process. Arguments: x (pandas.core.frame.DataFrame): Array of original features. y (pandas.core.series.Series) Expected classifier results. Returns: numpy.ndarray[bool]: Mask of selected features. """ self.__select_percentile.fit(x, y) return self.__select_percentile.get_support() def to_string(self): r"""User friendly representation of the object. Returns: str: User friendly representation of the object. """ return FeatureSelectionAlgorithm.to_string(self).format( name=self.Name, args=self._parameters_to_string( self.__select_percentile.get_params()))
class LinearRegressor: def __init__(self): self.model = linear_model.LinearRegression(fit_intercept=False) self.feature_selector = SelectPercentile(f_regression, percentile=100) self.imputer = Imputer(missing_values='NaN', strategy='mean', axis=0, verbose=0, copy=True) self.best_columns = [] self.feature_name = [] def valid_fit(self, X_train_init, y_train, X_eval_init, y_eval, use_sample_weight=False, use_sample_window_select=False): max_sample_num = 1000000 if len(X_train_init) > max_sample_num: X_train_init = X_train_init[-max_sample_num:] y_train = y_train[-max_sample_num:] gc.collect() weight_column = 'sample_weight' train_weight = None if use_sample_weight: train_weight = X_train_init.pop(weight_column) eval_weight = X_eval_init.pop(weight_column) init_columns = X_train_init.columns X_train = self.imputer.fit_transform(X_train_init) X_eval = self.imputer.transform(X_eval_init) if X_train.shape[1] != len(init_columns): X_train_init[(X_train_init == np.inf) | (X_train_init == -np.inf)] = np.nan X_train = self.imputer.fit_transform(X_train_init) X_eval = self.imputer.transform(X_eval_init) score_min = float("inf") best_percentile = 100 best_preds = None best_column_num = 0 if X_train.shape[1] < 20: if use_sample_weight: self.model.fit(X_train, y_train, sample_weight=train_weight) else: self.model.fit(X_train, y_train) best_preds = self.model.predict(X_eval) else: for percentile in range(100, 10, -10): self.feature_selector.set_params(**{'percentile': percentile}) gc.collect() train = self.feature_selector.fit_transform(X_train, y_train) eval = self.feature_selector.transform(X_eval) if use_sample_weight: self.model.fit(train, y_train, sample_weight=train_weight) else: self.model.fit(train, y_train) preds = self.model.predict(eval) score = math.sqrt(mean_squared_error(y_eval, preds)) print(f"valid score:{score}\n") if score < score_min: score_min = score best_percentile = percentile best_preds = preds best_column_num = train.shape[1] gc.collect() ss = pd.Series(self.feature_selector.scores_, index=init_columns) score_sorted_cols = list(ss.sort_values(ascending=False).index) self.best_columns = score_sorted_cols[:best_column_num] return best_preds def fit(self, X_train, y_train, use_sample_weight): weight_column = 'sample_weight' train_weight = None if use_sample_weight: train_weight = X_train.pop(weight_column) if len(self.best_columns): self.feature_name = list(set(self.best_columns) & set(X_train.columns)) X_train = X_train[self.feature_name] else: self.feature_name = X_train.columns init_column_num = len(X_train.columns) train = self.imputer.fit_transform(X_train) if train.shape[1] != init_column_num: X_train[(X_train == np.inf) | (X_train == -np.inf)] = np.nan train = self.imputer.fit_transform(X_train) if use_sample_weight: self.model.fit(train, y_train, sample_weight=train_weight) else: self.model.fit(train, y_train) gc.collect() return self def predict(self, x_test): x_test = x_test[self.feature_name] test = self.imputer.transform(x_test) pred = self.model.predict(test) return pred