def fit(self, X, y=None): X_norm = MaxAbsScaler().fit_transform(X) if issparse(X): if type(X) != csc_matrix: X = X.tocsc() X_norm = X_norm.tocsc() print('Running Cor') cor_support = cor_selector(X, y, self.feature_names, self.num_features) print('Running Chi2') chi_support = chi2_selector(X_norm, y, self.num_features) print('Running RFE') rfe_support = rfe_selector(X_norm, y, self.num_features, self.random_state) print('Running LR') embeded_lr_support = embeded_lr_selector(X_norm, y, self.num_features, self.random_state) print('Running RF') embeded_rf_support = embeded_rf_selector( X, y, self.num_features, n_jobs=self.n_jobs, random_state=self.random_state) print('Running XG') embeded_xgb_support = embeded_xgb_selector( X, y, self.num_features, n_jobs=self.n_jobs, random_state=self.random_state) feature_selection_df = pd.DataFrame({ 'feature': self.feature_names, 'pearson': cor_support, 'chi_2': chi_support, 'rfe': rfe_support, 'logistics': embeded_lr_support, 'random_forest': embeded_rf_support, 'xgboost': embeded_xgb_support }) feature_selection_df['total'] = np.sum(feature_selection_df, axis=1) self.feature_selection_df_ = feature_selection_df keep_features = feature_selection_df.query('total >= {}'.format( self.min_selections))['feature'].tolist() # Keep the features that we always want (e.g. domain expertise) if self.always_keep is not None: keep_features.extend(self.always_keep) self.keep_features_ = [ f for f in self.feature_names if f in keep_features ] return self