def test_rebate(self): df_reduced = rebate(self.test_df, "gap expt", 10) self.assertEqual(df_reduced.shape[1], 10)
def fit(self, df, target): missing_remove_features = [ c for c in self._remove_features if c not in df.columns ] missing_keep_features = [ c for c in self._keep_features if c not in df.columns ] for features, name in [(missing_remove_features, 'remove'), (missing_keep_features, 'keep')]: if features: self.logger.warning( self._log_prefix + "Asked to {} some features that do not exist in the " "dataframe. Skipping the following features:\n{}".format( name, features)) reduced_df = df for r in self.reducers: X = df.drop(columns=[target]) y = df[target] if r == "corr": reduced_df = self.rm_correlated(df, target, self.corr_threshold) reduced_df = reduced_df.drop(columns=[target]) if r == "tree": tbfr = TreeFeatureReducer( importance_percentile=self.tree_importance_percentile, mode=regression_or_classification(y), logger=self.logger) reduced_df = tbfr.fit_transform(X, y).copy(deep=True) self.reducer_params[r] = { "importance_percentile": tbfr.importance_percentile, "mode": tbfr.mode, "random_state": tbfr.rs } elif r == "rebate": if isinstance(self.n_rebate_features, float): self.logger.info( self._log_prefix + "Retaining fraction {} of current {} features.".format( self.n_rebate_features, df.shape[1] - 1)) self.n_rebate_features = int(df.shape[1] * self.n_rebate_features) self.logger.info( self._log_prefix + "ReBATE MultiSURF running: retaining {} numerical " "features.".format(self.n_rebate_features)) reduced_df = rebate(df, target, n_features=self.n_rebate_features) reduced_df = reduced_df.copy(deep=True) self.logger.info( self._log_prefix + "ReBATE MultiSURF completed: retained {} numerical " "features.".format(len(reduced_df.columns))) self.logger.debug( self._log_prefix + "ReBATE MultiSURF gave the following " "features: {}".format(reduced_df.columns.tolist())) self.reducer_params[r] = {"algo": "MultiSURF Algorithm"} elif r == "pca": n_samples, n_features = X.shape if self.n_pca_features == "auto": if n_samples < n_features: self.logger.warning( self._log_prefix + "Number of samples ({}) is less than number of " "features ({}). Setting n_pca_features equal to " "n_samples.".format(n_samples, n_features)) self._pca = PCA(n_components=n_samples, svd_solver="full") else: self.logger.info( self._log_prefix + "PCA automatically determining optimal number of " "features using Minka's MLE.") self._pca = PCA(n_components="mle", svd_solver="auto") else: if isinstance(self.n_pca_features, float): self.logger.info( self._log_prefix + "Retaining fraction {} of current {} features." "".format(self.n_pca_features, df.shape[1])) self.n_pca_features = int(df.shape[1] * self.n_pca_features) if self.n_pca_features > n_samples: self.logger.warning( self._log_prefix + "Number of PCA features interpreted as {}, which is" " more than the number of samples ({}). " "n_pca_features coerced to equal n_samples." "".format(self.n_pca_features, n_samples)) self.n_pca_features = n_samples self.logger.info( self._log_prefix + "PCA running: retaining {} numerical features." "".format(self.n_pca_features)) self._pca = PCA(n_components=self.n_pca_features, svd_solver="auto") self._pca.fit(X.values, y.values) matrix = self._pca.transform(X.values) pca_feats = [ "PCA {}".format(i) for i in range(matrix.shape[1]) ] self._pca_feats = pca_feats reduced_df = pd.DataFrame(columns=pca_feats, data=matrix, index=X.index) self.logger.info(self._log_prefix + "PCA completed: retained {} numerical " "features.".format(len(reduced_df.columns))) retained = reduced_df.columns.values.tolist() removed = [ c for c in df.columns.values if c not in retained and c != target ] self.removed_features[r] = removed if target not in reduced_df: reduced_df.loc[:, target] = y.tolist() df = reduced_df all_removed = [ c for r, rf in self.removed_features.items() for c in rf ] all_kept = [c for c in df.columns.tolist() if c != target] save_from_removal = [ c for c in self._keep_features if c in all_removed ] for_force_removal = [c for c in self._remove_features if c in all_kept] if save_from_removal: self.logger.info(self._log_prefix + "Saving features from removal. " "Saved features:\n{}".format(save_from_removal)) if for_force_removal: self.logger.info( self._log_prefix + "Forcing removal of features. " "Removed features: \n{}".format(for_force_removal)) self.removed_features['manual'] = for_force_removal self.retained_features = [ c for c in all_kept if c not in self._remove_features or c != target ] return self
def fit(self, df, target): reduced_df = df for r in self.reducers: if r == "corr": reduced_df = self.rm_correlated(df, target) # More advanced feature reduction methods else: X = df.drop(columns=[target]) y = df[target] if r == "tree": tbfr = TreeBasedFeatureReduction( mode=regression_or_classification(df[target]), logger=self.logger) reduced_df = tbfr.fit_transform(X, y) self.reducer_params[r] = { "importance_percentile": tbfr.importance_percentile, "mode": tbfr.mode, "random_state": tbfr.rs } elif r == "rebate": if isinstance(self.n_rebate_features, float): self.logger.info("Retaining fraction {} of current " "{} features.".format( self.n_rebate_features, df.shape[1])) self.n_rebate_features = int(df.shape[1] * self.n_rebate_features) self.logger.info( "ReBATE MultiSURF running: retaining {} numerical " "features.".format(self.n_rebate_features)) reduced_df = rebate(df, target, n_features=self.n_rebate_features) self.logger.info( "ReBATE MultiSURF completed: retained {} numerical " "features.".format(len(reduced_df.columns))) self.logger.debug("ReBATE MultiSURF gave the following " "features".format( reduced_df.columns.tolist())) self.reducer_params[r] = {"algo": "MultiSURF Algorithm"} # todo: PCA will not work with string columns!!!!! elif r == "pca": if isinstance(self.n_pca_features, float): self.logger.info("Retaining fraction {} of current " "{} features.".format( self.n_pca_features, df.shape[1])) self.n_pca_features = int(df.shape[1] * self.n_pca_features) self.logger.info("PCA running: retaining {} numerical " "features.".format( self.n_rebate_features)) matrix = PCA( n_components=self.n_pca_features).fit_transform( X.values, y.values) pcacols = [ "PCA {}".format(i) for i in range(matrix.shape[1]) ] reduced_df = pd.DataFrame(columns=pcacols, data=matrix, index=X.index) self.logger.info("PCA completed: retained {} numerical " "features.".format(len( reduced_df.columns))) retained = reduced_df.columns.values.tolist() removed = [c for c in df.columns.values if c not in retained] self.removed_features[r] = removed df = reduced_df self.retained_features = [ c for c in df.columns.tolist() if c != target ] return self