Ejemplo n.º 1
0
 def test_rebate(self):
     df_reduced = rebate(self.test_df, "gap expt", 10)
     self.assertEqual(df_reduced.shape[1], 10)
Ejemplo n.º 2
0
    def fit(self, df, target):
        missing_remove_features = [
            c for c in self._remove_features if c not in df.columns
        ]
        missing_keep_features = [
            c for c in self._keep_features if c not in df.columns
        ]
        for features, name in [(missing_remove_features, 'remove'),
                               (missing_keep_features, 'keep')]:
            if features:
                self.logger.warning(
                    self._log_prefix +
                    "Asked to {} some features that do not exist in the "
                    "dataframe. Skipping the following features:\n{}".format(
                        name, features))

        reduced_df = df
        for r in self.reducers:
            X = df.drop(columns=[target])
            y = df[target]
            if r == "corr":
                reduced_df = self.rm_correlated(df, target,
                                                self.corr_threshold)
                reduced_df = reduced_df.drop(columns=[target])
            if r == "tree":
                tbfr = TreeFeatureReducer(
                    importance_percentile=self.tree_importance_percentile,
                    mode=regression_or_classification(y),
                    logger=self.logger)
                reduced_df = tbfr.fit_transform(X, y).copy(deep=True)
                self.reducer_params[r] = {
                    "importance_percentile": tbfr.importance_percentile,
                    "mode": tbfr.mode,
                    "random_state": tbfr.rs
                }
            elif r == "rebate":
                if isinstance(self.n_rebate_features, float):
                    self.logger.info(
                        self._log_prefix +
                        "Retaining fraction {} of current {} features.".format(
                            self.n_rebate_features, df.shape[1] - 1))
                    self.n_rebate_features = int(df.shape[1] *
                                                 self.n_rebate_features)
                self.logger.info(
                    self._log_prefix +
                    "ReBATE MultiSURF running: retaining {} numerical "
                    "features.".format(self.n_rebate_features))
                reduced_df = rebate(df,
                                    target,
                                    n_features=self.n_rebate_features)
                reduced_df = reduced_df.copy(deep=True)
                self.logger.info(
                    self._log_prefix +
                    "ReBATE MultiSURF completed: retained {} numerical "
                    "features.".format(len(reduced_df.columns)))
                self.logger.debug(
                    self._log_prefix + "ReBATE MultiSURF gave the following "
                    "features: {}".format(reduced_df.columns.tolist()))
                self.reducer_params[r] = {"algo": "MultiSURF Algorithm"}
            elif r == "pca":
                n_samples, n_features = X.shape
                if self.n_pca_features == "auto":
                    if n_samples < n_features:
                        self.logger.warning(
                            self._log_prefix +
                            "Number of samples ({}) is less than number of "
                            "features ({}). Setting n_pca_features equal to "
                            "n_samples.".format(n_samples, n_features))
                        self._pca = PCA(n_components=n_samples,
                                        svd_solver="full")
                    else:
                        self.logger.info(
                            self._log_prefix +
                            "PCA automatically determining optimal number of "
                            "features using Minka's MLE.")
                        self._pca = PCA(n_components="mle", svd_solver="auto")
                else:
                    if isinstance(self.n_pca_features, float):
                        self.logger.info(
                            self._log_prefix +
                            "Retaining fraction {} of current {} features."
                            "".format(self.n_pca_features, df.shape[1]))
                        self.n_pca_features = int(df.shape[1] *
                                                  self.n_pca_features)
                    if self.n_pca_features > n_samples:
                        self.logger.warning(
                            self._log_prefix +
                            "Number of PCA features interpreted as {}, which is"
                            " more than the number of samples ({}). "
                            "n_pca_features coerced to equal n_samples."
                            "".format(self.n_pca_features, n_samples))
                        self.n_pca_features = n_samples
                    self.logger.info(
                        self._log_prefix +
                        "PCA running: retaining {} numerical features."
                        "".format(self.n_pca_features))
                    self._pca = PCA(n_components=self.n_pca_features,
                                    svd_solver="auto")
                self._pca.fit(X.values, y.values)
                matrix = self._pca.transform(X.values)
                pca_feats = [
                    "PCA {}".format(i) for i in range(matrix.shape[1])
                ]
                self._pca_feats = pca_feats
                reduced_df = pd.DataFrame(columns=pca_feats,
                                          data=matrix,
                                          index=X.index)
                self.logger.info(self._log_prefix +
                                 "PCA completed: retained {} numerical "
                                 "features.".format(len(reduced_df.columns)))

            retained = reduced_df.columns.values.tolist()
            removed = [
                c for c in df.columns.values
                if c not in retained and c != target
            ]

            self.removed_features[r] = removed
            if target not in reduced_df:
                reduced_df.loc[:, target] = y.tolist()
            df = reduced_df

        all_removed = [
            c for r, rf in self.removed_features.items() for c in rf
        ]
        all_kept = [c for c in df.columns.tolist() if c != target]
        save_from_removal = [
            c for c in self._keep_features if c in all_removed
        ]
        for_force_removal = [c for c in self._remove_features if c in all_kept]

        if save_from_removal:
            self.logger.info(self._log_prefix +
                             "Saving features from removal. "
                             "Saved features:\n{}".format(save_from_removal))

        if for_force_removal:
            self.logger.info(
                self._log_prefix + "Forcing removal of features. "
                "Removed features: \n{}".format(for_force_removal))
            self.removed_features['manual'] = for_force_removal

        self.retained_features = [
            c for c in all_kept
            if c not in self._remove_features or c != target
        ]
        return self
Ejemplo n.º 3
0
    def fit(self, df, target):
        reduced_df = df
        for r in self.reducers:
            if r == "corr":
                reduced_df = self.rm_correlated(df, target)

            # More advanced feature reduction methods
            else:
                X = df.drop(columns=[target])
                y = df[target]

                if r == "tree":
                    tbfr = TreeBasedFeatureReduction(
                        mode=regression_or_classification(df[target]),
                        logger=self.logger)
                    reduced_df = tbfr.fit_transform(X, y)
                    self.reducer_params[r] = {
                        "importance_percentile": tbfr.importance_percentile,
                        "mode": tbfr.mode,
                        "random_state": tbfr.rs
                    }
                elif r == "rebate":
                    if isinstance(self.n_rebate_features, float):
                        self.logger.info("Retaining fraction {} of current "
                                         "{} features.".format(
                                             self.n_rebate_features,
                                             df.shape[1]))
                        self.n_rebate_features = int(df.shape[1] *
                                                     self.n_rebate_features)
                    self.logger.info(
                        "ReBATE MultiSURF running: retaining {} numerical "
                        "features.".format(self.n_rebate_features))
                    reduced_df = rebate(df,
                                        target,
                                        n_features=self.n_rebate_features)
                    self.logger.info(
                        "ReBATE MultiSURF completed: retained {} numerical "
                        "features.".format(len(reduced_df.columns)))
                    self.logger.debug("ReBATE MultiSURF gave the following "
                                      "features".format(
                                          reduced_df.columns.tolist()))
                    self.reducer_params[r] = {"algo": "MultiSURF Algorithm"}

                # todo: PCA will not work with string columns!!!!!
                elif r == "pca":
                    if isinstance(self.n_pca_features, float):
                        self.logger.info("Retaining fraction {} of current "
                                         "{} features.".format(
                                             self.n_pca_features, df.shape[1]))
                        self.n_pca_features = int(df.shape[1] *
                                                  self.n_pca_features)
                    self.logger.info("PCA running: retaining {} numerical "
                                     "features.".format(
                                         self.n_rebate_features))
                    matrix = PCA(
                        n_components=self.n_pca_features).fit_transform(
                            X.values, y.values)
                    pcacols = [
                        "PCA {}".format(i) for i in range(matrix.shape[1])
                    ]
                    reduced_df = pd.DataFrame(columns=pcacols,
                                              data=matrix,
                                              index=X.index)

                    self.logger.info("PCA completed: retained {} numerical "
                                     "features.".format(len(
                                         reduced_df.columns)))

            retained = reduced_df.columns.values.tolist()
            removed = [c for c in df.columns.values if c not in retained]
            self.removed_features[r] = removed
            df = reduced_df

        self.retained_features = [
            c for c in df.columns.tolist() if c != target
        ]
        return self