コード例 #1
0
    def get_absolute_scores(self) -> np.ndarray:

        # Check that fit is called before
        check_true(self._is_initial_fit,
                   Exception("Call fit before getting importances"))

        return self._imp.abs_scores
コード例 #2
0
    def transform(self, data: pd.DataFrame) -> pd.DataFrame:

        # Check that fit is called before
        check_true(self._is_initial_fit,
                   Exception("Call fit before transform"))

        # Return transformed data
        return self._imp.transform(data)
コード例 #3
0
 def _validate(self):
     check_true(isinstance(self.threshold, (int, float)),
                TypeError("Threshold must a non-negative number."))
     check_true(
         self.threshold >= 0,
         ValueError("Threshold must be greater or equal to zero."))
     check_true(self.threshold <= 1,
                ValueError("Threshold must be less or equal to one."))
     check_true(
         self.method in ["pearson", "kendall", "spearman"],
         ValueError(
             "Method of correlation can be pearson, kendall, or spearman."
         ))
コード例 #4
0
    def _validate_fit(self, data, labels):

        # VIF is a Statistical methods, hence BaseSupervised, but does not need labels
        if isinstance(
                self._imp, _Statistical
        ) and self.selection_method.method == "variance_inflation":
            pass
        else:
            # Supervised implementors, except VIF, require labels
            if isinstance(self._imp, _BaseSupervisedSelector):
                check_true(labels is not None,
                           ValueError("Labels column cannot be none"))
                check_true(
                    isinstance(labels, pd.Series),
                    ValueError("Labels should be a pandas series/column."))

        if not hasattr(self.selection_method, 'num_features'):
            return

        if not isinstance(self.selection_method.num_features, int):
            return

        # Num features when integer, should be less or equal to size of feature columns
        # When float case is validated when selection method is created
        check_true(
            self.selection_method.num_features <= len(data.columns),
            ValueError("num_features cannot exceed size of feature columns " +
                       str(self.selection_method.num_features) + " vs. " +
                       str(len(data.columns))))
コード例 #5
0
 def _validate(self):
     check_true(isinstance(self.num_features, (int, float)),
                TypeError("Num features must a number."))
     check_true(self.num_features > 0,
                ValueError("Num features must be greater than zero."))
     if isinstance(self.num_features, float):
         check_true(
             self.num_features <= 1,
             ValueError("Num features ratio must be between [0..1]."))
     check_true(
         self.method in [
             "anova", "chi_square", "mutual_info", "maximal_info",
             "variance_inflation"
         ],
         ValueError(
             "Statistical method can only be anova, chi_square, mutual_info, or maximal_info."
         ))
コード例 #6
0
 def _validate(self):
     check_true(isinstance(self.num_features, (int, float)),
                TypeError("Num features must a number."))
     check_true(self.num_features > 0,
                ValueError("Num features must be greater than zero."))
     if isinstance(self.num_features, float):
         check_true(
             self.num_features <= 1,
             ValueError("Num features ratio must be between [0..1]."))
     if self.estimator is not None:
         check_true(
             isinstance(
                 self.estimator,
                 (RandomForestRegressor, RandomForestClassifier,
                  XGBClassifier, XGBRegressor, ExtraTreesClassifier,
                  ExtraTreesRegressor, LGBMClassifier, LGBMRegressor,
                  GradientBoostingClassifier, GradientBoostingRegressor,
                  AdaBoostClassifier, AdaBoostRegressor,
                  CatBoostClassifier, CatBoostRegressor)),
             ValueError("Unknown tree-based estimator" +
                        str(self.estimator)))
コード例 #7
0
    def _validate_args(seed, selection_method) -> NoReturn:
        """
        Validates arguments for the constructor.
        """

        # Seed
        check_true(isinstance(seed, int),
                   TypeError("The seed must be an integer."))
        check_true(seed >= 0,
                   TypeError("The seed must be a non-negative integer."))

        # Selection Method type
        check_true(
            isinstance(selection_method,
                       (SelectionMethod.Correlation, SelectionMethod.Linear,
                        SelectionMethod.TreeBased, SelectionMethod.Statistical,
                        SelectionMethod.Variance)),
            TypeError("Unknown selection type: " + str(selection_method)))

        # Selection method value
        selection_method._validate()
コード例 #8
0
def plot_importance(scores: pd.DataFrame,
                    columns: Optional[list] = None,
                    max_num_features: Optional[int] = None,
                    normalize: Optional[str] = None,
                    ignore_constant: Optional[bool] = True,
                    **kwargs):
    """Plot feature selector scores.

    Parameters
    ----------
    scores: pd.DataFrame
        Data frame with scores for each feature (index) and method (columns).
        Each feature could have multiple rows from different cross-validation folds.
    columns: list (default=None)
        List of methods (columns) to include in statistics.
        If None, all methods (columns) will be used.
    max_num_features: int or None, optional (default=None)
        Max number of top features displayed on plot.
        If None all features will be displayed.
    normalize: bool, optional (default=False)
        Whether to normalize scores such that scores sum to 1 for each column.
        This ensures that scores are comparable between different methods.
    ignore_constant: bool, optional (default=True)
        Whether to ignore columns with the same score for all features.
    **kwargs
        Other parameters passed to ``sns.catplot``.

    Returns
    -------
    ax : matplotlib.axes.Axes
        The plot with feature scores.
    """

    check_true(isinstance(scores, pd.DataFrame),
               ValueError("Selector scores must be a data frame."))

    # Get columns to use
    if columns is None:
        columns = scores.columns

    # Make copy of data frame
    # Fill nan with zero
    df = scores[columns].copy()
    df.fillna(0, inplace=True)

    # Group by feature for CV results
    df = df.groupby(df.index).mean()

    # Get normalized scores such that scores for each method sums to 1
    if normalize:
        df = normalize_columns(df)

    # Drop methods with constant scores
    if ignore_constant:
        mask = ~np.isclose(np.var(df, axis=0), 0)
        df = df.loc[:, mask]

    # Set max_num_features to total number of features if None
    if max_num_features is None:
        max_num_features = len(df)

    # Calculate the mean score and sort in descending order
    mean_score = np.mean(df, axis=1)
    index = (-mean_score).argsort().values
    df = df.iloc[index, :]

    # Convert data to long format and plot
    df = df.head(max_num_features).reset_index().melt(id_vars="index")
    ax = sns.catplot(x="index",
                     y="value",
                     data=df,
                     kind="bar",
                     color="darkgreen",
                     **kwargs)
    ax.set_xlabels("feature")
    ax.set_ylabels("score")

    return ax
コード例 #9
0
def calculate_statistics(
        scores: pd.DataFrame,
        selected: pd.DataFrame,
        columns: Optional[list] = None,
        ignore_constant: Optional[bool] = True) -> pd.DataFrame:
    """
    Calculate statistics for each feature using scores/selections from list of methods.
    Returns data frame with calculated statistics for each feature.

    Parameters
    ----------
    scores:  pd.DataFrame
        Data frame with scores for each feature (index) and selector (columns).
        Each feature could have multiple rows from different cross-validation folds.
    selected: pd.DataFrame
        Data frame with selection flag for each feature (index) and selector (columns).
        Each feature could have multiple rows from different cross-validation folds.
    columns: list (default=None)
        List of methods (columns) to include in statistics.
        If None, all methods (columns) will be used.
    ignore_constant: bool, optional (default=True)
        Whether to ignore methods with the same score for all features.

    Returns
    -------
    Data frame with statistics for each feature
    """

    check_true(isinstance(scores, pd.DataFrame),
               ValueError("scores must be a data frame."))
    check_true(isinstance(selected, pd.DataFrame),
               ValueError("selection must be a data frame."))
    check_true(
        scores.shape == selected.shape,
        ValueError("Shapes of scores and selected data frames must match."))
    check_true(
        np.all(scores.index == selected.index),
        ValueError("Index of score and selection data frames must match."))
    check_true(
        np.all(scores.columns == selected.columns),
        ValueError("Columns of score and selection data frames must match."))

    # Get columns to use
    if columns is None:
        columns = scores.columns

    # Copy data frames
    scores_df = scores[columns].copy()
    selected_df = selected[columns].copy()

    # Group by feature for CV results
    scores_df = scores_df.groupby(scores_df.index).mean()
    selected_df = selected_df.groupby(selected_df.index).mean()

    # Drop methods with constant scores
    if ignore_constant:
        mask = ~np.isclose(np.var(scores_df, axis=0), 0)
        scores_df = scores_df.loc[:, mask]
        selected_df = selected_df.loc[:, mask]

    # Calculate statistics
    stats_df = pd.DataFrame(index=scores_df.index)
    stats_df["score_mean"] = scores_df.mean(axis=1)
    stats_df["score_mean_norm"] = normalize_columns(scores_df).mean(axis=1)
    stats_df["selection_freq"] = selected_df.sum(axis=1)
    stats_df["selection_freq_norm"] = normalize_columns(selected_df).sum(
        axis=1)

    # Sort
    stats_df.sort_values(by="score_mean_norm", ascending=False, inplace=True)

    return stats_df
コード例 #10
0
def _bench(selectors: Dict[str, Union[SelectionMethod.Correlation,
                                      SelectionMethod.Linear,
                                      SelectionMethod.TreeBased,
                                      SelectionMethod.Statistical,
                                      SelectionMethod.Variance]],
           data: pd.DataFrame,
           labels: Optional[pd.Series] = None,
           output_filename: Optional[str] = None,
           drop_zero_variance_features: Optional[bool] = True,
           verbose: bool = False) \
        -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    """
    Benchmark with a given set of feature selectors.
    Return a tuple of data frames with scores, runtime and selected features for each method.

    Returns
    -------
    Tuple of data frames with scores, selected features and runtime for each method.
    """

    check_true(selectors is not None,
               ValueError("Benchmark selectors cannot be none."))
    check_true(data is not None, ValueError("Benchmark data cannot be none."))

    # Output files
    if output_filename is not None:
        output_file = open(output_filename, "a")
    else:
        output_file = None

    # Drop features without any variance
    if drop_zero_variance_features:
        selector = Selective(SelectionMethod.Variance())
        data = selector.fit_transform(data, labels)

    method_to_runtime = {}
    score_df = pd.DataFrame(index=data.columns)
    selected_df = pd.DataFrame(index=data.columns)
    for method_name, method in selectors.items():
        selector = Selective(method)
        t0 = time()
        if verbose:
            print("\n>>> Running", method_name)
        scores = None
        selected = []
        try:
            subset = selector.fit_transform(data, labels)
            scores = selector.get_absolute_scores()
            selected = [1 if c in subset.columns else 0 for c in data.columns]
            method_to_runtime[method_name] = round((time() - t0) / 60, 2)
        except Exception as exp:
            print("Exception", exp)
            scores = np.repeat(0, len(data.columns))
            selected = np.repeat(0, len(data.columns))
            method_to_runtime[method_name] = str(round(
                (time() - t0) / 60, 2)) + " (exception)"
        finally:
            score_df[method_name] = scores
            selected_df[method_name] = selected
            if output_filename is not None:
                output_file.write(method_name + " " +
                                  str(method_to_runtime[method_name]) + "\n")
                output_file.write(str(selected) + "\n")
                output_file.write(str(scores) + "\n")
            if verbose:
                print(
                    f"<<< Done! Time taken: {(time() - t0) / 60:.2f} minutes")

    # Format
    runtime_df = pd.Series(method_to_runtime).to_frame("runtime").rename_axis(
        "method").reset_index()

    return score_df, selected_df, runtime_df
コード例 #11
0
def benchmark(selectors: Dict[str, Union[SelectionMethod.Correlation,
                                         SelectionMethod.Linear,
                                         SelectionMethod.TreeBased,
                                         SelectionMethod.Statistical,
                                         SelectionMethod.Variance]],
              data: pd.DataFrame,
              labels: Optional[pd.Series] = None,
              cv: Optional[int] = None,
              output_filename: Optional[str] = None,
              drop_zero_variance_features: Optional[bool] = True,
              verbose: bool = False,
              seed: int = Constants.default_seed) \
        -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    """
    Benchmark with a given set of feature selectors.
    Return a tuple of data frames with scores, runtime and selected features for each method.

    Parameters
    ----------
    selectors:  Dict[str, Union[SelectionMethod.Correlation,
                                SelectionMethod.Linear,
                                SelectionMethod.TreeBased,
                                SelectionMethod.Statistical,
                                SelectionMethod.Variance]]
        Dictionary of feature selection methods to benchmark.
    data: pd.DataFrame
        Data of shape (n_samples, n_features) used for feature selection.
    labels: pd.Series, optional (default=None)
        The target values (class labels in classification, real numbers in regression).
    cv: int, optional (default=None)
        Number of folds to use for cross-validation.
    output_filename: str, optional (default=None)
        If not None, benchmarking output is saved.
        If file exists, results are appended, otherwise file is created.
    drop_zero_variance_features: bool, optional (default=True)
        Whether to drop features with zero variance before running feature selector methods or not.
    verbose: bool, optional (default=False)
        Whether to print progress messages or not.
    seed: int, optional (default=Constants.default_seed)
        The random seed to initialize the random number generator.

    Returns
    -------
    Tuple of data frames with scores, selected features and runtime for each method.
    If cv is not None, the data frames will contain the concatenated results from each fold.
    """

    check_true(selectors is not None,
               ValueError("Benchmark selectors cannot be none."))
    check_true(data is not None, ValueError("Benchmark data cannot be none."))

    if cv is None:
        return _bench(selectors=selectors,
                      data=data,
                      labels=labels,
                      output_filename=output_filename,
                      drop_zero_variance_features=drop_zero_variance_features,
                      verbose=verbose)
    else:

        # Create K-Fold object
        kf = KFold(n_splits=cv, shuffle=True, random_state=seed)

        # Initialize variables
        t0 = time()
        train_labels, test_labels = None, None
        score_df, selected_df, runtime_df = pd.DataFrame(), pd.DataFrame(
        ), pd.DataFrame()

        # Split data into cv-folds and run _bench for each fold
        if verbose:
            print("\n>>> Running")
        for fold, (train_index, _) in enumerate(kf.split(data)):

            if verbose:
                print("\tFold", fold, "...")

            # Split data, labels into folds
            train_data = data.iloc[train_index]
            if labels is not None:
                train_labels = labels.iloc[train_index]

            # Run benchmark
            score_cv_df, selected_cv_df, runtime_cv_df = _bench(
                selectors=selectors,
                data=train_data,
                labels=train_labels,
                output_filename=output_filename,
                drop_zero_variance_features=drop_zero_variance_features,
                verbose=False)

            # Concatenate data frames
            score_df = pd.concat((score_df, score_cv_df))
            selected_df = pd.concat((selected_df, selected_cv_df))
            runtime_df = pd.concat((runtime_df, runtime_cv_df))

        if verbose:
            print(f"<<< Done! Time taken: {(time() - t0) / 60:.2f} minutes")

        return score_df, selected_df, runtime_df
コード例 #12
0
 def _validate(self):
     check_true(isinstance(self.threshold, (int, float)),
                TypeError("Threshold must a non-negative number."))
     check_true(self.threshold >= 0,
                ValueError("Threshold must be non-negative."))
コード例 #13
0
 def _validate(self):
     check_true(isinstance(self.num_features, (int, float)),
                TypeError("Num features must a number."))
     check_true(self.num_features > 0,
                ValueError("Num features must be greater than zero."))
     if isinstance(self.num_features, float):
         check_true(
             self.num_features <= 1,
             ValueError("Num features ratio must be between [0..1]."))
     check_true(
         self.regularization in ["none", "lasso", "ridge"],
         ValueError(
             "Regularization can only be none, lasso, or ridge."))
     check_true(isinstance(self.alpha, (int, float)),
                TypeError("Alpha must a number."))
     check_true(self.alpha >= 0, ValueError("Alpha cannot be negative"))
コード例 #14
0
ファイル: selector.py プロジェクト: ayush488/selective
def benchmark(selectors: Dict[str, Union[SelectionMethod.Correlation,
                                         SelectionMethod.Linear,
                                         SelectionMethod.TreeBased,
                                         SelectionMethod.Statistical,
                                         SelectionMethod.Variance]],
              data: pd.DataFrame,
              labels: Optional[pd.Series] = None,
              output_filename: Optional[str] = None,
              drop_zero_variance_features: Optional[bool] = True,
              verbose: bool = False) \
        -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    """
    Benchmark with a given set of feature selectors.
    Return a tuple of data frames with scores, runtime and selected features for each method.

    Parameters
    ----------
    selectors:  Dict[str, Union[SelectionMethod.Correlation,
                                SelectionMethod.Linear,
                                SelectionMethod.TreeBased,
                                SelectionMethod.Statistical,
                                SelectionMethod.Variance]]
        Dictionary of feature selection methods to benchmark.
    data: pd.DataFrame
        Data of shape (n_samples, n_features) used for feature selection.
    labels: pd.Series, optional (default=None)
        The target values (class labels in classification, real numbers in regression).
    output_filename: str, optional (default=None)
        If not None, benchmarking output is saved.
        If file exists, results are appended, otherwise file is created.
    drop_zero_variance_features: bool, optional (default=True)
        Whether to drop features with zero variance before running feature selector methods or not.
    verbose: bool, optional (default=False)
        Whether to print progress messages or not.

    Returns
    -------
    Tuple of data frames with scores, selected features and runtime for each method.
    """

    check_true(selectors is not None,
               ValueError("Benchmark selectors cannot be none."))
    check_true(data is not None, ValueError("Benchmark data cannot be none."))

    # Output files
    if output_filename is not None:
        output_file = open(output_filename, "a")
    else:
        output_file = None

    # Drop features without any variance
    if drop_zero_variance_features:
        selector = Selective(SelectionMethod.Variance())
        data = selector.fit_transform(data, labels)

    method_to_runtime = {}
    score_df = pd.DataFrame(index=data.columns)
    selected_df = pd.DataFrame(index=data.columns)
    for method_name, method in selectors.items():
        selector = Selective(method)
        t0 = time()
        if verbose:
            print("\n>>> Running", method_name)
        scores = None
        selected = []
        try:
            subset = selector.fit_transform(data, labels)
            scores = selector.get_absolute_scores()
            selected = [1 if c in subset.columns else 0 for c in data.columns]
            method_to_runtime[method_name] = round((time() - t0) / 60, 2)
        except Exception as exp:
            print("Exception", exp)
            scores = np.repeat(0, len(data.columns))
            selected = np.repeat(0, len(data.columns))
            method_to_runtime[method_name] = str(round(
                (time() - t0) / 60, 2)) + " (exception)"
        finally:
            score_df[method_name] = scores
            selected_df[method_name] = selected
            if output_filename is not None:
                output_file.write(method_name + " " +
                                  str(method_to_runtime[method_name]) + "\n")
                output_file.write(str(selected) + "\n")
                output_file.write(str(scores) + "\n")
            if verbose:
                print(
                    f"<<< Done! Time taken: {(time() - t0) / 60:.2f} minutes")

    # Convert to series
    runtime_df = pd.Series(method_to_runtime).to_frame("runtime").rename_axis(
        "method").reset_index()

    return score_df, selected_df, runtime_df