Ejemplo n.º 1
0
def test_backward_subset_feature_selection(train_df, train_fn, eval_fn,
                                           split_fn, base_extractor,
                                           metric_name):
    features_sets = {
        "first": ["x1", "x2"],
        "second": ["x4", "x5"],
        "third": ["x3", "x6"]
    }

    logs = backward_subset_feature_selection(train_df,
                                             train_fn,
                                             features_sets,
                                             split_fn,
                                             eval_fn,
                                             base_extractor,
                                             metric_name,
                                             num_removed_by_step=1,
                                             threshold=-1,
                                             early_stop=10,
                                             iter_limit=50,
                                             min_remaining_features=5)
    assert len(get_used_features(
        first(logs)[0])) <= 5  # Assert stop by remaining features

    logs = backward_subset_feature_selection(train_df,
                                             train_fn,
                                             features_sets,
                                             split_fn,
                                             eval_fn,
                                             base_extractor,
                                             metric_name,
                                             num_removed_by_step=1,
                                             threshold=0,
                                             early_stop=10,
                                             iter_limit=1,
                                             min_remaining_features=3)

    assert len(logs) == 1  # Assert stop by iter limit

    logs = backward_subset_feature_selection(train_df,
                                             train_fn,
                                             features_sets,
                                             split_fn,
                                             eval_fn,
                                             base_extractor,
                                             metric_name,
                                             num_removed_by_step=1,
                                             threshold=1,
                                             early_stop=2,
                                             iter_limit=50,
                                             min_remaining_features=1)

    assert len(logs) == 2  # Assert stop by early_stop
Ejemplo n.º 2
0
def test_poor_man_boruta_selection(train_df, holdout_df, train_fn, eval_fn,
                                   base_extractor, metric_name):
    features = ["x1", "x2", "x3", "x4", "x5", "x6"]
    logs = poor_man_boruta_selection(train_df,
                                     holdout_df,
                                     train_fn,
                                     features,
                                     eval_fn,
                                     base_extractor,
                                     metric_name,
                                     max_removed_by_step=1,
                                     threshold=0,
                                     early_stop=10,
                                     iter_limit=50,
                                     min_remaining_features=5)

    assert len(get_used_features(
        first(logs))) <= 6  # Assert stop by remaining features

    logs = poor_man_boruta_selection(train_df,
                                     holdout_df,
                                     train_fn,
                                     features,
                                     eval_fn,
                                     base_extractor,
                                     metric_name,
                                     max_removed_by_step=1,
                                     threshold=0,
                                     early_stop=10,
                                     iter_limit=1,
                                     min_remaining_features=3)
    assert len(logs) == 1  # Assert stop by iter limit

    logs = poor_man_boruta_selection(train_df,
                                     holdout_df,
                                     train_fn,
                                     features,
                                     eval_fn,
                                     base_extractor,
                                     metric_name,
                                     max_removed_by_step=1,
                                     threshold=1,
                                     early_stop=2,
                                     iter_limit=50,
                                     min_remaining_features=1)
    assert len(logs) == 2  # Assert stop by early_stop
Ejemplo n.º 3
0
def test_feature_importance_backward_selection(train_df, train_fn, eval_fn,
                                               split_fn, base_extractor,
                                               metric_name):
    features = ["x1", "x2", "x3", "x4", "x5", "x6"]
    logs = feature_importance_backward_selection(train_df,
                                                 train_fn,
                                                 features,
                                                 split_fn,
                                                 eval_fn,
                                                 base_extractor,
                                                 metric_name,
                                                 num_removed_by_step=1,
                                                 threshold=0,
                                                 early_stop=10,
                                                 iter_limit=50,
                                                 min_remaining_features=5)
    assert len(get_used_features(
        first(logs))) <= 5  # Assert stop by remaining features

    logs = feature_importance_backward_selection(train_df,
                                                 train_fn,
                                                 features,
                                                 split_fn,
                                                 eval_fn,
                                                 base_extractor,
                                                 metric_name,
                                                 num_removed_by_step=1,
                                                 threshold=0,
                                                 early_stop=10,
                                                 iter_limit=1,
                                                 min_remaining_features=3)
    assert len(logs) == 1  # Assert stop by iter limit

    logs = feature_importance_backward_selection(train_df,
                                                 train_fn,
                                                 features,
                                                 split_fn,
                                                 eval_fn,
                                                 base_extractor,
                                                 metric_name,
                                                 num_removed_by_step=1,
                                                 threshold=1,
                                                 early_stop=2,
                                                 iter_limit=50,
                                                 min_remaining_features=1)
    assert len(logs) == 2  # Assert stop by early_stop
Ejemplo n.º 4
0
def stop_by_num_features(logs: ListLogListType,
                         min_num_features: int = 50) -> bool:
    """
    Checks for logs to see if feature selection should stop

    Parameters
    ----------
    logs : list of list of dict
        A list of log-like lists of dictionaries evaluations.

    min_num_features: int (default 50)
        The minimun number of features the model can have before stopping

    Returns
    -------
    stop: bool
        A boolean whether to stop recursion or not
    """

    return len(get_used_features(first(logs))) <= min_num_features
Ejemplo n.º 5
0
def test_get_used_features(logs):
    result = get_used_features(logs[0])
    assert result == ['x1', 'x2', 'x4', 'x5', 'x3', 'x6']
Ejemplo n.º 6
0
def remove_by_feature_shuffling(log: LogType,
                                predict_fn: PredictFnType,
                                eval_fn: EvalFnType,
                                eval_data: pd.DataFrame,
                                extractor: ExtractorFnType,
                                metric_name: str,
                                max_removed_by_step: int = 50,
                                threshold: float = 0.005,
                                speed_up_by_importance: bool = False,
                                parallel: bool = False,
                                nthread: int = 1,
                                seed: int = 7) -> List[str]:
    """
        Performs feature selection based on the evaluation of the test vs the
        evaluation of the test with randomly shuffled features

        Parameters
        ----------
        log : LogType
            Dictionaries evaluations.

        predict_fn: function pandas.DataFrame -> pandas.DataFrame
            A partially defined predictor that takes a DataFrame and returns the
            predicted score for this dataframe

        eval_fn : function DataFrame -> log dict
            A partially defined evaluation function that takes a dataset with prediction and
            returns the evaluation logs.

        eval_data: pandas.DataFrame
            Data used to evaluate the model after shuffling

        extractor: function str -> float
            A extractor that take a string and returns the value of that string on a dict

        metric_name: str
            String with the name of the column that refers to the metric column to be extracted

        max_removed_by_step: int (default 5)
            The maximum number of features to remove. It will only consider the least max_removed_by_step in terms of
            feature importance. If speed_up_by_importance=True it will first filter the least relevant feature an
            shuffle only those. If speed_up_by_importance=False it will shuffle all features and drop the last
            max_removed_by_step in terms of PIMP. In both cases, the features will only be removed if drop in
            performance is up to the defined threshold.

        threshold: float (default 0.005)
            Threshold for model performance comparison

        speed_up_by_importance: bool (default True)
            If it should narrow search looking at feature importance first before getting PIMP importance. If True,
            will only shuffle the top num_removed_by_step in terms of feature importance.

        parallel: bool (default False)

        nthread: int (default 1)

        seed: int (default 7)
            Random seed

        Returns
        ----------
        features: list of str
            The remaining features after removing based on feature importance

    """
    random.seed(seed)

    curr_metric = get_avg_metric_from_extractor(log, extractor, metric_name)
    eval_size = eval_data.shape[0]

    features_to_shuffle = order_feature_importance_avg_from_logs(log)[-max_removed_by_step:] \
        if speed_up_by_importance else get_used_features(log)

    def shuffle(feature: str) -> pd.DataFrame:
        return eval_data.assign(
            **{feature: eval_data[feature].sample(frac=1.0)})

    feature_to_delta_metric = compose(
        lambda m: curr_metric - m,
        get_avg_metric_from_extractor(extractor=extractor,
                                      metric_name=metric_name),
        gen_validator_log(fold_num=0, test_size=eval_size), eval_fn,
        predict_fn, shuffle)

    if parallel:
        metrics = Parallel(n_jobs=nthread, backend="threading")(
            delayed(feature_to_delta_metric)(feature)
            for feature in features_to_shuffle)
        feature_to_delta_metric = dict(zip(features_to_shuffle, metrics))
        gc.collect()

    else:
        feature_to_delta_metric = {
            feature: feature_to_delta_metric(feature)
            for feature in features_to_shuffle
        }

    return pipe(feature_to_delta_metric,
                valfilter(lambda delta_metric: delta_metric < threshold),
                sorted(key=lambda f: feature_to_delta_metric.get(f)),
                take(max_removed_by_step), list)