Ejemplo n.º 1
0
def compute_permutation_feature_importance(
        X: pd.DataFrame,
        y: pd.Series,
        predict_func: Callable[..., np.ndarray],
        eval_metric: Scorer,
        features: list = None,
        subsample_size=None,
        num_shuffle_sets: int = None,
        predict_func_kwargs: dict = None,
        transform_func: Callable[..., pd.DataFrame] = None,
        transform_func_kwargs: dict = None,
        time_limit: float = None,
        silent=False,
        log_prefix='',
        importance_as_list=False) -> pd.DataFrame:
    """
    Computes a trained model's feature importance via permutation shuffling (https://explained.ai/rf-importance/).
    A feature's importance score represents the performance drop that results when the model makes predictions on a perturbed copy of the dataset where this feature's values have been randomly shuffled across rows.
    A feature score of 0.01 would indicate that the predictive performance dropped by 0.01 when the feature was randomly shuffled.
    The higher the score a feature has, the more important it is to the model's performance.
    If a feature has a negative score, this means that the feature is likely harmful to the final model, and a model trained with the feature removed would be expected to achieve a better predictive performance.
    Note that calculating feature importance can be a very computationally expensive process, particularly if the model uses hundreds or thousands of features. In many cases, this can take longer than the original model training.

    Note: For highly accurate stddev and z_score estimates, it is recommend to set `subsample_size` to at least 5,000 if possible and `num_shuffle_sets` to at least 10.

    Parameters
    ----------
    X : pd.DataFrame
        Validation data to permute when calculating feature importances.
        Do not use training data as it will result in overfit feature importances.
    y : pd.Series
        Label values of X. The index of X and y must align.
    predict_func : Callable[..., np.ndarray]
        Function that computes model predictions or prediction probabilities on input data.
        Output must be in the form of a numpy ndarray or pandas Series or DataFrame.
        Output `y_pred` must be in a form acceptable as input to `eval_metric(y, y_pred)`.
        If using a fit model object, this is typically `model.predict` or `model.predict_proba`, depending on the `eval_metric` being used.
        If `eval_metric.needs_pred==True`, use `model.predict`, otherwise use `model.predict_proba`.
    eval_metric : Scorer
        Object that computes a score given ground truth labels and predictions or prediction probabilities (depending on the type of metric).
        If using a fit model object, this is typically `model.eval_metric`.
        Feature importances will be based on the delta permutation shuffling has on the score produced by `eval_metric`.
    features : list, default None
        List of features to calculate importances for.
        If None, all features' importances will be calculated.
    subsample_size : int, default None
        The amount of data rows to sample when computing importances.
        Higher values will improve the quality of feature importance estimates, but linearly increase the runtime.
        If None, all provided data will be used.
    num_shuffle_sets : int, default None
        The number of different permutation shuffles of the data that are evaluated.
        Shuffle sets are generated with different random seeds and importances are averaged across all shuffle sets to get the final feature importance values.
        Higher values will improve the quality of feature importance estimates, but linearly increase the runtime.
        `subsample_size` should be increased before `num_shuffle_sets` if runtime is a concern.
        Defaults to 1 if `time_limit` is None or 10 if `time_limit` is specified.
        When `num_shuffle_sets` is greater than 1, feature importance standard deviation and z-score will additionally be computed by using the results of each shuffle set as samples.
    predict_func_kwargs : dict, default {}
        Keyword arguments to be appended to calls to `predict_func(X, **kwargs)`.
    transform_func : Callable[..., pd.DataFrame], default None
        Transformation function that takes the raw input and transforms it row-wise to the input expected by `predict_func`.
        Common examples include `model.preprocess` and `feature_generator.transform`.
        If None, then no transformation is done on the data prior to calling `predict_func`.
        This is necessary to compute importance of original data features in `X` prior to their transformation assuming `predict_func` does not perform the transformation already.
            Example: `transform_func` is necessary to compute the importance of a text feature prior to being transformed into ngrams by `transform_func` when `predict_func` expects ngram features as input.
    transform_func_kwargs : dict, default {}
        Keyword arguments to be appended to calls to `transform_func(X, **kwargs)`.
    time_limit : float, default None
        Time in seconds to limit the calculation of feature importance.
        If None, feature importance will calculate without early stopping.
        A minimum of 1 full shuffle set will always be evaluated. If a shuffle set evaluation takes longer than `time_limit`, the method will take the length of a shuffle set evaluation to return regardless of the `time_limit`.
        If `num_shuffle_sets==1`, `time_limit` will be ignored.
    silent : bool, default False
        Whether to suppress logging output.
    log_prefix : str, default ''
        Prefix to add to logging statements.
    importance_as_list : bool, default False
        Whether to return the 'importance' column values as a list of the importance from each shuffle (True) or a single averaged value (False).

    Returns
    -------
    Pandas `pandas.DataFrame` of feature importance scores with 4 columns:
        index: The feature name.
        'importance': The estimated feature importance score.
        'stddev': The standard deviation of the feature importance score. If NaN, then not enough num_shuffle_sets were used to calculate a variance.
        'p_value': P-value for a statistical t-test of the null hypothesis: importance = 0, vs the (one-sided) alternative: importance > 0.
            Features with low p-value appear confidently useful to the predictor, while the other features may be useless to the predictor (or even harmful to include in its training data).
            A p-value of 0.01 indicates that there is a 1% chance that the feature is useless or harmful, and a 99% chance that the feature is useful.
            A p-value of 0.99 indicates that there is a 99% chance that the feature is useless or harmful, and a 1% chance that the feature is useful.
        'n': The number of shuffles performed to estimate importance score (corresponds to sample-size used to determine confidence interval for true score).
    """
    if num_shuffle_sets is None:
        num_shuffle_sets = 1 if time_limit is None else 10

    time_start = time.time()
    if predict_func_kwargs is None:
        predict_func_kwargs = dict()
    if transform_func_kwargs is None:
        transform_func_kwargs = dict()
    if features is None:
        features = list(X.columns)
    num_features = len(features)

    if subsample_size is not None:
        num_rows = min(len(X), subsample_size)
    else:
        num_rows = len(X)
    subsample = num_rows < len(X)

    if not silent:
        logging_message = f'{log_prefix}Computing feature importance via permutation shuffling for {num_features} features using {num_rows} rows with {num_shuffle_sets} shuffle sets...'
        if time_limit is not None:
            logging_message = f'{logging_message} Time limit: {time_limit}s...'
        logger.log(20, logging_message)

    time_permutation_start = time.time()
    fi_dict_list = []
    shuffle_repeats_completed = 0
    log_final_suffix = ''

    X_orig = X
    y_orig = y
    feature_batch_count = None
    X_raw = None
    score_baseline = None
    # TODO: Can speedup shuffle_repeats by incorporating into X_raw (do multiple repeats in a single predict call)
    for shuffle_repeat in range(num_shuffle_sets):
        fi = dict()

        if subsample:
            # TODO: Stratify? We currently don't know in this function the problem_type (could pass as additional arg).
            X = X_orig.sample(subsample_size, random_state=shuffle_repeat)
            y = y_orig.loc[X.index]

        if subsample or shuffle_repeat == 0:
            time_start_score = time.time()
            X_transformed = X if transform_func is None else transform_func(
                X, **transform_func_kwargs)
            y_pred = predict_func(X_transformed, **predict_func_kwargs)
            score_baseline = eval_metric(y, y_pred)
            if shuffle_repeat == 0:
                if not silent:
                    time_score = time.time() - time_start_score
                    time_estimated = (
                        (num_features + 1) * time_score
                    ) * num_shuffle_sets + time_start_score - time_start
                    time_estimated_per_set = time_estimated / num_shuffle_sets
                    logger.log(
                        20,
                        f'{log_prefix}\t{round(time_estimated, 2)}s\t= Expected runtime ({round(time_estimated_per_set, 2)}s per shuffle set)'
                    )

                if transform_func is None:
                    feature_batch_count = _get_safe_fi_batch_count(
                        X=X, num_features=num_features)
                else:
                    feature_batch_count = _get_safe_fi_batch_count(
                        X=X,
                        num_features=num_features,
                        X_transformed=X_transformed)

            # creating copy of original data N=feature_batch_count times for parallel processing
            X_raw = pd.concat([X.copy() for _ in range(feature_batch_count)],
                              ignore_index=True,
                              sort=False).reset_index(drop=True)

        row_count = len(X)

        X_shuffled = shuffle_df_rows(X=X, seed=shuffle_repeat)

        for i in range(0, num_features, feature_batch_count):
            parallel_computed_features = features[i:i + feature_batch_count]

            # if final iteration, leaving only necessary part of X_raw
            num_features_processing = len(parallel_computed_features)
            final_iteration = i + num_features_processing == num_features

            row_index = 0
            for feature in parallel_computed_features:
                row_index_end = row_index + row_count
                X_raw.loc[row_index:row_index_end - 1,
                          feature] = X_shuffled[feature].values
                row_index = row_index_end

            if (num_features_processing <
                    feature_batch_count) and final_iteration:
                X_raw_transformed = X_raw.loc[:row_count *
                                              num_features_processing - 1]
                X_raw_transformed = X_raw_transformed if transform_func is None else transform_func(
                    X_raw_transformed, **transform_func_kwargs)
            else:
                X_raw_transformed = X_raw if transform_func is None else transform_func(
                    X_raw, **transform_func_kwargs)
            y_pred = predict_func(X_raw_transformed, **predict_func_kwargs)

            row_index = 0
            for feature in parallel_computed_features:
                # calculating importance score for given feature
                row_index_end = row_index + row_count
                y_pred_cur = y_pred[row_index:row_index_end]
                score = eval_metric(y, y_pred_cur)
                fi[feature] = score_baseline - score

                # resetting to original values for processed feature
                X_raw.loc[row_index:row_index_end - 1,
                          feature] = X[feature].values

                row_index = row_index_end
        fi_dict_list.append(fi)
        shuffle_repeats_completed = shuffle_repeat + 1
        if time_limit is not None and shuffle_repeat != (num_shuffle_sets - 1):
            time_now = time.time()
            time_left = time_limit - (time_now - time_start)
            time_permutation_average = (time_now - time_permutation_start) / (
                shuffle_repeat + 1)
            if time_left < (time_permutation_average * 1.1):
                log_final_suffix = ' (Early stopping due to lack of time...)'
                break

    fi_list_dict = dict()
    for val in fi_dict_list:
        for key in val:
            if key not in fi_list_dict:
                fi_list_dict[key] = []
            fi_list_dict[key].append(val[key])
    fi_df = _compute_fi_with_stddev(fi_list_dict,
                                    importance_as_list=importance_as_list)

    if not silent:
        logger.log(
            20,
            f'{log_prefix}\t{round(time.time() - time_start, 2)}s\t= Actual runtime (Completed {shuffle_repeats_completed} of {num_shuffle_sets} shuffle sets){log_final_suffix}'
        )

    return fi_df
Ejemplo n.º 2
0
    def compute_permutation_importance(self, X, y, features: list, preprocess=True, silent=False) -> dict:
        time_start = time.time()

        feature_count = len(features)
        if not silent:
            logger.log(20, f'Computing permutation importance for {feature_count} features on {self.name} ...')
        if preprocess:
            X = self.preprocess(X)

        time_start_score = time.time()
        model_score_base = self.score(X=X, y=y, preprocess=False)
        time_score = time.time() - time_start_score

        if not silent:
            time_estimated = (feature_count + 1) * time_score + time_start_score - time_start
            logger.log(20, f'\t{round(time_estimated, 2)}s\t= Expected runtime')

        X_shuffled = shuffle_df_rows(X=X, seed=0)
        row_count = X.shape[0]

        # calculating maximum number of features, which is safe to process parallel
        X_memory_ratio_max = 0.2
        compute_count_max = 200

        X_size_bytes = sys.getsizeof(pickle.dumps(X, protocol=4))
        available_mem = psutil.virtual_memory().available
        X_memory_ratio = X_size_bytes / available_mem

        compute_count_safe = math.floor(X_memory_ratio_max / X_memory_ratio)
        compute_count = max(1, min(compute_count_max, compute_count_safe))
        compute_count = min(compute_count, feature_count)

        # creating copy of original data N=compute_count times for parallel processing
        X_raw = pd.concat([X.copy() for _ in range(compute_count)], ignore_index=True, sort=False).reset_index(drop=True)

        #  TODO: Make this faster by multi-threading?
        permutation_importance_dict = {}
        for i in range(0, feature_count, compute_count):
            parallel_computed_features = features[i:i + compute_count]

            # if final iteration, leaving only necessary part of X_raw
            num_features_processing = len(parallel_computed_features)
            final_iteration = i + num_features_processing == feature_count
            if (num_features_processing < compute_count) and final_iteration:
                X_raw = X_raw.loc[:row_count * num_features_processing - 1]

            row_index = 0
            for feature in parallel_computed_features:
                row_index_end = row_index + row_count
                X_raw.loc[row_index:row_index_end - 1, feature] = X_shuffled[feature].values
                row_index = row_index_end

            if self.metric_needs_y_pred:
                y_pred = self.predict(X_raw, preprocess=False)
            else:
                y_pred = self.predict_proba(X_raw, preprocess=False)

            row_index = 0
            for feature in parallel_computed_features:
                # calculating importance score for given feature
                row_index_end = row_index + row_count
                y_pred_cur = y_pred[row_index:row_index_end]
                score = self.eval_metric(y, y_pred_cur)
                permutation_importance_dict[feature] = model_score_base - score

                if not final_iteration:
                    # resetting to original values for processed feature
                    X_raw.loc[row_index:row_index_end - 1, feature] = X[feature].values

                row_index = row_index_end

        if not silent:
            logger.log(20, f'\t{round(time.time() - time_start, 2)}s\t= Actual runtime')

        return permutation_importance_dict