Example #1
0
    def _get_feature_shap_values_per_fold(X,
                                          y,
                                          clf,
                                          train_index,
                                          val_index,
                                          scorer,
                                          verbose=0):
        """
        This function calculates the shap values on validation set, and Train and Val score.

        Args:
            X (pd.DataFrame):
                Dataset used in CV.

            y (pd.Series):
                Binary labels for X.

            clf (binary classifier):
                Model to be fitted on the train folds.

            train_index (np.array):
                Positions of train folds samples.

            val_index (np.array):
                Positions of validation fold samples.

            scorer (string, callable or None):
                A string (see sklearn [model scoring](https://scikit-learn.org/stable/modules/model_evaluation.html)) or
                a scorer callable object, function with the signature `scorer(estimator, X, y)`.

            verbose (int, optional):
                Controls verbosity of the output:

                - 0 - neither prints nor warnings are shown
                - 1 - 50 - only most important warnings regarding data properties are shown (excluding SHAP warnings)
                - 51 - 100 - shows most important warnings, prints of the feature removal process
                - above 100 - presents all prints and all warnings (including SHAP warnings).

        Returns:
            (np.array, float, float):
                Tuple with the results: Shap Values on validation fold, train score, validation score.
        """
        X_train, X_val = X.iloc[train_index, :], X.iloc[val_index, :]
        y_train, y_val = y.iloc[train_index], y.iloc[val_index]

        # Fit model with train folds
        clf = clf.fit(X_train, y_train)

        # Score the model
        score_train = scorer(clf, X_train, y_train)
        score_val = scorer(clf, X_val, y_val)

        # Compute SHAP values
        shap_values = shap_calc(clf, X_val, verbose=verbose)
        return shap_values, score_train, score_val
    def _prep_shap_related_variables(
        clf,
        X,
        y,
        approximate=False,
        verbose=0,
        column_names=None,
        class_names=None,
        **shap_kwargs,
    ):
        """
        The function prepares the variables related to shap that are used to interpret the model:


        Returns:
            (np.array, int, TreeDependencePlotter):
                Shap values, expected value of the explainer, and fitted TreeDependencePlotter for a given dataset.
        """
        shap_values, explainer = shap_calc(
            clf,
            X,
            approximate=approximate,
            verbose=verbose,
            return_explainer=True,
            **shap_kwargs,
        )

        expected_value = explainer.expected_value

        # For sklearn models the expected values consists of two elements (negative_class and positive_class)
        if isinstance(expected_value, list) or isinstance(expected_value, np.ndarray):
            expected_value = expected_value[1]

        # Initialize tree dependence plotter
        tdp = TreeDependencePlotter(clf, verbose=verbose).fit(
            X,
            y,
            column_names=column_names,
            class_names=class_names,
            precalc_shap=shap_values,
        )
        return shap_values, expected_value, tdp
Example #3
0
    def _get_feature_shap_values_per_fold(self,
                                          X,
                                          y,
                                          clf,
                                          train_index,
                                          val_index,
                                          sample_weight=None,
                                          **shap_kwargs):
        """
        This function calculates the shap values on validation set, and Train and Val score.

        Args:
            X (pd.DataFrame):
                Dataset used in CV.

            y (pd.Series):
                Binary labels for X.

            sample_weight (pd.Series, np.ndarray, list, optional):
                array-like of shape (n_samples,) - only use if the model you're using supports
                sample weighting (check the corresponding scikit-learn documentation).
                Array of weights that are assigned to individual samples.
                Note that they're only used for fitting of  the model, not during evaluation of metrics.
                If not provided, then each sample is given unit weight.

            clf (binary classifier):
                Model to be fitted on the train folds.

            train_index (np.array):
                Positions of train folds samples.

            val_index (np.array):
                Positions of validation fold samples.

            **shap_kwargs:
                keyword arguments passed to
                [shap.Explainer](https://shap.readthedocs.io/en/latest/generated/shap.Explainer.html#shap.Explainer).
                It also enables `approximate` and `check_additivity` parameters, passed while calculating SHAP values.
                The `approximate=True` causes less accurate, but faster SHAP values calculation, while
                `check_additivity=False` disables the additivity check inside SHAP.
        Returns:
            (np.array, float, float):
                Tuple with the results: Shap Values on validation fold, train score, validation score.
        """
        X_train, X_val = X.iloc[train_index, :], X.iloc[val_index, :]
        y_train, y_val = y.iloc[train_index], y.iloc[val_index]

        if sample_weight is not None:
            clf = clf.fit(
                X_train,
                y_train,
                sample_weight=sample_weight.iloc[train_index],
                eval_set=[(X_val, y_val)],
                eval_sample_weight=[sample_weight.iloc[val_index]],
                early_stopping_rounds=self.early_stopping_rounds,
                eval_metric=self.eval_metric,
            )
        else:
            clf = clf.fit(
                X_train,
                y_train,
                eval_set=[(X_val, y_val)],
                early_stopping_rounds=self.early_stopping_rounds,
                eval_metric=self.eval_metric,
            )
        # Score the model
        score_train = self.scorer.scorer(clf, X_train, y_train)
        score_val = self.scorer.scorer(clf, X_val, y_val)

        # Compute SHAP values
        shap_values = shap_calc(clf,
                                X_val,
                                verbose=self.verbose,
                                **shap_kwargs)
        return shap_values, score_train, score_val
    def _get_feature_shap_values_per_fold(X,
                                          y,
                                          clf,
                                          train_index,
                                          val_index,
                                          scorer,
                                          verbose=0,
                                          **shap_kwargs):
        """
        This function calculates the shap values on validation set, and Train and Val score.

        Args:
            X (pd.DataFrame):
                Dataset used in CV.

            y (pd.Series):
                Binary labels for X.

            clf (binary classifier):
                Model to be fitted on the train folds.

            train_index (np.array):
                Positions of train folds samples.

            val_index (np.array):
                Positions of validation fold samples.

            scorer (string, callable or None):
                A string (see sklearn [model scoring](https://scikit-learn.org/stable/modules/model_evaluation.html)) or
                a scorer callable object, function with the signature `scorer(estimator, X, y)`.

            verbose (int, optional):
                Controls verbosity of the output:

                - 0 - neither prints nor warnings are shown
                - 1 - 50 - only most important warnings regarding data properties are shown (excluding SHAP warnings)
                - 51 - 100 - shows most important warnings, prints of the feature removal process
                - above 100 - presents all prints and all warnings (including SHAP warnings).

            **shap_kwargs:
                keyword arguments passed to
                [shap.Explainer](https://shap.readthedocs.io/en/latest/generated/shap.Explainer.html#shap.Explainer).
                It also enables `approximate` and `check_additivity` parameters, passed while calculating SHAP values.
                The `approximate=True` causes less accurate, but faster SHAP values calculation, while
                `check_additivity=False` disables the additivity check inside SHAP.
        Returns:
            (np.array, float, float):
                Tuple with the results: Shap Values on validation fold, train score, validation score.
        """
        X_train, X_val = X.iloc[train_index, :], X.iloc[val_index, :]
        y_train, y_val = y.iloc[train_index], y.iloc[val_index]

        # Fit model with train folds
        clf = clf.fit(X_train, y_train)

        # Score the model
        score_train = scorer(clf, X_train, y_train)
        score_val = scorer(clf, X_val, y_val)

        # Compute SHAP values
        shap_values = shap_calc(clf, X_val, verbose=verbose, **shap_kwargs)
        return shap_values, score_train, score_val