Example #1
0
    def shap(self, X, plot=False, plot_type='bar'):
        """Method for shap values calculation and corresponding plot of feature importances.

        Args:
            X (:obj:`pd.DataFrame`, :obj:`pd.Series`): Data for shap values calculation.
            plot (:obj:`boolean`, optional): Whether to plot a graph.
            plot_type (:obj:`str`, optional): Type of feature importance graph, takes value in ['dot', 'bar'].

        Returns:
            JSON containing shap values.
        """
        explainer = TreeExplainer(self.model)
        X = DataFrame(X).T if isinstance(X, Series) else X
        shap_values = explainer.shap_values(X)

        shap_values = shap_values[0] if isinstance(
            shap_values, list) and (len(shap_values) == 2) else shap_values
        expected_value = (explainer.expected_value[0].tolist()
                          if isinstance(shap_values, list) and
                          (len(shap_values) == 2) else
                          [explainer.expected_value])
        variables = ['Intercept'] + list(X.columns)
        mean_shap = expected_value + shap_values.mean(axis=0).tolist()

        if plot:
            summary_plot(shap_values, X, plot_type=plot_type)
        return {variables[i]: mean_shap[i] for i in range(len(variables))}
Example #2
0
    def evaluate(
        self,
        study: Study,
        params: Optional[List[str]] = None,
        *,
        target: Optional[Callable[[FrozenTrial], float]] = None,
    ) -> Dict[str, float]:

        # Train a RandomForest from the backend evaluator.
        self._backend_evaluator.evaluate(study=study, params=params, target=target)

        # Create Tree Explainer object that can calculate shap values.
        self._explainer = TreeExplainer(self._backend_evaluator._forest)

        # Generate SHAP values for the parameters during the trials.
        shap_values = self._explainer.shap_values(self._backend_evaluator._trans_params)

        # Calculate the mean absolute SHAP value for each parameter.
        # List of tuples ("feature_name": mean_abs_shap_value).
        mean_abs_shap_values = list(
            zip(self._backend_evaluator._param_names, np.abs(shap_values).mean(axis=0))
        )

        # Use the mean absolute SHAP values as the feature importance.
        mean_abs_shap_values.sort(key=lambda t: t[1], reverse=True)
        feature_importances = OrderedDict(mean_abs_shap_values)

        return feature_importances
Example #3
0
def plot_shap(model, test, instance=None, feature=None, dataset=False):
    """
    Displays shap plots to explain a black box model.

    :param model: the model considered. The shap plots are calculated only after the model has been fit.
    :param test: test dataset.
    :param instance: instance of the test dataset to explain. default_value=None
    :param feature: feature of the test dataset to explain. default_value=None
    :param dataset: if True the entire dataset is taken into account. default_value=False
    :return:
    """
    # Make an explainer on the model given. Not all the models are supported
    explainer = TreeExplainer(model)
    # Compute SHAP values
    shap_values = explainer.shap_values(test)
    initjs()
    # If not None explain single prediction
    if instance is not None:
        force_plot(explainer.expected_value,
                   shap_values[instance, :],
                   test.iloc[instance, :],
                   matplotlib=True)
    # If not None explain single feature
    if feature is not None:
        fig, ax = plt.subplots(figsize=(13, 10))
        dependence_plot(feature, shap_values, test, ax=ax)
    # If True explain the entire dataset
    if dataset:
        summary_plot(shap_values, test, plot_size=(8, 8))
        summary_plot(shap_values, test, plot_type="bar", plot_size=(8, 8))
Example #4
0
    def fit(self,
            explainer,
            new_observation,
            shap_explainer_type=None,
            **kwargs):
        """Calculate the result of explanation

        Fit method makes calculations in place and changes the attributes.

        Parameters
        -----------
        explainer : Explainer object
            Model wrapper created using the Explainer class.
        new_observation : pd.Series or np.ndarray
            An observation for which a prediction needs to be explained.
        shap_explainer_type : {'TreeExplainer', 'DeepExplainer', 'GradientExplainer', 'LinearExplainer', 'KernelExplainer'}
            String name of the Explainer class (default is `None`, which automatically
            chooses an Explainer to use).
        kwargs: dict
            Keyword parameters passed to the `shapley_values` method.

        Returns
        -----------
        None
        """
        from shap import TreeExplainer, DeepExplainer, GradientExplainer, LinearExplainer, KernelExplainer

        checks.check_compatibility(explainer)
        shap_explainer_type = checks.check_shap_explainer_type(
            shap_explainer_type, explainer.model)

        if self.type == 'predict_parts':
            new_observation = checks.check_new_observation_predict_parts(
                new_observation, explainer)

        if shap_explainer_type == "TreeExplainer":
            try:
                self.shap_explainer = TreeExplainer(explainer.model,
                                                    explainer.data.values)
            except:  # https://github.com/ModelOriented/DALEX/issues/371
                self.shap_explainer = TreeExplainer(explainer.model)
        elif shap_explainer_type == "DeepExplainer":
            self.shap_explainer = DeepExplainer(explainer.model,
                                                explainer.data.values)
        elif shap_explainer_type == "GradientExplainer":
            self.shap_explainer = GradientExplainer(explainer.model,
                                                    explainer.data.values)
        elif shap_explainer_type == "LinearExplainer":
            self.shap_explainer = LinearExplainer(explainer.model,
                                                  explainer.data.values)
        elif shap_explainer_type == "KernelExplainer":
            self.shap_explainer = KernelExplainer(
                lambda x: explainer.predict(x), explainer.data.values)

        self.result = self.shap_explainer.shap_values(new_observation.values,
                                                      **kwargs)
        self.new_observation = new_observation
        self.shap_explainer_type = shap_explainer_type
Example #5
0
def test_shap_sklearn_classifier(iris_X, iris_y):
    from shap import TreeExplainer

    forest = RandomForestClassifier()
    forest.fit(iris_X, iris_y)

    explainer = TreeExplainer(model=forest, data=iris_X)
    shap_values = explainer.shap_values(iris_X, check_additivity=False)
    print(shap_values)
Example #6
0
def test_shap_sklearn_regressor(boston_X, boston_y):
    from shap import TreeExplainer

    forest = RandomForestRegressor()
    forest.fit(boston_X, boston_y)

    explainer = TreeExplainer(model=forest, data=boston_X)
    shap_values = explainer.shap_values(boston_X, check_additivity=False)
    print(shap_values)
Example #7
0
def test_shap_sklearn_classifier(iris_X, iris_y):
    from shap import TreeExplainer

    forest = RandomForestClassifier()
    forest.fit(iris_X, iris_y)

    explainer = TreeExplainer(model=forest)
    shap_values = explainer.shap_values(iris_X)
    print(shap_values)
Example #8
0
def test_shap_sklearn_regressor(boston_X, boston_y):
    from shap import TreeExplainer

    forest = RandomForestRegressor()
    forest.fit(boston_X, boston_y)

    explainer = TreeExplainer(model=forest)
    shap_values = explainer.shap_values(boston_X)
    print(shap_values)
Example #9
0
 def __init__(self, hypergbm_estimator, data=None):
     if not has_shap:
         raise RuntimeError(
             'Please install `shap` package first. command: pip install shap'
         )
     self.hypergbm_estimator = hypergbm_estimator
     if data is not None:
         data = self.hypergbm_estimator.transform_data(data)
     self.explainer = TreeExplainer(self.hypergbm_estimator.estimator, data)
Example #10
0
def test_shap_classifier(iris_X, iris_y):
    from shap import TreeExplainer

    forest = GRFForestClassifier(enable_tree_details=True)
    forest.fit(iris_X, iris_y)

    with shap_patch():
        explainer = TreeExplainer(model=forest, data=iris_X)
    shap_values = explainer.shap_values(iris_X, check_additivity=False)
    print(shap_values)
Example #11
0
def test_shap_regressor(boston_X, boston_y):
    from shap import TreeExplainer

    forest = GRFForestRegressor(enable_tree_details=True)
    forest.fit(boston_X, boston_y)

    with shap_patch():
        explainer = TreeExplainer(model=forest, data=boston_X)
    shap_values = explainer.shap_values(boston_X, check_additivity=False)
    print(shap_values)
Example #12
0
def test_shap_classifier(iris_X, iris_y):
    from shap import TreeExplainer

    forest = RangerForestClassifier(enable_tree_details=True)
    forest.fit(iris_X, iris_y)

    with shap_patch():
        explainer = TreeExplainer(model=forest)
    shap_values = explainer.shap_values(iris_X)
    print(shap_values)
Example #13
0
def test_shap_regressor(boston_X, boston_y):
    from shap import TreeExplainer

    forest = RangerForestRegressor(enable_tree_details=True)
    forest.fit(boston_X, boston_y)

    with shap_patch():
        explainer = TreeExplainer(model=forest)
    shap_values = explainer.shap_values(boston_X)
    print(shap_values)
Example #14
0
def _explain_trees(
    model: Model,
    transformed_data: Table,
    transformed_reference_data: Table,
    progress_callback: Callable,
) -> Tuple[
    Optional[List[np.ndarray]], Optional[np.ndarray], Optional[np.ndarray]
]:
    """
    Computes and returns SHAP values for learners that are explained by
    TreeExplainer: all sci-kit models based on trees. In case that explanation
    with TreeExplainer is not possible it returns None
    """
    if sparse.issparse(transformed_data.X):
        # sparse not supported by TreeExplainer, KernelExplainer can handle it
        return None, None, None
    try:
        explainer = TreeExplainer(
            model.skl_model, data=sample(transformed_reference_data.X, 100),
        )
    # I know it is too broad but this is what TreeExplainer trows
    except Exception:
        return None, None, None

    # TreeExplaner cannot explain in normal time more cases than 1000
    data_sample, sample_mask = _subsample_data(transformed_data, 1000)
    num_classes = (
        len(model.domain.class_var.values)
        if model.domain.class_var.is_discrete
        else None
    )

    # this method will work in batches since explaining only one attribute
    # at time the processing timed doubles comparing to batch size 10
    shap_values = []
    batch_size = 1  # currently set to 1 to minimize widget blocking
    for i in range(0, len(data_sample), batch_size):
        progress_callback(i / len(data_sample))
        batch = data_sample.X[i : i + batch_size]
        shap_values.append(
            explainer.shap_values(batch, check_additivity=False)
        )

    shap_values = _join_shap_values(shap_values)
    base_value = explainer.expected_value
    # when in training phase one class value was missing skl_model do not
    # output probability for it. For other models it is handled by Orange
    if num_classes is not None:
        missing_d = num_classes - len(shap_values)
        shap_values += [
            np.zeros(shap_values[0].shape) for _ in range(missing_d)
        ]
        base_value = np.hstack((base_value, np.zeros(missing_d)))

    return shap_values, sample_mask, base_value
Example #15
0
    def evaluate(
        self,
        study: Study,
        params: Optional[List[str]] = None,
        *,
        target: Optional[Callable[[FrozenTrial], float]] = None,
    ) -> Dict[str, float]:

        if target is None and study._is_multi_objective():
            raise ValueError(
                "If the `study` is being used for multi-objective optimization, "
                "please specify the `target`. For example, use "
                "`target=lambda t: t.values[0]` for the first objective value."
            )

        distributions = _get_distributions(study, params=params)
        if params is None:
            params = list(distributions.keys())
        assert params is not None
        if len(params) == 0:
            return OrderedDict()

        trials: List[FrozenTrial] = _get_filtered_trials(study, params=params, target=target)
        trans = _SearchSpaceTransform(distributions, transform_log=False, transform_step=False)
        trans_params: np.ndarray = _get_trans_params(trials, trans)
        target_values: np.ndarray = _get_target_values(trials, target)

        forest = self._forest
        forest.fit(X=trans_params, y=target_values)

        # Create Tree Explainer object that can calculate shap values.
        explainer = TreeExplainer(forest)

        # Generate SHAP values for the parameters during the trials.
        feature_shap_values: np.ndarray = explainer.shap_values(trans_params)
        param_shap_values = np.zeros((len(trials), len(params)))
        np.add.at(param_shap_values.T, trans.encoded_column_to_column, feature_shap_values.T)

        # Calculate the mean absolute SHAP value for each parameter.
        # List of tuples ("feature_name": mean_abs_shap_value).
        mean_abs_shap_values = np.abs(param_shap_values).mean(axis=0)

        return _sort_dict_by_importance(_param_importances_to_dict(params, mean_abs_shap_values))
Example #16
0
class HyperGBMExplainer:
    def __init__(self, hypergbm_estimator, data=None):
        if not has_shap:
            raise RuntimeError(
                'Please install `shap` package first. command: pip install shap'
            )
        self.hypergbm_estimator = hypergbm_estimator
        if data is not None:
            data = self.hypergbm_estimator.transform_data(data)
        self.explainer = TreeExplainer(self.hypergbm_estimator.estimator, data)

    @property
    def expected_value(self):
        return self.explainer.expected_value

    def shap_values(self,
                    X,
                    y=None,
                    tree_limit=None,
                    approximate=False,
                    check_additivity=True,
                    from_call=False,
                    **kwargs):
        X = self.hypergbm_estimator.transform_data(X, **kwargs)
        return self.explainer.shap_values(X,
                                          y,
                                          tree_limit=tree_limit,
                                          approximate=approximate,
                                          check_additivity=check_additivity,
                                          from_call=from_call)

    def shap_interaction_values(self, X, y=None, tree_limit=None, **kwargs):
        X = self.hypergbm_estimator.transform_data(X, **kwargs)
        return self.explainer.shap_interaction_values(X, y, tree_limit)

    def transform_data(self, X, **kwargs):
        X = self.hypergbm_estimator.transform_data(X, **kwargs)
        return X
    def model_interpretation(self, patient_id, patient_preprocessed, pred,
                             prob, model):
        '''
        Fazer gráficos avaliativos do modelo.
        Argumentos:
            patient_id = string referente a identificação do paciente
            patient_preprocessed = dicionario contendo dados do exame do paciente
            pred = classe predita pelo modelo
            prob = probabilidade referente a classe predita pelo modelo
            model = objeto do modelo
        '''
        #### Pegar variaveis necessárias para o plot (import csv)

        #### Nome dos plots

        plot_1_name = 'app/ai_models/temp/probacurve-' + str(
            patient_id) + '.png'
        plot_2_name = 'app/ai_models/temp/shap-' + str(patient_id) + '.png'
        plot_3_name = 'app/ai_models/temp/dist-' + str(patient_id) + '.png'
        plot_4_name = 'app/ai_models/temp/mapa-' + str(patient_id) + '.png'

        #URL API PLOTS
        plot_1_api = "http://" + self.IP + ":" + self.API_PORT + "/api/media/probacurve-" + str(
            patient_id) + ".png"
        plot_2_api = "http://" + self.IP + ":" + self.API_PORT + "/api/media/shap-" + str(
            patient_id) + ".png"
        plot_3_api = "http://" + self.IP + ":" + self.API_PORT + "/api/media/dist-" + str(
            patient_id) + ".png"
        plot_4_api = "http://" + self.IP + ":" + self.API_PORT + "/api/media/mapa-" + str(
            patient_id) + ".png"

        #### Configurações gerais do plt
        DPI_IMAGES = 100
        FONT_SIZE = 8
        FONT_NAME = 'sans-serif'
        plt.rc('font', family=FONT_NAME, size=FONT_SIZE)
        plt.rc('axes', titlesize=FONT_SIZE, labelsize=FONT_SIZE)
        plt.rc('xtick', labelsize=FONT_SIZE)
        plt.rc('ytick', labelsize=FONT_SIZE)
        plt.rc('legend', fontsize=FONT_SIZE)

        #### PLOT 1 - Distribuição da probabilidade dada pelo modelo para pacientes positivos
        # Itens Necessário: self.probs_df(csv importado) e pred
        exame_resp = pred
        exame_prob = prob
        # Plot
        fig, axis = plt.subplots(nrows=1, ncols=1, figsize=(5, 5))
        sns.kdeplot(self.probs_df['prob_neg'],
                    shade=True,
                    color='#386796',
                    ax=axis,
                    linestyle="--",
                    label='Casos Negativos')
        sns.kdeplot(self.probs_df['prob_pos'],
                    shade=True,
                    color='#F06C61',
                    ax=axis,
                    label='Casos positivos')
        # Pegar eixo XY do Plt object para fazer a interpolação
        if exame_resp == 0:
            xi = 1 - exame_prob
            data_x, data_y = axis.lines[0].get_data()
        elif exame_resp == 1:
            xi = exame_prob
            data_x, data_y = axis.lines[1].get_data()
        # Fazer a interpolação e plot
        yi = np.interp(xi, data_x, data_y)
        axis.plot([xi], [yi],
                  linestyle='None',
                  marker="*",
                  color='black',
                  markersize=10,
                  label='Paciente')
        # Outras configuracoes do plot
        axis.legend(loc="upper right")
        #axis.set_title('Probabilidade de ser COVID Positivo pelo modelo', fontweight='bold')
        axis.set_xlim([0, 1])
        axis.set_ylim([0, axis.get_ylim()[1]])
        plt.tight_layout()
        # Salvar plot 1
        plt.savefig(plot_1_name,
                    dpi=DPI_IMAGES,
                    bbox_inches='tight',
                    pad_inches=0.1)
        plt.close()

        #### PLOT 2 - SHAP
        # Necessário: patient_preprocessed, pred e model
        features = np.array(list(patient_preprocessed.keys()))
        sample_x = np.array(list(patient_preprocessed.values()))
        # Calcular SHAP Value
        explainer = TreeExplainer(model=model)  # Faz o objeto SHAP
        shap_values_sample = explainer.shap_values(sample_x)  # Calculo do SHAP
        expected_value = explainer.expected_value[
            exame_resp]  # Pega o baseline para a classe predita pelo modelo
        shap_values_sample = explainer.shap_values(
            sample_x)  # Calcular os SHAP values
        # Plot
        #plt.title('Valores SHAP', fontweight='bold')
        waterfall_plot(expected_value,
                       shap_values_sample[exame_resp],
                       sample_x,
                       feature_names=features,
                       max_display=20,
                       show=False)
        # Salvar imagem
        plt.tight_layout()
        plt.savefig(plot_2_name,
                    dpi=DPI_IMAGES,
                    bbox_inches='tight',
                    pad_inches=0)
        plt.close()

        #### PLOT 3 - Distribuição das variáveis mais importantes para o modelo
        # Necessário: self.train_df(csv importado), patient_preprocessed, pred
        important_features = [
            'Leucócitos', 'Plaquetas', 'Hemácias', 'Eosinófilos'
        ]
        target_0 = self.train_df[self.train_df['target'] == 0][[
            'Leucócitos', 'Plaquetas', 'Hemácias', 'Eosinófilos'
        ]]
        target_1 = self.train_df[self.train_df['target'] == 1][[
            'Leucócitos', 'Plaquetas', 'Hemácias', 'Eosinófilos'
        ]]
        # Plot
        fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(10, 5))
        # Plot settings
        #sns.set_color_codes()
        #st = fig.suptitle("Distribuição das variáveis importantes para o modelo", fontweight='bold')
        #st.set_y (1.05)
        # Index col/row
        r = 0
        c = 0
        # Loop to plot
        for feat in important_features:
            # Plot distribuição
            sns.kdeplot(list(target_0[feat]),
                        shade=True,
                        color='#386796',
                        ax=axes[r][c],
                        label='Casos Negativos',
                        linestyle="--")
            sns.kdeplot(list(target_1[feat]),
                        shade=True,
                        color='#F06C61',
                        ax=axes[r][c],
                        label='Casos positivos')
            # Pegar a curva de densidade a partir do resultado do modelo
            if pred == 0:
                data_x, data_y = axes[r][c].lines[0].get_data()
            elif pred == 1:
                data_x, data_y = axes[r][c].lines[1].get_data()
            # Pegar a informação (valor) daquela variável importante
            xi = patient_preprocessed[feat]
            yi = np.interp(xi, data_x, data_y)
            ## Plot ponto na curva
            axes[r][c].plot([xi], [yi],
                            linestyle='None',
                            marker="*",
                            color='black',
                            markersize=10,
                            label='Paciente')
            axes[r][c].set_title(feat)
            axes[r][c].legend(loc="upper right")
            axes[r][c].set_ylim([0, axes[r][c].get_ylim()[1]])
            # Mudar onde sera plotado
            if c == 0:
                c += 1
            else:
                r += 1
                c = 0
        # Ajeitar o plot
        plt.tight_layout()
        # Salvar imagem
        plt.savefig(plot_3_name,
                    dpi=DPI_IMAGES,
                    bbox_inches='tight',
                    pad_inches=0.1)
        plt.close()

        #### PLOT 4 - Mapa com SVD para os pacientes
        # Necessário: train_df(csv importado), patient_preprocessed
        amostra = pd.DataFrame(patient_preprocessed, index=[
            0,
        ]).drop(axis=1, columns=['Outra gripe'])

        # Fazer PCA com SVD via prince package
        y_train = self.train_df['target']  # Salvar coluna target
        dados = self.train_df.drop(
            axis=1, columns=['Outra gripe',
                             'target']).copy()  # Dataset para criar o mapa
        pca_obj = PCA(n_components=2, random_state=42)  # Objeto do PCA
        pca_obj.fit(dados)  # Fit no conjunto de dados
        componentes = pca_obj.transform(
            dados)  # Criar os componentes principais dos dados
        transf = pca_obj.transform(amostra)  # Transformar paciente para PCA
        xi = transf.loc[0, 0]  # Eixo X do paciente para plot
        yi = transf.loc[0, 1]  # Eixo Y do paciente para plot
        comp = pd.DataFrame()  # Dataframe para conter os componentes
        comp['C1'] = componentes[0]  # Componente Principal 1
        comp['C2'] = componentes[1]  # Componente Principal 2
        comp['TG'] = y_train  # Variável target para a mascara
        comp_0 = comp[comp['TG'] == 0][['C1', 'C2'
                                        ]]  # Dataframe de CP para negativos
        comp_1 = comp[comp['TG'] == 1][['C1', 'C2'
                                        ]]  # Dataframe de CP para positivos
        # Plot
        fig, ax = plt.subplots(figsize=(8, 8))
        plt.margins(0, 0)
        sns.scatterplot(ax=ax,
                        data=comp_0,
                        x='C1',
                        y='C2',
                        color='#386796',
                        label='Casos Negativos')
        sns.scatterplot(ax=ax,
                        data=comp_1,
                        x='C1',
                        y='C2',
                        color='#F06C61',
                        label='Casos Positivos')
        x_mean, y_mean, width, height, angle = self.build_ellipse(
            comp_0['C1'], comp_0['C2'])
        ax.add_patch(
            Ellipse((x_mean, y_mean),
                    width,
                    height,
                    angle=angle,
                    linewidth=2,
                    color='#386796',
                    fill=True,
                    alpha=0.2))
        x_mean, y_mean, width, height, angle = self.build_ellipse(
            comp_1['C1'], comp_1['C2'])
        ax.add_patch(
            Ellipse((x_mean, y_mean),
                    width,
                    height,
                    angle=angle,
                    linewidth=2,
                    color='#F06C61',
                    fill=True,
                    alpha=0.2))
        ax.plot([xi], [yi],
                linestyle='None',
                marker="*",
                color='black',
                markersize=10,
                label='Paciente')
        # Configurações do plot
        #ax.set_title('Similaridade entre pacientes',fontweight='bold')
        ax.set_xticks([])
        ax.set_yticks([])
        ax.set_ylabel('')
        ax.set_xlabel('')
        handles, labels = ax.get_legend_handles_labels()
        labels, handles = zip(
            *sorted(zip(labels, handles), key=lambda t: t[0]))
        ax.legend(handles, labels, loc="upper right")
        # Salvar imagem
        plt.axis('off')
        plt.savefig(plot_4_name,
                    dpi=DPI_IMAGES,
                    bbox_inches='tight',
                    pad_inches=0)
        plt.close()

        # Retornar
        model_result = {
            'prediction': pred,
            'probability': str(round(prob * 100, 2)),
            'probacurve': plot_1_api,
            'shap_img': plot_2_api,
            'dist_img': plot_3_api,
            'mapa_img': plot_4_api
        }
        return model_result
        """
Example #18
0
class ShapleyImportanceEvaluator(BaseImportanceEvaluator):
    """Shapley (SHAP) parameter importance evaluator.

    This evaluator fits a random forest that predicts objective values given hyperparameter
    configurations. Feature importances are then computed as the mean absolute SHAP values.

    .. note::

        This evaluator requires the `sklearn <https://scikit-learn.org/stable/>`_ Python package
        and `SHAP <https://shap.readthedocs.io/en/stable/index.html>`_.
        The model for the SHAP calculation is based on `sklearn.ensemble.RandomForestClassifier
        <https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html>`_.

    Args:
        n_trees:
            Number of trees in the random forest.
        max_depth:
            The maximum depth of each tree in the random forest.
        seed:
            Seed for the random forest.
    """

    def __init__(
        self, *, n_trees: int = 64, max_depth: int = 64, seed: Optional[int] = None
    ) -> None:
        _imports.check()

        # Use the RandomForest as the surrogate model to evaluate the feature importances.
        self._backend_evaluator = MeanDecreaseImpurityImportanceEvaluator(
            n_trees=n_trees, max_depth=max_depth, seed=seed
        )
        # Use the TreeExplainer from the SHAP module.
        self._explainer: TreeExplainer = None

    def evaluate(
        self,
        study: Study,
        params: Optional[List[str]] = None,
        *,
        target: Optional[Callable[[FrozenTrial], float]] = None,
    ) -> Dict[str, float]:

        # Train a RandomForest from the backend evaluator.
        self._backend_evaluator.evaluate(study=study, params=params, target=target)

        # Create Tree Explainer object that can calculate shap values.
        self._explainer = TreeExplainer(self._backend_evaluator._forest)

        # Generate SHAP values for the parameters during the trials.
        shap_values = self._explainer.shap_values(self._backend_evaluator._trans_params)

        # Calculate the mean absolute SHAP value for each parameter.
        # List of tuples ("feature_name": mean_abs_shap_value).
        mean_abs_shap_values = list(
            zip(self._backend_evaluator._param_names, np.abs(shap_values).mean(axis=0))
        )

        # Use the mean absolute SHAP values as the feature importance.
        mean_abs_shap_values.sort(key=lambda t: t[1], reverse=True)
        feature_importances = OrderedDict(mean_abs_shap_values)

        return feature_importances
Example #19
0
    def cross_val(self, X, y, scoring=None, cv=None, **kwargs):
        """Method for performing cross-validation given the hyperparameters of initialized or fitted model.

        Args:
            X (:obj:`pd.DataFrame`, :obj:`pd.Series`): Training data.
            y (:obj:`pd.DataFrame`, :obj:`pd.Series`): Training target values.
            scoring (:obj:`callable`): Metrics passed to sklearn.model_selection.cross_validate calculation.
            cv (:obj:`int, cross-validation generator or an iterable`, optional): Cross-validation strategy from
             sklearn. Performs 5-fold cv by default.
            **kwargs: Other parameters passed to sklearn.model_selection.cross_validate.

        Returns:
            pd.DataFrame, pd.DataFrame: DataFrame with metrics on folds, DataFrame with shap values on folds.
        """
        scoring = mean_squared_error if scoring is None else scoring
        models, metrics = self._cross_val(X,
                                          y,
                                          scoring=scoring,
                                          cv=cv,
                                          **kwargs)
        if callable(scoring):
            scorers = {
                scoring.__name__.replace('_', ' '):
                array([scoring(y, self.model.predict(X))])
            }
        elif isinstance(scoring, (tuple, list)):
            scorers = {
                scorer.__name__.replace('_', ' '):
                array([scorer(y, self.model.predict(X))])
                for scorer in scoring
            }
        elif isinstance(scoring, str):
            if scoring in SCORERS:
                scorers = {
                    scoring.replace('_', ' '):
                    array([SCORERS[scoring](self.model, X=X, y=y)])
                }
            else:
                raise ValueError(f'Scorer {scoring} is not supported.')
        else:
            raise NotImplementedError(
                f'Scoring of type {scoring} is not supported')
        metrics = DataFrame({
            key: concatenate((scorers[key], metrics[key]))
            for key in scorers.keys()
        }).T
        metrics.columns = [
            f'Fold {i}' if i != 0 else 'Overall'
            for i in range(metrics.shape[1])
        ]
        shap_coefs = []
        explainer = TreeExplainer(self.model)

        shap_coefs.append(
            ([explainer.expected_value] if explainer.expected_value is None
             else explainer.expected_value.tolist()) +
            explainer.shap_values(X).mean(axis=0).tolist())
        for model in models:
            explainer = TreeExplainer(model)
            shap_coefs.append(
                ([explainer.expected_value] if explainer.expected_value is None
                 else explainer.expected_value.tolist()) +
                explainer.shap_values(X).mean(axis=0).tolist())
        shapdf = DataFrame(array(shap_coefs).T,
                           columns=['Overall'] +
                           [f'Fold {x}' for x in range(1,
                                                       len(models) + 1)],
                           index=['Intercept'] + X.columns.tolist())
        return metrics, shapdf
Example #20
0
    def shap_explain(self,
                     data,
                     index=None,
                     link=None,
                     show=True,
                     layout_dict=None):
        """Method for plotting a waterfall graph or return corresponding JSON if show=False.

        Args:
            data (:obj:`pd.DataFrame`, :obj:`pd.Series`): Data for shap values calculation.
            index (:obj:`int`, optional): Index of the observation of interest, if data is pd.DataFrame.
            link (:obj:`callable`, optional): A function for transforming shap values into predictions.
            Unnecessary if self.objective is present and it takes values in ['binary', 'poisson', 'gamma'].
            show (:obj:`boolean`, optional): Whether to plot a graph or return a json.
            layout_dict (:obj:`boolean`, optional): Dictionary containing the parameters of plotly figure layout.

        Returns:
            None or dict: Waterfall graph or corresponding JSON.
        """
        def logit(x):
            return true_divide(1, add(1, exp(-x)))

        explainer = TreeExplainer(self.model)
        if isinstance(self.model, (XGBClassifier, XGBRegressor)):
            feature_names = self.model.get_booster().feature_names
        elif isinstance(self.model, (LGBMClassifier, LGBMRegressor)):
            feature_names = self.model.feature_name_
        elif isinstance(self.model, (CatBoostClassifier, CatBoostRegressor)):
            feature_names = self.model.feature_names_
        else:
            raise NotImplementedError(
                f'Error with the backend choice. Supported backends: {self._backends}'
            )

        index = index if (isinstance(
            data, DataFrame)) and (index is not None) else None
        data = DataFrame(data).T[feature_names] if isinstance(
            data, Series) else data[feature_names]
        data = data if index is None else data.loc[[index], :]
        shap_values = explainer.shap_values(data)
        cond_bool = isinstance(shap_values, list) and (len(shap_values) == 2)
        shap_values = shap_values[0] if cond_bool else shap_values
        expected_value = explainer.expected_value[
            0] if cond_bool else explainer.expected_value

        prediction = DataFrame([expected_value] +
                               shap_values.reshape(-1).tolist(),
                               index=['Intercept'] + feature_names,
                               columns=['SHAP Value'])
        prediction['CumSum'] = cumsum(prediction['SHAP Value'])
        prediction['Value'] = append(nan, data.values.reshape(-1))

        if (self.objective is not None) and (link is None):
            link = exp if self.objective in [
                'poisson', 'gamma'
            ] else logit if self.objective == 'binary' else None
        if link is not None:
            prediction['Link'] = link(prediction['CumSum'])
            prediction['Contribution'] = [link(expected_value)] + list(
                diff(prediction['Link']))
        else:
            prediction['Contribution'] = [expected_value] + list(
                diff(prediction['CumSum']))

        fig = Figure(
            Waterfall(
                name=f'Prediction {index}',
                orientation='h',
                measure=['relative'] * len(prediction),
                y=[
                    prediction.index[i] if i == 0 else
                    f'{prediction.index[i]}={data.values.reshape(-1)[i-1]}'
                    for i in range(len(prediction.index))
                ],
                x=prediction['Contribution']))
        fig.update_layout(**(layout_dict if layout_dict is not None else {}))

        if show:
            fig.show()
        else:
            json_ = prediction[['Value', 'SHAP Value',
                                'Contribution']].T.to_dict()
            fig_base64 = b64encode(
                to_image(fig, format='jpeg', engine='kaleido')).decode('ascii')
            json_.update({
                'id': int(data.index.values),
                'predict': prediction['Link'][-1],
                "ShapValuesPlot": fig_base64
            })
            return json_