def _explain_trees( model: Model, transformed_data: Table, transformed_reference_data: Table, progress_callback: Callable, ) -> Tuple[ Optional[List[np.ndarray]], Optional[np.ndarray], Optional[np.ndarray] ]: """ Computes and returns SHAP values for learners that are explained by TreeExplainer: all sci-kit models based on trees. In case that explanation with TreeExplainer is not possible it returns None """ if sparse.issparse(transformed_data.X): # sparse not supported by TreeExplainer, KernelExplainer can handle it return None, None, None try: explainer = TreeExplainer( model.skl_model, data=sample(transformed_reference_data.X, 100), ) # I know it is too broad but this is what TreeExplainer trows except Exception: return None, None, None # TreeExplaner cannot explain in normal time more cases than 1000 data_sample, sample_mask = _subsample_data(transformed_data, 1000) num_classes = ( len(model.domain.class_var.values) if model.domain.class_var.is_discrete else None ) # this method will work in batches since explaining only one attribute # at time the processing timed doubles comparing to batch size 10 shap_values = [] batch_size = 1 # currently set to 1 to minimize widget blocking for i in range(0, len(data_sample), batch_size): progress_callback(i / len(data_sample)) batch = data_sample.X[i : i + batch_size] shap_values.append( explainer.shap_values(batch, check_additivity=False) ) shap_values = _join_shap_values(shap_values) base_value = explainer.expected_value # when in training phase one class value was missing skl_model do not # output probability for it. For other models it is handled by Orange if num_classes is not None: missing_d = num_classes - len(shap_values) shap_values += [ np.zeros(shap_values[0].shape) for _ in range(missing_d) ] base_value = np.hstack((base_value, np.zeros(missing_d))) return shap_values, sample_mask, base_value
def _explain_other_models( model: Model, transformed_data: Table, transformed_reference_data: Table, progress_callback: Callable, ) -> Tuple[List[np.ndarray], np.ndarray, np.ndarray]: """ Computes SHAP values for any learner with KernelExplainer. """ # 1000 is a number that for normal data and model do not take so long data_sample, sample_mask = _subsample_data(transformed_data, 1000) try: ref = kmeans(transformed_reference_data.X, k=10) except ValueError: # k-means fails with value error when it cannot produce enough clusters # in this case we will use sample instead of clusters ref = sample(transformed_reference_data.X, nsamples=100) explainer = KernelExplainer( lambda x: (model(x) if model.domain.class_var.is_continuous else model(x, model.Probs)), ref, ) shap_values = [] for i, row in enumerate(data_sample.X): progress_callback(i / len(data_sample)) shap_values.append( explainer.shap_values(row, nsamples=100, silent=True, l1_reg="num_features(90)")) return ( _join_shap_values(shap_values), sample_mask, explainer.expected_value, )
def shap_calc( model, X, return_explainer=False, verbose=0, sample_size=100, approximate=False, check_additivity=True, **shap_kwargs, ): """ Helper function to calculate the shapley values for a given model. Args: model (binary model): Trained model. X (pd.DataFrame or np.ndarray): features set. return_explainer (boolean): if True, returns a a tuple (shap_values, explainer). verbose (int, optional): Controls verbosity of the output: - 0 - nether prints nor warnings are shown - 1 - 50 - only most important warnings - 51 - 100 - shows other warnings and prints - above 100 - presents all prints and all warnings (including SHAP warnings). approximate (boolean): if True uses shap approximations - less accurate, but very fast. It applies to tree-based explainers only. check_additivity (boolean): if False SHAP will disable the additivity check for tree-based models. **shap_kwargs: kwargs of the shap.Explainer Returns: (np.ndarray or tuple(np.ndarray, shap.Explainer)): shapley_values for the model, optionally also returns the explainer. """ if isinstance(model, Pipeline): raise (TypeError( "The provided model is a Pipeline. Unfortunately, the features based on SHAP do not support " "pipelines, because they cannot be used in combination with shap.Explainer. Please apply any " "data transformations before running the probatus module.")) # Suppress warnings regarding XGboost and Lightgbm models. with warnings.catch_warnings(): if verbose <= 100: warnings.simplefilter("ignore") # Create the background data,required for non tree based models. # A single datapoint can passed as mask (https://github.com/slundberg/shap/issues/955#issuecomment-569837201) if X.shape[1] < sample_size: sample_size = int(np.ceil(X.shape[1] * 0.2)) else: pass mask = sample(X, sample_size) if X.select_dtypes("category").shape[1] > 0: if verbose > 0: warnings.warn( "Using tree_dependent feature_perturbation (in shap) without background" " data for tree-based model with categorical features.") explainer = Explainer(model, **shap_kwargs) else: explainer = Explainer(model, masker=mask, **shap_kwargs) # For tree-explainers allow for using check_additivity and approximate arguments if isinstance(explainer, Tree): # Calculate Shap values. shap_values = explainer.shap_values( X, check_additivity=check_additivity, approximate=approximate) else: # Calculate Shap values. shap_values = explainer.shap_values(X) if isinstance(shap_values, list) and len(shap_values) == 2: warnings.warn( "Shap values are related to the output probabilities of class 1 for this model, instead of " "log odds.") shap_values = shap_values[1] if return_explainer: return shap_values, explainer return shap_values