def test_classification(self): bc_df = pd.DataFrame(self.bc.data, columns=self.bc.feature_names) X_train, X_test, Y_train, Y_test = train_test_split(bc_df, self.bc.target, test_size = 0.2, random_state = 31) fb = FeatureBinarizer(negations=True) X_train_fb = fb.fit_transform(X_train) X_test_fb = fb.transform(X_test) self.assertEqual(len(X_train_fb.columns), 540) self.assertEqual(len(X_test_fb.columns), 540) boolean_model = BooleanRuleCG(silent=True) explainer = BRCGExplainer(boolean_model) explainer.fit(X_train_fb, Y_train) Y_pred = explainer.predict(X_test_fb) self.assertGreater(accuracy_score(Y_test, Y_pred), 0.9) self.assertGreater(precision_score(Y_test, Y_pred), 0.9) self.assertGreater(recall_score(Y_test, Y_pred), 0.9) self.assertGreater(f1_score(Y_test, Y_pred), 0.9) explanation = explainer.explain() self.assertEqual(explanation['rules'], [ 'compactness error > 0.01 AND worst concavity <= 0.22 AND worst symmetry <= 0.28', 'mean texture <= 15.46 AND mean concavity <= 0.15 AND area error <= 54.16', 'fractal dimension error > 0.00 AND worst area <= 680.60 AND worst concave points <= 0.18', 'mean concave points <= 0.05 AND perimeter error <= 3.80 AND worst area <= 930.88 AND worst smoothness <= 0.16' ])
def __init__(self, explainer, X, model=None, y=None, regressor_params={}): """ Constructor. For a description of the missing arguments, please refer to the AnteHocInterpreter. Args: - explainer (str): name of the explainer to use. - X (np.ndarray or pd.DataFrame): data to explain. - model (depiction.models.base.BaseModel): a model to interpret. Defaults to None, a.k.a. ante-hoc. - y (np.ndarray): binary labels for X. Defaults to None, a.k.a. post-hoc. - regressor_params (dict): parameters for the regressor.s """ is_post_hoc = y is None is_ante_hoc = model is None if is_ante_hoc and is_post_hoc: raise RuntimeError( 'Make sure you pass a model (post-hoc) or labels (ante-hoc)') if model is None: super(RuleAIX360, self).__init__(AnteHocInterpreter.UsageMode.ANTE_HOC, task_type=Task.BINARY, data_type=DataType.TABULAR) else: super(RuleAIX360, self).__init__(AnteHocInterpreter.UsageMode.POST_HOC, model=model) if 'glrm' in explainer: regressor = explainer.split('_')[1] if regressor == 'logistic': self.regressor = LogisticRuleRegression(**regressor_params) elif regressor == 'linear': self.regressor = LinearRuleRegression(**regressor_params) else: raise ValueError( "Regressor '{}' not supported! Available regressors: {}". format(regressor, self._AVAILABLE_RULE_REGRESSORS)) self.explainer = GLRMExplainer(self.regressor) elif explainer == 'brcg': self.regressor = BooleanRuleCG(**regressor_params) self.explainer = BRCGExplainer(self.regressor) else: raise ValueError( "Interpreter '{}' not supported! Available interpreters: {}". format(explainer, self.AVAILABLE_INTERPRETERS)) if isinstance(X, np.ndarray): X = pd.DataFrame(X) self.X = X self.y = y self.binarizer = FeatureBinarizer(negations=True) self.X_binarized = self.binarizer.fit_transform(self.X) self._fitted = False
def binarize_data(self): """ This has to be run after initialization, to binarize the training data. Then, the binarizer is saved to binarize any points during prediction time. """ fb = FeatureBinarizer(negations = True, returnOrd = True) dfTrain = pd.DataFrame(self.train_data) if self.feature_names: dfTrain.columns = self.feature_names dfTrain, dfTrainStd = fb.fit_transform(dfTrain) self.binarizer = fb self.binarized_train_data = dfTrain return
def test_classification(self): bc_df = pd.DataFrame(self.bc.data, columns=self.bc.feature_names) X_train, X_test, Y_train, Y_test = train_test_split(bc_df, self.bc.target, test_size = 0.2, random_state = 31) fb = FeatureBinarizer(negations=True) X_train_fb = fb.fit_transform(X_train) X_test_fb = fb.transform(X_test) self.assertEqual(len(X_train_fb.columns), 540) self.assertEqual(len(X_test_fb.columns), 540) logistic_model = LogisticRuleRegression(maxSolverIter=2000) explainer = GLRMExplainer(logistic_model) explainer.fit(X_train_fb, Y_train) Y_pred = explainer.predict(X_test_fb) self.assertGreater(accuracy_score(Y_test, Y_pred), 0.85) self.assertGreater(precision_score(Y_test, Y_pred), 0.85) self.assertGreater(recall_score(Y_test, Y_pred), 0.85) self.assertGreater(f1_score(Y_test, Y_pred), 0.9) explanation = explainer.explain() expected = pd.DataFrame(columns=["rule", "coefficient"], data=[ ['(intercept)', -11.2], ['worst perimeter <= 116.46 AND worst concave points <= 0.15', -11.9], ['worst concave points <= 0.15', 10.1], ['worst perimeter <= 116.46 AND worst concave points <= 0.18', 9.8], ['worst area <= 930.88', 5.4], ['worst area > 680.60 AND worst concavity > 0.22', -3.3], ['worst perimeter <= 116.46 AND worst smoothness <= 0.16', 3.1], ['mean concave points <= 0.05', 1.5], ['worst concavity <= 0.27', 0.9], ['worst concave points <= 0.12', 0.63], ['worst perimeter <= 104.38', -0.02] ]) assert_frame_equal(explanation, expected, check_dtype=False, check_exact=False, check_less_precise=1) figs, _ = explainer.visualize(bc_df, fb) with open('tests/rbm/logistic_plot_data.json') as fp: plot_data = json.load(fp) for k,v in plot_data.items(): obtained_plot = figs[k].axes[0].lines[0].get_xydata() assert_allclose(np.array(v), obtained_plot, rtol=1e-2)
}, "fbs": { 0: "False", 1: "True" }, "restecg": { 0: "normal", 1: "ST-T wave abnormality", 2: "left ventricular hypertrophy", }, } # Load and preprocess dataset df = pd.read_csv("heart.csv") for k, v in num2desc.items(): df[k] = df[k].replace(v) y = df.pop("target") dfTrain, dfTest, yTrain, yTest = train_test_split(df, y, random_state=0, stratify=y) fb = FeatureBinarizer(negations=True, returnOrd=True) dfTrain, dfTrainStd = fb.fit_transform(dfTrain) dfTest, dfTestStd = fb.transform(dfTest) # Train model lrr = LogisticRuleRegression(lambda0=0.005, lambda1=0.001, useOrd=True) lrr.fit(dfTrain, yTrain, dfTrainStd)
def test_classification(self): boston_df = pd.DataFrame(self.boston.data, columns=self.boston.feature_names) X_train, X_test, Y_train, Y_test = train_test_split(boston_df, self.boston.target, test_size=0.25, random_state=31) fb = FeatureBinarizer(negations=True) X_train_fb = fb.fit_transform(X_train) X_test_fb = fb.transform(X_test) self.assertEqual(len(X_train_fb.columns), 196) self.assertEqual(len(X_test_fb.columns), 196) linear_model = LinearRuleRegression() explainer = GLRMExplainer(linear_model) explainer.fit(X_train_fb, Y_train) Y_pred = explainer.predict(X_test_fb) self.assertGreater(r2_score(Y_test, Y_pred), 0.8) self.assertGreater(explained_variance_score(Y_test, Y_pred), 0.8) self.assertLess(mean_absolute_error(Y_test, Y_pred), 3) self.assertLess(max_error(Y_test, Y_pred), 12) explanation = explainer.explain() explanation = explainer.explain() expected = pd.DataFrame( columns=["rule", "coefficient"], data= [['(intercept)', 21.9], ['NOX <= 0.66', 6.3], ['RM <= 7.16 AND DIS > 1.62', -5.8], ['LSTAT <= 4.66', 5.5], [ 'DIS <= 3.32 AND RAD > 2.00 AND B > 295.98 AND LSTAT <= 22.79', 4.8 ], ['CHAS not AND PTRATIO > 16.10', -3.9], ['RM <= 7.16 AND RAD <= 6.00', -3.3], ['TAX > 293.00 AND LSTAT > 4.66', -2.9], ['LSTAT <= 15.03', 2.8], ['INDUS > 4.05 AND LSTAT > 4.66', -2.5], [ 'DIS <= 7.24 AND RAD > 2.00 AND PTRATIO <= 20.90 AND B <= 394.99 AND B > 295.98 AND LSTAT <= 22.79', 2.5 ], ['LSTAT <= 9.48', 2.5], [ 'CRIM <= 9.84 AND DIS <= 4.64 AND RAD > 1.00 AND TAX <= 666.00 AND LSTAT <= 22.79', 2.2 ], ['LSTAT <= 17.60', 1.9], ['TAX > 330.00 AND LSTAT > 4.66', -1.8], ['CRIM <= 9.84 AND CRIM > 0.06 AND PTRATIO <= 20.90', 1.8], ['LSTAT <= 6.25', 1.6], ['RM <= 7.16 AND B > 380.27', -1.6], ['LSTAT <= 11.12', 1.6], ['RAD > 2.00 AND LSTAT <= 22.79', 1.2], ['RM <= 7.16', -1.2], ['CHAS not AND RM <= 7.16', 1.2], ['RM <= 6.51', -1.1], [ 'CRIM <= 9.84 AND DIS <= 3.95 AND TAX <= 666.00 AND PTRATIO <= 20.90 AND B > 295.98', 1.0 ], ['CRIM <= 9.84 AND RAD > 1.00 AND LSTAT <= 22.79', 1.0], ['DIS <= 3.95 AND LSTAT <= 22.79', -0.9], ['RM <= 6.74', -0.8], ['PTRATIO <= 19.52', 0.8], ['NOX <= 0.66 AND PTRATIO <= 20.90 AND LSTAT <= 22.79', -0.8], ['RAD > 4.00 AND LSTAT <= 22.79', -0.63], ['B <= 391.27 AND LSTAT <= 22.79', 0.5], ['LSTAT <= 7.58', 0.44], ['LSTAT <= 13.14', 0.17]]) assert_frame_equal(explanation, expected, check_dtype=False, check_exact=False, check_less_precise=1) figs, _ = explainer.visualize(boston_df, fb) with open('tests/rbm/linear_plot_data.json') as fp: plot_data = json.load(fp) for k, v in plot_data.items(): obtained_plot = figs[k].axes[0].lines[0].get_xydata() assert_allclose(np.array(v), obtained_plot, rtol=1e-2)
class RuleAIX360(AnteHocInterpreter): _AVAILABLE_RULE_REGRESSORS = {'logistic', 'linear'} SUPPORTED_TASK = {Task.BINARY} SUPPORTED_DATATYPE = {DataType.TABULAR} AVAILABLE_INTERPRETERS = {'brcg'}.union( {'glrm_{}'.format(i) for i in _AVAILABLE_RULE_REGRESSORS}) EXPLANATION_TYPE = ExplanationType.GLOBAL def __init__(self, explainer, X, model=None, y=None, regressor_params={}): """ Constructor. For a description of the missing arguments, please refer to the AnteHocInterpreter. Args: - explainer (str): name of the explainer to use. - X (np.ndarray or pd.DataFrame): data to explain. - model (depiction.models.base.BaseModel): a model to interpret. Defaults to None, a.k.a. ante-hoc. - y (np.ndarray): binary labels for X. Defaults to None, a.k.a. post-hoc. - regressor_params (dict): parameters for the regressor.s """ is_post_hoc = y is None is_ante_hoc = model is None if is_ante_hoc and is_post_hoc: raise RuntimeError( 'Make sure you pass a model (post-hoc) or labels (ante-hoc)') if model is None: super(RuleAIX360, self).__init__(AnteHocInterpreter.UsageMode.ANTE_HOC, task_type=Task.BINARY, data_type=DataType.TABULAR) else: super(RuleAIX360, self).__init__(AnteHocInterpreter.UsageMode.POST_HOC, model=model) if 'glrm' in explainer: regressor = explainer.split('_')[1] if regressor == 'logistic': self.regressor = LogisticRuleRegression(**regressor_params) elif regressor == 'linear': self.regressor = LinearRuleRegression(**regressor_params) else: raise ValueError( "Regressor '{}' not supported! Available regressors: {}". format(regressor, self._AVAILABLE_RULE_REGRESSORS)) self.explainer = GLRMExplainer(self.regressor) elif explainer == 'brcg': self.regressor = BooleanRuleCG(**regressor_params) self.explainer = BRCGExplainer(self.regressor) else: raise ValueError( "Interpreter '{}' not supported! Available interpreters: {}". format(explainer, self.AVAILABLE_INTERPRETERS)) if isinstance(X, np.ndarray): X = pd.DataFrame(X) self.X = X self.y = y self.binarizer = FeatureBinarizer(negations=True) self.X_binarized = self.binarizer.fit_transform(self.X) self._fitted = False def _fit_antehoc(self, X, y): """ Fitting the rule based model (antehoc version). Args: X (pandas.DataFrame): model input data y (array): model output data """ self.explainer.fit(X, y) self._fitted = True def _fit_posthoc(self, X, preprocess_X=None, postprocess_y=None): """ Fitting the rule based model to posthoc interpret another model. Args: X: input to the model to be interpreted. Type depends on the model. preprocess_X: function to create a pandas.DataFrame from the model input to feed to this rule-based model. postprocess_y: function to postprocess the model output to feed to this rule-based model. """ y = self._to_interpret.predict(X) processed_X = X processed_y = y if preprocess_X is not None: processed_X = preprocess_X(processed_X) if postprocess_y is not None: processed_y = postprocess_y(processed_y) self._fit_antehoc(processed_X, processed_y) def interpret(self, explanation_configs={}, path=None): """ Produce explanation. Args: explanation_configs (dict): keyword arguments for the explain function of the explainer. Refer to the AIX360 implementation for details. path (str): path where to save the explanation. If None, a notebook environment will be assumed, and the explanation will be visualized. Returns: pd.DataFrame or dict: the explanation. """ if not self._fitted: if self.usage_mode == self.UsageMode.ANTE_HOC: self._fit_antehoc(self.X_binarized, self.y) else: self._fit_posthoc(self.X, self.binarizer.transform) self.explanation = self.explainer.explain(**explanation_configs) if path is None: self._visualize_explanation(self.explanation) else: self._save_explanation(self.explanation, path) return self.explanation def _visualize_explanation(self, explanation): """ Helper function to visualize the explanation. """ if isinstance(self.explainer, GLRMExplainer): with pd.option_context('display.max_rows', None, 'display.max_columns', None): print(explanation) elif isinstance(self.explainer, BRCGExplainer): # from "https://github.com/IBM/AIX360/blob/master/examples/rbm/breast-cancer-br.ipynb" isCNF = 'Predict Y=1 if ANY of the following rules are satisfied, otherwise Y=0:' notCNF = 'Predict Y=0 if ANY of the following rules are satisfied, otherwise Y=1:' print(isCNF if explanation['isCNF'] else notCNF) print() for rule in explanation['rules']: print(f' - {rule}') def _save_explanation(self, explanation, path): if isinstance(explanation, DataFrame): explanation.to_pickle(path) else: with open(path, 'wb') as f: pickle.dump(explanation, f) def predict(self, X, **kwargs): self.explainer.predict(X, **kwargs)
def aix360_rules_wrapper( df_anomalies, numerical_cols, categorical_cols, rule_algorithm="", simplify_rules=False, model_params={}, ): """ Rules obtained using brlg or logrr. Parameters ---------- df_anomalies : TYPE DESCRIPTION. numerical_cols : TYPE DESCRIPTION. categorical_cols : TYPE DESCRIPTION. rule_algorithm : TYPE, optional DESCRIPTION. The default is "". simplify_rules : TYPE, optional DESCRIPTION. The default is False. model_params : TYPE, optional DESCRIPTION. The default is {}. Raises ------ ValueError DESCRIPTION. Returns ------- df_rules_inliers : TYPE DESCRIPTION. df_rules_outliers : TYPE DESCRIPTION. """ # Define variables feature_cols = numerical_cols + categorical_cols X = df_anomalies[feature_cols].astype(float) y = df_anomalies["predictions"].astype(int) y_inliers = np.array([x if x > 0 else 0 for x in y]) # Defined for inliers levels y_outliers = np.array([1 if x < 0 else 0 for x in y]) # Defined for outlier levels # Feature binarize fb = FeatureBinarizer(negations=True, returnOrd=True, colsCateg=categorical_cols, numThres=90) X_fb, X_std = fb.fit_transform(X) # Choose model if rule_algorithm == "brlg": # Default params if "lambda0" not in model_params.keys(): model_params["lambda0"] = 1e-3 if "lambda1" not in model_params.keys(): model_params["lambda1"] = 1e-3 if "CNF" not in model_params.keys(): model_params["CNF"] = False # Inliers model_rules = BooleanRuleCG(**model_params) model_rules.fit(X_fb, y_inliers) list_rules_inliers = model_rules.explain()["rules"] # Outliers model_rules = BooleanRuleCG(**model_params) model_rules.fit(X_fb, y_outliers) list_rules_outliers = model_rules.explain()["rules"] elif rule_algorithm == "logrr": # Default params if "lambda0" not in model_params.keys(): model_params["lambda0"] = 0.005 if "lambda1" not in model_params.keys(): model_params["lambda1"] = 0.001 # Obtain rules [Inliers] model_rules = LogisticRuleRegression(**model_params) model_rules.fit(X_fb, y_inliers, X_std) df_rules = model_rules.explain() try: # Inliers df_rules_inliers = df_rules[ (df_rules["coefficient"] > 0) & (df_rules["rule/numerical feature"] != "(intercept)")] list_rules_inliers = list( df_rules_inliers["rule/numerical feature"]) # Outliers df_rules_outliers = df_rules[ (df_rules["coefficient"] < 0) & (df_rules["rule/numerical feature"] != "(intercept)")] list_rules_outliers = list( df_rules_outliers["rule/numerical feature"]) except KeyError: # Inliers df_rules_inliers = df_rules[(df_rules["coefficient"] > 0) & (df_rules["rule"] != "(intercept)")] list_rules_inliers = list(df_rules_inliers["rule"]) # Outliers df_rules_outliers = df_rules[(df_rules["coefficient"] < 0) & (df_rules["rule"] != "(intercept)")] list_rules_outliers = list(df_rules_outliers["rule"]) else: raise ValueError( "Argument {0} not recognised -- use 'brlg' or 'logrr' instead") # Turn to DF list_rules_inliers = [x.replace("AND", "&") for x in list_rules_inliers] list_rules_outliers = [x.replace("AND", "&") for x in list_rules_outliers] df_inliers = turn_rules_to_df(list_rules=list_rules_inliers, list_cols=feature_cols) df_outliers = turn_rules_to_df(list_rules=list_rules_outliers, list_cols=feature_cols) # Get rule size df_inliers = df_inliers.reset_index(drop=True) df_inliers["size_rules"] = [len(x.split("&")) for x in list_rules_inliers] df_outliers = df_outliers.reset_index(drop=True) df_outliers["size_rules"] = [ len(x.split("&")) for x in list_rules_outliers ] # Prune rules if simplify_rules: if len(df_inliers) > 0: df_rules_pruned = simplifyRules( df_inliers.drop(columns=["size_rules"]), categorical_cols) df_rules_pruned = df_rules_pruned.reset_index().merge( df_inliers.reset_index()[["index", "size_rules"]], how="left") df_rules_pruned.index = df_rules_pruned["index"] df_rules_pruned = df_rules_pruned.drop(columns=["index"], errors="ignore") df_rules_inliers = df_rules_pruned.copy() df_rules_inliers["rule_prediction"] = 1 else: df_rules_inliers = pd.DataFrame() if len(df_outliers) > 0: df_rules_pruned = simplifyRules( df_outliers.drop(columns=["size_rules"]), categorical_cols) df_rules_pruned = df_rules_pruned.reset_index().merge( df_outliers.reset_index()[["index", "size_rules"]], how="left") df_rules_pruned.index = df_rules_pruned["index"] df_rules_pruned = df_rules_pruned.drop(columns=["index"], errors="ignore") df_rules_outliers = df_rules_pruned.copy() df_rules_outliers["rule_prediction"] = -1 else: df_rules_outliers = pd.DataFrame() else: df_rules_inliers = df_inliers df_rules_inliers["rule_prediction"] = 1 df_rules_outliers = df_outliers df_rules_outliers["rule_prediction"] = -1 return df_rules_inliers, df_rules_outliers
def fbt_vs_fb(X, y, categorical=[], iterations=30, treeNum=1, treeDepth=4, numThresh=9, filename=None): def fit_transform(transformer, args_train, args_test): X_train_fb, X_train_std_fb = transformer.fit_transform(*args_train) X_test_fb, X_test_std_fb = transformer.transform(*args_test) return X_train_fb, X_train_std_fb, X_test_fb, X_test_std_fb def fit_score(explainer, y_test, args_train, args_test): t = time() explainer.fit(*args_train) t = time() - t y_pred = explainer.predict(*args_test) if isinstance(explainer, BRCGExplainer): z: DataFrame = explainer._model.z.loc[:, explainer._model.w > 0.5] rules = z.shape[1] clauses = z.any(axis=1).sum() else: z: DataFrame = explainer._model.z rules = z.any(axis=1).sum() clauses = z.any(axis=1).sum() + 1 # +1 for intercept return (t, accuracy_score(y_test, y_pred), precision_score(y_test, y_pred), recall_score(y_test, y_pred), f1_score(y_test, y_pred), rules, clauses, str(explainer.explain())) columns = [ 'time', 'accuracy', 'precision', 'recall', 'f1', 'rules', 'clauses' ] index = pd.MultiIndex.from_product( (['brcg', 'logrr'], ['fb', 'fbt'], range(iterations))) d = DataFrame(np.zeros((iterations * 4, len(columns))), dtype=float, index=index, columns=columns) d['explanation'] = '' fb = FeatureBinarizer(colCateg=categorical, negations=True, returnOrd=True, numThresh=numThresh) fbt = FeatureBinarizerFromTrees(colCateg=categorical, treeNum=treeNum, treeDepth=treeDepth, returnOrd=True) brcg = BRCGExplainer(BooleanRuleCG(silent=True)) logrr = GLRMExplainer( LogisticRuleRegression(lambda0=0.005, lambda1=0.001, useOrd=True, maxSolverIter=1000)) for i in range(iterations): # Train/Test split X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=i) # FeatureBinarizer X_train_fb, X_train_std_fb, X_test_fb, X_test_std_fb = fit_transform( fb, (X_train, ), (X_test, )) d.loc[('brcg', 'fb', i)] = fit_score(brcg, y_test, (X_train_fb, y_train), (X_test_fb, )) d.loc[('logrr', 'fb', i)] = fit_score(logrr, y_test, (X_train_fb, y_train, X_train_std_fb), (X_test_fb, X_test_std_fb)) # FeatureBinarizerFromTrees X_train_fb, X_train_std_fb, X_test_fb, X_test_std_fb = fit_transform( fbt, (X_train, y_train), (X_test, )) d.loc[('brcg', 'fbt', i)] = fit_score(brcg, y_test, (X_train_fb, y_train), (X_test_fb, )) d.loc[('logrr', 'fbt', i)] = fit_score(logrr, y_test, (X_train_fb, y_train, X_train_std_fb), (X_test_fb, X_test_std_fb)) if filename is not None: with open(filename, 'wb') as fl: pickle.dump(d, fl) return d