def train_r_0(self): """ Trains the rule model that outputs rules that predict for class 0. """ # the rules generated predict for label 0. br_0 = BooleanRuleCG(CNF = False) br_0.fit(self.binarized_train_data, self.train_labels) self.r_0 = br_0 return
def test_classification(self): bc_df = pd.DataFrame(self.bc.data, columns=self.bc.feature_names) X_train, X_test, Y_train, Y_test = train_test_split(bc_df, self.bc.target, test_size = 0.2, random_state = 31) fb = FeatureBinarizer(negations=True) X_train_fb = fb.fit_transform(X_train) X_test_fb = fb.transform(X_test) self.assertEqual(len(X_train_fb.columns), 540) self.assertEqual(len(X_test_fb.columns), 540) boolean_model = BooleanRuleCG(silent=True) explainer = BRCGExplainer(boolean_model) explainer.fit(X_train_fb, Y_train) Y_pred = explainer.predict(X_test_fb) self.assertGreater(accuracy_score(Y_test, Y_pred), 0.9) self.assertGreater(precision_score(Y_test, Y_pred), 0.9) self.assertGreater(recall_score(Y_test, Y_pred), 0.9) self.assertGreater(f1_score(Y_test, Y_pred), 0.9) explanation = explainer.explain() self.assertEqual(explanation['rules'], [ 'compactness error > 0.01 AND worst concavity <= 0.22 AND worst symmetry <= 0.28', 'mean texture <= 15.46 AND mean concavity <= 0.15 AND area error <= 54.16', 'fractal dimension error > 0.00 AND worst area <= 680.60 AND worst concave points <= 0.18', 'mean concave points <= 0.05 AND perimeter error <= 3.80 AND worst area <= 930.88 AND worst smoothness <= 0.16' ])
def __init__(self, explainer, X, model=None, y=None, regressor_params={}): """ Constructor. For a description of the missing arguments, please refer to the AnteHocInterpreter. Args: - explainer (str): name of the explainer to use. - X (np.ndarray or pd.DataFrame): data to explain. - model (depiction.models.base.BaseModel): a model to interpret. Defaults to None, a.k.a. ante-hoc. - y (np.ndarray): binary labels for X. Defaults to None, a.k.a. post-hoc. - regressor_params (dict): parameters for the regressor.s """ is_post_hoc = y is None is_ante_hoc = model is None if is_ante_hoc and is_post_hoc: raise RuntimeError( 'Make sure you pass a model (post-hoc) or labels (ante-hoc)') if model is None: super(RuleAIX360, self).__init__(AnteHocInterpreter.UsageMode.ANTE_HOC, task_type=Task.BINARY, data_type=DataType.TABULAR) else: super(RuleAIX360, self).__init__(AnteHocInterpreter.UsageMode.POST_HOC, model=model) if 'glrm' in explainer: regressor = explainer.split('_')[1] if regressor == 'logistic': self.regressor = LogisticRuleRegression(**regressor_params) elif regressor == 'linear': self.regressor = LinearRuleRegression(**regressor_params) else: raise ValueError( "Regressor '{}' not supported! Available regressors: {}". format(regressor, self._AVAILABLE_RULE_REGRESSORS)) self.explainer = GLRMExplainer(self.regressor) elif explainer == 'brcg': self.regressor = BooleanRuleCG(**regressor_params) self.explainer = BRCGExplainer(self.regressor) else: raise ValueError( "Interpreter '{}' not supported! Available interpreters: {}". format(explainer, self.AVAILABLE_INTERPRETERS)) if isinstance(X, np.ndarray): X = pd.DataFrame(X) self.X = X self.y = y self.binarizer = FeatureBinarizer(negations=True) self.X_binarized = self.binarizer.fit_transform(self.X) self._fitted = False
def train_r_1(self): """ Trains the rule model that outputs rules that predict for class 1. """ # the rules generated predict for label 0, so we hvae to # invert the labels to generate rules that predict # for label 1. br_1 = BooleanRuleCG(CNF = False) inverted_train_labels = [] for label in self.train_labels: if label: inverted_train_labels.append(0) else: inverted_train_labels.append(1) br_1.fit(self.binarized_train_data, np.array(inverted_train_labels)) self.r_1 = br_1 return
def aix360_rules_wrapper( df_anomalies, numerical_cols, categorical_cols, rule_algorithm="", simplify_rules=False, model_params={}, ): """ Rules obtained using brlg or logrr. Parameters ---------- df_anomalies : TYPE DESCRIPTION. numerical_cols : TYPE DESCRIPTION. categorical_cols : TYPE DESCRIPTION. rule_algorithm : TYPE, optional DESCRIPTION. The default is "". simplify_rules : TYPE, optional DESCRIPTION. The default is False. model_params : TYPE, optional DESCRIPTION. The default is {}. Raises ------ ValueError DESCRIPTION. Returns ------- df_rules_inliers : TYPE DESCRIPTION. df_rules_outliers : TYPE DESCRIPTION. """ # Define variables feature_cols = numerical_cols + categorical_cols X = df_anomalies[feature_cols].astype(float) y = df_anomalies["predictions"].astype(int) y_inliers = np.array([x if x > 0 else 0 for x in y]) # Defined for inliers levels y_outliers = np.array([1 if x < 0 else 0 for x in y]) # Defined for outlier levels # Feature binarize fb = FeatureBinarizer(negations=True, returnOrd=True, colsCateg=categorical_cols, numThres=90) X_fb, X_std = fb.fit_transform(X) # Choose model if rule_algorithm == "brlg": # Default params if "lambda0" not in model_params.keys(): model_params["lambda0"] = 1e-3 if "lambda1" not in model_params.keys(): model_params["lambda1"] = 1e-3 if "CNF" not in model_params.keys(): model_params["CNF"] = False # Inliers model_rules = BooleanRuleCG(**model_params) model_rules.fit(X_fb, y_inliers) list_rules_inliers = model_rules.explain()["rules"] # Outliers model_rules = BooleanRuleCG(**model_params) model_rules.fit(X_fb, y_outliers) list_rules_outliers = model_rules.explain()["rules"] elif rule_algorithm == "logrr": # Default params if "lambda0" not in model_params.keys(): model_params["lambda0"] = 0.005 if "lambda1" not in model_params.keys(): model_params["lambda1"] = 0.001 # Obtain rules [Inliers] model_rules = LogisticRuleRegression(**model_params) model_rules.fit(X_fb, y_inliers, X_std) df_rules = model_rules.explain() try: # Inliers df_rules_inliers = df_rules[ (df_rules["coefficient"] > 0) & (df_rules["rule/numerical feature"] != "(intercept)")] list_rules_inliers = list( df_rules_inliers["rule/numerical feature"]) # Outliers df_rules_outliers = df_rules[ (df_rules["coefficient"] < 0) & (df_rules["rule/numerical feature"] != "(intercept)")] list_rules_outliers = list( df_rules_outliers["rule/numerical feature"]) except KeyError: # Inliers df_rules_inliers = df_rules[(df_rules["coefficient"] > 0) & (df_rules["rule"] != "(intercept)")] list_rules_inliers = list(df_rules_inliers["rule"]) # Outliers df_rules_outliers = df_rules[(df_rules["coefficient"] < 0) & (df_rules["rule"] != "(intercept)")] list_rules_outliers = list(df_rules_outliers["rule"]) else: raise ValueError( "Argument {0} not recognised -- use 'brlg' or 'logrr' instead") # Turn to DF list_rules_inliers = [x.replace("AND", "&") for x in list_rules_inliers] list_rules_outliers = [x.replace("AND", "&") for x in list_rules_outliers] df_inliers = turn_rules_to_df(list_rules=list_rules_inliers, list_cols=feature_cols) df_outliers = turn_rules_to_df(list_rules=list_rules_outliers, list_cols=feature_cols) # Get rule size df_inliers = df_inliers.reset_index(drop=True) df_inliers["size_rules"] = [len(x.split("&")) for x in list_rules_inliers] df_outliers = df_outliers.reset_index(drop=True) df_outliers["size_rules"] = [ len(x.split("&")) for x in list_rules_outliers ] # Prune rules if simplify_rules: if len(df_inliers) > 0: df_rules_pruned = simplifyRules( df_inliers.drop(columns=["size_rules"]), categorical_cols) df_rules_pruned = df_rules_pruned.reset_index().merge( df_inliers.reset_index()[["index", "size_rules"]], how="left") df_rules_pruned.index = df_rules_pruned["index"] df_rules_pruned = df_rules_pruned.drop(columns=["index"], errors="ignore") df_rules_inliers = df_rules_pruned.copy() df_rules_inliers["rule_prediction"] = 1 else: df_rules_inliers = pd.DataFrame() if len(df_outliers) > 0: df_rules_pruned = simplifyRules( df_outliers.drop(columns=["size_rules"]), categorical_cols) df_rules_pruned = df_rules_pruned.reset_index().merge( df_outliers.reset_index()[["index", "size_rules"]], how="left") df_rules_pruned.index = df_rules_pruned["index"] df_rules_pruned = df_rules_pruned.drop(columns=["index"], errors="ignore") df_rules_outliers = df_rules_pruned.copy() df_rules_outliers["rule_prediction"] = -1 else: df_rules_outliers = pd.DataFrame() else: df_rules_inliers = df_inliers df_rules_inliers["rule_prediction"] = 1 df_rules_outliers = df_outliers df_rules_outliers["rule_prediction"] = -1 return df_rules_inliers, df_rules_outliers
def fbt_vs_fb(X, y, categorical=[], iterations=30, treeNum=1, treeDepth=4, numThresh=9, filename=None): def fit_transform(transformer, args_train, args_test): X_train_fb, X_train_std_fb = transformer.fit_transform(*args_train) X_test_fb, X_test_std_fb = transformer.transform(*args_test) return X_train_fb, X_train_std_fb, X_test_fb, X_test_std_fb def fit_score(explainer, y_test, args_train, args_test): t = time() explainer.fit(*args_train) t = time() - t y_pred = explainer.predict(*args_test) if isinstance(explainer, BRCGExplainer): z: DataFrame = explainer._model.z.loc[:, explainer._model.w > 0.5] rules = z.shape[1] clauses = z.any(axis=1).sum() else: z: DataFrame = explainer._model.z rules = z.any(axis=1).sum() clauses = z.any(axis=1).sum() + 1 # +1 for intercept return (t, accuracy_score(y_test, y_pred), precision_score(y_test, y_pred), recall_score(y_test, y_pred), f1_score(y_test, y_pred), rules, clauses, str(explainer.explain())) columns = [ 'time', 'accuracy', 'precision', 'recall', 'f1', 'rules', 'clauses' ] index = pd.MultiIndex.from_product( (['brcg', 'logrr'], ['fb', 'fbt'], range(iterations))) d = DataFrame(np.zeros((iterations * 4, len(columns))), dtype=float, index=index, columns=columns) d['explanation'] = '' fb = FeatureBinarizer(colCateg=categorical, negations=True, returnOrd=True, numThresh=numThresh) fbt = FeatureBinarizerFromTrees(colCateg=categorical, treeNum=treeNum, treeDepth=treeDepth, returnOrd=True) brcg = BRCGExplainer(BooleanRuleCG(silent=True)) logrr = GLRMExplainer( LogisticRuleRegression(lambda0=0.005, lambda1=0.001, useOrd=True, maxSolverIter=1000)) for i in range(iterations): # Train/Test split X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=i) # FeatureBinarizer X_train_fb, X_train_std_fb, X_test_fb, X_test_std_fb = fit_transform( fb, (X_train, ), (X_test, )) d.loc[('brcg', 'fb', i)] = fit_score(brcg, y_test, (X_train_fb, y_train), (X_test_fb, )) d.loc[('logrr', 'fb', i)] = fit_score(logrr, y_test, (X_train_fb, y_train, X_train_std_fb), (X_test_fb, X_test_std_fb)) # FeatureBinarizerFromTrees X_train_fb, X_train_std_fb, X_test_fb, X_test_std_fb = fit_transform( fbt, (X_train, y_train), (X_test, )) d.loc[('brcg', 'fbt', i)] = fit_score(brcg, y_test, (X_train_fb, y_train), (X_test_fb, )) d.loc[('logrr', 'fbt', i)] = fit_score(logrr, y_test, (X_train_fb, y_train, X_train_std_fb), (X_test_fb, X_test_std_fb)) if filename is not None: with open(filename, 'wb') as fl: pickle.dump(d, fl) return d