Exemple #1
0
	def train_r_0(self):
		"""
		Trains the rule model that outputs rules that predict
		for class 0. 
		"""

		# the rules generated predict for label 0. 
		br_0 = BooleanRuleCG(CNF = False)
		br_0.fit(self.binarized_train_data, self.train_labels)
		self.r_0 = br_0
		return
Exemple #2
0
    def test_classification(self):
        bc_df = pd.DataFrame(self.bc.data, columns=self.bc.feature_names)
        X_train, X_test, Y_train, Y_test = train_test_split(bc_df, self.bc.target, test_size = 0.2, random_state = 31)
        fb = FeatureBinarizer(negations=True)
        X_train_fb = fb.fit_transform(X_train)
        X_test_fb = fb.transform(X_test)

        self.assertEqual(len(X_train_fb.columns), 540)
        self.assertEqual(len(X_test_fb.columns), 540)

        boolean_model = BooleanRuleCG(silent=True)
        explainer = BRCGExplainer(boolean_model)
        explainer.fit(X_train_fb, Y_train)
        Y_pred = explainer.predict(X_test_fb)

        self.assertGreater(accuracy_score(Y_test, Y_pred), 0.9)
        self.assertGreater(precision_score(Y_test, Y_pred), 0.9)
        self.assertGreater(recall_score(Y_test, Y_pred), 0.9)
        self.assertGreater(f1_score(Y_test, Y_pred), 0.9)

        explanation = explainer.explain()
        self.assertEqual(explanation['rules'], [
          'compactness error > 0.01 AND worst concavity <= 0.22 AND worst symmetry <= 0.28',
          'mean texture <= 15.46 AND mean concavity <= 0.15 AND area error <= 54.16',
          'fractal dimension error > 0.00 AND worst area <= 680.60 AND worst concave points <= 0.18',
          'mean concave points <= 0.05 AND perimeter error <= 3.80 AND worst area <= 930.88 AND worst smoothness <= 0.16'
        ])
Exemple #3
0
    def __init__(self, explainer, X, model=None, y=None, regressor_params={}):
        """
        Constructor. For a description of the missing arguments,
        please refer to the AnteHocInterpreter.

        Args:
            - explainer (str): name of the explainer to use.
            - X (np.ndarray or pd.DataFrame): data to explain.
            - model (depiction.models.base.BaseModel): a model to interpret.
                Defaults to None, a.k.a. ante-hoc.
            - y (np.ndarray): binary labels for X.
                Defaults to None, a.k.a. post-hoc.
            - regressor_params (dict): parameters for the regressor.s
        """
        is_post_hoc = y is None
        is_ante_hoc = model is None
        if is_ante_hoc and is_post_hoc:
            raise RuntimeError(
                'Make sure you pass a model (post-hoc) or labels (ante-hoc)')
        if model is None:
            super(RuleAIX360,
                  self).__init__(AnteHocInterpreter.UsageMode.ANTE_HOC,
                                 task_type=Task.BINARY,
                                 data_type=DataType.TABULAR)
        else:
            super(RuleAIX360,
                  self).__init__(AnteHocInterpreter.UsageMode.POST_HOC,
                                 model=model)

        if 'glrm' in explainer:
            regressor = explainer.split('_')[1]
            if regressor == 'logistic':
                self.regressor = LogisticRuleRegression(**regressor_params)
            elif regressor == 'linear':
                self.regressor = LinearRuleRegression(**regressor_params)
            else:
                raise ValueError(
                    "Regressor '{}' not supported! Available regressors: {}".
                    format(regressor, self._AVAILABLE_RULE_REGRESSORS))
            self.explainer = GLRMExplainer(self.regressor)
        elif explainer == 'brcg':
            self.regressor = BooleanRuleCG(**regressor_params)
            self.explainer = BRCGExplainer(self.regressor)
        else:
            raise ValueError(
                "Interpreter '{}' not supported! Available interpreters: {}".
                format(explainer, self.AVAILABLE_INTERPRETERS))

        if isinstance(X, np.ndarray):
            X = pd.DataFrame(X)
        self.X = X
        self.y = y
        self.binarizer = FeatureBinarizer(negations=True)
        self.X_binarized = self.binarizer.fit_transform(self.X)
        self._fitted = False
Exemple #4
0
	def train_r_1(self):
		"""
		Trains the rule model that outputs rules that predict
		for class 1.
		"""

		# the rules generated predict for label 0, so we hvae to 
		# invert the labels to generate rules that predict
		# for label 1.
		br_1 = BooleanRuleCG(CNF = False)
		inverted_train_labels = []
		for label in self.train_labels:
			if label:
				inverted_train_labels.append(0)
			else:
				inverted_train_labels.append(1)

		br_1.fit(self.binarized_train_data, np.array(inverted_train_labels))
		self.r_1 = br_1
		return
Exemple #5
0
def aix360_rules_wrapper(
    df_anomalies,
    numerical_cols,
    categorical_cols,
    rule_algorithm="",
    simplify_rules=False,
    model_params={},
):
    """
    Rules obtained using brlg or logrr.

    Parameters
    ----------
    df_anomalies : TYPE
        DESCRIPTION.
    numerical_cols : TYPE
        DESCRIPTION.
    categorical_cols : TYPE
        DESCRIPTION.
    rule_algorithm : TYPE, optional
        DESCRIPTION. The default is "".
    simplify_rules : TYPE, optional
        DESCRIPTION. The default is False.
    model_params : TYPE, optional
        DESCRIPTION. The default is {}.

    Raises
    ------
    ValueError
        DESCRIPTION.

    Returns
    -------
    df_rules_inliers : TYPE
        DESCRIPTION.
    df_rules_outliers : TYPE
        DESCRIPTION.

    """

    # Define variables
    feature_cols = numerical_cols + categorical_cols
    X = df_anomalies[feature_cols].astype(float)
    y = df_anomalies["predictions"].astype(int)
    y_inliers = np.array([x if x > 0 else 0
                          for x in y])  # Defined for inliers levels
    y_outliers = np.array([1 if x < 0 else 0
                           for x in y])  # Defined for outlier levels

    # Feature binarize
    fb = FeatureBinarizer(negations=True,
                          returnOrd=True,
                          colsCateg=categorical_cols,
                          numThres=90)
    X_fb, X_std = fb.fit_transform(X)

    # Choose model
    if rule_algorithm == "brlg":

        # Default params
        if "lambda0" not in model_params.keys():
            model_params["lambda0"] = 1e-3
        if "lambda1" not in model_params.keys():
            model_params["lambda1"] = 1e-3
        if "CNF" not in model_params.keys():
            model_params["CNF"] = False
        # Inliers
        model_rules = BooleanRuleCG(**model_params)
        model_rules.fit(X_fb, y_inliers)
        list_rules_inliers = model_rules.explain()["rules"]

        # Outliers
        model_rules = BooleanRuleCG(**model_params)
        model_rules.fit(X_fb, y_outliers)
        list_rules_outliers = model_rules.explain()["rules"]
    elif rule_algorithm == "logrr":

        # Default params
        if "lambda0" not in model_params.keys():
            model_params["lambda0"] = 0.005
        if "lambda1" not in model_params.keys():
            model_params["lambda1"] = 0.001
        # Obtain rules [Inliers]
        model_rules = LogisticRuleRegression(**model_params)
        model_rules.fit(X_fb, y_inliers, X_std)
        df_rules = model_rules.explain()

        try:
            # Inliers
            df_rules_inliers = df_rules[
                (df_rules["coefficient"] > 0)
                & (df_rules["rule/numerical feature"] != "(intercept)")]
            list_rules_inliers = list(
                df_rules_inliers["rule/numerical feature"])

            # Outliers
            df_rules_outliers = df_rules[
                (df_rules["coefficient"] < 0)
                & (df_rules["rule/numerical feature"] != "(intercept)")]
            list_rules_outliers = list(
                df_rules_outliers["rule/numerical feature"])
        except KeyError:
            # Inliers
            df_rules_inliers = df_rules[(df_rules["coefficient"] > 0)
                                        & (df_rules["rule"] != "(intercept)")]
            list_rules_inliers = list(df_rules_inliers["rule"])

            # Outliers
            df_rules_outliers = df_rules[(df_rules["coefficient"] < 0)
                                         & (df_rules["rule"] != "(intercept)")]
            list_rules_outliers = list(df_rules_outliers["rule"])
    else:
        raise ValueError(
            "Argument {0} not recognised -- use 'brlg' or 'logrr' instead")
    # Turn to DF
    list_rules_inliers = [x.replace("AND", "&") for x in list_rules_inliers]
    list_rules_outliers = [x.replace("AND", "&") for x in list_rules_outliers]
    df_inliers = turn_rules_to_df(list_rules=list_rules_inliers,
                                  list_cols=feature_cols)
    df_outliers = turn_rules_to_df(list_rules=list_rules_outliers,
                                   list_cols=feature_cols)

    # Get rule size
    df_inliers = df_inliers.reset_index(drop=True)
    df_inliers["size_rules"] = [len(x.split("&")) for x in list_rules_inliers]
    df_outliers = df_outliers.reset_index(drop=True)
    df_outliers["size_rules"] = [
        len(x.split("&")) for x in list_rules_outliers
    ]

    # Prune rules
    if simplify_rules:
        if len(df_inliers) > 0:
            df_rules_pruned = simplifyRules(
                df_inliers.drop(columns=["size_rules"]), categorical_cols)
            df_rules_pruned = df_rules_pruned.reset_index().merge(
                df_inliers.reset_index()[["index", "size_rules"]], how="left")
            df_rules_pruned.index = df_rules_pruned["index"]
            df_rules_pruned = df_rules_pruned.drop(columns=["index"],
                                                   errors="ignore")
            df_rules_inliers = df_rules_pruned.copy()
            df_rules_inliers["rule_prediction"] = 1
        else:
            df_rules_inliers = pd.DataFrame()
        if len(df_outliers) > 0:
            df_rules_pruned = simplifyRules(
                df_outliers.drop(columns=["size_rules"]), categorical_cols)
            df_rules_pruned = df_rules_pruned.reset_index().merge(
                df_outliers.reset_index()[["index", "size_rules"]], how="left")
            df_rules_pruned.index = df_rules_pruned["index"]
            df_rules_pruned = df_rules_pruned.drop(columns=["index"],
                                                   errors="ignore")
            df_rules_outliers = df_rules_pruned.copy()
            df_rules_outliers["rule_prediction"] = -1
        else:
            df_rules_outliers = pd.DataFrame()
    else:
        df_rules_inliers = df_inliers
        df_rules_inliers["rule_prediction"] = 1
        df_rules_outliers = df_outliers
        df_rules_outliers["rule_prediction"] = -1
    return df_rules_inliers, df_rules_outliers
Exemple #6
0
def fbt_vs_fb(X,
              y,
              categorical=[],
              iterations=30,
              treeNum=1,
              treeDepth=4,
              numThresh=9,
              filename=None):
    def fit_transform(transformer, args_train, args_test):
        X_train_fb, X_train_std_fb = transformer.fit_transform(*args_train)
        X_test_fb, X_test_std_fb = transformer.transform(*args_test)
        return X_train_fb, X_train_std_fb, X_test_fb, X_test_std_fb

    def fit_score(explainer, y_test, args_train, args_test):
        t = time()
        explainer.fit(*args_train)
        t = time() - t
        y_pred = explainer.predict(*args_test)
        if isinstance(explainer, BRCGExplainer):
            z: DataFrame = explainer._model.z.loc[:, explainer._model.w > 0.5]
            rules = z.shape[1]
            clauses = z.any(axis=1).sum()
        else:
            z: DataFrame = explainer._model.z
            rules = z.any(axis=1).sum()
            clauses = z.any(axis=1).sum() + 1  # +1 for intercept
        return (t, accuracy_score(y_test, y_pred),
                precision_score(y_test, y_pred), recall_score(y_test, y_pred),
                f1_score(y_test,
                         y_pred), rules, clauses, str(explainer.explain()))

    columns = [
        'time', 'accuracy', 'precision', 'recall', 'f1', 'rules', 'clauses'
    ]
    index = pd.MultiIndex.from_product(
        (['brcg', 'logrr'], ['fb', 'fbt'], range(iterations)))
    d = DataFrame(np.zeros((iterations * 4, len(columns))),
                  dtype=float,
                  index=index,
                  columns=columns)
    d['explanation'] = ''

    fb = FeatureBinarizer(colCateg=categorical,
                          negations=True,
                          returnOrd=True,
                          numThresh=numThresh)
    fbt = FeatureBinarizerFromTrees(colCateg=categorical,
                                    treeNum=treeNum,
                                    treeDepth=treeDepth,
                                    returnOrd=True)

    brcg = BRCGExplainer(BooleanRuleCG(silent=True))
    logrr = GLRMExplainer(
        LogisticRuleRegression(lambda0=0.005,
                               lambda1=0.001,
                               useOrd=True,
                               maxSolverIter=1000))

    for i in range(iterations):

        # Train/Test split
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            stratify=y,
                                                            random_state=i)

        # FeatureBinarizer
        X_train_fb, X_train_std_fb, X_test_fb, X_test_std_fb = fit_transform(
            fb, (X_train, ), (X_test, ))
        d.loc[('brcg', 'fb', i)] = fit_score(brcg, y_test,
                                             (X_train_fb, y_train),
                                             (X_test_fb, ))
        d.loc[('logrr', 'fb',
               i)] = fit_score(logrr, y_test,
                               (X_train_fb, y_train, X_train_std_fb),
                               (X_test_fb, X_test_std_fb))

        # FeatureBinarizerFromTrees
        X_train_fb, X_train_std_fb, X_test_fb, X_test_std_fb = fit_transform(
            fbt, (X_train, y_train), (X_test, ))
        d.loc[('brcg', 'fbt', i)] = fit_score(brcg, y_test,
                                              (X_train_fb, y_train),
                                              (X_test_fb, ))
        d.loc[('logrr', 'fbt',
               i)] = fit_score(logrr, y_test,
                               (X_train_fb, y_train, X_train_std_fb),
                               (X_test_fb, X_test_std_fb))

    if filename is not None:
        with open(filename, 'wb') as fl:
            pickle.dump(d, fl)

    return d