Exemple #1
0
def train_explainer(regressor: LogisticRegression, feature_names: List[str], X_train: np.ndarray, X_test: np.ndarray, y_test: np.ndarray):
    predict_fn = lambda x: regressor.predict(x)

    explainer = AnchorTabular(predict_fn, feature_names)
    explainer.fit(X_train)

    file_path=""
    with open("explainer.dill", "wb") as file:
        dill.dump(explainer, file)
        file_path = file.name

    mlflow.log_artifact("explainer.dill", "model")

    print(np.where(y_test == 1)[0])
    probe = np. array([40.316667556762695, 0.5605325219195545, 0.350, 0, 3, 1, 5], dtype=float)
    #probe = np. array(X_test[700], dtype=float)
    explanation = explainer.explain(probe)

    print('Anchor: %s' % (' AND '.join(explanation['names'])))
    print('Precision: %.2f' % explanation['precision'])
    print('Coverage: %.2f' % explanation['coverage'])
    print(explanation)
    return explainer

# kedro install
# kedro run
# kedro viz
class Anchors(FeatureImportance):
    """
    Feature importance method by [RIB]_.

    References
    ----------
    .. [RIB] Ribeiro, et al, "Anchors: High-precision model-agnostic explanations",
     Proceedings of the AAAI Conference on Artificial Intelligence, Volume 32, 2018.

    """

    def __init__(self, model: Any, seed: int = SEED):
        super().__init__(seed=seed)
        self._model = assign_model(model=model)
        self._explainer = None

    def fit(self, X: Any) -> None:
        self._explainer = AnchorTabular(
            predictor=self._model.predict_proba,
            feature_names=list(range(X.shape[1])), seed=self._seed)
        self._explainer.fit(train_data=X)
        # disc_perc=(0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9))
        # disc_perc=(0.1, 0.3, 0.5, 0.7, 0.9))
        # disc_perc=(0.2, 0.4, 0.6, 0.8))

    def _compute_anchors_per_sample(self, X: np.ndarray, idx: int) -> List:
        result = self._explainer.explain(X=X[idx, :])
        return result.data['raw']['feature']

    @staticmethod
    def _calculate_importance(anchors: List, output_shape: Tuple) -> np.ndarray:
        importance = np.zeros(shape=output_shape)
        for k, anchor in enumerate(anchors):
            if isinstance(anchor, list):
                importance[k, anchor] = 1
            else:
                importance[anchor] = 1
        return importance

    def _compute_anchors(self, X: np.ndarray, num_jobs: int) -> List:
        return Parallel(n_jobs=num_jobs)(
            delayed(self._compute_anchors_per_sample)(X, sample_idx)
            for sample_idx in range(X.shape[0]))

    def explain(self, X: np.ndarray, sample_idx: int) -> np.ndarray:
        anchors = self._compute_anchors_per_sample(X=X, idx=sample_idx)
        return self._calculate_importance(anchors=anchors, output_shape=(X.shape[1],))

    def explain_batch(self, X: np.ndarray, num_jobs: int = 2) -> np.ndarray:
        anchors = self._compute_anchors(X=X, num_jobs=num_jobs)
        return self._calculate_importance(anchors=anchors, output_shape=X.shape)
Exemple #3
0
    def fit(self, x, y):

        self.dim = x.shape[1]

        # clf = sklearn.svm.SVC(kernel=self.kernel, probability=True)
        clf = RandomForestClassifier()
        clf.fit(x, y)

        y_pred = clf.predict(x)
        print("Clf model accuracy: [{:.4f}]".format(
            sklearn.metrics.accuracy_score(y, y_pred)))

        self.ano_idx = np.where(y == 1)[0]
        print(self.ano_idx.shape)

        n_f = x.shape[1]
        feature_names = ["A" + str(i) for i in range(n_f)]
        # use anchor
        predict_fn = lambda xx: clf.predict_proba(xx)
        explainer = AnchorTabular(predict_fn, feature_names)
        explainer.fit(x, disc_perc=(25, 50, 75))

        exp_sub_lst = []
        for i in tqdm(range(len(self.ano_idx))):
            ano = x[self.ano_idx[i]]
            explanation = explainer.explain(ano, threshold=0.95)
            anchor = explanation['anchor']
            f_sub = []
            for a in anchor:
                for item in a.split(" "):
                    if item.startswith("A"):
                        item = int(item[1:])
                        f_sub.append(item)
            # print(anchor, f_sub)
            if len(f_sub) == 0:
                f_sub = np.arange(n_f)
            exp_sub_lst.append(f_sub)

        return exp_sub_lst
    def retrain_classifier_final(self, args, nn_model_ref):
        nn_model_ref.epochs = args.num_epch_2
        nn_model_ref.batch_size_2 = args.batch_size_2
        nn_model_ref.net.freeze()
        X_train_proba_feat, X_eval_proba_feat = nn_model_ref.all_intermediaire, nn_model_ref.all_intermediaire_val
        Y_train_proba = nn_model_ref.Y_train_nn_binaire
        Y_eval_proba = nn_model_ref.Y_val_nn_binaire
        print("START RETRAIN LINEAR NN GOHR ")
        print()
        """net_retrain, h = train_speck_distinguisher(args, X_train_proba_feat.shape[1], X_train_proba_feat,
                                                   Y_train_proba, X_eval_proba_feat, Y_eval_proba,
                                                   bs=args.batch_size_2,
                                                   epoch=args.num_epch_2, name_ici="retrain_nn_gohr",
                                                   wdir=self.path_save_model)"""

        from alibi.explainers import AnchorTabular
        #from alibi.explainers import AnchorImage
        from sklearn.ensemble import RandomForestClassifier

        clf = RandomForestClassifier(n_estimators=50)
        clf.fit(X_train_proba_feat, Y_train_proba)
        predict_fn = lambda x: clf.predict_proba(x)
        feature_names = [i for i in range(X_train_proba_feat.shape[1])]
        explainer = AnchorTabular(predict_fn, feature_names)
        idx = 0
        explainer.fit(X_train_proba_feat, disc_perc=(25))
        print('Prediction: ',
              explainer.predictor(X_eval_proba_feat[idx].reshape(1, -1))[0])

        #print('Prediction: ', explainer.predict_fn(X_eval_proba_feat[idx].reshape(1, -1))[0])
        explanation = explainer.explain(X_eval_proba_feat[idx], threshold=0.8)
        print('Anchor: %s' % (' AND '.join(explanation['names'])))
        print('Precision: %.2f' % explanation['precision'])
        print('Coverage: %.2f' % explanation['coverage'])

        print(ok)

        return net_retrain
Exemple #5
0
confusion_matrix(y_test, y_pred)
st.write('Confusion matrix:')
plot_confusion_matrix(clf, X_test, y_test)
st.pyplot()
# st.write(classification_report(y_test, y_pred))
predict_fn = lambda x: clf.predict_proba(x)
explainer = AnchorTabular(predict_fn, feature_names)
explainer.fit(X_train)
idx = st.sidebar.slider(label='Select an instance:',min_value=1,max_value=len(y_test))
st.write("""### Selected instance:""")
st.write(X_test_df.iloc[[idx-1]], height=150)
print(y_train_df.iloc[[idx-1]])
st.write('Prediction: ', class_names[explainer.predictor(X_test[idx-1].reshape(1, -1))[0]])
st.write("""### Prediction Explained:""")
with st.spinner('Calculating'):
    explanation = explainer.explain(X_test[idx-1], threshold=0.70)
    st.write('Anchor (instance explanation): %s' % (' AND '.join(explanation.anchor)))
    st.write('Precision: %.2f' % explanation.precision)
    st.write('Coverage: %.2f' % explanation.coverage)
# st.write("""### Trust score:""")
    ts = TrustScore(k_filter=10,
                alpha=.05,
                filter_type='distance_knn',
                leaf_size=40,
                metric='euclidean',
                dist_filter_type='point')
    ts.fit(X_train, y_train, classes=len(class_names))
    score, closest_class = ts.score(X_test[idx-1].reshape(1,-1),
                                y_pred[idx-1], k=2,  # kth nearest neighbor used
                                              # to compute distances for each class
                                dist_type='point')  # 'point' or 'mean' distance option
Exemple #6
0
    def anchors_connector(self, *arg):
        query_instance = dict(s.split(':') for s in arg)

        #anchor instance to model instance. Input: Numpy. Output: Pandas df. Turns numbers into categories.
        def adapter(n):
            d = pd.DataFrame(data=n, columns=self.featureNames)
            categories = self.getCategoricalFeatures()
            for c in categories:
                d[c] = d[c].map(self.dictionary[c]["values"])
            #d['Sex'] = d['Sex'].map({0:'Male', 1: 'Female'})
            #d['Embarked'] = d['Embarked'].map({0: 'Southampton', 1: 'Cherbourg', 2: 'Queenstown'})
            #d['Pclass'] = d['Pclass'].map({0: 'First', 1: 'Second', 2: 'Third'})
            return d

        #model instance to anchor instance. Input: Pandas df. Output: Numpy. Turns categories into numbers.
        def reverse_adapter(p):
            d = p.copy()
            categories = self.getCategoricalFeatures()
            for c in categories:
                d[c] = d[c].map(
                    {v: k
                     for k, v in self.dictionary[c]["values"].items()})
            #d['Sex'] = d['Sex'].map({'Male': 0, 'Female': 1})
            #d['Embarked'] = d['Embarked'].map({'Southampton': 0, 'Cherbourg': 1, 'Queenstown': 2})
            #d['Pclass'] = d['Pclass'].map({'First': 0, 'Second': 1, 'Third': 2})
            n = d.to_numpy().astype(np.float)
            return (n)

        predict_fn = lambda x: self.model.predict(adapter(x))

        #create the category map
        categories = self.getCategoricalFeatures()
        category_map = {}
        for i in range(len(self.featureNames)):
            if self.featureNames[i] in categories:
                category_map[i] = [
                    str(k) for k in list(self.dictionary[self.featureNames[i]]
                                         ["values"].values())
                ]
        #category_map = {0: ['First', 'Second', 'Third'], 1: ['Male','Female'], 4: ['Southampton', 'Cherbourg', 'Queenstown']}

        print("-------")
        print(query_instance)
        print(reverse_adapter(pd.DataFrame([query_instance])))

        #sort query_instance
        sorted_query_instance = {}
        for f in self.featureNames:
            sorted_query_instance[f] = query_instance[f]

        print(sorted_query_instance)
        print(reverse_adapter(pd.DataFrame([sorted_query_instance])))

        explainer = AnchorTabular(predict_fn,
                                  feature_names=self.featureNames,
                                  categorical_names=category_map)
        anchor_training = reverse_adapter(self.X_train)
        explainer.fit(anchor_training, disc_perc=[25, 50, 75])
        explanation = explainer.explain(reverse_adapter(
            pd.DataFrame([sorted_query_instance])),
                                        threshold=0.90,
                                        max_anchor_size=3,
                                        batch_size=2000)
        print('Anchor: %s' % (' AND '.join(explanation['data']['anchor'])))
        print('Precision: %.2f' % explanation['precision'])
        print('Coverage: %.2f' % explanation['coverage'])

        #build rule
        rule = ""
        names = explanation['data']['anchor']
        precision = np.asarray(explanation['raw']['precision'])
        precision[1:] -= precision[:-1].copy()
        precision = [round(elem, 2) for elem in precision.tolist()]
        for i in range(0, len(names)):
            rule = rule + names[i]
            importance = round(precision[i] / sum(precision) * 100, 2)

            rule = rule + " (" + str(importance) + "%)"
            if (i < len(names) - 1):
                rule = rule + " AND "

        self.explanation = 'I generated the following rule for you. It describes the boundaries under which the current prediction remains stable: <br> <br> <big>' + rule + '</big>. <br> <br> Each rule condition has an importance score which shows how critical the condition is for the prediction outcome to stay stable.'
        self.certainty = 'I tested the rule on many sample data instances. The condition applies on %.2f' % explanation[
            'coverage'] + ' of the instances. In these cases, the rule was accurate in %.2f' % explanation[
                'precision'] + ' of the cases.'
        return (True)