def train_explainer(regressor: LogisticRegression, feature_names: List[str], X_train: np.ndarray, X_test: np.ndarray, y_test: np.ndarray): predict_fn = lambda x: regressor.predict(x) explainer = AnchorTabular(predict_fn, feature_names) explainer.fit(X_train) file_path="" with open("explainer.dill", "wb") as file: dill.dump(explainer, file) file_path = file.name mlflow.log_artifact("explainer.dill", "model") print(np.where(y_test == 1)[0]) probe = np. array([40.316667556762695, 0.5605325219195545, 0.350, 0, 3, 1, 5], dtype=float) #probe = np. array(X_test[700], dtype=float) explanation = explainer.explain(probe) print('Anchor: %s' % (' AND '.join(explanation['names']))) print('Precision: %.2f' % explanation['precision']) print('Coverage: %.2f' % explanation['coverage']) print(explanation) return explainer # kedro install # kedro run # kedro viz
class Anchors(FeatureImportance): """ Feature importance method by [RIB]_. References ---------- .. [RIB] Ribeiro, et al, "Anchors: High-precision model-agnostic explanations", Proceedings of the AAAI Conference on Artificial Intelligence, Volume 32, 2018. """ def __init__(self, model: Any, seed: int = SEED): super().__init__(seed=seed) self._model = assign_model(model=model) self._explainer = None def fit(self, X: Any) -> None: self._explainer = AnchorTabular( predictor=self._model.predict_proba, feature_names=list(range(X.shape[1])), seed=self._seed) self._explainer.fit(train_data=X) # disc_perc=(0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9)) # disc_perc=(0.1, 0.3, 0.5, 0.7, 0.9)) # disc_perc=(0.2, 0.4, 0.6, 0.8)) def _compute_anchors_per_sample(self, X: np.ndarray, idx: int) -> List: result = self._explainer.explain(X=X[idx, :]) return result.data['raw']['feature'] @staticmethod def _calculate_importance(anchors: List, output_shape: Tuple) -> np.ndarray: importance = np.zeros(shape=output_shape) for k, anchor in enumerate(anchors): if isinstance(anchor, list): importance[k, anchor] = 1 else: importance[anchor] = 1 return importance def _compute_anchors(self, X: np.ndarray, num_jobs: int) -> List: return Parallel(n_jobs=num_jobs)( delayed(self._compute_anchors_per_sample)(X, sample_idx) for sample_idx in range(X.shape[0])) def explain(self, X: np.ndarray, sample_idx: int) -> np.ndarray: anchors = self._compute_anchors_per_sample(X=X, idx=sample_idx) return self._calculate_importance(anchors=anchors, output_shape=(X.shape[1],)) def explain_batch(self, X: np.ndarray, num_jobs: int = 2) -> np.ndarray: anchors = self._compute_anchors(X=X, num_jobs=num_jobs) return self._calculate_importance(anchors=anchors, output_shape=X.shape)
def fit(self, x, y): self.dim = x.shape[1] # clf = sklearn.svm.SVC(kernel=self.kernel, probability=True) clf = RandomForestClassifier() clf.fit(x, y) y_pred = clf.predict(x) print("Clf model accuracy: [{:.4f}]".format( sklearn.metrics.accuracy_score(y, y_pred))) self.ano_idx = np.where(y == 1)[0] print(self.ano_idx.shape) n_f = x.shape[1] feature_names = ["A" + str(i) for i in range(n_f)] # use anchor predict_fn = lambda xx: clf.predict_proba(xx) explainer = AnchorTabular(predict_fn, feature_names) explainer.fit(x, disc_perc=(25, 50, 75)) exp_sub_lst = [] for i in tqdm(range(len(self.ano_idx))): ano = x[self.ano_idx[i]] explanation = explainer.explain(ano, threshold=0.95) anchor = explanation['anchor'] f_sub = [] for a in anchor: for item in a.split(" "): if item.startswith("A"): item = int(item[1:]) f_sub.append(item) # print(anchor, f_sub) if len(f_sub) == 0: f_sub = np.arange(n_f) exp_sub_lst.append(f_sub) return exp_sub_lst
def retrain_classifier_final(self, args, nn_model_ref): nn_model_ref.epochs = args.num_epch_2 nn_model_ref.batch_size_2 = args.batch_size_2 nn_model_ref.net.freeze() X_train_proba_feat, X_eval_proba_feat = nn_model_ref.all_intermediaire, nn_model_ref.all_intermediaire_val Y_train_proba = nn_model_ref.Y_train_nn_binaire Y_eval_proba = nn_model_ref.Y_val_nn_binaire print("START RETRAIN LINEAR NN GOHR ") print() """net_retrain, h = train_speck_distinguisher(args, X_train_proba_feat.shape[1], X_train_proba_feat, Y_train_proba, X_eval_proba_feat, Y_eval_proba, bs=args.batch_size_2, epoch=args.num_epch_2, name_ici="retrain_nn_gohr", wdir=self.path_save_model)""" from alibi.explainers import AnchorTabular #from alibi.explainers import AnchorImage from sklearn.ensemble import RandomForestClassifier clf = RandomForestClassifier(n_estimators=50) clf.fit(X_train_proba_feat, Y_train_proba) predict_fn = lambda x: clf.predict_proba(x) feature_names = [i for i in range(X_train_proba_feat.shape[1])] explainer = AnchorTabular(predict_fn, feature_names) idx = 0 explainer.fit(X_train_proba_feat, disc_perc=(25)) print('Prediction: ', explainer.predictor(X_eval_proba_feat[idx].reshape(1, -1))[0]) #print('Prediction: ', explainer.predict_fn(X_eval_proba_feat[idx].reshape(1, -1))[0]) explanation = explainer.explain(X_eval_proba_feat[idx], threshold=0.8) print('Anchor: %s' % (' AND '.join(explanation['names']))) print('Precision: %.2f' % explanation['precision']) print('Coverage: %.2f' % explanation['coverage']) print(ok) return net_retrain
confusion_matrix(y_test, y_pred) st.write('Confusion matrix:') plot_confusion_matrix(clf, X_test, y_test) st.pyplot() # st.write(classification_report(y_test, y_pred)) predict_fn = lambda x: clf.predict_proba(x) explainer = AnchorTabular(predict_fn, feature_names) explainer.fit(X_train) idx = st.sidebar.slider(label='Select an instance:',min_value=1,max_value=len(y_test)) st.write("""### Selected instance:""") st.write(X_test_df.iloc[[idx-1]], height=150) print(y_train_df.iloc[[idx-1]]) st.write('Prediction: ', class_names[explainer.predictor(X_test[idx-1].reshape(1, -1))[0]]) st.write("""### Prediction Explained:""") with st.spinner('Calculating'): explanation = explainer.explain(X_test[idx-1], threshold=0.70) st.write('Anchor (instance explanation): %s' % (' AND '.join(explanation.anchor))) st.write('Precision: %.2f' % explanation.precision) st.write('Coverage: %.2f' % explanation.coverage) # st.write("""### Trust score:""") ts = TrustScore(k_filter=10, alpha=.05, filter_type='distance_knn', leaf_size=40, metric='euclidean', dist_filter_type='point') ts.fit(X_train, y_train, classes=len(class_names)) score, closest_class = ts.score(X_test[idx-1].reshape(1,-1), y_pred[idx-1], k=2, # kth nearest neighbor used # to compute distances for each class dist_type='point') # 'point' or 'mean' distance option
def anchors_connector(self, *arg): query_instance = dict(s.split(':') for s in arg) #anchor instance to model instance. Input: Numpy. Output: Pandas df. Turns numbers into categories. def adapter(n): d = pd.DataFrame(data=n, columns=self.featureNames) categories = self.getCategoricalFeatures() for c in categories: d[c] = d[c].map(self.dictionary[c]["values"]) #d['Sex'] = d['Sex'].map({0:'Male', 1: 'Female'}) #d['Embarked'] = d['Embarked'].map({0: 'Southampton', 1: 'Cherbourg', 2: 'Queenstown'}) #d['Pclass'] = d['Pclass'].map({0: 'First', 1: 'Second', 2: 'Third'}) return d #model instance to anchor instance. Input: Pandas df. Output: Numpy. Turns categories into numbers. def reverse_adapter(p): d = p.copy() categories = self.getCategoricalFeatures() for c in categories: d[c] = d[c].map( {v: k for k, v in self.dictionary[c]["values"].items()}) #d['Sex'] = d['Sex'].map({'Male': 0, 'Female': 1}) #d['Embarked'] = d['Embarked'].map({'Southampton': 0, 'Cherbourg': 1, 'Queenstown': 2}) #d['Pclass'] = d['Pclass'].map({'First': 0, 'Second': 1, 'Third': 2}) n = d.to_numpy().astype(np.float) return (n) predict_fn = lambda x: self.model.predict(adapter(x)) #create the category map categories = self.getCategoricalFeatures() category_map = {} for i in range(len(self.featureNames)): if self.featureNames[i] in categories: category_map[i] = [ str(k) for k in list(self.dictionary[self.featureNames[i]] ["values"].values()) ] #category_map = {0: ['First', 'Second', 'Third'], 1: ['Male','Female'], 4: ['Southampton', 'Cherbourg', 'Queenstown']} print("-------") print(query_instance) print(reverse_adapter(pd.DataFrame([query_instance]))) #sort query_instance sorted_query_instance = {} for f in self.featureNames: sorted_query_instance[f] = query_instance[f] print(sorted_query_instance) print(reverse_adapter(pd.DataFrame([sorted_query_instance]))) explainer = AnchorTabular(predict_fn, feature_names=self.featureNames, categorical_names=category_map) anchor_training = reverse_adapter(self.X_train) explainer.fit(anchor_training, disc_perc=[25, 50, 75]) explanation = explainer.explain(reverse_adapter( pd.DataFrame([sorted_query_instance])), threshold=0.90, max_anchor_size=3, batch_size=2000) print('Anchor: %s' % (' AND '.join(explanation['data']['anchor']))) print('Precision: %.2f' % explanation['precision']) print('Coverage: %.2f' % explanation['coverage']) #build rule rule = "" names = explanation['data']['anchor'] precision = np.asarray(explanation['raw']['precision']) precision[1:] -= precision[:-1].copy() precision = [round(elem, 2) for elem in precision.tolist()] for i in range(0, len(names)): rule = rule + names[i] importance = round(precision[i] / sum(precision) * 100, 2) rule = rule + " (" + str(importance) + "%)" if (i < len(names) - 1): rule = rule + " AND " self.explanation = 'I generated the following rule for you. It describes the boundaries under which the current prediction remains stable: <br> <br> <big>' + rule + '</big>. <br> <br> Each rule condition has an importance score which shows how critical the condition is for the prediction outcome to stay stable.' self.certainty = 'I tested the rule on many sample data instances. The condition applies on %.2f' % explanation[ 'coverage'] + ' of the instances. In these cases, the rule was accurate in %.2f' % explanation[ 'precision'] + ' of the cases.' return (True)