def run(args): # Fixes an issue where threads of inherit the same rng state scipy.random.seed() # Arguments dataset = args[0] trial = args[1] # Outpt out = {} file = open("Trials/" + dataset + "_" + str(trial) + ".json", "w") # Load Data X_train, y_train, X_val, y_val, X_test, y_test, train_mean, train_stddev = load_normalize_data( "../Datasets/" + dataset + ".csv") # Linear model lr = LinearRegression() lr.fit(X_train, y_train) predictions = lr.predict(X_test) out["lm_rmse"] = np.sqrt(mean_squared_error(y_test, predictions)) # RF maple_rf = MAPLE(X_train, y_train, X_val, y_val) predictions = maple_rf.predict_fe(X_test) out["rf_rmse"] = np.sqrt(mean_squared_error(y_test, predictions)) predictions = maple_rf.predict_silo(X_test) out["silo_rf_rmse"] = np.sqrt(mean_squared_error(y_test, predictions)) predictions = maple_rf.predict(X_test) out["maple_rf_rmse"] = np.sqrt(mean_squared_error(y_test, predictions)) out["nf_rf"] = maple_rf.retain # GBRT maple_gbrt = MAPLE(X_train, y_train, X_val, y_val, fe_type="gbrt") predictions = maple_gbrt.predict_fe(X_test) out["gbrt_rmse"] = np.sqrt(mean_squared_error(y_test, predictions)) predictions = maple_gbrt.predict_silo(X_test) out["silo_gbrt_rmse"] = np.sqrt(mean_squared_error(y_test, predictions)) predictions = maple_gbrt.predict(X_test) out["maple_gbrt_rmse"] = np.sqrt(mean_squared_error(y_test, predictions)) out["nf_gbrt"] = maple_gbrt.retain # Save results json.dump(out, file) file.close()
def main(): n_features = 100 random_state = 0 stc = generate_synthetic_text_classifier(n_features=n_features, use_textual_words=False, random_state=random_state) predict = stc['predict'] predict_proba = stc['predict_proba'] words = stc['words'] vectorizer = stc['vectorizer'] nbr_terms = stc['nbr_terms'] # print(words) X_test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'), categories=None).data X_test = preprocess_data(X_test) X_test = vectorizer.transform(X_test).toarray() Y_test = predict(X_test) print('building') explainer = MAPLE(X_test[:10], Y_test[:10], X_test[:10], Y_test[:10]) print('built') for x in X_test[:10]: # print(x) exp = explainer.explain(x) expl_val = exp['coefs'][:-1] gt_val = get_word_importance_explanation(x, stc) # gt_val = get_word_importance_explanation_text(x, stc) # print(gt_val) # print(expl_val) # print(np.where(gt_val != 0)) # print(np.where(expl_val != 0)) wbs = word_based_similarity(expl_val, gt_val, use_values=False) print(wbs, word_based_similarity(expl_val, gt_val, use_values=True)) print('')
def main(): m = 5 n = 10 n_features = 3 random_state = 2 p_binary = 0.7 p_parenthesis = 0.3 slc = generate_synthetic_linear_classifier(expr=None, n_features=n_features, n_all_features=m, random_state=random_state, p_binary=p_binary, p_parenthesis=p_parenthesis) expr = slc['expr'] X = slc['X'] feature_names = slc['feature_names'] class_values = slc['class_values'] predict_proba = slc['predict_proba'] predict = slc['predict'] print(expr) X_test = np.random.uniform(np.min(X), np.max(X), size=(n, m)) Y_test = predict(X_test) explainer = MAPLE(X_test, Y_test, X_test, Y_test) for x in X_test: print(x) exp = explainer.explain(x) expl_val = exp['coefs'][:-1] gt_val = get_feature_importance_explanation(x, slc, n_features, get_values=True) fis = feature_importance_similarity(expl_val, gt_val) print(expl_val) print(gt_val) print(fis) break
def run(args): # Hyperparamaters num_perturbations = 5 # Fixes an issue where threads inherit the same rng state scipy.random.seed() # Arguments dataset = args[0] trial = args[1] # Output out = {} file = open("Trials/" + dataset + "_" + str(trial) + ".json", "w") # Load data X_train, y_train, X_valid, y_valid, X_test, y_test, train_mean, train_stddev = load_normalize_data( "../Datasets/" + dataset + ".csv") n = X_test.shape[0] d = X_test.shape[1] # Load the noise scale parameters #with open("Sigmas/" + dataset + ".json", "r") as tmp: #scales = json.load(tmp) scales = [0.1, 0.25] scales_len = len(scales) # Fit MAPLE model exp_maple = MAPLE(X_train, y_train, X_valid, y_valid) # Fit LIME to explain MAPLE exp_lime = lime_tabular.LimeTabularExplainer(X_train, discretize_continuous=False, mode="regression") # Evaluate model faithfullness on the test set rmse = 0.0 #MAPLE accuracy on the dataset lime_rmse = np.zeros((scales_len)) maple_rmse = np.zeros((scales_len)) for i in range(n): x = X_test[i, :] #LIME's default parameter for num_samples is 500 # 1) This is larger than any of the datasets we tested on # 2) It makes explaining MAPLE impractically slow since the complexity of MAPLE's predict() depends on the dataset size coefs_lime = unpack_coefs(exp_lime, x, exp_maple.predict, d, X_train, num_samples=100) e_maple = exp_maple.explain(x) coefs_maple = e_maple["coefs"] rmse += (e_maple["pred"] - y_test[i])**2 for j in range(num_perturbations): noise = np.random.normal(loc=0.0, scale=1.0, size=d) for k in range(scales_len): scale = scales[k] x_pert = x + scale * noise e_maple_pert = exp_maple.explain(x_pert) model_pred = e_maple_pert["pred"] lime_pred = np.dot(np.insert(x_pert, 0, 1), coefs_lime) maple_pred = np.dot(np.insert(x_pert, 0, 1), coefs_maple) lime_rmse[k] += (lime_pred - model_pred)**2 maple_rmse[k] += (maple_pred - model_pred)**2 rmse /= n lime_rmse /= n * num_perturbations maple_rmse /= n * num_perturbations rmse = np.sqrt(rmse) lime_rmse = np.sqrt(lime_rmse) maple_rmse = np.sqrt(maple_rmse) out["model_rmse"] = rmse[0] out["lime_rmse_0.1"] = lime_rmse[0] out["maple_rmse_0.1"] = maple_rmse[0] out["lime_rmse_0.25"] = lime_rmse[1] out["maple_rmse_0.25"] = maple_rmse[1] json.dump(out, file) file.close()
def main(): n_features = (8, 8) img_size = (32, 32, 3) cell_size = (4, 4) colors_p = np.array([0.15, 0.7, 0.15]) p_border = 0.0 sic = generate_synthetic_image_classifier(img_size=img_size, cell_size=cell_size, n_features=n_features, p_border=p_border) pattern = sic['pattern'] predict = sic['predict'] predict_proba = sic['predict_proba'] plt.imshow(pattern) plt.show() X_test = generate_random_img_dataset(pattern, nbr_images=1000, pattern_ratio=0.4, img_size=img_size, cell_size=cell_size, min_nbr_cells=0.1, max_nbr_cells=0.3, colors_p=colors_p) Y_test = predict(X_test) nbr_records = 10 Xm_test = np.array([x.ravel() for x in X_test[:nbr_records]]) explainer = MAPLE(Xm_test, Y_test[:nbr_records], Xm_test, Y_test[:nbr_records], n_estimators=5, max_features=0.5, min_samples_leaf=5) x = X_test[-1] plt.imshow(x) plt.show() exp = explainer.explain(x) expl_val = exp['coefs'][:-1] print(expl_val) expl_val = np.array([1.0 if v > 0.0 else 0.0 for v in expl_val]) # expl_val = (expl_val - np.min(expl_val)) / (np.max(expl_val) - np.min(expl_val)) print(expl_val) print(np.unique(expl_val, return_counts=True)) print(expl_val.shape) sv = np.sum(np.reshape(expl_val, img_size), axis=2) sv01 = np.zeros(sv.shape) sv01[np.where(sv > 0.0)] = 1.0 # np.array([1.0 if v > 0.0 else 0.0 for v in expl_val]) sv = sv01 print(sv) print(sv.shape) max_val = np.nanpercentile(np.abs(sv), 99.9) # plt.imshow(x) plt.imshow(sv, cmap='RdYlBu', vmin=-max_val, vmax=max_val, alpha=0.7) plt.show() # shap.image_plot(expl_val, x) gt_val = get_pixel_importance_explanation(x, sic) print(gt_val.shape) max_val = np.nanpercentile(np.abs(gt_val), 99.9) # plt.imshow(x) plt.imshow(np.reshape(gt_val, img_size[:2]), cmap='RdYlBu', vmin=-max_val, vmax=max_val, alpha=0.7) plt.show() print(np.unique(gt_val, return_counts=True)) print(np.unique(sv.ravel(), return_counts=True)) print(pixel_based_similarity(sv.ravel(), gt_val))
def main(): n_features = (20, 20) img_size = (32, 32, 3) cell_size = (4, 4) colors_p = np.array([0.15, 0.7, 0.15]) p_border = 1.0 # img_draft = np.array([ # ['k', 'k', 'k', 'k', 'k', 'k', 'k', 'k'], # ['k', 'k', 'k', 'k', 'k', 'g', 'r', 'k'], # ['g', 'k', 'k', 'k', 'k', 'k', 'k', 'g'], # ['k', 'g', 'k', 'k', 'k', 'b', 'k', 'k'], # ['k', 'g', 'k', 'k', 'g', 'g', 'k', 'b'], # ['k', 'k', 'k', 'k', 'g', 'k', 'k', 'g'], # ['g', 'k', 'k', 'k', 'k', 'k', 'k', 'k'], # ['k', 'k', 'k', 'k', 'k', 'k', 'k', 'k'], # ['k', 'k', 'k', 'k', 'g', 'k', 'k', 'k'], # # ]) # img = generate_img_defined(img_draft, img_size=img_size, cell_size=cell_size) # plt.imshow(img) # plt.xticks(()) # plt.yticks(()) # # plt.savefig('../fig/pattern.png', format='png', bbox_inches='tight') # plt.show() pattern_draft = np.array([ # ['k', 'k', 'k', 'k', 'k', 'k', 'k', 'k'], ['k', 'k', 'k', 'k', 'k'], ['k', 'k', 'k', 'b', 'k'], ['k', 'k', 'g', 'g', 'k'], ['k', 'k', 'g', 'k', 'k'], ['k', 'k', 'k', 'k', 'k'], ]) pattern = generate_img_defined(pattern_draft, img_size=(20, 20, 3), cell_size=cell_size) sic = generate_synthetic_image_classifier(img_size=img_size, cell_size=cell_size, n_features=n_features, p_border=p_border, pattern=pattern) pattern = sic['pattern'] predict = sic['predict'] predict_proba = sic['predict_proba'] plt.imshow(pattern) plt.xticks(()) plt.yticks(()) # plt.savefig('../fig/pattern.png', format='png', bbox_inches='tight') plt.show() X_test = generate_random_img_dataset(pattern, nbr_images=1000, pattern_ratio=0.4, img_size=img_size, cell_size=cell_size, min_nbr_cells=0.1, max_nbr_cells=0.3, colors_p=colors_p) Y_test = predict(X_test) idx = np.where(Y_test == 1)[0][0] # x = X_test[idx] img_draft = np.array([ # ['k', 'k', 'k', 'k', 'k', 'k', 'k', 'k'], ['k', 'k', 'k', 'k', 'k', 'g', 'r', 'k'], ['g', 'k', 'k', 'k', 'k', 'k', 'k', 'g'], ['k', 'g', 'k', 'k', 'k', 'b', 'k', 'k'], ['k', 'g', 'k', 'k', 'g', 'g', 'k', 'b'], ['k', 'k', 'k', 'k', 'g', 'k', 'k', 'g'], ['g', 'k', 'k', 'k', 'k', 'k', 'k', 'k'], ['k', 'k', 'k', 'k', 'k', 'k', 'k', 'k'], ['k', 'k', 'k', 'k', 'g', 'k', 'k', 'k'], ]) x = generate_img_defined(img_draft, img_size=img_size, cell_size=cell_size) plt.imshow(x) plt.xticks(()) plt.yticks(()) # plt.savefig('../fig/image.png', format='png', bbox_inches='tight') plt.show() gt_val = get_pixel_importance_explanation(x, sic) max_val = np.nanpercentile(np.abs(gt_val), 99.9) plt.imshow(np.reshape(gt_val, img_size[:2]), cmap='RdYlBu', vmin=-max_val, vmax=max_val, alpha=0.7) plt.xticks(()) plt.yticks(()) # plt.savefig('../fig/saliencymap.png', format='png', bbox_inches='tight') plt.show() # plt.imshow(x) # plt.imshow(np.reshape(gt_val, img_size[:2]), cmap='RdYlBu', vmin=-max_val, vmax=max_val, alpha=0.7) # plt.xticks(()) # plt.yticks(()) # plt.savefig('../fig/saliencymap2.png', format='png', bbox_inches='tight') # plt.show() lime_explainer = LimeImageExplainer() segmenter = SegmentationAlgorithm('quickshift', kernel_size=1, max_dist=10, ratio=0.5) tot_num_features = img_size[0] * img_size[1] lime_exp = lime_explainer.explain_instance(x, predict_proba, top_labels=2, hide_color=0, num_samples=10000, segmentation_fn=segmenter) _, lime_expl_val = lime_exp.get_image_and_mask( 1, positive_only=True, num_features=tot_num_features, hide_rest=False, min_weight=0.0) max_val = np.nanpercentile(np.abs(lime_expl_val), 99.9) plt.imshow(lime_expl_val, cmap='RdYlBu', vmin=-max_val, vmax=max_val, alpha=0.7) plt.xticks(()) plt.yticks(()) plt.title('lime', fontsize=20) plt.savefig('../fig/saliencymap_lime.png', format='png', bbox_inches='tight') plt.show() background = np.array([np.zeros(img_size).ravel()] * 10) shap_explainer = KernelExplainer(predict_proba, background) shap_expl_val = shap_explainer.shap_values(x.ravel(), l1_reg='bic')[1] shap_expl_val = np.sum(np.reshape(shap_expl_val, img_size), axis=2) tmp = np.zeros(shap_expl_val.shape) tmp[np.where(shap_expl_val > 0.0)] = 1.0 shap_expl_val = tmp max_val = np.nanpercentile(np.abs(shap_expl_val), 99.9) plt.imshow(shap_expl_val, cmap='RdYlBu', vmin=-max_val, vmax=max_val, alpha=0.7) plt.xticks(()) plt.yticks(()) plt.title('shap', fontsize=20) plt.savefig('../fig/saliencymap_shap.png', format='png', bbox_inches='tight') plt.show() nbr_records = 10 Xm_test = np.array([x.ravel() for x in X_test[:nbr_records]]) maple_explainer = MAPLE(Xm_test, Y_test[:nbr_records], Xm_test, Y_test[:nbr_records], n_estimators=5, max_features=0.5, min_samples_leaf=5) maple_exp = maple_explainer.explain(x) maple_expl_val = maple_exp['coefs'][:-1] maple_expl_val = np.sum(np.reshape(maple_expl_val, img_size), axis=2) tmp = np.zeros(maple_expl_val.shape) tmp[np.where(maple_expl_val > 0.0)] = 1.0 maple_expl_val = tmp max_val = np.nanpercentile(np.abs(shap_expl_val), 99.9) plt.imshow(maple_expl_val, cmap='RdYlBu', vmin=-max_val, vmax=max_val, alpha=0.7) plt.xticks(()) plt.yticks(()) plt.title('maple', fontsize=20) plt.savefig('../fig/saliencymap_maple.png', format='png', bbox_inches='tight') plt.show() lime_f1, lime_pre, lime_rec = pixel_based_similarity(lime_expl_val.ravel(), gt_val, ret_pre_rec=True) shap_f1, shap_pre, shap_rec = pixel_based_similarity(shap_expl_val.ravel(), gt_val, ret_pre_rec=True) maple_f1, maple_pre, maple_rec = pixel_based_similarity( maple_expl_val.ravel(), gt_val, ret_pre_rec=True) print(lime_f1, lime_pre, lime_rec) print(shap_f1, shap_pre, shap_rec) print(maple_f1, maple_pre, maple_rec)
def run(black_box, n_records, img_size, cell_size, n_features, p_border, colors_p, random_state, filename): sic = generate_synthetic_image_classifier(img_size=img_size, cell_size=cell_size, n_features=n_features, p_border=p_border, random_state=random_state) pattern = sic['pattern'] predict = sic['predict'] predict_proba = sic['predict_proba'] X_test = generate_random_img_dataset(pattern, nbr_images=n_records, pattern_ratio=0.5, img_size=img_size, cell_size=cell_size, min_nbr_cells=0.1, max_nbr_cells=0.3, colors_p=colors_p) Y_test_proba = predict_proba(X_test) Y_test = predict(X_test) lime_explainer = LimeImageExplainer() segmenter = SegmentationAlgorithm('quickshift', kernel_size=1, max_dist=10, ratio=0.5) tot_num_features = img_size[0] * img_size[1] background = np.array([np.zeros(img_size).ravel()] * 10) shap_explainer = KernelExplainer(predict_proba, background) nbr_records_explainer = 10 idx_records_train_expl = np.random.choice(range(len(X_test)), size=nbr_records_explainer, replace=False) idx_records_test_expl = np.random.choice(range(len(X_test)), size=nbr_records_explainer, replace=False) Xm_train = np.array([x.ravel() for x in X_test[idx_records_train_expl]]) Xm_test = np.array([x.ravel() for x in X_test[idx_records_test_expl]]) print(datetime.datetime.now(), 'build maple') maple_explainer = MAPLE(Xm_train, Y_test_proba[idx_records_train_expl][:, 1], Xm_test, Y_test_proba[idx_records_test_expl][:, 1], n_estimators=100, max_features=0.5, min_samples_leaf=2) print(datetime.datetime.now(), 'build maple done') idx = 0 results = list() for x, y in zip(X_test, Y_test): print(datetime.datetime.now(), 'seneca - image', 'black_box %s' % black_box, 'n_features %s' % str(n_features), 'rs %s' % random_state, '%s/%s' % (idx, n_records), end=' ') gt_val = get_pixel_importance_explanation(x, sic) lime_exp = lime_explainer.explain_instance(x, predict_proba, top_labels=2, hide_color=0, num_samples=10000, segmentation_fn=segmenter) _, lime_expl_val = lime_exp.get_image_and_mask( y, positive_only=True, num_features=tot_num_features, hide_rest=False, min_weight=0.0) shap_expl_val = shap_explainer.shap_values(x.ravel(), l1_reg='bic')[1] shap_expl_val = np.sum(np.reshape(shap_expl_val, img_size), axis=2) tmp = np.zeros(shap_expl_val.shape) tmp[np.where(shap_expl_val > 0.0)] = 1.0 shap_expl_val = tmp maple_exp = maple_explainer.explain(x) maple_expl_val = maple_exp['coefs'][:-1] maple_expl_val = np.sum(np.reshape(maple_expl_val, img_size), axis=2) tmp = np.zeros(maple_expl_val.shape) tmp[np.where(maple_expl_val > 0.0)] = 1.0 maple_expl_val = tmp lime_f1, lime_pre, lime_rec = pixel_based_similarity( lime_expl_val.ravel(), gt_val, ret_pre_rec=True) shap_f1, shap_pre, shap_rec = pixel_based_similarity( shap_expl_val.ravel(), gt_val, ret_pre_rec=True) maple_f1, maple_pre, maple_rec = pixel_based_similarity( maple_expl_val.ravel(), gt_val, ret_pre_rec=True) res = { 'black_box': black_box, 'n_records': n_records, 'img_size': '"%s"' % str(img_size), 'cell_size': '"%s"' % str(cell_size), 'n_features': '"%s"' % str(n_features), 'random_state': random_state, 'idx': idx, 'lime_f1': lime_f1, 'lime_pre': lime_pre, 'lime_rec': lime_rec, 'shap_f1': shap_f1, 'shap_pre': shap_pre, 'shap_rec': shap_rec, 'maple_f1': maple_f1, 'maple_pre': maple_pre, 'maple_rec': maple_rec, 'p_border': p_border } results.append(res) print('lime %.2f' % lime_f1, 'shap %.2f' % shap_f1, 'maple %.2f' % maple_f1) idx += 1 df = pd.DataFrame(data=results) df = df[[ 'black_box', 'n_records', 'img_size', 'cell_size', 'n_features', 'random_state', 'idx', 'lime_f1', 'lime_pre', 'lime_rec', 'shap_f1', 'shap_pre', 'shap_rec', 'maple_f1', 'maple_pre', 'maple_rec', 'p_border' ]] # print(df.head()) if not os.path.isfile(filename): df.to_csv(filename, index=False) else: df.to_csv(filename, mode='a', index=False, header=False)
def run(black_box, n_records, n_features, random_state, filename): X_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'), categories=None).data X_train = preprocess_data(X_train) stc = generate_synthetic_text_classifier(X_train, n_features=n_features, random_state=random_state) predict_proba = stc['predict_proba'] vectorizer = stc['vectorizer'] nbr_terms = stc['nbr_terms'] X_test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'), categories=None).data X_test = preprocess_data(X_test) X_test_nbrs = vectorizer.transform(X_test).toarray() Y_test = predict_proba(X_test_nbrs) lime_explainer = LimeTextExplainer(class_names=[0, 1]) print(datetime.datetime.now(), 'build shap') reference = get_reference4shap(X_test_nbrs, stc['words_vec'], nbr_terms, nbr_references=10) shap_explainer = KernelExplainer(predict_proba, reference) print(datetime.datetime.now(), 'build shap done') # print(idx_records_train_expl) # print(X_test_nbrs[idx_records_train_expl]) # print(Y_test[idx_records_train_expl][:, 1]) # print(np.any(np.isnan(X_test_nbrs[idx_records_train_expl]))) # print(np.any(np.isnan(X_test_nbrs[idx_records_test_expl]))) # print(np.any(np.isnan(Y_test[idx_records_train_expl][:, 1]))) # print(np.any(np.isnan(Y_test[idx_records_test_expl][:, 1]))) nbr_records_explainer = 100 idx_records_train_expl = np.random.choice(range(len(X_test)), size=nbr_records_explainer, replace=False) idx_records_test_expl = np.random.choice(range(len(X_test)), size=nbr_records_explainer, replace=False) print(datetime.datetime.now(), 'build maple') maple_explainer = MAPLE(X_test_nbrs[idx_records_train_expl], Y_test[idx_records_train_expl][:, 1], X_test_nbrs[idx_records_test_expl], Y_test[idx_records_test_expl][:, 1], n_estimators=100, max_features=0.5, min_samples_leaf=2) print(datetime.datetime.now(), 'build maple done') results = list() explained = 0 for idx, x in enumerate(X_test): x_nbrs = X_test_nbrs[idx] print(datetime.datetime.now(), 'seneca - text', 'black_box %s' % black_box, 'n_features %s' % n_features, 'rs %s' % random_state, '%s/%s' % (idx, n_records), end=' ') gt_val_text = get_word_importance_explanation_text(x, stc) gt_val = get_word_importance_explanation(x_nbrs, stc) try: lime_exp = lime_explainer.explain_instance(x, predict_proba, num_features=n_features) lime_expl_val = {e[0]: e[1] for e in lime_exp.as_list()} shap_expl_val = shap_explainer.shap_values(x_nbrs, l1_reg='bic')[1] maple_exp = maple_explainer.explain(x_nbrs) maple_expl_val = maple_exp['coefs'][:-1] except ValueError: print(datetime.datetime.now(), 'Error in explanation') continue lime_cs = word_based_similarity_text(lime_expl_val, gt_val_text, use_values=True) lime_f1, lime_pre, lime_rec = word_based_similarity_text( lime_expl_val, gt_val_text, use_values=False, ret_pre_rec=True) shap_cs = word_based_similarity(shap_expl_val, gt_val, use_values=True) shap_f1, shap_pre, shap_rec = word_based_similarity(shap_expl_val, gt_val, use_values=False, ret_pre_rec=True) maple_cs = word_based_similarity(maple_expl_val, gt_val, use_values=True) maple_f1, maple_pre, maple_rec = word_based_similarity( maple_expl_val, gt_val, use_values=False, ret_pre_rec=True) # print(gt_val) # print(lime_expl_val) # print(shap_expl_val) # print(maple_expl_val) res = { 'black_box': black_box, 'n_records': n_records, 'nbr_terms': nbr_terms, 'n_features': n_features, 'random_state': random_state, 'idx': idx, 'lime_cs': lime_cs, 'lime_f1': lime_f1, 'lime_pre': lime_pre, 'lime_rec': lime_rec, 'shap_cs': shap_cs, 'shap_f1': shap_f1, 'shap_pre': shap_pre, 'shap_rec': shap_rec, 'maple_cs': maple_cs, 'maple_f1': maple_f1, 'maple_pre': maple_pre, 'maple_rec': maple_rec, } results.append(res) print('lime %.2f %.2f' % (lime_cs, lime_f1), 'shap %.2f %.2f' % (shap_cs, shap_f1), 'maple %.2f %.2f' % (maple_cs, maple_f1)) explained += 1 if explained >= n_records: break df = pd.DataFrame(data=results) df = df[[ 'black_box', 'n_records', 'nbr_terms', 'n_features', 'random_state', 'idx', 'lime_cs', 'lime_f1', 'lime_pre', 'lime_rec', 'shap_cs', 'shap_f1', 'shap_pre', 'shap_rec', 'maple_cs', 'maple_f1', 'maple_pre', 'maple_rec', ]] # print(df.head()) if not os.path.isfile(filename): df.to_csv(filename, index=False) else: df.to_csv(filename, mode='a', index=False, header=False)
def run(black_box, n_records, n_all_features, n_features, random_state, filename): n = n_records m = n_all_features p_binary = 0.7 p_parenthesis = 0.3 slc = generate_synthetic_linear_classifier(expr=None, n_features=n_features, n_all_features=m, random_state=random_state, p_binary=p_binary, p_parenthesis=p_parenthesis) expr = slc['expr'] X = slc['X'] feature_names = slc['feature_names'] class_values = slc['class_values'] predict_proba = slc['predict_proba'] # predict = slc['predict'] X_test = np.random.uniform(np.min(X), np.max(X), size=(n, m)) Y_test = predict_proba(X_test)[:, 1] lime_explainer = LimeTabularExplainer(X_test, feature_names=feature_names, class_names=class_values, discretize_continuous=False, discretizer='entropy') reference = np.zeros(m) shap_explainer = KernelExplainer(predict_proba, np.reshape(reference, (1, len(reference)))) maple_explainer = MAPLE(X_test, Y_test, X_test, Y_test) results = list() for idx, x in enumerate(X_test): gt_val = get_feature_importance_explanation(x, slc, n_features, get_values=True) lime_exp = lime_explainer.explain_instance(x, predict_proba, num_features=m) # lime_exp_as_dict = {e[0]: e[1] for e in lime_exp.as_list()} # lime_expl_val = np.asarray([lime_exp_as_dict.get('x%s' % i, .0) for i in range(m)]) lime_expl_val = np.array([e[1] for e in lime_exp.as_list()]) shap_expl_val = shap_explainer.shap_values(x, l1_reg='bic')[1] maple_exp = maple_explainer.explain(x) maple_expl_val = maple_exp['coefs'][:-1] lime_fis = feature_importance_similarity(lime_expl_val, gt_val) shap_fis = feature_importance_similarity(shap_expl_val, gt_val) maple_fis = feature_importance_similarity(maple_expl_val, gt_val) # print(gt_val) # print(lime_expl_val) # print(shap_expl_val) # print(maple_expl_val) res = { 'black_box': black_box, 'n_records': n_records, 'n_all_features': n_all_features, 'n_features': n_features, 'random_state': random_state, 'idx': idx, 'lime': lime_fis, 'shap': shap_fis, 'maple': maple_fis, 'expr': expr, } results.append(res) print(datetime.datetime.now(), 'syege - tlsb', 'black_box %s' % black_box, 'n_all_features %s' % n_all_features, 'n_features %s' % n_features, 'rs %s' % random_state, '%s %s' % (idx, n_records), expr, 'lime %.2f' % lime_fis, 'shap %.2f' % shap_fis, 'maple %.2f' % maple_fis) if idx > 0 and idx % 10 == 0: df = pd.DataFrame(data=results) df = df[['black_box', 'n_records', 'n_all_features', 'n_features', 'random_state', 'expr', 'idx', 'lime', 'shap', 'maple']] # print(df.head()) if not os.path.isfile(filename): df.to_csv(filename, index=False) else: df.to_csv(filename, mode='a', index=False, header=False) results = list() df = pd.DataFrame(data=results) df = df[['black_box', 'n_records', 'n_all_features', 'n_features', 'random_state', 'expr', 'idx', 'lime', 'shap', 'maple']] # print(df.head()) if not os.path.isfile(filename): df.to_csv(filename, index=False) else: df.to_csv(filename, mode='a', index=False, header=False)
def run(args): # Hyperparamaters num_perturbations = 5 # Fixes an issue where threads of inherit the same rng state scipy.random.seed() # Arguments dataset = args[1] trial = args[0] # Outpt out = {} file = open("Trials/" + dataset + "_" + str(trial) + ".json", "w") # Load data X_train, y_train, X_valid, y_valid, X_test, y_test, train_mean, train_stddev = load_normalize_data("../Datasets/" + dataset + ".csv") n = X_test.shape[0] d = X_test.shape[1] scales = [0.1, 0.25] scales_len = len(scales) # Fit model model = fit_svr(X_train, y_train, X_test, y_test) out["model_rmse"] = np.sqrt(np.mean((y_test - model.predict(X_test))**2)) # Fit LIME and MAPLE explainers to the model exp_lime = lime_tabular.LimeTabularExplainer(X_train, discretize_continuous=False, mode="regression") exp_maple = MAPLE(X_train, model.predict(X_train), X_valid, model.predict(X_valid)) # Evaluate model faithfullness on the test set lime_rmse = np.zeros((scales_len)) maple_rmse = np.zeros((scales_len)) for i in range(n): x = X_test[i, :] coefs_lime = unpack_coefs(exp_lime, x, model.predict, d, X_train) #Allow full number of features e_maple = exp_maple.explain(x) coefs_maple = e_maple["coefs"] for j in range(num_perturbations): noise = np.random.normal(loc = 0.0, scale = 1.0, size = d) for k in range(scales_len): scale = scales[k] x_pert = x + scale * noise model_pred = model.predict(x_pert.reshape(1,-1)) lime_pred = np.dot(np.insert(x_pert, 0, 1), coefs_lime) maple_pred = np.dot(np.insert(x_pert, 0, 1), coefs_maple) lime_rmse[k] += (lime_pred - model_pred)**2 maple_rmse[k] += (maple_pred - model_pred)**2 lime_rmse /= n * num_perturbations maple_rmse /= n * num_perturbations lime_rmse = np.sqrt(lime_rmse) maple_rmse = np.sqrt(maple_rmse) out["lime_rmse_0.1"] = lime_rmse[0] out["maple_rmse_0.1"] = maple_rmse[0] out["lime_rmse_0.25"] = lime_rmse[1] out["maple_rmse_0.25"] = maple_rmse[1] json.dump(out, file) file.close()
def run(black_box, n_records, n_all_features, n_features, n_coefficients, random_state, filename): n = n_records m = n_all_features slc = generate_synthetic_linear_classifier2(n_features=n_features, n_all_features=n_all_features, n_coefficients=n_coefficients, random_state=random_state) feature_names = slc['feature_names'] class_values = slc['class_values'] predict_proba = slc['predict_proba'] # predict = slc['predict'] X_test = np.random.uniform(size=(n, n_all_features)) Xz = list() for x in X_test: nz = np.random.randint(0, n_features) zeros_idx = np.random.choice(np.arange(n_features), size=nz, replace=False) x[zeros_idx] = 0.0 Xz.append(x) X_test = np.array(Xz) Y_test = predict_proba(X_test)[:, 1] lime_explainer = LimeTabularExplainer(X_test, feature_names=feature_names, class_names=class_values, discretize_continuous=False, discretizer='entropy') reference = np.zeros(m) shap_explainer = KernelExplainer( predict_proba, np.reshape(reference, (1, len(reference)))) maple_explainer = MAPLE(X_test, Y_test, X_test, Y_test) results = list() for idx, x in enumerate(X_test): gt_val = get_feature_importance_explanation2(x, slc, n_features, n_all_features, get_values=True) gt_val_bin = get_feature_importance_explanation2(x, slc, n_features, n_all_features, get_values=False) lime_exp = lime_explainer.explain_instance(x, predict_proba, num_features=m) lime_expl_val = np.array([e[1] for e in lime_exp.as_list()]) tmp = np.zeros(lime_expl_val.shape) tmp[np.where(lime_expl_val != 0.0)] = 1.0 lime_expl_val_bin = tmp shap_expl_val = shap_explainer.shap_values(x, l1_reg='bic')[1] tmp = np.zeros(shap_expl_val.shape) tmp[np.where(shap_expl_val != 0.0)] = 1.0 shap_expl_val_bin = tmp maple_exp = maple_explainer.explain(x) maple_expl_val = maple_exp['coefs'][:-1] tmp = np.zeros(maple_expl_val.shape) tmp[np.where(maple_expl_val != 0.0)] = 1.0 maple_expl_val_bin = tmp lime_fis = feature_importance_similarity(lime_expl_val, gt_val) shap_fis = feature_importance_similarity(shap_expl_val, gt_val) maple_fis = feature_importance_similarity(maple_expl_val, gt_val) lime_rbs = rule_based_similarity(lime_expl_val_bin, gt_val_bin) shap_rbs = rule_based_similarity(shap_expl_val_bin, gt_val_bin) maple_rbs = rule_based_similarity(maple_expl_val_bin, gt_val_bin) # print(gt_val) # print(lime_expl_val) # print(shap_expl_val) # print(maple_expl_val) res = { 'black_box': black_box, 'n_records': n_records, 'n_all_features': n_all_features, 'n_features': n_features, 'n_coefficients': n_coefficients, 'random_state': random_state, 'idx': idx, 'lime_cs': lime_fis, 'shap_cs': shap_fis, 'maple_cs': maple_fis, 'lime_f1': lime_rbs, 'shap_f1': shap_rbs, 'maple_f1': maple_rbs, } results.append(res) print(datetime.datetime.now(), 'syege - tlsb2', 'black_box %s' % black_box, 'n_all_features %s' % n_all_features, 'n_features %s' % n_features, 'n_coefficients % s' % n_coefficients, 'rs %s' % random_state, '%s %s' % (idx, n_records), 'lime %.2f %.2f' % (lime_fis, lime_rbs), 'shap %.2f %.2f' % (shap_fis, shap_rbs), 'maple %.2f %.2f' % (maple_fis, maple_rbs)) df = pd.DataFrame(data=results) df = df[[ 'black_box', 'n_records', 'n_all_features', 'n_features', 'n_coefficients', 'random_state', 'idx', 'lime_cs', 'shap_cs', 'maple_cs', 'lime_f1', 'shap_f1', 'maple_f1' ]] # print(df.head()) if not os.path.isfile(filename): df.to_csv(filename, index=False) else: df.to_csv(filename, mode='a', index=False, header=False)