コード例 #1
0
ファイル: run.py プロジェクト: mtaufeeq/MAPLE
def run(args):

    # Fixes an issue where threads of inherit the same rng state
    scipy.random.seed()

    # Arguments
    dataset = args[0]
    trial = args[1]

    # Outpt
    out = {}
    file = open("Trials/" + dataset + "_" + str(trial) + ".json", "w")

    # Load Data
    X_train, y_train, X_val, y_val, X_test, y_test, train_mean, train_stddev = load_normalize_data(
        "../Datasets/" + dataset + ".csv")

    # Linear model
    lr = LinearRegression()
    lr.fit(X_train, y_train)
    predictions = lr.predict(X_test)
    out["lm_rmse"] = np.sqrt(mean_squared_error(y_test, predictions))

    # RF
    maple_rf = MAPLE(X_train, y_train, X_val, y_val)

    predictions = maple_rf.predict_fe(X_test)
    out["rf_rmse"] = np.sqrt(mean_squared_error(y_test, predictions))

    predictions = maple_rf.predict_silo(X_test)
    out["silo_rf_rmse"] = np.sqrt(mean_squared_error(y_test, predictions))

    predictions = maple_rf.predict(X_test)
    out["maple_rf_rmse"] = np.sqrt(mean_squared_error(y_test, predictions))
    out["nf_rf"] = maple_rf.retain

    # GBRT
    maple_gbrt = MAPLE(X_train, y_train, X_val, y_val, fe_type="gbrt")

    predictions = maple_gbrt.predict_fe(X_test)
    out["gbrt_rmse"] = np.sqrt(mean_squared_error(y_test, predictions))

    predictions = maple_gbrt.predict_silo(X_test)
    out["silo_gbrt_rmse"] = np.sqrt(mean_squared_error(y_test, predictions))

    predictions = maple_gbrt.predict(X_test)
    out["maple_gbrt_rmse"] = np.sqrt(mean_squared_error(y_test, predictions))
    out["nf_gbrt"] = maple_gbrt.retain

    # Save results
    json.dump(out, file)
    file.close()
コード例 #2
0
def main():
    n_features = 100
    random_state = 0

    stc = generate_synthetic_text_classifier(n_features=n_features, use_textual_words=False,
                                             random_state=random_state)

    predict = stc['predict']
    predict_proba = stc['predict_proba']
    words = stc['words']
    vectorizer = stc['vectorizer']
    nbr_terms = stc['nbr_terms']

    # print(words)

    X_test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'),
                                categories=None).data
    X_test = preprocess_data(X_test)
    X_test = vectorizer.transform(X_test).toarray()
    Y_test = predict(X_test)

    print('building')
    explainer = MAPLE(X_test[:10], Y_test[:10], X_test[:10], Y_test[:10])
    print('built')

    for x in X_test[:10]:
        # print(x)
        exp = explainer.explain(x)
        expl_val = exp['coefs'][:-1]
        gt_val = get_word_importance_explanation(x, stc)
        # gt_val = get_word_importance_explanation_text(x, stc)
        # print(gt_val)
        # print(expl_val)
        # print(np.where(gt_val != 0))
        # print(np.where(expl_val != 0))

        wbs = word_based_similarity(expl_val, gt_val, use_values=False)
        print(wbs, word_based_similarity(expl_val, gt_val, use_values=True))
        print('')
コード例 #3
0
def main():
    m = 5
    n = 10

    n_features = 3
    random_state = 2

    p_binary = 0.7
    p_parenthesis = 0.3

    slc = generate_synthetic_linear_classifier(expr=None, n_features=n_features, n_all_features=m,
                                               random_state=random_state, p_binary=p_binary,
                                               p_parenthesis=p_parenthesis)
    expr = slc['expr']
    X = slc['X']
    feature_names = slc['feature_names']
    class_values = slc['class_values']
    predict_proba = slc['predict_proba']
    predict = slc['predict']

    print(expr)

    X_test = np.random.uniform(np.min(X), np.max(X), size=(n, m))
    Y_test = predict(X_test)

    explainer = MAPLE(X_test, Y_test, X_test, Y_test)

    for x in X_test:
        print(x)
        exp = explainer.explain(x)
        expl_val = exp['coefs'][:-1]
        gt_val = get_feature_importance_explanation(x, slc, n_features, get_values=True)
        fis = feature_importance_similarity(expl_val, gt_val)
        print(expl_val)
        print(gt_val)
        print(fis)
        break
コード例 #4
0
ファイル: run.py プロジェクト: mtaufeeq/MAPLE
def run(args):
    # Hyperparamaters
    num_perturbations = 5

    # Fixes an issue where threads inherit the same rng state
    scipy.random.seed()

    # Arguments
    dataset = args[0]
    trial = args[1]

    # Output
    out = {}
    file = open("Trials/" + dataset + "_" + str(trial) + ".json", "w")

    # Load data
    X_train, y_train, X_valid, y_valid, X_test, y_test, train_mean, train_stddev = load_normalize_data(
        "../Datasets/" + dataset + ".csv")
    n = X_test.shape[0]
    d = X_test.shape[1]

    # Load the noise scale parameters
    #with open("Sigmas/" + dataset + ".json", "r") as tmp:
    #scales = json.load(tmp)
    scales = [0.1, 0.25]
    scales_len = len(scales)

    # Fit MAPLE model
    exp_maple = MAPLE(X_train, y_train, X_valid, y_valid)

    # Fit LIME to explain MAPLE
    exp_lime = lime_tabular.LimeTabularExplainer(X_train,
                                                 discretize_continuous=False,
                                                 mode="regression")

    # Evaluate model faithfullness on the test set
    rmse = 0.0  #MAPLE accuracy on the dataset
    lime_rmse = np.zeros((scales_len))
    maple_rmse = np.zeros((scales_len))
    for i in range(n):
        x = X_test[i, :]

        #LIME's default parameter for num_samples is 500
        # 1) This is larger than any of the datasets we tested on
        # 2) It makes explaining MAPLE impractically slow since the complexity of MAPLE's predict() depends on the dataset size
        coefs_lime = unpack_coefs(exp_lime,
                                  x,
                                  exp_maple.predict,
                                  d,
                                  X_train,
                                  num_samples=100)

        e_maple = exp_maple.explain(x)
        coefs_maple = e_maple["coefs"]

        rmse += (e_maple["pred"] - y_test[i])**2

        for j in range(num_perturbations):

            noise = np.random.normal(loc=0.0, scale=1.0, size=d)

            for k in range(scales_len):
                scale = scales[k]

                x_pert = x + scale * noise

                e_maple_pert = exp_maple.explain(x_pert)
                model_pred = e_maple_pert["pred"]
                lime_pred = np.dot(np.insert(x_pert, 0, 1), coefs_lime)
                maple_pred = np.dot(np.insert(x_pert, 0, 1), coefs_maple)

                lime_rmse[k] += (lime_pred - model_pred)**2
                maple_rmse[k] += (maple_pred - model_pred)**2

    rmse /= n
    lime_rmse /= n * num_perturbations
    maple_rmse /= n * num_perturbations

    rmse = np.sqrt(rmse)
    lime_rmse = np.sqrt(lime_rmse)
    maple_rmse = np.sqrt(maple_rmse)

    out["model_rmse"] = rmse[0]
    out["lime_rmse_0.1"] = lime_rmse[0]
    out["maple_rmse_0.1"] = maple_rmse[0]
    out["lime_rmse_0.25"] = lime_rmse[1]
    out["maple_rmse_0.25"] = maple_rmse[1]

    json.dump(out, file)
    file.close()
コード例 #5
0
def main():
    n_features = (8, 8)
    img_size = (32, 32, 3)
    cell_size = (4, 4)
    colors_p = np.array([0.15, 0.7, 0.15])
    p_border = 0.0

    sic = generate_synthetic_image_classifier(img_size=img_size,
                                              cell_size=cell_size,
                                              n_features=n_features,
                                              p_border=p_border)

    pattern = sic['pattern']
    predict = sic['predict']
    predict_proba = sic['predict_proba']

    plt.imshow(pattern)
    plt.show()

    X_test = generate_random_img_dataset(pattern,
                                         nbr_images=1000,
                                         pattern_ratio=0.4,
                                         img_size=img_size,
                                         cell_size=cell_size,
                                         min_nbr_cells=0.1,
                                         max_nbr_cells=0.3,
                                         colors_p=colors_p)

    Y_test = predict(X_test)

    nbr_records = 10
    Xm_test = np.array([x.ravel() for x in X_test[:nbr_records]])

    explainer = MAPLE(Xm_test,
                      Y_test[:nbr_records],
                      Xm_test,
                      Y_test[:nbr_records],
                      n_estimators=5,
                      max_features=0.5,
                      min_samples_leaf=5)

    x = X_test[-1]
    plt.imshow(x)
    plt.show()

    exp = explainer.explain(x)
    expl_val = exp['coefs'][:-1]
    print(expl_val)
    expl_val = np.array([1.0 if v > 0.0 else 0.0 for v in expl_val])
    # expl_val = (expl_val - np.min(expl_val)) / (np.max(expl_val) - np.min(expl_val))
    print(expl_val)
    print(np.unique(expl_val, return_counts=True))
    print(expl_val.shape)

    sv = np.sum(np.reshape(expl_val, img_size), axis=2)
    sv01 = np.zeros(sv.shape)
    sv01[np.where(sv > 0.0)] = 1.0
    # np.array([1.0 if v > 0.0 else 0.0 for v in expl_val])
    sv = sv01
    print(sv)
    print(sv.shape)

    max_val = np.nanpercentile(np.abs(sv), 99.9)
    # plt.imshow(x)
    plt.imshow(sv, cmap='RdYlBu', vmin=-max_val, vmax=max_val, alpha=0.7)
    plt.show()
    # shap.image_plot(expl_val, x)

    gt_val = get_pixel_importance_explanation(x, sic)
    print(gt_val.shape)
    max_val = np.nanpercentile(np.abs(gt_val), 99.9)
    # plt.imshow(x)
    plt.imshow(np.reshape(gt_val, img_size[:2]),
               cmap='RdYlBu',
               vmin=-max_val,
               vmax=max_val,
               alpha=0.7)
    plt.show()

    print(np.unique(gt_val, return_counts=True))
    print(np.unique(sv.ravel(), return_counts=True))
    print(pixel_based_similarity(sv.ravel(), gt_val))
コード例 #6
0
def main():
    n_features = (20, 20)
    img_size = (32, 32, 3)
    cell_size = (4, 4)
    colors_p = np.array([0.15, 0.7, 0.15])
    p_border = 1.0

    # img_draft = np.array([ # ['k', 'k', 'k', 'k', 'k', 'k', 'k', 'k'],
    #     ['k', 'k', 'k', 'k', 'k', 'g', 'r', 'k'],
    #     ['g', 'k', 'k', 'k', 'k', 'k', 'k', 'g'],
    #     ['k', 'g', 'k', 'k', 'k', 'b', 'k', 'k'],
    #     ['k', 'g', 'k', 'k', 'g', 'g', 'k', 'b'],
    #     ['k', 'k', 'k', 'k', 'g', 'k', 'k', 'g'],
    #     ['g', 'k', 'k', 'k', 'k', 'k', 'k', 'k'],
    #     ['k', 'k', 'k', 'k', 'k', 'k', 'k', 'k'],
    #     ['k', 'k', 'k', 'k', 'g', 'k', 'k', 'k'],
    #
    # ])
    # img = generate_img_defined(img_draft, img_size=img_size, cell_size=cell_size)
    # plt.imshow(img)
    # plt.xticks(())
    # plt.yticks(())
    # # plt.savefig('../fig/pattern.png', format='png', bbox_inches='tight')
    # plt.show()

    pattern_draft = np.array([  # ['k', 'k', 'k', 'k', 'k', 'k', 'k', 'k'],
        ['k', 'k', 'k', 'k', 'k'],
        ['k', 'k', 'k', 'b', 'k'],
        ['k', 'k', 'g', 'g', 'k'],
        ['k', 'k', 'g', 'k', 'k'],
        ['k', 'k', 'k', 'k', 'k'],
    ])

    pattern = generate_img_defined(pattern_draft,
                                   img_size=(20, 20, 3),
                                   cell_size=cell_size)

    sic = generate_synthetic_image_classifier(img_size=img_size,
                                              cell_size=cell_size,
                                              n_features=n_features,
                                              p_border=p_border,
                                              pattern=pattern)

    pattern = sic['pattern']
    predict = sic['predict']
    predict_proba = sic['predict_proba']

    plt.imshow(pattern)
    plt.xticks(())
    plt.yticks(())
    # plt.savefig('../fig/pattern.png', format='png', bbox_inches='tight')
    plt.show()

    X_test = generate_random_img_dataset(pattern,
                                         nbr_images=1000,
                                         pattern_ratio=0.4,
                                         img_size=img_size,
                                         cell_size=cell_size,
                                         min_nbr_cells=0.1,
                                         max_nbr_cells=0.3,
                                         colors_p=colors_p)

    Y_test = predict(X_test)
    idx = np.where(Y_test == 1)[0][0]

    # x = X_test[idx]
    img_draft = np.array([  # ['k', 'k', 'k', 'k', 'k', 'k', 'k', 'k'],
        ['k', 'k', 'k', 'k', 'k', 'g', 'r', 'k'],
        ['g', 'k', 'k', 'k', 'k', 'k', 'k', 'g'],
        ['k', 'g', 'k', 'k', 'k', 'b', 'k', 'k'],
        ['k', 'g', 'k', 'k', 'g', 'g', 'k', 'b'],
        ['k', 'k', 'k', 'k', 'g', 'k', 'k', 'g'],
        ['g', 'k', 'k', 'k', 'k', 'k', 'k', 'k'],
        ['k', 'k', 'k', 'k', 'k', 'k', 'k', 'k'],
        ['k', 'k', 'k', 'k', 'g', 'k', 'k', 'k'],
    ])
    x = generate_img_defined(img_draft, img_size=img_size, cell_size=cell_size)
    plt.imshow(x)
    plt.xticks(())
    plt.yticks(())
    # plt.savefig('../fig/image.png', format='png', bbox_inches='tight')
    plt.show()

    gt_val = get_pixel_importance_explanation(x, sic)
    max_val = np.nanpercentile(np.abs(gt_val), 99.9)
    plt.imshow(np.reshape(gt_val, img_size[:2]),
               cmap='RdYlBu',
               vmin=-max_val,
               vmax=max_val,
               alpha=0.7)
    plt.xticks(())
    plt.yticks(())
    # plt.savefig('../fig/saliencymap.png', format='png', bbox_inches='tight')
    plt.show()

    # plt.imshow(x)
    # plt.imshow(np.reshape(gt_val, img_size[:2]), cmap='RdYlBu', vmin=-max_val, vmax=max_val, alpha=0.7)
    # plt.xticks(())
    # plt.yticks(())
    # plt.savefig('../fig/saliencymap2.png', format='png', bbox_inches='tight')
    # plt.show()

    lime_explainer = LimeImageExplainer()
    segmenter = SegmentationAlgorithm('quickshift',
                                      kernel_size=1,
                                      max_dist=10,
                                      ratio=0.5)
    tot_num_features = img_size[0] * img_size[1]

    lime_exp = lime_explainer.explain_instance(x,
                                               predict_proba,
                                               top_labels=2,
                                               hide_color=0,
                                               num_samples=10000,
                                               segmentation_fn=segmenter)
    _, lime_expl_val = lime_exp.get_image_and_mask(
        1,
        positive_only=True,
        num_features=tot_num_features,
        hide_rest=False,
        min_weight=0.0)
    max_val = np.nanpercentile(np.abs(lime_expl_val), 99.9)
    plt.imshow(lime_expl_val,
               cmap='RdYlBu',
               vmin=-max_val,
               vmax=max_val,
               alpha=0.7)
    plt.xticks(())
    plt.yticks(())
    plt.title('lime', fontsize=20)
    plt.savefig('../fig/saliencymap_lime.png',
                format='png',
                bbox_inches='tight')
    plt.show()

    background = np.array([np.zeros(img_size).ravel()] * 10)
    shap_explainer = KernelExplainer(predict_proba, background)

    shap_expl_val = shap_explainer.shap_values(x.ravel(), l1_reg='bic')[1]
    shap_expl_val = np.sum(np.reshape(shap_expl_val, img_size), axis=2)
    tmp = np.zeros(shap_expl_val.shape)
    tmp[np.where(shap_expl_val > 0.0)] = 1.0
    shap_expl_val = tmp
    max_val = np.nanpercentile(np.abs(shap_expl_val), 99.9)
    plt.imshow(shap_expl_val,
               cmap='RdYlBu',
               vmin=-max_val,
               vmax=max_val,
               alpha=0.7)
    plt.xticks(())
    plt.yticks(())
    plt.title('shap', fontsize=20)
    plt.savefig('../fig/saliencymap_shap.png',
                format='png',
                bbox_inches='tight')
    plt.show()

    nbr_records = 10
    Xm_test = np.array([x.ravel() for x in X_test[:nbr_records]])
    maple_explainer = MAPLE(Xm_test,
                            Y_test[:nbr_records],
                            Xm_test,
                            Y_test[:nbr_records],
                            n_estimators=5,
                            max_features=0.5,
                            min_samples_leaf=5)

    maple_exp = maple_explainer.explain(x)
    maple_expl_val = maple_exp['coefs'][:-1]
    maple_expl_val = np.sum(np.reshape(maple_expl_val, img_size), axis=2)
    tmp = np.zeros(maple_expl_val.shape)
    tmp[np.where(maple_expl_val > 0.0)] = 1.0
    maple_expl_val = tmp
    max_val = np.nanpercentile(np.abs(shap_expl_val), 99.9)
    plt.imshow(maple_expl_val,
               cmap='RdYlBu',
               vmin=-max_val,
               vmax=max_val,
               alpha=0.7)
    plt.xticks(())
    plt.yticks(())
    plt.title('maple', fontsize=20)
    plt.savefig('../fig/saliencymap_maple.png',
                format='png',
                bbox_inches='tight')
    plt.show()

    lime_f1, lime_pre, lime_rec = pixel_based_similarity(lime_expl_val.ravel(),
                                                         gt_val,
                                                         ret_pre_rec=True)
    shap_f1, shap_pre, shap_rec = pixel_based_similarity(shap_expl_val.ravel(),
                                                         gt_val,
                                                         ret_pre_rec=True)
    maple_f1, maple_pre, maple_rec = pixel_based_similarity(
        maple_expl_val.ravel(), gt_val, ret_pre_rec=True)

    print(lime_f1, lime_pre, lime_rec)
    print(shap_f1, shap_pre, shap_rec)
    print(maple_f1, maple_pre, maple_rec)
コード例 #7
0
def run(black_box, n_records, img_size, cell_size, n_features, p_border,
        colors_p, random_state, filename):

    sic = generate_synthetic_image_classifier(img_size=img_size,
                                              cell_size=cell_size,
                                              n_features=n_features,
                                              p_border=p_border,
                                              random_state=random_state)

    pattern = sic['pattern']
    predict = sic['predict']
    predict_proba = sic['predict_proba']

    X_test = generate_random_img_dataset(pattern,
                                         nbr_images=n_records,
                                         pattern_ratio=0.5,
                                         img_size=img_size,
                                         cell_size=cell_size,
                                         min_nbr_cells=0.1,
                                         max_nbr_cells=0.3,
                                         colors_p=colors_p)

    Y_test_proba = predict_proba(X_test)
    Y_test = predict(X_test)

    lime_explainer = LimeImageExplainer()
    segmenter = SegmentationAlgorithm('quickshift',
                                      kernel_size=1,
                                      max_dist=10,
                                      ratio=0.5)
    tot_num_features = img_size[0] * img_size[1]

    background = np.array([np.zeros(img_size).ravel()] * 10)
    shap_explainer = KernelExplainer(predict_proba, background)

    nbr_records_explainer = 10
    idx_records_train_expl = np.random.choice(range(len(X_test)),
                                              size=nbr_records_explainer,
                                              replace=False)
    idx_records_test_expl = np.random.choice(range(len(X_test)),
                                             size=nbr_records_explainer,
                                             replace=False)

    Xm_train = np.array([x.ravel() for x in X_test[idx_records_train_expl]])
    Xm_test = np.array([x.ravel() for x in X_test[idx_records_test_expl]])

    print(datetime.datetime.now(), 'build maple')
    maple_explainer = MAPLE(Xm_train,
                            Y_test_proba[idx_records_train_expl][:, 1],
                            Xm_test,
                            Y_test_proba[idx_records_test_expl][:, 1],
                            n_estimators=100,
                            max_features=0.5,
                            min_samples_leaf=2)
    print(datetime.datetime.now(), 'build maple done')

    idx = 0
    results = list()
    for x, y in zip(X_test, Y_test):
        print(datetime.datetime.now(),
              'seneca - image',
              'black_box %s' % black_box,
              'n_features %s' % str(n_features),
              'rs %s' % random_state,
              '%s/%s' % (idx, n_records),
              end=' ')

        gt_val = get_pixel_importance_explanation(x, sic)

        lime_exp = lime_explainer.explain_instance(x,
                                                   predict_proba,
                                                   top_labels=2,
                                                   hide_color=0,
                                                   num_samples=10000,
                                                   segmentation_fn=segmenter)
        _, lime_expl_val = lime_exp.get_image_and_mask(
            y,
            positive_only=True,
            num_features=tot_num_features,
            hide_rest=False,
            min_weight=0.0)

        shap_expl_val = shap_explainer.shap_values(x.ravel(), l1_reg='bic')[1]
        shap_expl_val = np.sum(np.reshape(shap_expl_val, img_size), axis=2)
        tmp = np.zeros(shap_expl_val.shape)
        tmp[np.where(shap_expl_val > 0.0)] = 1.0
        shap_expl_val = tmp

        maple_exp = maple_explainer.explain(x)
        maple_expl_val = maple_exp['coefs'][:-1]
        maple_expl_val = np.sum(np.reshape(maple_expl_val, img_size), axis=2)
        tmp = np.zeros(maple_expl_val.shape)
        tmp[np.where(maple_expl_val > 0.0)] = 1.0
        maple_expl_val = tmp

        lime_f1, lime_pre, lime_rec = pixel_based_similarity(
            lime_expl_val.ravel(), gt_val, ret_pre_rec=True)
        shap_f1, shap_pre, shap_rec = pixel_based_similarity(
            shap_expl_val.ravel(), gt_val, ret_pre_rec=True)
        maple_f1, maple_pre, maple_rec = pixel_based_similarity(
            maple_expl_val.ravel(), gt_val, ret_pre_rec=True)

        res = {
            'black_box': black_box,
            'n_records': n_records,
            'img_size': '"%s"' % str(img_size),
            'cell_size': '"%s"' % str(cell_size),
            'n_features': '"%s"' % str(n_features),
            'random_state': random_state,
            'idx': idx,
            'lime_f1': lime_f1,
            'lime_pre': lime_pre,
            'lime_rec': lime_rec,
            'shap_f1': shap_f1,
            'shap_pre': shap_pre,
            'shap_rec': shap_rec,
            'maple_f1': maple_f1,
            'maple_pre': maple_pre,
            'maple_rec': maple_rec,
            'p_border': p_border
        }
        results.append(res)
        print('lime %.2f' % lime_f1, 'shap %.2f' % shap_f1,
              'maple %.2f' % maple_f1)

        idx += 1

    df = pd.DataFrame(data=results)
    df = df[[
        'black_box', 'n_records', 'img_size', 'cell_size', 'n_features',
        'random_state', 'idx', 'lime_f1', 'lime_pre', 'lime_rec', 'shap_f1',
        'shap_pre', 'shap_rec', 'maple_f1', 'maple_pre', 'maple_rec',
        'p_border'
    ]]
    # print(df.head())

    if not os.path.isfile(filename):
        df.to_csv(filename, index=False)
    else:
        df.to_csv(filename, mode='a', index=False, header=False)
def run(black_box, n_records, n_features, random_state, filename):

    X_train = fetch_20newsgroups(subset='train',
                                 remove=('headers', 'footers', 'quotes'),
                                 categories=None).data
    X_train = preprocess_data(X_train)

    stc = generate_synthetic_text_classifier(X_train,
                                             n_features=n_features,
                                             random_state=random_state)

    predict_proba = stc['predict_proba']
    vectorizer = stc['vectorizer']
    nbr_terms = stc['nbr_terms']

    X_test = fetch_20newsgroups(subset='test',
                                remove=('headers', 'footers', 'quotes'),
                                categories=None).data
    X_test = preprocess_data(X_test)
    X_test_nbrs = vectorizer.transform(X_test).toarray()
    Y_test = predict_proba(X_test_nbrs)

    lime_explainer = LimeTextExplainer(class_names=[0, 1])

    print(datetime.datetime.now(), 'build shap')
    reference = get_reference4shap(X_test_nbrs,
                                   stc['words_vec'],
                                   nbr_terms,
                                   nbr_references=10)
    shap_explainer = KernelExplainer(predict_proba, reference)
    print(datetime.datetime.now(), 'build shap done')

    # print(idx_records_train_expl)
    # print(X_test_nbrs[idx_records_train_expl])
    # print(Y_test[idx_records_train_expl][:, 1])
    # print(np.any(np.isnan(X_test_nbrs[idx_records_train_expl])))
    # print(np.any(np.isnan(X_test_nbrs[idx_records_test_expl])))
    # print(np.any(np.isnan(Y_test[idx_records_train_expl][:, 1])))
    # print(np.any(np.isnan(Y_test[idx_records_test_expl][:, 1])))

    nbr_records_explainer = 100
    idx_records_train_expl = np.random.choice(range(len(X_test)),
                                              size=nbr_records_explainer,
                                              replace=False)
    idx_records_test_expl = np.random.choice(range(len(X_test)),
                                             size=nbr_records_explainer,
                                             replace=False)

    print(datetime.datetime.now(), 'build maple')
    maple_explainer = MAPLE(X_test_nbrs[idx_records_train_expl],
                            Y_test[idx_records_train_expl][:, 1],
                            X_test_nbrs[idx_records_test_expl],
                            Y_test[idx_records_test_expl][:, 1],
                            n_estimators=100,
                            max_features=0.5,
                            min_samples_leaf=2)
    print(datetime.datetime.now(), 'build maple done')

    results = list()
    explained = 0
    for idx, x in enumerate(X_test):
        x_nbrs = X_test_nbrs[idx]

        print(datetime.datetime.now(),
              'seneca - text',
              'black_box %s' % black_box,
              'n_features %s' % n_features,
              'rs %s' % random_state,
              '%s/%s' % (idx, n_records),
              end=' ')

        gt_val_text = get_word_importance_explanation_text(x, stc)
        gt_val = get_word_importance_explanation(x_nbrs, stc)

        try:
            lime_exp = lime_explainer.explain_instance(x,
                                                       predict_proba,
                                                       num_features=n_features)
            lime_expl_val = {e[0]: e[1] for e in lime_exp.as_list()}

            shap_expl_val = shap_explainer.shap_values(x_nbrs, l1_reg='bic')[1]

            maple_exp = maple_explainer.explain(x_nbrs)
            maple_expl_val = maple_exp['coefs'][:-1]
        except ValueError:
            print(datetime.datetime.now(), 'Error in explanation')
            continue

        lime_cs = word_based_similarity_text(lime_expl_val,
                                             gt_val_text,
                                             use_values=True)
        lime_f1, lime_pre, lime_rec = word_based_similarity_text(
            lime_expl_val, gt_val_text, use_values=False, ret_pre_rec=True)

        shap_cs = word_based_similarity(shap_expl_val, gt_val, use_values=True)
        shap_f1, shap_pre, shap_rec = word_based_similarity(shap_expl_val,
                                                            gt_val,
                                                            use_values=False,
                                                            ret_pre_rec=True)

        maple_cs = word_based_similarity(maple_expl_val,
                                         gt_val,
                                         use_values=True)
        maple_f1, maple_pre, maple_rec = word_based_similarity(
            maple_expl_val, gt_val, use_values=False, ret_pre_rec=True)

        # print(gt_val)
        # print(lime_expl_val)
        # print(shap_expl_val)
        # print(maple_expl_val)

        res = {
            'black_box': black_box,
            'n_records': n_records,
            'nbr_terms': nbr_terms,
            'n_features': n_features,
            'random_state': random_state,
            'idx': idx,
            'lime_cs': lime_cs,
            'lime_f1': lime_f1,
            'lime_pre': lime_pre,
            'lime_rec': lime_rec,
            'shap_cs': shap_cs,
            'shap_f1': shap_f1,
            'shap_pre': shap_pre,
            'shap_rec': shap_rec,
            'maple_cs': maple_cs,
            'maple_f1': maple_f1,
            'maple_pre': maple_pre,
            'maple_rec': maple_rec,
        }
        results.append(res)
        print('lime %.2f %.2f' % (lime_cs, lime_f1),
              'shap %.2f %.2f' % (shap_cs, shap_f1),
              'maple %.2f %.2f' % (maple_cs, maple_f1))

        explained += 1
        if explained >= n_records:
            break

    df = pd.DataFrame(data=results)
    df = df[[
        'black_box',
        'n_records',
        'nbr_terms',
        'n_features',
        'random_state',
        'idx',
        'lime_cs',
        'lime_f1',
        'lime_pre',
        'lime_rec',
        'shap_cs',
        'shap_f1',
        'shap_pre',
        'shap_rec',
        'maple_cs',
        'maple_f1',
        'maple_pre',
        'maple_rec',
    ]]
    # print(df.head())

    if not os.path.isfile(filename):
        df.to_csv(filename, index=False)
    else:
        df.to_csv(filename, mode='a', index=False, header=False)
コード例 #9
0
def run(black_box, n_records, n_all_features, n_features, random_state, filename):

    n = n_records
    m = n_all_features

    p_binary = 0.7
    p_parenthesis = 0.3

    slc = generate_synthetic_linear_classifier(expr=None, n_features=n_features, n_all_features=m,
                                               random_state=random_state,
                                               p_binary=p_binary, p_parenthesis=p_parenthesis)
    expr = slc['expr']

    X = slc['X']
    feature_names = slc['feature_names']
    class_values = slc['class_values']
    predict_proba = slc['predict_proba']
    # predict = slc['predict']

    X_test = np.random.uniform(np.min(X), np.max(X), size=(n, m))
    Y_test = predict_proba(X_test)[:, 1]

    lime_explainer = LimeTabularExplainer(X_test, feature_names=feature_names, class_names=class_values,
                                          discretize_continuous=False, discretizer='entropy')

    reference = np.zeros(m)
    shap_explainer = KernelExplainer(predict_proba, np.reshape(reference, (1, len(reference))))

    maple_explainer = MAPLE(X_test, Y_test, X_test, Y_test)

    results = list()
    for idx, x in enumerate(X_test):
        gt_val = get_feature_importance_explanation(x, slc, n_features, get_values=True)

        lime_exp = lime_explainer.explain_instance(x, predict_proba, num_features=m)
        # lime_exp_as_dict = {e[0]: e[1] for e in lime_exp.as_list()}
        # lime_expl_val = np.asarray([lime_exp_as_dict.get('x%s' % i, .0) for i in range(m)])
        lime_expl_val = np.array([e[1] for e in lime_exp.as_list()])

        shap_expl_val = shap_explainer.shap_values(x, l1_reg='bic')[1]

        maple_exp = maple_explainer.explain(x)
        maple_expl_val = maple_exp['coefs'][:-1]

        lime_fis = feature_importance_similarity(lime_expl_val, gt_val)
        shap_fis = feature_importance_similarity(shap_expl_val, gt_val)
        maple_fis = feature_importance_similarity(maple_expl_val, gt_val)

        # print(gt_val)
        # print(lime_expl_val)
        # print(shap_expl_val)
        # print(maple_expl_val)

        res = {
            'black_box': black_box,
            'n_records': n_records,
            'n_all_features': n_all_features,
            'n_features': n_features,
            'random_state': random_state,
            'idx': idx,
            'lime': lime_fis,
            'shap': shap_fis,
            'maple': maple_fis,
            'expr': expr,
        }
        results.append(res)
        print(datetime.datetime.now(), 'syege - tlsb', 'black_box %s' % black_box,
              'n_all_features %s' % n_all_features, 'n_features %s' % n_features, 'rs %s' % random_state,
              '%s %s' % (idx, n_records), expr,
              'lime %.2f' % lime_fis, 'shap %.2f' % shap_fis, 'maple %.2f' % maple_fis)

        if idx > 0 and idx % 10 == 0:
            df = pd.DataFrame(data=results)
            df = df[['black_box', 'n_records', 'n_all_features', 'n_features', 'random_state', 'expr',
                     'idx', 'lime', 'shap', 'maple']]
            # print(df.head())

            if not os.path.isfile(filename):
                df.to_csv(filename, index=False)
            else:
                df.to_csv(filename, mode='a', index=False, header=False)
            results = list()

    df = pd.DataFrame(data=results)
    df = df[['black_box', 'n_records', 'n_all_features', 'n_features', 'random_state', 'expr',
             'idx', 'lime', 'shap', 'maple']]
    # print(df.head())

    if not os.path.isfile(filename):
        df.to_csv(filename, index=False)
    else:
        df.to_csv(filename, mode='a', index=False, header=False)
コード例 #10
0
ファイル: run.py プロジェクト: mtaufeeq/MAPLE
def run(args):
    # Hyperparamaters
    num_perturbations = 5
    
    # Fixes an issue where threads of inherit the same rng state
    scipy.random.seed()
    
    # Arguments
    dataset = args[1]
    trial = args[0]
    
    # Outpt
    out = {}
    file = open("Trials/" + dataset + "_" + str(trial) + ".json", "w")

    # Load data
    X_train, y_train, X_valid, y_valid, X_test, y_test, train_mean, train_stddev = load_normalize_data("../Datasets/" + dataset + ".csv")
    n = X_test.shape[0]
    d = X_test.shape[1]
    
    scales = [0.1, 0.25]
    scales_len = len(scales)
        
    # Fit model
    model = fit_svr(X_train, y_train, X_test, y_test)
    out["model_rmse"] = np.sqrt(np.mean((y_test - model.predict(X_test))**2))
        
    # Fit LIME and MAPLE explainers to the model
    exp_lime = lime_tabular.LimeTabularExplainer(X_train, discretize_continuous=False, mode="regression")
    exp_maple = MAPLE(X_train, model.predict(X_train), X_valid, model.predict(X_valid))
        
    # Evaluate model faithfullness on the test set
    lime_rmse = np.zeros((scales_len))
    maple_rmse = np.zeros((scales_len))
    
    for i in range(n):
        x = X_test[i, :]
        
        coefs_lime = unpack_coefs(exp_lime, x, model.predict, d, X_train) #Allow full number of features
    
        e_maple = exp_maple.explain(x)
        coefs_maple = e_maple["coefs"]
        
        for j in range(num_perturbations):
            
            noise = np.random.normal(loc = 0.0, scale = 1.0, size = d)
            
            for k in range(scales_len):
                scale = scales[k]
            
                x_pert = x + scale * noise
            
                model_pred = model.predict(x_pert.reshape(1,-1))
                lime_pred = np.dot(np.insert(x_pert, 0, 1), coefs_lime)
                maple_pred = np.dot(np.insert(x_pert, 0, 1), coefs_maple)
            
                lime_rmse[k] += (lime_pred - model_pred)**2
                maple_rmse[k] += (maple_pred - model_pred)**2

    lime_rmse /= n * num_perturbations
    maple_rmse /= n * num_perturbations

    lime_rmse = np.sqrt(lime_rmse)
    maple_rmse = np.sqrt(maple_rmse)

    out["lime_rmse_0.1"] = lime_rmse[0]
    out["maple_rmse_0.1"] = maple_rmse[0]
    out["lime_rmse_0.25"] = lime_rmse[1]
    out["maple_rmse_0.25"] = maple_rmse[1]

    json.dump(out, file)
    file.close()
def run(black_box, n_records, n_all_features, n_features, n_coefficients,
        random_state, filename):

    n = n_records
    m = n_all_features

    slc = generate_synthetic_linear_classifier2(n_features=n_features,
                                                n_all_features=n_all_features,
                                                n_coefficients=n_coefficients,
                                                random_state=random_state)

    feature_names = slc['feature_names']
    class_values = slc['class_values']
    predict_proba = slc['predict_proba']
    # predict = slc['predict']

    X_test = np.random.uniform(size=(n, n_all_features))
    Xz = list()
    for x in X_test:
        nz = np.random.randint(0, n_features)
        zeros_idx = np.random.choice(np.arange(n_features),
                                     size=nz,
                                     replace=False)
        x[zeros_idx] = 0.0
        Xz.append(x)
    X_test = np.array(Xz)

    Y_test = predict_proba(X_test)[:, 1]

    lime_explainer = LimeTabularExplainer(X_test,
                                          feature_names=feature_names,
                                          class_names=class_values,
                                          discretize_continuous=False,
                                          discretizer='entropy')

    reference = np.zeros(m)
    shap_explainer = KernelExplainer(
        predict_proba, np.reshape(reference, (1, len(reference))))

    maple_explainer = MAPLE(X_test, Y_test, X_test, Y_test)

    results = list()
    for idx, x in enumerate(X_test):
        gt_val = get_feature_importance_explanation2(x,
                                                     slc,
                                                     n_features,
                                                     n_all_features,
                                                     get_values=True)
        gt_val_bin = get_feature_importance_explanation2(x,
                                                         slc,
                                                         n_features,
                                                         n_all_features,
                                                         get_values=False)

        lime_exp = lime_explainer.explain_instance(x,
                                                   predict_proba,
                                                   num_features=m)
        lime_expl_val = np.array([e[1] for e in lime_exp.as_list()])
        tmp = np.zeros(lime_expl_val.shape)
        tmp[np.where(lime_expl_val != 0.0)] = 1.0
        lime_expl_val_bin = tmp

        shap_expl_val = shap_explainer.shap_values(x, l1_reg='bic')[1]
        tmp = np.zeros(shap_expl_val.shape)
        tmp[np.where(shap_expl_val != 0.0)] = 1.0
        shap_expl_val_bin = tmp

        maple_exp = maple_explainer.explain(x)
        maple_expl_val = maple_exp['coefs'][:-1]
        tmp = np.zeros(maple_expl_val.shape)
        tmp[np.where(maple_expl_val != 0.0)] = 1.0
        maple_expl_val_bin = tmp

        lime_fis = feature_importance_similarity(lime_expl_val, gt_val)
        shap_fis = feature_importance_similarity(shap_expl_val, gt_val)
        maple_fis = feature_importance_similarity(maple_expl_val, gt_val)

        lime_rbs = rule_based_similarity(lime_expl_val_bin, gt_val_bin)
        shap_rbs = rule_based_similarity(shap_expl_val_bin, gt_val_bin)
        maple_rbs = rule_based_similarity(maple_expl_val_bin, gt_val_bin)

        # print(gt_val)
        # print(lime_expl_val)
        # print(shap_expl_val)
        # print(maple_expl_val)

        res = {
            'black_box': black_box,
            'n_records': n_records,
            'n_all_features': n_all_features,
            'n_features': n_features,
            'n_coefficients': n_coefficients,
            'random_state': random_state,
            'idx': idx,
            'lime_cs': lime_fis,
            'shap_cs': shap_fis,
            'maple_cs': maple_fis,
            'lime_f1': lime_rbs,
            'shap_f1': shap_rbs,
            'maple_f1': maple_rbs,
        }
        results.append(res)
        print(datetime.datetime.now(), 'syege - tlsb2',
              'black_box %s' % black_box, 'n_all_features %s' % n_all_features,
              'n_features %s' % n_features,
              'n_coefficients % s' % n_coefficients, 'rs %s' % random_state,
              '%s %s' % (idx, n_records),
              'lime %.2f %.2f' % (lime_fis, lime_rbs),
              'shap %.2f %.2f' % (shap_fis, shap_rbs),
              'maple %.2f %.2f' % (maple_fis, maple_rbs))

    df = pd.DataFrame(data=results)
    df = df[[
        'black_box', 'n_records', 'n_all_features', 'n_features',
        'n_coefficients', 'random_state', 'idx', 'lime_cs', 'shap_cs',
        'maple_cs', 'lime_f1', 'shap_f1', 'maple_f1'
    ]]
    # print(df.head())

    if not os.path.isfile(filename):
        df.to_csv(filename, index=False)
    else:
        df.to_csv(filename, mode='a', index=False, header=False)