Beispiel #1
0
def run_pred_combi(dataset_name, save_dir, train_dev_tokens, test_tokens, \
                   test_labels, d_pred, combinations, k_list, second):
    for idx, combi in enumerate(combinations):
        model1, model2 = combi[0], combi[1]
        print('{}: {} vs. {}'.format(dataset_name, model1, model2))
        dicts1, dict_keys1 = als.create_model_d(save_dir, model1, test_labels)
        dicts2, dict_keys2 = als.create_model_d(save_dir, model2, test_labels)
        keys_different, keys_same, keys_different_err, keys_same_err = [], [], [], []
        for key in dict_keys1:
            different, different_err, same, same_err = als.get_k_combi_pred(train_dev_tokens, test_tokens, \
                                                                        test_labels, dicts1, dicts2, key, \
                                                                        model1, model2, k_list, save_dir, \
                                                                        d_pred)
            keys_different.append(different)
            keys_same.append(same)
            keys_different_err.append(different_err)
            keys_same_err.append(same_err)
        y_data, y_err = [], []
        y_data.append(keys_different)
        y_data.append(keys_same)
        y_err.append(keys_different_err)
        y_err.append(keys_same_err)

        diff_min_val, diff_max_val = als.get_min_max(keys_different)
        same_min_val, same_max_val = als.get_min_max(keys_same)
        y_min = max(0, min(diff_min_val, same_min_val) - 0.05)
        y_max = min(1, max(diff_max_val, same_max_val) + 0.05)

        file_name = '{}_{}_{}'.format(dataset_name, model1, model2)
        als.show_pred_plot(k_list, y_data, 'Number of important features (k)', 'Jaccard Similarity',
                       file_name, (12, 12), '', y_err=y_err, \
                       y_min=y_min, y_max=y_max, combi_index=idx, \
                       second=second)
Beispiel #2
0
def run_comp_builtin_posthoc(dataset_name,
                             save_dir,
                             k_list,
                             models,
                             feature_types,
                             second=False):
    train_tokens, dev_tokens, train_dev_tokens, test_tokens, \
    train_labels, dev_labels, train_dev_labels, test_labels = utils.load_data(dataset_name)
    y_data, y_err_data = [], []
    min_vals, max_vals, y_min_val, y_max_val = [], [], 0, 0
    for feature_type in feature_types:
        all_combi_data, y_err = generate_simi_change(save_dir, '', test_tokens, test_labels, dataset_name, \
                                                     k_list, explainer_name=feature_type, y_err=True, \
                                                     second=second, if_comp_model=True)
        min_val, max_val = als.get_min_max(all_combi_data)
        assert len(all_combi_data) == len(y_err)
        tmp_min_val, tmp_max_val = als.get_min_max(all_combi_data)
        min_vals.append(tmp_min_val)
        max_vals.append(tmp_max_val)
        y_data.append(all_combi_data)
        y_err_data.append(y_err)

    y_min_val = np.min(min_vals) - 0.05
    y_min_val = max(0, y_min_val)
    y_max_val = np.max(max_vals) + 0.05
    show_simi_plot(k_list, y_data, 'Number of important features (k)', 'Jaccard Similarity', '', \
                   (13, 12), '', y_err=y_err_data, x_min=np.min(k_list)-0.5, x_max=np.max(k_list)+0.5, \
                   y_min=y_min_val, y_max=y_max_val, if_model=False, dataset_name=dataset_name, \
                   model_combi=True, second=second, if_builtin_posthoc=True)
Beispiel #3
0
def run_simi_length(dataset_name,
                    save_dir,
                    k_list,
                    models,
                    feature_types,
                    folder_name,
                    var,
                    second=False):
    train_tokens, dev_tokens, train_dev_tokens, test_tokens, \
    train_labels, dev_labels, train_dev_labels, test_labels = utils.load_data(dataset_name)
    if var == 'len':
        variable_l = als.get_tokens_length(test_tokens)
    else:
        variable_l = als.get_tokens_ratio(test_tokens)
    # generate models line plot
    y_data, min_vals, max_vals, y_min_val, y_max_val = [], [], [], 0, 0
    for model_name in models:
        combinations = als.get_model_combinations()
        dicts, dict_keys = als.create_model_d(save_dir, model_name,
                                              test_labels)
        all_combi_data = als.get_rho(test_tokens, dicts, combinations, k_list,
                                     variable_l)
        tmp_min_val, tmp_max_val = als.get_min_max(all_combi_data)
        min_vals.append(tmp_min_val)
        max_vals.append(tmp_max_val)
        y_data.append(all_combi_data)
    y_min_val = np.min(min_vals) - 0.05
    y_min_val = min(y_min_val, 0 - 0.05)
    y_max_val = np.max(max_vals) + 0.05
    y_max_val = max(y_max_val, 0 + 0.05)
    simi.show_simi_plot(k_list, y_data, 'Number of important features (k)', 'Spearman correlation', '', \
                        (13, 12), '', x_min=np.min(k_list)-0.5, x_max=np.max(k_list)+0.5, \
                        y_min=y_min_val, y_max=y_max_val, if_model=True, second=second, \
                        if_builtin_posthoc=True)
def run_entropy(dataset_name,
                save_dir,
                k_list,
                models,
                feature_types,
                second=False):
    train_tokens, dev_tokens, train_dev_tokens, test_tokens, \
    train_labels, dev_labels, train_dev_labels, test_labels = utils.load_data(dataset_name)

    y_data, min_vals, max_vals, y_min_val, y_max_val = [], [], [], 0, 0
    for model_name in models:
        dicts, d_keys = als.create_model_d(save_dir,
                                           model_name,
                                           test_labels=test_labels)
        tmp_y_data = als.get_entropy(test_tokens, dicts, d_keys, k_list)
        assert len(tmp_y_data) == len(d_keys)
        tmp_min_val, tmp_max_val = als.get_min_max(tmp_y_data)
        min_vals.append(tmp_min_val)
        max_vals.append(tmp_max_val)
        y_data.append(tmp_y_data)
    y_min_val = np.min(min_vals) - 0.25
    y_min_val = max(0, y_min_val)
    y_max_val = np.max(max_vals) + 0.25
    simi.show_simi_plot(k_list, y_data, 'Number of important features (k)', 'Entropy', '', \
                        (13, 12), '', x_min=np.min(k_list)-0.5, x_max=np.max(k_list)+0.5, \
                        y_min=y_min_val, y_max=y_max_val, if_model=True, second=second, \
                        if_combi=False, if_builtin_posthoc=True)
def run_js_pos(dataset_name,
               data_dir,
               save_dir,
               models,
               feature_types,
               k_list,
               second=False):
    train_tokens, dev_tokens, train_dev_tokens, test_tokens, \
    train_labels, dev_labels, train_dev_labels, test_labels = utils.load_data(dataset_name)
    train_pos, dev_pos, train_dev_pos, test_pos = utils.get_pos(
        dataset_name, data_dir)
    token_pos_d = als.get_token_pos_d(test_tokens, test_pos)
    # compare with background
    y_data, min_vals, max_vals, y_min_val, y_max_val = [], [], [], 0, 0
    for model_name in models:
        dicts, d_keys = als.create_model_d(save_dir,
                                           model_name,
                                           test_labels=test_labels)
        tmp_y_data = get_jensen_shannon(test_tokens, dicts, d_keys, k_list, 'background', \
                                        combinations=d_keys, token_pos_d=token_pos_d)
        tmp_min_val, tmp_max_val = als.get_min_max(tmp_y_data)
        min_vals.append(tmp_min_val)
        max_vals.append(tmp_max_val)
        y_data.append(tmp_y_data)
    y_min_val = np.min(min_vals) - 0.05
    y_max_val = np.max(max_vals) + 0.05
    simi.show_simi_plot(k_list, y_data, 'Number of important features (k)', 'Jensen-Shannon Score', '', \
                        (13, 12), '', y_min=y_min_val, y_max=y_max_val, if_model=True, second=second, \
                        if_combi=False, if_background=True, if_builtin_posthoc=True)
def run_pos_percent(dataset_name, data_dir, save_dir, models, feature_types,
                    k):
    train_tokens, dev_tokens, train_dev_tokens, test_tokens, \
    train_labels, dev_labels, train_dev_labels, test_labels = utils.load_data(dataset_name)
    train_pos, dev_pos, train_dev_pos, test_pos = utils.get_pos(
        dataset_name, data_dir)

    pos_types = ['NOUN', 'VERB', 'ADJ', 'ADV', 'PRON', 'DET']
    token_pos_d = als.get_token_pos_d(test_tokens, test_pos)
    vocab_size = len(test_tokens) * k
    min_vals, max_vals, y_min_val, y_max_val = [], [], 0, 0
    for idx, feature_type in enumerate(feature_types):
        dicts, d_keys = als.create_explainer_d(save_dir,
                                               feature_type,
                                               len(test_labels),
                                               test_labels=test_labels)
        tmp_y_data = als.get_combi_pos(d_keys, dicts, test_tokens, k,
                                       token_pos_d, vocab_size)
        tmp_min_val, tmp_max_val = als.get_min_max(tmp_y_data)

        y_min_val = max(tmp_min_val - 1, 0)
        y_max_val = min(tmp_max_val + 1, 100)

        y_data = als.format_pos_data(tmp_y_data, pos_types)
        assert len(y_data) == len(pos_types)
        display_model_names = als.get_explainer_combinations(combi=False)
        display_feature_names = als.get_model_combinations(combi=False)
        x_data = []
        x_data.append('Background')
        for model_name in display_model_names:
            label = '{}'.format(model_name)
            x_data.append(label)

        show_bar_plot(x_data, y_data, '', 'Percentage', \
                      '', (15, 14), '', y_min=y_min_val, \
                      y_max=y_max_val, labels=pos_types)