def run_pred_combi(dataset_name, save_dir, train_dev_tokens, test_tokens, \ test_labels, d_pred, combinations, k_list, second): for idx, combi in enumerate(combinations): model1, model2 = combi[0], combi[1] print('{}: {} vs. {}'.format(dataset_name, model1, model2)) dicts1, dict_keys1 = als.create_model_d(save_dir, model1, test_labels) dicts2, dict_keys2 = als.create_model_d(save_dir, model2, test_labels) keys_different, keys_same, keys_different_err, keys_same_err = [], [], [], [] for key in dict_keys1: different, different_err, same, same_err = als.get_k_combi_pred(train_dev_tokens, test_tokens, \ test_labels, dicts1, dicts2, key, \ model1, model2, k_list, save_dir, \ d_pred) keys_different.append(different) keys_same.append(same) keys_different_err.append(different_err) keys_same_err.append(same_err) y_data, y_err = [], [] y_data.append(keys_different) y_data.append(keys_same) y_err.append(keys_different_err) y_err.append(keys_same_err) diff_min_val, diff_max_val = als.get_min_max(keys_different) same_min_val, same_max_val = als.get_min_max(keys_same) y_min = max(0, min(diff_min_val, same_min_val) - 0.05) y_max = min(1, max(diff_max_val, same_max_val) + 0.05) file_name = '{}_{}_{}'.format(dataset_name, model1, model2) als.show_pred_plot(k_list, y_data, 'Number of important features (k)', 'Jaccard Similarity', file_name, (12, 12), '', y_err=y_err, \ y_min=y_min, y_max=y_max, combi_index=idx, \ second=second)
def run_comp_builtin_posthoc(dataset_name, save_dir, k_list, models, feature_types, second=False): train_tokens, dev_tokens, train_dev_tokens, test_tokens, \ train_labels, dev_labels, train_dev_labels, test_labels = utils.load_data(dataset_name) y_data, y_err_data = [], [] min_vals, max_vals, y_min_val, y_max_val = [], [], 0, 0 for feature_type in feature_types: all_combi_data, y_err = generate_simi_change(save_dir, '', test_tokens, test_labels, dataset_name, \ k_list, explainer_name=feature_type, y_err=True, \ second=second, if_comp_model=True) min_val, max_val = als.get_min_max(all_combi_data) assert len(all_combi_data) == len(y_err) tmp_min_val, tmp_max_val = als.get_min_max(all_combi_data) min_vals.append(tmp_min_val) max_vals.append(tmp_max_val) y_data.append(all_combi_data) y_err_data.append(y_err) y_min_val = np.min(min_vals) - 0.05 y_min_val = max(0, y_min_val) y_max_val = np.max(max_vals) + 0.05 show_simi_plot(k_list, y_data, 'Number of important features (k)', 'Jaccard Similarity', '', \ (13, 12), '', y_err=y_err_data, x_min=np.min(k_list)-0.5, x_max=np.max(k_list)+0.5, \ y_min=y_min_val, y_max=y_max_val, if_model=False, dataset_name=dataset_name, \ model_combi=True, second=second, if_builtin_posthoc=True)
def run_simi_length(dataset_name, save_dir, k_list, models, feature_types, folder_name, var, second=False): train_tokens, dev_tokens, train_dev_tokens, test_tokens, \ train_labels, dev_labels, train_dev_labels, test_labels = utils.load_data(dataset_name) if var == 'len': variable_l = als.get_tokens_length(test_tokens) else: variable_l = als.get_tokens_ratio(test_tokens) # generate models line plot y_data, min_vals, max_vals, y_min_val, y_max_val = [], [], [], 0, 0 for model_name in models: combinations = als.get_model_combinations() dicts, dict_keys = als.create_model_d(save_dir, model_name, test_labels) all_combi_data = als.get_rho(test_tokens, dicts, combinations, k_list, variable_l) tmp_min_val, tmp_max_val = als.get_min_max(all_combi_data) min_vals.append(tmp_min_val) max_vals.append(tmp_max_val) y_data.append(all_combi_data) y_min_val = np.min(min_vals) - 0.05 y_min_val = min(y_min_val, 0 - 0.05) y_max_val = np.max(max_vals) + 0.05 y_max_val = max(y_max_val, 0 + 0.05) simi.show_simi_plot(k_list, y_data, 'Number of important features (k)', 'Spearman correlation', '', \ (13, 12), '', x_min=np.min(k_list)-0.5, x_max=np.max(k_list)+0.5, \ y_min=y_min_val, y_max=y_max_val, if_model=True, second=second, \ if_builtin_posthoc=True)
def run_entropy(dataset_name, save_dir, k_list, models, feature_types, second=False): train_tokens, dev_tokens, train_dev_tokens, test_tokens, \ train_labels, dev_labels, train_dev_labels, test_labels = utils.load_data(dataset_name) y_data, min_vals, max_vals, y_min_val, y_max_val = [], [], [], 0, 0 for model_name in models: dicts, d_keys = als.create_model_d(save_dir, model_name, test_labels=test_labels) tmp_y_data = als.get_entropy(test_tokens, dicts, d_keys, k_list) assert len(tmp_y_data) == len(d_keys) tmp_min_val, tmp_max_val = als.get_min_max(tmp_y_data) min_vals.append(tmp_min_val) max_vals.append(tmp_max_val) y_data.append(tmp_y_data) y_min_val = np.min(min_vals) - 0.25 y_min_val = max(0, y_min_val) y_max_val = np.max(max_vals) + 0.25 simi.show_simi_plot(k_list, y_data, 'Number of important features (k)', 'Entropy', '', \ (13, 12), '', x_min=np.min(k_list)-0.5, x_max=np.max(k_list)+0.5, \ y_min=y_min_val, y_max=y_max_val, if_model=True, second=second, \ if_combi=False, if_builtin_posthoc=True)
def run_js_pos(dataset_name, data_dir, save_dir, models, feature_types, k_list, second=False): train_tokens, dev_tokens, train_dev_tokens, test_tokens, \ train_labels, dev_labels, train_dev_labels, test_labels = utils.load_data(dataset_name) train_pos, dev_pos, train_dev_pos, test_pos = utils.get_pos( dataset_name, data_dir) token_pos_d = als.get_token_pos_d(test_tokens, test_pos) # compare with background y_data, min_vals, max_vals, y_min_val, y_max_val = [], [], [], 0, 0 for model_name in models: dicts, d_keys = als.create_model_d(save_dir, model_name, test_labels=test_labels) tmp_y_data = get_jensen_shannon(test_tokens, dicts, d_keys, k_list, 'background', \ combinations=d_keys, token_pos_d=token_pos_d) tmp_min_val, tmp_max_val = als.get_min_max(tmp_y_data) min_vals.append(tmp_min_val) max_vals.append(tmp_max_val) y_data.append(tmp_y_data) y_min_val = np.min(min_vals) - 0.05 y_max_val = np.max(max_vals) + 0.05 simi.show_simi_plot(k_list, y_data, 'Number of important features (k)', 'Jensen-Shannon Score', '', \ (13, 12), '', y_min=y_min_val, y_max=y_max_val, if_model=True, second=second, \ if_combi=False, if_background=True, if_builtin_posthoc=True)
def run_pos_percent(dataset_name, data_dir, save_dir, models, feature_types, k): train_tokens, dev_tokens, train_dev_tokens, test_tokens, \ train_labels, dev_labels, train_dev_labels, test_labels = utils.load_data(dataset_name) train_pos, dev_pos, train_dev_pos, test_pos = utils.get_pos( dataset_name, data_dir) pos_types = ['NOUN', 'VERB', 'ADJ', 'ADV', 'PRON', 'DET'] token_pos_d = als.get_token_pos_d(test_tokens, test_pos) vocab_size = len(test_tokens) * k min_vals, max_vals, y_min_val, y_max_val = [], [], 0, 0 for idx, feature_type in enumerate(feature_types): dicts, d_keys = als.create_explainer_d(save_dir, feature_type, len(test_labels), test_labels=test_labels) tmp_y_data = als.get_combi_pos(d_keys, dicts, test_tokens, k, token_pos_d, vocab_size) tmp_min_val, tmp_max_val = als.get_min_max(tmp_y_data) y_min_val = max(tmp_min_val - 1, 0) y_max_val = min(tmp_max_val + 1, 100) y_data = als.format_pos_data(tmp_y_data, pos_types) assert len(y_data) == len(pos_types) display_model_names = als.get_explainer_combinations(combi=False) display_feature_names = als.get_model_combinations(combi=False) x_data = [] x_data.append('Background') for model_name in display_model_names: label = '{}'.format(model_name) x_data.append(label) show_bar_plot(x_data, y_data, '', 'Percentage', \ '', (15, 14), '', y_min=y_min_val, \ y_max=y_max_val, labels=pos_types)